sibi-dst 0.3.29__tar.gz → 0.3.31__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/PKG-INFO +2 -1
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/pyproject.toml +2 -1
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/_df_helper.py +4 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/_parquet_artifact.py +8 -6
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +14 -9
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/utils/data_wrapper.py +31 -8
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/utils/filepath_generator.py +1 -1
- sibi_dst-0.3.31/sibi_dst/utils/parquet_saver.py +228 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/utils/storage_manager.py +7 -3
- sibi_dst-0.3.29/sibi_dst/utils/parquet_saver.py +0 -104
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/README.md +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/__init__.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/__init__.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/_parquet_reader.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/backends/__init__.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/backends/django/__init__.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/backends/django/_db_connection.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/backends/django/_io_dask.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/backends/django/_load_from_db.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/core/_defaults.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/data_cleaner.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/geopy_helper/__init__.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/geopy_helper/utils.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/osmnx_helper/__init__.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/osmnx_helper/utils.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/tests/__init__.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/utils/__init__.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/utils/airflow_manager.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/utils/clickhouse_writer.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/utils/credentials.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/utils/data_utils.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/utils/date_utils.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/utils/df_utils.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/utils/file_utils.py +0 -0
- {sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/utils/log_utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sibi-dst
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.31
|
4
4
|
Summary: Data Science Toolkit
|
5
5
|
Author: Luis Valverde
|
6
6
|
Author-email: lvalverdeb@gmail.com
|
@@ -26,6 +26,7 @@ Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
|
|
26
26
|
Requires-Dist: mysqlclient (>=2.2.6,<3.0.0)
|
27
27
|
Requires-Dist: nltk (>=3.9.1,<4.0.0)
|
28
28
|
Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
29
|
+
Requires-Dist: osmnx (>=2.0.1,<3.0.0)
|
29
30
|
Requires-Dist: pandas (>=2.2.3,<3.0.0)
|
30
31
|
Requires-Dist: paramiko (>=3.5.0,<4.0.0)
|
31
32
|
Requires-Dist: psutil (>=6.1.0,<7.0.0)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "sibi-dst"
|
3
|
-
version = "0.3.
|
3
|
+
version = "0.3.31"
|
4
4
|
description = "Data Science Toolkit"
|
5
5
|
authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
|
6
6
|
readme = "README.md"
|
@@ -40,6 +40,7 @@ s3fs = "^2024.12.0"
|
|
40
40
|
nltk = "^3.9.1"
|
41
41
|
folium = "^0.19.4"
|
42
42
|
geopandas = "^1.0.1"
|
43
|
+
osmnx = "^2.0.1"
|
43
44
|
|
44
45
|
|
45
46
|
[build-system]
|
@@ -9,6 +9,7 @@ import dask.dataframe as dd
|
|
9
9
|
from dask import delayed, compute
|
10
10
|
import pandas as pd
|
11
11
|
from pydantic import BaseModel
|
12
|
+
import fsspec
|
12
13
|
|
13
14
|
from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
|
14
15
|
from sibi_dst.utils import Logger
|
@@ -86,8 +87,11 @@ class DfHelper:
|
|
86
87
|
self.parquet_storage_path = kwargs.setdefault("parquet_storage_path", None)
|
87
88
|
self.dt_field = kwargs.setdefault("dt_field", None)
|
88
89
|
self.as_pandas = kwargs.setdefault("as_pandas", False)
|
90
|
+
self.filesystem = kwargs.pop('filesystem', 'file')
|
91
|
+
self.filesystem_options = kwargs.pop('filesystem_options', {})
|
89
92
|
kwargs.setdefault("live", True)
|
90
93
|
kwargs.setdefault("logger", self.logger)
|
94
|
+
kwargs.setdefault("fs", fsspec.filesystem('file'))
|
91
95
|
self.__post_init(**kwargs)
|
92
96
|
|
93
97
|
def __str__(self):
|
@@ -13,7 +13,7 @@ class ParquetArtifact(DfHelper):
|
|
13
13
|
'backend': 'parquet'
|
14
14
|
}
|
15
15
|
|
16
|
-
def __init__(self, data_wrapper_class,
|
16
|
+
def __init__(self, data_wrapper_class, **kwargs):
|
17
17
|
self.config = {
|
18
18
|
**self.DEFAULT_CONFIG,
|
19
19
|
**kwargs,
|
@@ -39,13 +39,14 @@ class ParquetArtifact(DfHelper):
|
|
39
39
|
raise ValueError('parquet_end_date must be set')
|
40
40
|
|
41
41
|
# Filesystem setup
|
42
|
-
self.filesystem_type = filesystem_type
|
43
|
-
self.filesystem_options = filesystem_options
|
44
|
-
self.fs =
|
45
|
-
|
42
|
+
self.filesystem_type = self.config.setdefault('filesystem_type', 'file')
|
43
|
+
self.filesystem_options = self.config.setdefault('filesystem_options', {})
|
44
|
+
self.fs = self.config.setdefault('fs', None)
|
45
|
+
if self.fs is None:
|
46
|
+
self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
|
47
|
+
self.config.setdefault('fs', self.fs)
|
46
48
|
# Ensure the directory exists
|
47
49
|
self.ensure_directory_exists(self.parquet_storage_path)
|
48
|
-
|
49
50
|
super().__init__(**self.config)
|
50
51
|
|
51
52
|
def load(self, **kwargs):
|
@@ -97,6 +98,7 @@ class ParquetArtifact(DfHelper):
|
|
97
98
|
'history_days_threshold': kwargs.pop('history_days_threshold', 30),
|
98
99
|
'max_age_minutes': kwargs.pop('max_age_minutes', 10),
|
99
100
|
'show_progress': kwargs.pop('show_progress', False),
|
101
|
+
'fs': self.fs,
|
100
102
|
'filesystem_type': self.filesystem_type,
|
101
103
|
'filesystem_options': self.filesystem_options,
|
102
104
|
}
|
@@ -13,8 +13,8 @@ from sibi_dst.utils import Logger
|
|
13
13
|
class ParquetConfig(BaseModel):
|
14
14
|
load_parquet: bool = False
|
15
15
|
parquet_filename: Optional[str] = None
|
16
|
-
parquet_storage_path: Optional[
|
17
|
-
parquet_full_path: Optional[
|
16
|
+
parquet_storage_path: Optional[str] = None
|
17
|
+
parquet_full_path: Optional[str] = None
|
18
18
|
parquet_folder_list: Optional[List[str]] = None
|
19
19
|
parquet_size_bytes: int = 0
|
20
20
|
parquet_max_age_minutes: int = 0
|
@@ -30,14 +30,17 @@ class ParquetConfig(BaseModel):
|
|
30
30
|
# Configure paths based on fsspec
|
31
31
|
if self.logger is None:
|
32
32
|
self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
|
33
|
-
self.fs = fsspec.filesystem("file") if "://" not in str(self.parquet_storage_path) else fsspec.filesystem(
|
34
|
-
|
35
|
-
|
33
|
+
#self.fs = fsspec.filesystem("file") if "://" not in str(self.parquet_storage_path) else fsspec.filesystem(
|
34
|
+
# str(self.parquet_storage_path).split("://")[0])
|
36
35
|
# Validation for parquet path
|
36
|
+
|
37
|
+
|
37
38
|
if self.parquet_storage_path is None:
|
38
39
|
raise ValueError('Parquet storage path must be specified')
|
40
|
+
self.parquet_storage_path = self.parquet_storage_path.rstrip('/')
|
39
41
|
if not self.fs.exists(self.parquet_storage_path):
|
40
|
-
|
42
|
+
self.fs.mkdirs(self.parquet_storage_path, exist_ok=True)
|
43
|
+
#raise ValueError('Parquet storage path does not exist')
|
41
44
|
self.load_parquet = False
|
42
45
|
if self.parquet_filename is not None:
|
43
46
|
self.parquet_full_path = self.ensure_file_extension(
|
@@ -57,8 +60,9 @@ class ParquetConfig(BaseModel):
|
|
57
60
|
raise ValueError('Parquet end date must be greater than start date')
|
58
61
|
|
59
62
|
# Saving to parquet is disabled when start and end dates are provided, as we will load parquet files
|
60
|
-
self.parquet_folder_list = FilePathGenerator(str(self.parquet_storage_path),
|
63
|
+
self.parquet_folder_list = FilePathGenerator(str(self.parquet_storage_path), fs=self.fs,
|
61
64
|
logger=self.logger).generate_file_paths(start_date, end_date)
|
65
|
+
|
62
66
|
self.parquet_size_bytes = self.get_parquet_size_bytes()
|
63
67
|
self.load_parquet = True
|
64
68
|
# self.load_parquet = all([self.fs.exists(folder) for folder in self.parquet_folder_list]) and self.parquet_size_bytes > 0
|
@@ -84,11 +88,12 @@ class ParquetConfig(BaseModel):
|
|
84
88
|
return total_size
|
85
89
|
|
86
90
|
def load_files(self):
|
91
|
+
|
87
92
|
if self.load_parquet:
|
88
93
|
if self.parquet_folder_list:
|
89
|
-
return dd.read_parquet(self.parquet_folder_list, engine="pyarrow")
|
94
|
+
return dd.read_parquet(self.parquet_folder_list, engine="pyarrow", filesystem=self.fs)
|
90
95
|
else:
|
91
|
-
return dd.read_parquet(self.parquet_full_path, engine="pyarrow")
|
96
|
+
return dd.read_parquet(self.parquet_full_path, engine="pyarrow", filesystem=self.fs)
|
92
97
|
|
93
98
|
@staticmethod
|
94
99
|
def ensure_file_extension(filepath: str, extension: str) -> str:
|
@@ -22,6 +22,7 @@ class DataWrapper:
|
|
22
22
|
parquet_filename: str,
|
23
23
|
start_date: Any,
|
24
24
|
end_date: Any,
|
25
|
+
fs: Optional[fsspec.AbstractFileSystem] = None,
|
25
26
|
filesystem_type: str = "file",
|
26
27
|
filesystem_options: Optional[Dict] = None,
|
27
28
|
verbose: bool = False,
|
@@ -41,7 +42,7 @@ class DataWrapper:
|
|
41
42
|
self.parquet_filename = parquet_filename
|
42
43
|
self.filesystem_type = filesystem_type
|
43
44
|
self.filesystem_options = filesystem_options or {}
|
44
|
-
self.fs = fsspec.filesystem(filesystem_type, **self.filesystem_options)
|
45
|
+
self.fs = fs or fsspec.filesystem(filesystem_type, **self.filesystem_options)
|
45
46
|
self.verbose = verbose
|
46
47
|
self.class_params = class_params or {}
|
47
48
|
self.load_params = load_params or {}
|
@@ -129,23 +130,45 @@ class DataWrapper:
|
|
129
130
|
def is_file_older_than(self, file_path: str) -> bool:
|
130
131
|
"""
|
131
132
|
Check if a file is older than the specified max_age_minutes.
|
133
|
+
|
134
|
+
:param file_path: Path to the file.
|
135
|
+
:return: True if the file is older than max_age_minutes, False otherwise.
|
132
136
|
"""
|
133
137
|
try:
|
138
|
+
# Get file info
|
134
139
|
info = self.fs.info(file_path)
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
140
|
+
|
141
|
+
# Determine the modification time from available keys
|
142
|
+
file_modification_time = None
|
143
|
+
if "mtime" in info: # Local filesystem
|
144
|
+
file_modification_time = info["mtime"]
|
145
|
+
file_modification_datetime = datetime.datetime.fromtimestamp(
|
146
|
+
file_modification_time, tz=datetime.timezone.utc
|
147
|
+
)
|
148
|
+
elif "LastModified" in info: # S3-compatible filesystem
|
149
|
+
file_modification_datetime = (
|
150
|
+
info["LastModified"] if isinstance(info["LastModified"], datetime.datetime)
|
151
|
+
else datetime.datetime.strptime(info["LastModified"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
152
|
+
)
|
153
|
+
else:
|
154
|
+
self.logger.warning(f"Modification time not available for {file_path}.")
|
155
|
+
return True # Assume file is too old if we cannot determine its age
|
156
|
+
|
157
|
+
# Compare file age
|
139
158
|
current_time = datetime.datetime.now(datetime.timezone.utc)
|
140
159
|
file_age_minutes = (current_time - file_modification_datetime).total_seconds() / 60
|
141
160
|
self.logger.info(
|
142
161
|
f"File {file_path} is {round(file_age_minutes, 2)} minutes old "
|
143
162
|
f"(threshold: {self.max_age_minutes} minutes)"
|
144
163
|
)
|
145
|
-
|
146
164
|
return file_age_minutes > self.max_age_minutes
|
165
|
+
|
147
166
|
except FileNotFoundError:
|
148
|
-
|
167
|
+
self.logger.warning(f"File {file_path} not found.")
|
168
|
+
return True # File is considered old if it doesn't exist
|
169
|
+
except Exception as e:
|
170
|
+
self.logger.error(f"Error checking file age for {file_path}: {str(e)}")
|
171
|
+
return True #
|
149
172
|
|
150
173
|
def process_date(self, date: datetime.date):
|
151
174
|
"""Process a specific date by regenerating data as necessary."""
|
@@ -162,7 +185,7 @@ class DataWrapper:
|
|
162
185
|
self.logger.error("No data found for the specified date.")
|
163
186
|
return
|
164
187
|
|
165
|
-
parquet_saver = ParquetSaver(df, folder, self.logger)
|
188
|
+
parquet_saver = ParquetSaver(df, parquet_storage_path=folder, logger=self.logger, fs=self.fs)
|
166
189
|
parquet_saver.save_to_parquet(self.parquet_filename, clear_existing=True)
|
167
190
|
|
168
191
|
end_time = datetime.datetime.now()
|
@@ -91,7 +91,7 @@ class FilePathGenerator:
|
|
91
91
|
if engine == 'dask':
|
92
92
|
# Collect individual file paths
|
93
93
|
file_pattern = f"{base_dir}/**/*.{self.file_extension}"
|
94
|
-
all_paths = self.fs.glob(file_pattern
|
94
|
+
all_paths = self.fs.glob(file_pattern)
|
95
95
|
|
96
96
|
if not all_paths and self.debug:
|
97
97
|
self.logger.debug(f"No files found with pattern: {file_pattern}")
|
@@ -0,0 +1,228 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
import pyarrow as pa
|
5
|
+
import fsspec
|
6
|
+
|
7
|
+
from sibi_dst.utils import Logger
|
8
|
+
|
9
|
+
|
10
|
+
class ParquetSaver:
|
11
|
+
def __init__(self, df_result, parquet_storage_path, logger=None, fs=None):
|
12
|
+
"""
|
13
|
+
Initialize ParquetSaver.
|
14
|
+
:param df_result: Dask DataFrame to save.
|
15
|
+
:param parquet_storage_path: Base storage path (e.g., "s3://bucket-name/path/").
|
16
|
+
:param logger: Logger instance for logging messages.
|
17
|
+
:param fs: Pre-initialized fsspec filesystem instance. Defaults to 'file' if None.
|
18
|
+
"""
|
19
|
+
self.df_result = df_result
|
20
|
+
self.parquet_storage_path = parquet_storage_path.rstrip("/")
|
21
|
+
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
22
|
+
|
23
|
+
# Default to the local filesystem if `fs` is not provided
|
24
|
+
self.fs = fs or fsspec.filesystem("file")
|
25
|
+
|
26
|
+
def save_to_parquet(self, parquet_filename: Optional[str] = None, clear_existing=True):
|
27
|
+
"""
|
28
|
+
Save the DataFrame to Parquet format.
|
29
|
+
:param parquet_filename: Filename for the Parquet file.
|
30
|
+
:param clear_existing: Whether to clear existing files in the target directory.
|
31
|
+
"""
|
32
|
+
full_path = self._construct_full_path(parquet_filename)
|
33
|
+
|
34
|
+
# Ensure directory exists and clear if necessary
|
35
|
+
self._ensure_directory_exists(full_path, clear_existing=clear_existing)
|
36
|
+
|
37
|
+
# Define schema and save DataFrame to Parquet
|
38
|
+
schema = self._define_schema()
|
39
|
+
self._convert_dtypes(schema)
|
40
|
+
self._save_dataframe_to_parquet(full_path, schema)
|
41
|
+
|
42
|
+
def _define_schema(self) -> pa.Schema:
|
43
|
+
"""Define a PyArrow schema dynamically based on df_result column types."""
|
44
|
+
pandas_dtype_to_pa = {
|
45
|
+
"object": pa.string(),
|
46
|
+
"string": pa.string(),
|
47
|
+
"Int64": pa.int64(),
|
48
|
+
"int64": pa.int64(),
|
49
|
+
"float64": pa.float64(),
|
50
|
+
"float32": pa.float32(),
|
51
|
+
"bool": pa.bool_(),
|
52
|
+
"boolean": pa.bool_(), # pandas nullable boolean
|
53
|
+
"datetime64[ns]": pa.timestamp("ns"),
|
54
|
+
"timedelta[ns]": pa.duration("ns"),
|
55
|
+
}
|
56
|
+
|
57
|
+
dtypes = self.df_result.dtypes
|
58
|
+
|
59
|
+
fields = [
|
60
|
+
pa.field(col, pandas_dtype_to_pa.get(str(dtype), pa.string()))
|
61
|
+
for col, dtype in dtypes.items()
|
62
|
+
]
|
63
|
+
return pa.schema(fields)
|
64
|
+
|
65
|
+
def _convert_dtypes(self, schema: pa.Schema):
|
66
|
+
"""Convert DataFrame columns to match the specified schema."""
|
67
|
+
dtype_mapping = {}
|
68
|
+
for field in schema:
|
69
|
+
col_name = field.name
|
70
|
+
if col_name in self.df_result.columns:
|
71
|
+
if pa.types.is_string(field.type):
|
72
|
+
dtype_mapping[col_name] = "string"
|
73
|
+
elif pa.types.is_int64(field.type):
|
74
|
+
dtype_mapping[col_name] = "Int64"
|
75
|
+
elif pa.types.is_float64(field.type):
|
76
|
+
dtype_mapping[col_name] = "float64"
|
77
|
+
elif pa.types.is_float32(field.type):
|
78
|
+
dtype_mapping[col_name] = "float32"
|
79
|
+
elif pa.types.is_boolean(field.type):
|
80
|
+
dtype_mapping[col_name] = "boolean"
|
81
|
+
elif pa.types.is_timestamp(field.type):
|
82
|
+
dtype_mapping[col_name] = "datetime64[ns]"
|
83
|
+
else:
|
84
|
+
dtype_mapping[col_name] = "object"
|
85
|
+
self.df_result = self.df_result.astype(dtype_mapping)
|
86
|
+
|
87
|
+
def _construct_full_path(self, parquet_filename: Optional[str]) -> str:
|
88
|
+
"""Construct and return the full path for the Parquet file."""
|
89
|
+
parquet_filename = parquet_filename or "default.parquet"
|
90
|
+
return f"{self.parquet_storage_path}/{parquet_filename}"
|
91
|
+
|
92
|
+
def _ensure_directory_exists(self, full_path: str, clear_existing=False):
|
93
|
+
"""
|
94
|
+
Ensure that the directory for the path exists, clearing it if specified.
|
95
|
+
:param full_path: Full path for the target file.
|
96
|
+
:param clear_existing: Whether to clear existing files/directories.
|
97
|
+
"""
|
98
|
+
directory = "/".join(full_path.split("/")[:-1])
|
99
|
+
|
100
|
+
if self.fs.exists(directory):
|
101
|
+
if clear_existing:
|
102
|
+
self.logger.info(f"Clearing existing directory: {directory}")
|
103
|
+
self.fs.rm(directory, recursive=True)
|
104
|
+
else:
|
105
|
+
self.logger.info(f"Creating directory: {directory}")
|
106
|
+
self.fs.mkdirs(directory, exist_ok=True)
|
107
|
+
|
108
|
+
def _save_dataframe_to_parquet(self, full_path: str, schema: pa.Schema):
|
109
|
+
"""Save the DataFrame to Parquet using the specified schema."""
|
110
|
+
if self.fs.exists(full_path):
|
111
|
+
self.logger.info(f"Overwriting existing file: {full_path}")
|
112
|
+
self.fs.rm(full_path, recursive=True)
|
113
|
+
|
114
|
+
self.logger.info(f"Saving Parquet file to: {full_path}")
|
115
|
+
self.df_result.to_parquet(
|
116
|
+
full_path,
|
117
|
+
engine="pyarrow",
|
118
|
+
schema=schema,
|
119
|
+
storage_options=self.fs.storage_options if hasattr(self.fs, "storage_options") else None,
|
120
|
+
write_index=False,
|
121
|
+
)
|
122
|
+
|
123
|
+
# from pathlib import Path
|
124
|
+
# from typing import Optional
|
125
|
+
#
|
126
|
+
# import fsspec
|
127
|
+
# import pyarrow as pa
|
128
|
+
#
|
129
|
+
# from sibi_dst.utils import Logger
|
130
|
+
#
|
131
|
+
#
|
132
|
+
# class ParquetSaver:
|
133
|
+
# def __init__(self, df_result, parquet_storage_path, logger=None, fs=None):
|
134
|
+
# # Ensure df_result is a Dask DataFrame
|
135
|
+
# self.fs = fs or fsspec.filesystem("file")
|
136
|
+
# self.df_result = df_result
|
137
|
+
# self.parquet_storage_path = parquet_storage_path
|
138
|
+
# self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
139
|
+
#
|
140
|
+
# def save_to_parquet(self, parquet_filename: Optional[str] = None, clear_existing=True):
|
141
|
+
# full_path = self._construct_full_path(parquet_filename)
|
142
|
+
#
|
143
|
+
# # We cannot check for empty DataFrame directly with Dask without computation
|
144
|
+
# # Proceed with saving; if the DataFrame is empty, an empty Parquet file will be created
|
145
|
+
#
|
146
|
+
# # Ensure directory exists and clear if necessary
|
147
|
+
# self._ensure_directory_exists(full_path, clear_existing=clear_existing)
|
148
|
+
#
|
149
|
+
# # Define schema and save DataFrame to Parquet
|
150
|
+
# schema = self._define_schema()
|
151
|
+
# self._convert_dtypes(schema)
|
152
|
+
# self._save_dataframe_to_parquet(full_path, schema)
|
153
|
+
#
|
154
|
+
# def _define_schema(self) -> pa.Schema:
|
155
|
+
# """Define a PyArrow schema dynamically based on df_result column types."""
|
156
|
+
# pandas_dtype_to_pa = {
|
157
|
+
# 'object': pa.string(),
|
158
|
+
# 'string': pa.string(),
|
159
|
+
# 'Int64': pa.int64(),
|
160
|
+
# 'int64': pa.int64(),
|
161
|
+
# 'float64': pa.float64(),
|
162
|
+
# 'float32': pa.float32(),
|
163
|
+
# 'bool': pa.bool_(),
|
164
|
+
# 'boolean': pa.bool_(), # pandas nullable boolean
|
165
|
+
# 'datetime64[ns]': pa.timestamp('ns'),
|
166
|
+
# 'timedelta[ns]': pa.duration('ns')
|
167
|
+
# }
|
168
|
+
#
|
169
|
+
# dtypes = self.df_result.dtypes # No need to call .compute()
|
170
|
+
#
|
171
|
+
# fields = [
|
172
|
+
# pa.field(col, pandas_dtype_to_pa.get(str(dtype), pa.string()))
|
173
|
+
# for col, dtype in dtypes.items()
|
174
|
+
# ]
|
175
|
+
# return pa.schema(fields)
|
176
|
+
#
|
177
|
+
# def _convert_dtypes(self, schema: pa.Schema):
|
178
|
+
# """Convert DataFrame columns to match the specified schema."""
|
179
|
+
# dtype_mapping = {}
|
180
|
+
# for field in schema:
|
181
|
+
# col_name = field.name
|
182
|
+
# if col_name in self.df_result.columns:
|
183
|
+
# if pa.types.is_string(field.type):
|
184
|
+
# dtype_mapping[col_name] = 'string'
|
185
|
+
# elif pa.types.is_int64(field.type):
|
186
|
+
# dtype_mapping[col_name] = 'Int64' # pandas nullable integer
|
187
|
+
# elif pa.types.is_float64(field.type):
|
188
|
+
# dtype_mapping[col_name] = 'float64'
|
189
|
+
# elif pa.types.is_float32(field.type):
|
190
|
+
# dtype_mapping[col_name] = 'float32'
|
191
|
+
# elif pa.types.is_boolean(field.type):
|
192
|
+
# dtype_mapping[col_name] = 'boolean' # pandas nullable boolean
|
193
|
+
# elif pa.types.is_timestamp(field.type):
|
194
|
+
# dtype_mapping[col_name] = 'datetime64[ns]'
|
195
|
+
# else:
|
196
|
+
# dtype_mapping[col_name] = 'object' # Fallback to object
|
197
|
+
# # Convert dtypes
|
198
|
+
# self.df_result = self.df_result.astype(dtype_mapping)
|
199
|
+
#
|
200
|
+
# def _construct_full_path(self, parquet_filename: Optional[str]) -> Path:
|
201
|
+
# """Construct and return the full path for the Parquet file."""
|
202
|
+
# _, base_path = fsspec.core.url_to_fs(self.parquet_storage_path)
|
203
|
+
# parquet_filename = parquet_filename or "default.parquet"
|
204
|
+
# return Path(base_path) / parquet_filename
|
205
|
+
#
|
206
|
+
# @staticmethod
|
207
|
+
# def _ensure_directory_exists(full_path: Path, clear_existing=False):
|
208
|
+
# """Ensure that the directory for the path exists, clearing it if specified."""
|
209
|
+
# fs, _ = fsspec.core.url_to_fs(str(full_path))
|
210
|
+
# directory = str(full_path.parent)
|
211
|
+
#
|
212
|
+
# if fs.exists(directory):
|
213
|
+
# if clear_existing:
|
214
|
+
# fs.rm(directory, recursive=True)
|
215
|
+
# else:
|
216
|
+
# fs.mkdirs(directory, exist_ok=True)
|
217
|
+
#
|
218
|
+
# def _save_dataframe_to_parquet(self, full_path: Path, schema: pa.Schema):
|
219
|
+
# """Save the DataFrame to Parquet using the specified schema."""
|
220
|
+
# fs, _ = fsspec.core.url_to_fs(str(full_path))
|
221
|
+
# print(f"Saving to {str(full_path)}")
|
222
|
+
# if fs.exists(str(full_path)):
|
223
|
+
# fs.rm(str(full_path), recursive=True)
|
224
|
+
#
|
225
|
+
# # Save the Dask DataFrame to Parquet
|
226
|
+
# self.df_result.to_parquet(
|
227
|
+
# str(full_path), engine="pyarrow", schema=schema, write_index=False
|
228
|
+
# )
|
@@ -7,11 +7,12 @@ class StorageManager:
|
|
7
7
|
def __init__(self, storage_path, fs_type="file", fs_options=None):
|
8
8
|
"""
|
9
9
|
Initializes the StorageManager with the base storage path and file system settings.
|
10
|
-
:param storage_path: Base path for the storage.
|
10
|
+
:param storage_path: Base path for the storage (e.g., "s3://my-bucket").
|
11
11
|
:param fs_type: File system type (e.g., "file", "s3").
|
12
12
|
:param fs_options: Dictionary of options for fsspec file system (e.g., credentials).
|
13
13
|
"""
|
14
|
-
|
14
|
+
# Ensure the storage_path ends with a slash for consistency
|
15
|
+
self.storage_path = storage_path.rstrip("/")
|
15
16
|
self.fs_type = fs_type
|
16
17
|
self.fs_options = fs_options or {}
|
17
18
|
self.fs = fsspec.filesystem(fs_type, **self.fs_options)
|
@@ -33,6 +34,7 @@ class StorageManager:
|
|
33
34
|
:param dirs_to_create: List of subdirectories to create.
|
34
35
|
:param clear_existing: Whether to clear existing directories.
|
35
36
|
"""
|
37
|
+
print(f"Setting up directories under: {base_path}")
|
36
38
|
if clear_existing:
|
37
39
|
print(f"Warning: All existing contents in {base_path} will be removed.")
|
38
40
|
if self.fs.exists(base_path):
|
@@ -44,6 +46,7 @@ class StorageManager:
|
|
44
46
|
# Create subdirectories
|
45
47
|
for sub_directory in dirs_to_create:
|
46
48
|
sub_path = self.join_paths(base_path, sub_directory)
|
49
|
+
print(f"Creating directory: {sub_path}")
|
47
50
|
if clear_existing and self.fs.exists(sub_path):
|
48
51
|
self.fs.rm(sub_path, recursive=True)
|
49
52
|
self.fs.mkdirs(sub_path, exist_ok=True)
|
@@ -59,6 +62,7 @@ class StorageManager:
|
|
59
62
|
# Ensure directories exist (optionally clear existing ones)
|
60
63
|
for depot, sub_directories in depots.items():
|
61
64
|
depot_path = self.join_paths(self.storage_path, depot)
|
65
|
+
print(f"Rebuilding depot at: {depot_path}")
|
62
66
|
self.setup_directories(depot_path, sub_directories, clear_existing=clear_existing)
|
63
67
|
|
64
68
|
# Generate depot_paths dictionary
|
@@ -86,4 +90,4 @@ class StorageManager:
|
|
86
90
|
"""
|
87
91
|
print("Rebuilding depot structure...")
|
88
92
|
self.rebuild_depot_paths(depots, clear_existing=clear_existing)
|
89
|
-
print("Rebuild complete.")
|
93
|
+
print("Rebuild complete.")
|
@@ -1,104 +0,0 @@
|
|
1
|
-
from pathlib import Path
|
2
|
-
from typing import Optional
|
3
|
-
|
4
|
-
import fsspec
|
5
|
-
import pyarrow as pa
|
6
|
-
|
7
|
-
from sibi_dst.utils import Logger
|
8
|
-
|
9
|
-
|
10
|
-
class ParquetSaver:
|
11
|
-
def __init__(self, df_result, parquet_storage_path, logger=None):
|
12
|
-
# Ensure df_result is a Dask DataFrame
|
13
|
-
self.df_result = df_result
|
14
|
-
self.parquet_storage_path = parquet_storage_path
|
15
|
-
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
16
|
-
|
17
|
-
def save_to_parquet(self, parquet_filename: Optional[str] = None, clear_existing=True):
|
18
|
-
full_path = self._construct_full_path(parquet_filename)
|
19
|
-
|
20
|
-
# We cannot check for empty DataFrame directly with Dask without computation
|
21
|
-
# Proceed with saving; if the DataFrame is empty, an empty Parquet file will be created
|
22
|
-
|
23
|
-
# Ensure directory exists and clear if necessary
|
24
|
-
self._ensure_directory_exists(full_path, clear_existing=clear_existing)
|
25
|
-
|
26
|
-
# Define schema and save DataFrame to Parquet
|
27
|
-
schema = self._define_schema()
|
28
|
-
self._convert_dtypes(schema)
|
29
|
-
self._save_dataframe_to_parquet(full_path, schema)
|
30
|
-
|
31
|
-
def _define_schema(self) -> pa.Schema:
|
32
|
-
"""Define a PyArrow schema dynamically based on df_result column types."""
|
33
|
-
pandas_dtype_to_pa = {
|
34
|
-
'object': pa.string(),
|
35
|
-
'string': pa.string(),
|
36
|
-
'Int64': pa.int64(),
|
37
|
-
'int64': pa.int64(),
|
38
|
-
'float64': pa.float64(),
|
39
|
-
'float32': pa.float32(),
|
40
|
-
'bool': pa.bool_(),
|
41
|
-
'boolean': pa.bool_(), # pandas nullable boolean
|
42
|
-
'datetime64[ns]': pa.timestamp('ns'),
|
43
|
-
'timedelta[ns]': pa.duration('ns')
|
44
|
-
}
|
45
|
-
|
46
|
-
dtypes = self.df_result.dtypes # No need to call .compute()
|
47
|
-
|
48
|
-
fields = [
|
49
|
-
pa.field(col, pandas_dtype_to_pa.get(str(dtype), pa.string()))
|
50
|
-
for col, dtype in dtypes.items()
|
51
|
-
]
|
52
|
-
return pa.schema(fields)
|
53
|
-
|
54
|
-
def _convert_dtypes(self, schema: pa.Schema):
|
55
|
-
"""Convert DataFrame columns to match the specified schema."""
|
56
|
-
dtype_mapping = {}
|
57
|
-
for field in schema:
|
58
|
-
col_name = field.name
|
59
|
-
if col_name in self.df_result.columns:
|
60
|
-
if pa.types.is_string(field.type):
|
61
|
-
dtype_mapping[col_name] = 'string'
|
62
|
-
elif pa.types.is_int64(field.type):
|
63
|
-
dtype_mapping[col_name] = 'Int64' # pandas nullable integer
|
64
|
-
elif pa.types.is_float64(field.type):
|
65
|
-
dtype_mapping[col_name] = 'float64'
|
66
|
-
elif pa.types.is_float32(field.type):
|
67
|
-
dtype_mapping[col_name] = 'float32'
|
68
|
-
elif pa.types.is_boolean(field.type):
|
69
|
-
dtype_mapping[col_name] = 'boolean' # pandas nullable boolean
|
70
|
-
elif pa.types.is_timestamp(field.type):
|
71
|
-
dtype_mapping[col_name] = 'datetime64[ns]'
|
72
|
-
else:
|
73
|
-
dtype_mapping[col_name] = 'object' # Fallback to object
|
74
|
-
# Convert dtypes
|
75
|
-
self.df_result = self.df_result.astype(dtype_mapping)
|
76
|
-
|
77
|
-
def _construct_full_path(self, parquet_filename: Optional[str]) -> Path:
|
78
|
-
"""Construct and return the full path for the Parquet file."""
|
79
|
-
_, base_path = fsspec.core.url_to_fs(self.parquet_storage_path)
|
80
|
-
parquet_filename = parquet_filename or "default.parquet"
|
81
|
-
return Path(base_path) / parquet_filename
|
82
|
-
|
83
|
-
@staticmethod
|
84
|
-
def _ensure_directory_exists(full_path: Path, clear_existing=False):
|
85
|
-
"""Ensure that the directory for the path exists, clearing it if specified."""
|
86
|
-
fs, _ = fsspec.core.url_to_fs(str(full_path))
|
87
|
-
directory = str(full_path.parent)
|
88
|
-
|
89
|
-
if fs.exists(directory):
|
90
|
-
if clear_existing:
|
91
|
-
fs.rm(directory, recursive=True)
|
92
|
-
else:
|
93
|
-
fs.mkdirs(directory, exist_ok=True)
|
94
|
-
|
95
|
-
def _save_dataframe_to_parquet(self, full_path: Path, schema: pa.Schema):
|
96
|
-
"""Save the DataFrame to Parquet using the specified schema."""
|
97
|
-
fs, _ = fsspec.core.url_to_fs(str(full_path))
|
98
|
-
if fs.exists(str(full_path)):
|
99
|
-
fs.rm(str(full_path), recursive=True)
|
100
|
-
|
101
|
-
# Save the Dask DataFrame to Parquet
|
102
|
-
self.df_result.to_parquet(
|
103
|
-
str(full_path), engine="pyarrow", schema=schema, write_index=False
|
104
|
-
)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/backends/django/_sql_model_builder.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py
RENAMED
File without changes
|
{sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|