sibi-dst 0.3.20__tar.gz → 0.3.22__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/PKG-INFO +1 -1
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/pyproject.toml +1 -1
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/__init__.py +1 -1
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/__init__.py +2 -2
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/_df_helper.py +34 -33
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/_parquet_artifact.py +4 -1
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/_parquet_reader.py +2 -1
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/django/__init__.py +1 -2
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/django/_django_db_connection.py +1 -1
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/django/_django_load_from_db.py +6 -8
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/django/_django_sql_model_builder.py +5 -5
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/django/_io_dask.py +0 -1
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/django/_io_dask_alt.py +5 -4
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/http/__init__.py +2 -2
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/http/_http_config.py +6 -3
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/parquet/__init__.py +3 -3
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/parquet/_parquet_filter_handler.py +4 -2
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +12 -7
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/sql_alchemy/_io_sqlalchemy_dask.py +3 -1
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/sql_alchemy/_sqlachemy_filter_handler.py +2 -3
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_load_from_db.py +2 -2
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_model_builder.py +5 -3
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/sql_model/_sqlmodel_db_connection.py +5 -4
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/sql_model/_sqlmodel_load_from_db.py +13 -11
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/core/_defaults.py +9 -6
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/core/_filter_handler.py +7 -4
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/core/_params_config.py +3 -2
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/core/_query_config.py +0 -2
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/utils/__init__.py +6 -5
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/utils/_airflow_manager.py +4 -3
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/utils/_clickhouse_writer.py +16 -13
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/utils/_data_wrapper.py +82 -16
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/utils/_date_utils.py +11 -5
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/utils/_df_utils.py +9 -5
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/utils/_file_utils.py +3 -1
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/utils/_filepath_generator.py +4 -2
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/utils/_log_utils.py +1 -1
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/utils/_parquet_saver.py +0 -2
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/README.md +0 -0
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/__init__.py +0 -0
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/sql_alchemy/__init__.py +2 -2
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_db_connection.py +3 -3
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/sql_model/__init__.py +1 -1
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/utils/_credentials.py +1 -1
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/utils/_data_utils.py +0 -0
- {sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/utils/_storage_manager.py +1 -1
@@ -27,11 +27,12 @@ warnings.filterwarnings(
|
|
27
27
|
category=UserWarning,
|
28
28
|
)
|
29
29
|
|
30
|
+
|
30
31
|
class DfHelper:
|
31
32
|
df: Union[dd.DataFrame, pd.DataFrame] = None
|
32
33
|
backend_django: Optional[DjangoConnectionConfig] = None
|
33
|
-
|
34
|
-
|
34
|
+
_backend_query: Optional[QueryConfig] = None
|
35
|
+
_backend_params: Optional[ParamsConfig] = None
|
35
36
|
backend_parquet: Optional[ParquetConfig] = None
|
36
37
|
backend_http: Optional[HttpConfig] = None
|
37
38
|
backend_sqlalchemy: Optional[SqlAlchemyConnectionConfig] = None
|
@@ -45,7 +46,7 @@ class DfHelper:
|
|
45
46
|
kwargs = {**self.default_config.copy(), **kwargs}
|
46
47
|
self.backend = backend
|
47
48
|
self.debug = kwargs.setdefault("debug", False)
|
48
|
-
self.logger = kwargs.get("logger",Logger.default_logger(logger_name=self.__class__.__name__))
|
49
|
+
self.logger = kwargs.get("logger", Logger.default_logger(logger_name=self.__class__.__name__))
|
49
50
|
# Configure logger level
|
50
51
|
self.logger.setLevel(logging.DEBUG if self.debug else logging.INFO)
|
51
52
|
self.logger.debug("Logger initialized in DEBUG mode.")
|
@@ -54,15 +55,15 @@ class DfHelper:
|
|
54
55
|
self.as_pandas = kwargs.setdefault("as_pandas", False)
|
55
56
|
kwargs.setdefault("live", True)
|
56
57
|
kwargs.setdefault("logger", self.logger)
|
57
|
-
self.
|
58
|
+
self.__post_init(**kwargs)
|
58
59
|
|
59
60
|
def __str__(self):
|
60
61
|
return self.__class__.__name__
|
61
62
|
|
62
|
-
def
|
63
|
+
def __post_init(self, **kwargs):
|
63
64
|
self.logger.debug(f"backend used: {self.backend}")
|
64
|
-
self.
|
65
|
-
self.
|
65
|
+
self._backend_query = self.__get_config(QueryConfig, kwargs)
|
66
|
+
self._backend_params = self.__get_config(ParamsConfig, kwargs)
|
66
67
|
if self.backend == 'django_db':
|
67
68
|
self.backend_django = self.__get_config(DjangoConnectionConfig, kwargs)
|
68
69
|
elif self.backend == 'parquet':
|
@@ -89,42 +90,42 @@ class DfHelper:
|
|
89
90
|
|
90
91
|
def load(self, **options):
|
91
92
|
# this will be the universal method to load data from a df irrespective of the backend
|
92
|
-
df = self.
|
93
|
+
df = self.__load(**options)
|
93
94
|
if self.as_pandas:
|
94
95
|
return df.compute()
|
95
96
|
return df
|
96
97
|
|
97
|
-
def
|
98
|
+
def __load(self, **options):
|
98
99
|
|
99
100
|
if self.backend == 'django_db':
|
100
|
-
self.
|
101
|
-
return self.
|
101
|
+
self._backend_params.parse_params(options)
|
102
|
+
return self.__load_from_db(**options)
|
102
103
|
elif self.backend == 'sqlalchemy':
|
103
|
-
self.
|
104
|
-
return self.
|
104
|
+
self._backend_params.parse_params(options)
|
105
|
+
return self.__load_from_sqlalchemy(**options)
|
105
106
|
elif self.backend == 'parquet':
|
106
|
-
return self.
|
107
|
+
return self.__load_from_parquet(**options)
|
107
108
|
elif self.backend == 'http':
|
108
109
|
if asyncio.get_event_loop().is_running():
|
109
110
|
self.logger.debug("Running as a task from an event loop")
|
110
|
-
return asyncio.create_task(self.
|
111
|
+
return asyncio.create_task(self.__load_from_http(**options))
|
111
112
|
else:
|
112
113
|
self.logger.debug("Regular asyncio run...")
|
113
|
-
return asyncio.run(self.
|
114
|
+
return asyncio.run(self.__load_from_http(**options))
|
114
115
|
|
115
|
-
def
|
116
|
+
def __load_from_sqlalchemy(self, **options):
|
116
117
|
try:
|
117
118
|
options.setdefault("debug", self.debug)
|
118
119
|
db_loader = SqlAlchemyLoadFromDb(
|
119
120
|
self.backend_sqlalchemy,
|
120
|
-
self.
|
121
|
-
self.
|
121
|
+
self._backend_query,
|
122
|
+
self._backend_params,
|
122
123
|
self.logger,
|
123
124
|
**options
|
124
125
|
)
|
125
126
|
self.df = db_loader.build_and_load()
|
126
|
-
self.
|
127
|
-
self.
|
127
|
+
self.__process_loaded_data()
|
128
|
+
self.__post_process_df()
|
128
129
|
self.logger.debug("Data successfully loaded from sqlalchemy database.")
|
129
130
|
except Exception as e:
|
130
131
|
self.logger.debug(f"Failed to load data from sqlalchemy database: {e}: options: {options}")
|
@@ -132,19 +133,19 @@ class DfHelper:
|
|
132
133
|
|
133
134
|
return self.df
|
134
135
|
|
135
|
-
def
|
136
|
+
def __load_from_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
136
137
|
try:
|
137
138
|
options.setdefault("debug", self.debug)
|
138
139
|
db_loader = DjangoLoadFromDb(
|
139
140
|
self.backend_django,
|
140
|
-
self.
|
141
|
-
self.
|
141
|
+
self._backend_query,
|
142
|
+
self._backend_params,
|
142
143
|
self.logger,
|
143
144
|
**options
|
144
145
|
)
|
145
146
|
self.df = db_loader.build_and_load()
|
146
|
-
self.
|
147
|
-
self.
|
147
|
+
self.__process_loaded_data()
|
148
|
+
self.__post_process_df()
|
148
149
|
self.logger.debug("Data successfully loaded from django database.")
|
149
150
|
except Exception as e:
|
150
151
|
self.logger.debug(f"Failed to load data from django database: {e}")
|
@@ -152,7 +153,7 @@ class DfHelper:
|
|
152
153
|
|
153
154
|
return self.df
|
154
155
|
|
155
|
-
async def
|
156
|
+
async def __load_from_http(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
156
157
|
"""Delegate asynchronous HTTP data loading to HttpDatabackend plugin."""
|
157
158
|
if not self.backend_http:
|
158
159
|
self.logger.debug("HTTP plugin not configured properly.")
|
@@ -164,12 +165,12 @@ class DfHelper:
|
|
164
165
|
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
165
166
|
return self.df
|
166
167
|
|
167
|
-
def
|
168
|
+
def __post_process_df(self):
|
168
169
|
"""
|
169
170
|
Efficiently process the DataFrame by filtering, renaming, and setting indices.
|
170
171
|
Optimized for large datasets with Dask compatibility.
|
171
172
|
"""
|
172
|
-
df_params = self.
|
173
|
+
df_params = self._backend_params.df_params
|
173
174
|
fieldnames = df_params.get("fieldnames", None)
|
174
175
|
index_col = df_params.get("index_col", None)
|
175
176
|
datetime_index = df_params.get("datetime_index", False)
|
@@ -203,10 +204,10 @@ class DfHelper:
|
|
203
204
|
|
204
205
|
self.logger.debug("Post-processing of DataFrame completed.")
|
205
206
|
|
206
|
-
def
|
207
|
+
def __process_loaded_data(self):
|
207
208
|
self.logger.debug(f"Type of self.df: {type(self.df)}")
|
208
209
|
if self.df.map_partitions(len).compute().sum() > 0:
|
209
|
-
field_map = self.
|
210
|
+
field_map = self._backend_params.field_map or {}
|
210
211
|
if isinstance(field_map, dict):
|
211
212
|
rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
|
212
213
|
missing_columns = [k for k in field_map.keys() if k not in self.df.columns]
|
@@ -237,7 +238,7 @@ class DfHelper:
|
|
237
238
|
cs.save_to_clickhouse(self.df)
|
238
239
|
self.logger.debug("Save to ClickHouse completed.")
|
239
240
|
|
240
|
-
def
|
241
|
+
def __load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
241
242
|
self.df = self.backend_parquet.load_files()
|
242
243
|
if options:
|
243
244
|
"""
|
@@ -274,7 +275,7 @@ class DfHelper:
|
|
274
275
|
raise ValueError("The 'start' date cannot be later than the 'end' date.")
|
275
276
|
|
276
277
|
# Reverse map to original field name
|
277
|
-
field_map = getattr(self.
|
278
|
+
field_map = getattr(self._backend_params, 'field_map', {}) or {}
|
278
279
|
reverse_map = {v: k for k, v in field_map.items()}
|
279
280
|
mapped_field = reverse_map.get(dt_field, dt_field)
|
280
281
|
|
@@ -1,6 +1,8 @@
|
|
1
1
|
from typing import Optional, Any, Dict
|
2
|
+
|
2
3
|
import dask.dataframe as dd
|
3
4
|
import fsspec
|
5
|
+
|
4
6
|
from sibi_dst.df_helper import DfHelper
|
5
7
|
from sibi_dst.utils import DataWrapper
|
6
8
|
from sibi_dst.utils import DateUtils
|
@@ -106,9 +108,10 @@ class ParquetArtifact(DfHelper):
|
|
106
108
|
'parquet_start_date': start_date.strftime('%Y-%m-%d'),
|
107
109
|
'parquet_end_date': end_date.strftime('%Y-%m-%d'),
|
108
110
|
}
|
111
|
+
|
109
112
|
def ensure_directory_exists(self, path: str) -> None:
|
110
113
|
"""Ensure the directory exists in the specified filesystem."""
|
111
114
|
try:
|
112
115
|
self.fs.makedirs(path, exist_ok=True)
|
113
116
|
except Exception as e:
|
114
|
-
raise ValueError(f"Error creating directory {path} in filesystem {self.filesystem_type}: {e}")
|
117
|
+
raise ValueError(f"Error creating directory {path} in filesystem {self.filesystem_type}: {e}")
|
@@ -5,6 +5,7 @@ import fsspec
|
|
5
5
|
|
6
6
|
from sibi_dst.df_helper import DfHelper
|
7
7
|
|
8
|
+
|
8
9
|
class ParquetReader(DfHelper):
|
9
10
|
DEFAULT_CONFIG = {
|
10
11
|
'backend': 'parquet'
|
@@ -46,4 +47,4 @@ class ParquetReader(DfHelper):
|
|
46
47
|
info = self.fs.info(self.parquet_storage_path)
|
47
48
|
return info['type'] == 'directory'
|
48
49
|
except FileNotFoundError:
|
49
|
-
return False
|
50
|
+
return False
|
@@ -1,8 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from ._django_db_connection import DjangoConnectionConfig
|
4
3
|
from ._io_dask import ReadFrameDask
|
5
|
-
|
4
|
+
from ._django_db_connection import DjangoConnectionConfig
|
6
5
|
from ._django_load_from_db import DjangoLoadFromDb
|
7
6
|
|
8
7
|
__all__ = [
|
{sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/django/_django_load_from_db.py
RENAMED
@@ -2,12 +2,12 @@ import warnings
|
|
2
2
|
|
3
3
|
import dask.dataframe as dd
|
4
4
|
import pandas as pd
|
5
|
-
from IPython.core.hooks import deprecated
|
6
5
|
from django.db.models import Q
|
7
6
|
|
8
7
|
from sibi_dst.df_helper.backends.django import ReadFrameDask
|
9
|
-
from sibi_dst.utils import Logger
|
10
8
|
from sibi_dst.df_helper.core import django_field_conversion_map_dask
|
9
|
+
from sibi_dst.utils import Logger
|
10
|
+
|
11
11
|
|
12
12
|
class DjangoLoadFromDb:
|
13
13
|
df: dd.DataFrame
|
@@ -28,17 +28,16 @@ class DjangoLoadFromDb:
|
|
28
28
|
|
29
29
|
def build_and_load(self):
|
30
30
|
self.df = self._build_and_load()
|
31
|
-
#self.df = self._convert_columns(self.df)
|
31
|
+
# self.df = self._convert_columns(self.df)
|
32
32
|
return self.df
|
33
33
|
|
34
|
-
|
35
34
|
def _build_and_load(self) -> dd.DataFrame:
|
36
35
|
query = self.connection_config.model.objects.using(self.connection_config.connection_name)
|
37
36
|
if not self.params_config.filters:
|
38
37
|
# IMPORTANT: if no filters are provided show only the first n_records
|
39
38
|
# this is to prevent loading the entire table by mistake
|
40
39
|
n_records = self.query_config.n_records if self.query_config.n_records else 100
|
41
|
-
queryset=query.all()[:n_records]
|
40
|
+
queryset = query.all()[:n_records]
|
42
41
|
else:
|
43
42
|
q_objects = self.__build_query_objects(self.params_config.filters, self.query_config.use_exclude)
|
44
43
|
queryset = query.filter(q_objects)
|
@@ -99,13 +98,12 @@ class DjangoLoadFromDb:
|
|
99
98
|
# Simplified loop to apply conversions partition-wise
|
100
99
|
for field_name, field_type in field_type_map.items():
|
101
100
|
if field_name not in df.columns:
|
102
|
-
|
103
101
|
self.logger.debug(f"Column '{field_name}' not found in DataFrame columns.")
|
104
102
|
continue
|
105
103
|
|
106
104
|
conversion_func = django_field_conversion_map_dask.get(field_type)
|
107
105
|
if not conversion_func:
|
108
|
-
message=f"Field type '{field_type}' not found in conversion_map."
|
106
|
+
message = f"Field type '{field_type}' not found in conversion_map."
|
109
107
|
self.logger.debug(message)
|
110
108
|
continue
|
111
109
|
|
@@ -130,4 +128,4 @@ class DjangoLoadFromDb:
|
|
130
128
|
except Exception as e:
|
131
129
|
self.logger.debug(f"Failed to queue conversion for column '{field_name}': {str(e)}")
|
132
130
|
|
133
|
-
return df
|
131
|
+
return df
|
{sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/django/_django_sql_model_builder.py
RENAMED
@@ -219,9 +219,9 @@ class DjangoSqlModelBuilder:
|
|
219
219
|
if field_type == "AutoField(":
|
220
220
|
continue
|
221
221
|
elif (
|
222
|
-
|
223
|
-
|
224
|
-
|
222
|
+
field_type
|
223
|
+
== connection.features.introspected_field_types["AutoField"]
|
224
|
+
+ "("
|
225
225
|
):
|
226
226
|
comment_notes.append("AutoField?")
|
227
227
|
|
@@ -240,8 +240,8 @@ class DjangoSqlModelBuilder:
|
|
240
240
|
|
241
241
|
# Add comment.
|
242
242
|
if (
|
243
|
-
|
244
|
-
|
243
|
+
hasattr(connection.features, "supports_comments")
|
244
|
+
and row.comment
|
245
245
|
):
|
246
246
|
extra_params["db_comment"] = row.comment
|
247
247
|
# if connection.features.supports_comments and row.comment:
|
@@ -1,13 +1,14 @@
|
|
1
1
|
import itertools
|
2
|
+
|
2
3
|
import dask.dataframe as dd
|
4
|
+
import django
|
3
5
|
import pandas as pd
|
4
|
-
|
5
6
|
from django.core.cache import cache
|
7
|
+
from django.core.exceptions import FieldDoesNotExist
|
6
8
|
from django.db import models
|
7
9
|
from django.db.models import Field
|
8
10
|
from django.utils.encoding import force_str as force_text
|
9
|
-
|
10
|
-
from django.core.exceptions import FieldDoesNotExist
|
11
|
+
|
11
12
|
|
12
13
|
class ReadFrameDask:
|
13
14
|
FieldDoesNotExist = (
|
@@ -185,4 +186,4 @@ class ReadFrameDask:
|
|
185
186
|
if verbose:
|
186
187
|
self.update_with_verbose(dask_df, fieldnames, qs.model._meta.fields)
|
187
188
|
|
188
|
-
return dask_df
|
189
|
+
return dask_df
|
@@ -1,10 +1,13 @@
|
|
1
|
-
from pydantic import BaseModel, HttpUrl, Field, ConfigDict, SecretStr
|
2
1
|
from typing import Dict, Optional, Any
|
3
|
-
|
2
|
+
|
4
3
|
import dask.dataframe as dd
|
4
|
+
import httpx
|
5
5
|
import pandas as pd
|
6
|
+
from pydantic import BaseModel, HttpUrl, Field, ConfigDict, SecretStr
|
7
|
+
|
6
8
|
from sibi_dst.utils import Logger
|
7
9
|
|
10
|
+
|
8
11
|
class HttpConfig(BaseModel):
|
9
12
|
base_url: HttpUrl
|
10
13
|
params: Optional[Dict[str, Any]] = Field(default_factory=dict)
|
@@ -44,4 +47,4 @@ class HttpConfig(BaseModel):
|
|
44
47
|
raise
|
45
48
|
except ValueError as e:
|
46
49
|
self.logger.debug(f"Error parsing JSON data: {e}")
|
47
|
-
raise
|
50
|
+
raise
|
{sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/parquet/_parquet_filter_handler.py
RENAMED
@@ -1,7 +1,9 @@
|
|
1
|
-
import pandas as pd
|
2
1
|
import dask.dataframe as dd
|
2
|
+
import pandas as pd
|
3
|
+
|
3
4
|
from sibi_dst.utils import Logger
|
4
5
|
|
6
|
+
|
5
7
|
class ParquetFilterHandler(object):
|
6
8
|
def __init__(self, logger=None):
|
7
9
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
@@ -92,4 +94,4 @@ class ParquetFilterHandler(object):
|
|
92
94
|
else:
|
93
95
|
raise ValueError(f"Unsupported operation: {operation}")
|
94
96
|
|
95
|
-
return df
|
97
|
+
return df
|
@@ -1,12 +1,15 @@
|
|
1
|
+
import datetime
|
1
2
|
from pathlib import Path
|
2
3
|
from typing import Optional, List
|
4
|
+
|
3
5
|
import dask.dataframe as dd
|
4
|
-
from pydantic import BaseModel, model_validator, DirectoryPath, FilePath, ConfigDict
|
5
6
|
import fsspec
|
6
|
-
import
|
7
|
+
from pydantic import BaseModel, model_validator, DirectoryPath, FilePath, ConfigDict
|
8
|
+
|
7
9
|
from sibi_dst.utils import FilePathGenerator
|
8
10
|
from sibi_dst.utils import Logger
|
9
11
|
|
12
|
+
|
10
13
|
class ParquetConfig(BaseModel):
|
11
14
|
load_parquet: bool = False
|
12
15
|
parquet_filename: Optional[str] = None
|
@@ -27,7 +30,8 @@ class ParquetConfig(BaseModel):
|
|
27
30
|
# Configure paths based on fsspec
|
28
31
|
if self.logger is None:
|
29
32
|
self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
|
30
|
-
self.fs = fsspec.filesystem("file") if "://" not in str(self.parquet_storage_path) else fsspec.filesystem(
|
33
|
+
self.fs = fsspec.filesystem("file") if "://" not in str(self.parquet_storage_path) else fsspec.filesystem(
|
34
|
+
str(self.parquet_storage_path).split("://")[0])
|
31
35
|
|
32
36
|
# Validation for parquet path
|
33
37
|
if self.parquet_storage_path is None:
|
@@ -37,7 +41,8 @@ class ParquetConfig(BaseModel):
|
|
37
41
|
self.load_parquet = False
|
38
42
|
if self.parquet_filename is not None:
|
39
43
|
self.parquet_full_path = self.ensure_file_extension(
|
40
|
-
filepath=self.fs.sep.join([str(self.parquet_storage_path), str(self.parquet_filename)]),
|
44
|
+
filepath=self.fs.sep.join([str(self.parquet_storage_path), str(self.parquet_filename)]),
|
45
|
+
extension='parquet'
|
41
46
|
)
|
42
47
|
self.parquet_is_recent = self.is_file_recent()
|
43
48
|
self.load_parquet = self.parquet_is_recent and self.fs.exists(self.parquet_full_path)
|
@@ -52,10 +57,11 @@ class ParquetConfig(BaseModel):
|
|
52
57
|
raise ValueError('Parquet end date must be greater than start date')
|
53
58
|
|
54
59
|
# Saving to parquet is disabled when start and end dates are provided, as we will load parquet files
|
55
|
-
self.parquet_folder_list = FilePathGenerator(str(self.parquet_storage_path),
|
60
|
+
self.parquet_folder_list = FilePathGenerator(str(self.parquet_storage_path),
|
61
|
+
logger=self.logger).generate_file_paths(start_date, end_date)
|
56
62
|
self.parquet_size_bytes = self.get_parquet_size_bytes()
|
57
63
|
self.load_parquet = True
|
58
|
-
#self.load_parquet = all([self.fs.exists(folder) for folder in self.parquet_folder_list]) and self.parquet_size_bytes > 0
|
64
|
+
# self.load_parquet = all([self.fs.exists(folder) for folder in self.parquet_folder_list]) and self.parquet_size_bytes > 0
|
59
65
|
elif self.parquet_end_date is not None:
|
60
66
|
raise ValueError('Parquet start date must be specified if end date is provided')
|
61
67
|
|
@@ -88,4 +94,3 @@ class ParquetConfig(BaseModel):
|
|
88
94
|
def ensure_file_extension(filepath: str, extension: str) -> str:
|
89
95
|
path = Path(filepath)
|
90
96
|
return str(path.with_suffix(f".{extension}")) if path.suffix != f".{extension}" else filepath
|
91
|
-
|
{sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/sql_alchemy/_io_sqlalchemy_dask.py
RENAMED
@@ -74,7 +74,9 @@ class SQLAlchemyDask:
|
|
74
74
|
deprecated specific filter handling to a generic one
|
75
75
|
#self.query = SqlAlchemyFilterHandler.apply_filters_sqlalchemy(self.query, self.model, self.filters)
|
76
76
|
"""
|
77
|
-
self.query = FilterHandler(backend="sqlalchemy", logger=self.logger).apply_filters(self.query,
|
77
|
+
self.query = FilterHandler(backend="sqlalchemy", logger=self.logger).apply_filters(self.query,
|
78
|
+
model=self.model,
|
79
|
+
filters=self.filters)
|
78
80
|
else:
|
79
81
|
n_records = 100
|
80
82
|
self.query = self.query.limit(n_records)
|
@@ -52,7 +52,6 @@ class SqlAlchemyFilterHandler:
|
|
52
52
|
return [datetime.date.fromisoformat(v) for v in value]
|
53
53
|
return value
|
54
54
|
|
55
|
-
|
56
55
|
def handle_date_operator(column, date_op):
|
57
56
|
"""
|
58
57
|
Handle filtering on specific datetime parts (e.g., year, month).
|
@@ -93,7 +92,7 @@ class SqlAlchemyFilterHandler:
|
|
93
92
|
|
94
93
|
# Get the column from the model
|
95
94
|
column = getattr(model, field_name, None)
|
96
|
-
#column = model.__table__.columns.get(field_name)
|
95
|
+
# column = model.__table__.columns.get(field_name)
|
97
96
|
if not column:
|
98
97
|
raise AttributeError(f"Field '{field_name}' not found in model '{model.__name__}'")
|
99
98
|
|
@@ -117,4 +116,4 @@ class SqlAlchemyFilterHandler:
|
|
117
116
|
else:
|
118
117
|
raise ValueError(f"Unsupported operation: {operation}")
|
119
118
|
|
120
|
-
return query
|
119
|
+
return query
|
@@ -1,5 +1,4 @@
|
|
1
1
|
import dask.dataframe as dd
|
2
|
-
import dask_expr
|
3
2
|
import pandas as pd
|
4
3
|
|
5
4
|
from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
|
@@ -7,6 +6,7 @@ from sibi_dst.utils import Logger
|
|
7
6
|
from ._io_sqlalchemy_dask import SQLAlchemyDask
|
8
7
|
from ._sqlalchemy_db_connection import SqlAlchemyConnectionConfig
|
9
8
|
|
9
|
+
|
10
10
|
class SqlAlchemyLoadFromDb:
|
11
11
|
df: dd.DataFrame = None
|
12
12
|
|
@@ -52,7 +52,7 @@ class SqlAlchemyLoadFromDb:
|
|
52
52
|
|
53
53
|
if self.df is None or len(self.df.head().index) == 0:
|
54
54
|
self.logger.debug("Query returned no results.")
|
55
|
-
dask_df=dd.from_pandas(pd.DataFrame(), npartitions=1)
|
55
|
+
dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
56
56
|
|
57
57
|
return dask_df
|
58
58
|
return self.df
|
@@ -1,15 +1,17 @@
|
|
1
1
|
import re
|
2
|
+
|
2
3
|
from sqlalchemy import MetaData, Table
|
3
4
|
from sqlalchemy.orm import declarative_base, relationship
|
4
5
|
|
5
6
|
# Base class for dynamically created models
|
6
7
|
Base = declarative_base()
|
7
8
|
|
8
|
-
|
9
9
|
apps_label = "datacubes"
|
10
10
|
|
11
|
+
|
11
12
|
class SqlAlchemyModelBuilder:
|
12
13
|
_model_cache = {} # Local cache for model classes
|
14
|
+
|
13
15
|
def __init__(self, engine, table_name):
|
14
16
|
"""
|
15
17
|
Initialize the model builder with a database engine and specific table.
|
@@ -58,7 +60,7 @@ class SqlAlchemyModelBuilder:
|
|
58
60
|
|
59
61
|
# Add columns and relationships to the model
|
60
62
|
attrs.update(columns)
|
61
|
-
#self.add_relationships(attrs, self.table)
|
63
|
+
# self.add_relationships(attrs, self.table)
|
62
64
|
model = Base.registry._class_registry.get(self.class_name)
|
63
65
|
if not model:
|
64
66
|
model = type(self.class_name, (Base,), attrs)
|
@@ -126,4 +128,4 @@ class SqlAlchemyModelBuilder:
|
|
126
128
|
column_name = re.sub(r"\W|^(?=\d)", "_", column_name)
|
127
129
|
if column_name in {"class", "def", "return", "yield", "global"}:
|
128
130
|
column_name += "_field"
|
129
|
-
return column_name
|
131
|
+
return column_name
|
{sibi_dst-0.3.20 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/sql_model/_sqlmodel_db_connection.py
RENAMED
@@ -1,9 +1,10 @@
|
|
1
|
+
import datetime
|
1
2
|
from typing import Any, Optional, Dict, Type
|
3
|
+
|
2
4
|
from pydantic import BaseModel, model_validator
|
3
|
-
from sqlmodel import SQLModel, Field, create_engine
|
4
5
|
from sqlalchemy import inspect
|
5
|
-
from sqlalchemy.sql import text
|
6
6
|
from sqlalchemy.exc import OperationalError
|
7
|
+
from sqlalchemy.sql import text
|
7
8
|
from sqlalchemy.sql.sqltypes import (
|
8
9
|
Integer,
|
9
10
|
String,
|
@@ -14,7 +15,7 @@ from sqlalchemy.sql.sqltypes import (
|
|
14
15
|
Time,
|
15
16
|
Numeric,
|
16
17
|
)
|
17
|
-
import
|
18
|
+
from sqlmodel import SQLModel, Field, create_engine
|
18
19
|
|
19
20
|
|
20
21
|
class SQLModelConnectionConfig(BaseModel):
|
@@ -130,4 +131,4 @@ class SQLModelConnectionConfig(BaseModel):
|
|
130
131
|
@staticmethod
|
131
132
|
def _table2model(table_name: str) -> str:
|
132
133
|
"""Convert table name to PascalCase model name."""
|
133
|
-
return "".join(word.capitalize() for word in table_name.split("_"))
|
134
|
+
return "".join(word.capitalize() for word in table_name.split("_"))
|