sibi-dst 0.3.19__tar.gz → 0.3.21__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/PKG-INFO +1 -1
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/pyproject.toml +1 -1
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/__init__.py +1 -1
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/__init__.py +2 -2
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/_df_helper.py +34 -33
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/_parquet_artifact.py +4 -1
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/_parquet_reader.py +2 -1
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/django/__init__.py +1 -2
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/django/_django_db_connection.py +1 -1
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/django/_django_load_from_db.py +6 -8
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/django/_django_sql_model_builder.py +5 -5
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/django/_io_dask.py +0 -1
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/django/_io_dask_alt.py +5 -4
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/http/__init__.py +2 -2
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/http/_http_config.py +6 -3
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/parquet/__init__.py +3 -3
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/parquet/_parquet_filter_handler.py +4 -2
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +12 -7
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/sql_alchemy/_io_sqlalchemy_dask.py +3 -1
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/sql_alchemy/_sqlachemy_filter_handler.py +2 -3
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_load_from_db.py +2 -2
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_model_builder.py +5 -3
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/sql_model/_sqlmodel_db_connection.py +5 -4
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/sql_model/_sqlmodel_load_from_db.py +13 -11
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/core/_defaults.py +9 -6
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/core/_filter_handler.py +7 -4
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/core/_params_config.py +3 -2
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/core/_query_config.py +0 -2
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/utils/__init__.py +10 -9
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/utils/_airflow_manager.py +4 -3
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/utils/_clickhouse_writer.py +16 -13
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/utils/_data_wrapper.py +7 -4
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/utils/_date_utils.py +11 -5
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/utils/_df_utils.py +9 -5
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/utils/_file_utils.py +3 -1
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/utils/_filepath_generator.py +4 -2
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/utils/_log_utils.py +1 -1
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/utils/_parquet_saver.py +0 -2
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/README.md +0 -0
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/__init__.py +0 -0
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/sql_alchemy/__init__.py +2 -2
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_db_connection.py +3 -3
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/sql_model/__init__.py +1 -1
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/utils/_credentials.py +1 -1
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/utils/_data_utils.py +0 -0
- {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/utils/_storage_manager.py +1 -1
@@ -27,11 +27,12 @@ warnings.filterwarnings(
|
|
27
27
|
category=UserWarning,
|
28
28
|
)
|
29
29
|
|
30
|
+
|
30
31
|
class DfHelper:
|
31
32
|
df: Union[dd.DataFrame, pd.DataFrame] = None
|
32
33
|
backend_django: Optional[DjangoConnectionConfig] = None
|
33
|
-
|
34
|
-
|
34
|
+
_backend_query: Optional[QueryConfig] = None
|
35
|
+
_backend_params: Optional[ParamsConfig] = None
|
35
36
|
backend_parquet: Optional[ParquetConfig] = None
|
36
37
|
backend_http: Optional[HttpConfig] = None
|
37
38
|
backend_sqlalchemy: Optional[SqlAlchemyConnectionConfig] = None
|
@@ -45,7 +46,7 @@ class DfHelper:
|
|
45
46
|
kwargs = {**self.default_config.copy(), **kwargs}
|
46
47
|
self.backend = backend
|
47
48
|
self.debug = kwargs.setdefault("debug", False)
|
48
|
-
self.logger = kwargs.get("logger",Logger.default_logger(logger_name=self.__class__.__name__))
|
49
|
+
self.logger = kwargs.get("logger", Logger.default_logger(logger_name=self.__class__.__name__))
|
49
50
|
# Configure logger level
|
50
51
|
self.logger.setLevel(logging.DEBUG if self.debug else logging.INFO)
|
51
52
|
self.logger.debug("Logger initialized in DEBUG mode.")
|
@@ -54,15 +55,15 @@ class DfHelper:
|
|
54
55
|
self.as_pandas = kwargs.setdefault("as_pandas", False)
|
55
56
|
kwargs.setdefault("live", True)
|
56
57
|
kwargs.setdefault("logger", self.logger)
|
57
|
-
self.
|
58
|
+
self.__post_init(**kwargs)
|
58
59
|
|
59
60
|
def __str__(self):
|
60
61
|
return self.__class__.__name__
|
61
62
|
|
62
|
-
def
|
63
|
+
def __post_init(self, **kwargs):
|
63
64
|
self.logger.debug(f"backend used: {self.backend}")
|
64
|
-
self.
|
65
|
-
self.
|
65
|
+
self._backend_query = self.__get_config(QueryConfig, kwargs)
|
66
|
+
self._backend_params = self.__get_config(ParamsConfig, kwargs)
|
66
67
|
if self.backend == 'django_db':
|
67
68
|
self.backend_django = self.__get_config(DjangoConnectionConfig, kwargs)
|
68
69
|
elif self.backend == 'parquet':
|
@@ -89,42 +90,42 @@ class DfHelper:
|
|
89
90
|
|
90
91
|
def load(self, **options):
|
91
92
|
# this will be the universal method to load data from a df irrespective of the backend
|
92
|
-
df = self.
|
93
|
+
df = self.__load(**options)
|
93
94
|
if self.as_pandas:
|
94
95
|
return df.compute()
|
95
96
|
return df
|
96
97
|
|
97
|
-
def
|
98
|
+
def __load(self, **options):
|
98
99
|
|
99
100
|
if self.backend == 'django_db':
|
100
|
-
self.
|
101
|
-
return self.
|
101
|
+
self._backend_params.parse_params(options)
|
102
|
+
return self.__load_from_db(**options)
|
102
103
|
elif self.backend == 'sqlalchemy':
|
103
|
-
self.
|
104
|
-
return self.
|
104
|
+
self._backend_params.parse_params(options)
|
105
|
+
return self.__load_from_sqlalchemy(**options)
|
105
106
|
elif self.backend == 'parquet':
|
106
|
-
return self.
|
107
|
+
return self.__load_from_parquet(**options)
|
107
108
|
elif self.backend == 'http':
|
108
109
|
if asyncio.get_event_loop().is_running():
|
109
110
|
self.logger.debug("Running as a task from an event loop")
|
110
|
-
return asyncio.create_task(self.
|
111
|
+
return asyncio.create_task(self.__load_from_http(**options))
|
111
112
|
else:
|
112
113
|
self.logger.debug("Regular asyncio run...")
|
113
|
-
return asyncio.run(self.
|
114
|
+
return asyncio.run(self.__load_from_http(**options))
|
114
115
|
|
115
|
-
def
|
116
|
+
def __load_from_sqlalchemy(self, **options):
|
116
117
|
try:
|
117
118
|
options.setdefault("debug", self.debug)
|
118
119
|
db_loader = SqlAlchemyLoadFromDb(
|
119
120
|
self.backend_sqlalchemy,
|
120
|
-
self.
|
121
|
-
self.
|
121
|
+
self._backend_query,
|
122
|
+
self._backend_params,
|
122
123
|
self.logger,
|
123
124
|
**options
|
124
125
|
)
|
125
126
|
self.df = db_loader.build_and_load()
|
126
|
-
self.
|
127
|
-
self.
|
127
|
+
self.__process_loaded_data()
|
128
|
+
self.__post_process_df()
|
128
129
|
self.logger.debug("Data successfully loaded from sqlalchemy database.")
|
129
130
|
except Exception as e:
|
130
131
|
self.logger.debug(f"Failed to load data from sqlalchemy database: {e}: options: {options}")
|
@@ -132,19 +133,19 @@ class DfHelper:
|
|
132
133
|
|
133
134
|
return self.df
|
134
135
|
|
135
|
-
def
|
136
|
+
def __load_from_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
136
137
|
try:
|
137
138
|
options.setdefault("debug", self.debug)
|
138
139
|
db_loader = DjangoLoadFromDb(
|
139
140
|
self.backend_django,
|
140
|
-
self.
|
141
|
-
self.
|
141
|
+
self._backend_query,
|
142
|
+
self._backend_params,
|
142
143
|
self.logger,
|
143
144
|
**options
|
144
145
|
)
|
145
146
|
self.df = db_loader.build_and_load()
|
146
|
-
self.
|
147
|
-
self.
|
147
|
+
self.__process_loaded_data()
|
148
|
+
self.__post_process_df()
|
148
149
|
self.logger.debug("Data successfully loaded from django database.")
|
149
150
|
except Exception as e:
|
150
151
|
self.logger.debug(f"Failed to load data from django database: {e}")
|
@@ -152,7 +153,7 @@ class DfHelper:
|
|
152
153
|
|
153
154
|
return self.df
|
154
155
|
|
155
|
-
async def
|
156
|
+
async def __load_from_http(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
156
157
|
"""Delegate asynchronous HTTP data loading to HttpDatabackend plugin."""
|
157
158
|
if not self.backend_http:
|
158
159
|
self.logger.debug("HTTP plugin not configured properly.")
|
@@ -164,12 +165,12 @@ class DfHelper:
|
|
164
165
|
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
165
166
|
return self.df
|
166
167
|
|
167
|
-
def
|
168
|
+
def __post_process_df(self):
|
168
169
|
"""
|
169
170
|
Efficiently process the DataFrame by filtering, renaming, and setting indices.
|
170
171
|
Optimized for large datasets with Dask compatibility.
|
171
172
|
"""
|
172
|
-
df_params = self.
|
173
|
+
df_params = self._backend_params.df_params
|
173
174
|
fieldnames = df_params.get("fieldnames", None)
|
174
175
|
index_col = df_params.get("index_col", None)
|
175
176
|
datetime_index = df_params.get("datetime_index", False)
|
@@ -203,10 +204,10 @@ class DfHelper:
|
|
203
204
|
|
204
205
|
self.logger.debug("Post-processing of DataFrame completed.")
|
205
206
|
|
206
|
-
def
|
207
|
+
def __process_loaded_data(self):
|
207
208
|
self.logger.debug(f"Type of self.df: {type(self.df)}")
|
208
209
|
if self.df.map_partitions(len).compute().sum() > 0:
|
209
|
-
field_map = self.
|
210
|
+
field_map = self._backend_params.field_map or {}
|
210
211
|
if isinstance(field_map, dict):
|
211
212
|
rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
|
212
213
|
missing_columns = [k for k in field_map.keys() if k not in self.df.columns]
|
@@ -237,7 +238,7 @@ class DfHelper:
|
|
237
238
|
cs.save_to_clickhouse(self.df)
|
238
239
|
self.logger.debug("Save to ClickHouse completed.")
|
239
240
|
|
240
|
-
def
|
241
|
+
def __load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
241
242
|
self.df = self.backend_parquet.load_files()
|
242
243
|
if options:
|
243
244
|
"""
|
@@ -274,7 +275,7 @@ class DfHelper:
|
|
274
275
|
raise ValueError("The 'start' date cannot be later than the 'end' date.")
|
275
276
|
|
276
277
|
# Reverse map to original field name
|
277
|
-
field_map = getattr(self.
|
278
|
+
field_map = getattr(self._backend_params, 'field_map', {}) or {}
|
278
279
|
reverse_map = {v: k for k, v in field_map.items()}
|
279
280
|
mapped_field = reverse_map.get(dt_field, dt_field)
|
280
281
|
|
@@ -1,6 +1,8 @@
|
|
1
1
|
from typing import Optional, Any, Dict
|
2
|
+
|
2
3
|
import dask.dataframe as dd
|
3
4
|
import fsspec
|
5
|
+
|
4
6
|
from sibi_dst.df_helper import DfHelper
|
5
7
|
from sibi_dst.utils import DataWrapper
|
6
8
|
from sibi_dst.utils import DateUtils
|
@@ -106,9 +108,10 @@ class ParquetArtifact(DfHelper):
|
|
106
108
|
'parquet_start_date': start_date.strftime('%Y-%m-%d'),
|
107
109
|
'parquet_end_date': end_date.strftime('%Y-%m-%d'),
|
108
110
|
}
|
111
|
+
|
109
112
|
def ensure_directory_exists(self, path: str) -> None:
|
110
113
|
"""Ensure the directory exists in the specified filesystem."""
|
111
114
|
try:
|
112
115
|
self.fs.makedirs(path, exist_ok=True)
|
113
116
|
except Exception as e:
|
114
|
-
raise ValueError(f"Error creating directory {path} in filesystem {self.filesystem_type}: {e}")
|
117
|
+
raise ValueError(f"Error creating directory {path} in filesystem {self.filesystem_type}: {e}")
|
@@ -5,6 +5,7 @@ import fsspec
|
|
5
5
|
|
6
6
|
from sibi_dst.df_helper import DfHelper
|
7
7
|
|
8
|
+
|
8
9
|
class ParquetReader(DfHelper):
|
9
10
|
DEFAULT_CONFIG = {
|
10
11
|
'backend': 'parquet'
|
@@ -46,4 +47,4 @@ class ParquetReader(DfHelper):
|
|
46
47
|
info = self.fs.info(self.parquet_storage_path)
|
47
48
|
return info['type'] == 'directory'
|
48
49
|
except FileNotFoundError:
|
49
|
-
return False
|
50
|
+
return False
|
@@ -1,9 +1,8 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from ._django_db_connection import DjangoConnectionConfig
|
4
|
-
from ._io_dask import ReadFrameDask
|
5
|
-
#from ._io_dask_alt import ReadFrameDask
|
6
4
|
from ._django_load_from_db import DjangoLoadFromDb
|
5
|
+
from ._io_dask import ReadFrameDask
|
7
6
|
|
8
7
|
__all__ = [
|
9
8
|
"DjangoConnectionConfig",
|
{sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/django/_django_load_from_db.py
RENAMED
@@ -2,12 +2,12 @@ import warnings
|
|
2
2
|
|
3
3
|
import dask.dataframe as dd
|
4
4
|
import pandas as pd
|
5
|
-
from IPython.core.hooks import deprecated
|
6
5
|
from django.db.models import Q
|
7
6
|
|
8
7
|
from sibi_dst.df_helper.backends.django import ReadFrameDask
|
9
|
-
from sibi_dst.utils import Logger
|
10
8
|
from sibi_dst.df_helper.core import django_field_conversion_map_dask
|
9
|
+
from sibi_dst.utils import Logger
|
10
|
+
|
11
11
|
|
12
12
|
class DjangoLoadFromDb:
|
13
13
|
df: dd.DataFrame
|
@@ -28,17 +28,16 @@ class DjangoLoadFromDb:
|
|
28
28
|
|
29
29
|
def build_and_load(self):
|
30
30
|
self.df = self._build_and_load()
|
31
|
-
#self.df = self._convert_columns(self.df)
|
31
|
+
# self.df = self._convert_columns(self.df)
|
32
32
|
return self.df
|
33
33
|
|
34
|
-
|
35
34
|
def _build_and_load(self) -> dd.DataFrame:
|
36
35
|
query = self.connection_config.model.objects.using(self.connection_config.connection_name)
|
37
36
|
if not self.params_config.filters:
|
38
37
|
# IMPORTANT: if no filters are provided show only the first n_records
|
39
38
|
# this is to prevent loading the entire table by mistake
|
40
39
|
n_records = self.query_config.n_records if self.query_config.n_records else 100
|
41
|
-
queryset=query.all()[:n_records]
|
40
|
+
queryset = query.all()[:n_records]
|
42
41
|
else:
|
43
42
|
q_objects = self.__build_query_objects(self.params_config.filters, self.query_config.use_exclude)
|
44
43
|
queryset = query.filter(q_objects)
|
@@ -99,13 +98,12 @@ class DjangoLoadFromDb:
|
|
99
98
|
# Simplified loop to apply conversions partition-wise
|
100
99
|
for field_name, field_type in field_type_map.items():
|
101
100
|
if field_name not in df.columns:
|
102
|
-
|
103
101
|
self.logger.debug(f"Column '{field_name}' not found in DataFrame columns.")
|
104
102
|
continue
|
105
103
|
|
106
104
|
conversion_func = django_field_conversion_map_dask.get(field_type)
|
107
105
|
if not conversion_func:
|
108
|
-
message=f"Field type '{field_type}' not found in conversion_map."
|
106
|
+
message = f"Field type '{field_type}' not found in conversion_map."
|
109
107
|
self.logger.debug(message)
|
110
108
|
continue
|
111
109
|
|
@@ -130,4 +128,4 @@ class DjangoLoadFromDb:
|
|
130
128
|
except Exception as e:
|
131
129
|
self.logger.debug(f"Failed to queue conversion for column '{field_name}': {str(e)}")
|
132
130
|
|
133
|
-
return df
|
131
|
+
return df
|
{sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/django/_django_sql_model_builder.py
RENAMED
@@ -219,9 +219,9 @@ class DjangoSqlModelBuilder:
|
|
219
219
|
if field_type == "AutoField(":
|
220
220
|
continue
|
221
221
|
elif (
|
222
|
-
|
223
|
-
|
224
|
-
|
222
|
+
field_type
|
223
|
+
== connection.features.introspected_field_types["AutoField"]
|
224
|
+
+ "("
|
225
225
|
):
|
226
226
|
comment_notes.append("AutoField?")
|
227
227
|
|
@@ -240,8 +240,8 @@ class DjangoSqlModelBuilder:
|
|
240
240
|
|
241
241
|
# Add comment.
|
242
242
|
if (
|
243
|
-
|
244
|
-
|
243
|
+
hasattr(connection.features, "supports_comments")
|
244
|
+
and row.comment
|
245
245
|
):
|
246
246
|
extra_params["db_comment"] = row.comment
|
247
247
|
# if connection.features.supports_comments and row.comment:
|
@@ -1,13 +1,14 @@
|
|
1
1
|
import itertools
|
2
|
+
|
2
3
|
import dask.dataframe as dd
|
4
|
+
import django
|
3
5
|
import pandas as pd
|
4
|
-
|
5
6
|
from django.core.cache import cache
|
7
|
+
from django.core.exceptions import FieldDoesNotExist
|
6
8
|
from django.db import models
|
7
9
|
from django.db.models import Field
|
8
10
|
from django.utils.encoding import force_str as force_text
|
9
|
-
|
10
|
-
from django.core.exceptions import FieldDoesNotExist
|
11
|
+
|
11
12
|
|
12
13
|
class ReadFrameDask:
|
13
14
|
FieldDoesNotExist = (
|
@@ -185,4 +186,4 @@ class ReadFrameDask:
|
|
185
186
|
if verbose:
|
186
187
|
self.update_with_verbose(dask_df, fieldnames, qs.model._meta.fields)
|
187
188
|
|
188
|
-
return dask_df
|
189
|
+
return dask_df
|
@@ -1,10 +1,13 @@
|
|
1
|
-
from pydantic import BaseModel, HttpUrl, Field, ConfigDict, SecretStr
|
2
1
|
from typing import Dict, Optional, Any
|
3
|
-
|
2
|
+
|
4
3
|
import dask.dataframe as dd
|
4
|
+
import httpx
|
5
5
|
import pandas as pd
|
6
|
+
from pydantic import BaseModel, HttpUrl, Field, ConfigDict, SecretStr
|
7
|
+
|
6
8
|
from sibi_dst.utils import Logger
|
7
9
|
|
10
|
+
|
8
11
|
class HttpConfig(BaseModel):
|
9
12
|
base_url: HttpUrl
|
10
13
|
params: Optional[Dict[str, Any]] = Field(default_factory=dict)
|
@@ -44,4 +47,4 @@ class HttpConfig(BaseModel):
|
|
44
47
|
raise
|
45
48
|
except ValueError as e:
|
46
49
|
self.logger.debug(f"Error parsing JSON data: {e}")
|
47
|
-
raise
|
50
|
+
raise
|
{sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/parquet/_parquet_filter_handler.py
RENAMED
@@ -1,7 +1,9 @@
|
|
1
|
-
import pandas as pd
|
2
1
|
import dask.dataframe as dd
|
2
|
+
import pandas as pd
|
3
|
+
|
3
4
|
from sibi_dst.utils import Logger
|
4
5
|
|
6
|
+
|
5
7
|
class ParquetFilterHandler(object):
|
6
8
|
def __init__(self, logger=None):
|
7
9
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
@@ -92,4 +94,4 @@ class ParquetFilterHandler(object):
|
|
92
94
|
else:
|
93
95
|
raise ValueError(f"Unsupported operation: {operation}")
|
94
96
|
|
95
|
-
return df
|
97
|
+
return df
|
@@ -1,12 +1,15 @@
|
|
1
|
+
import datetime
|
1
2
|
from pathlib import Path
|
2
3
|
from typing import Optional, List
|
4
|
+
|
3
5
|
import dask.dataframe as dd
|
4
|
-
from pydantic import BaseModel, model_validator, DirectoryPath, FilePath, ConfigDict
|
5
6
|
import fsspec
|
6
|
-
import
|
7
|
+
from pydantic import BaseModel, model_validator, DirectoryPath, FilePath, ConfigDict
|
8
|
+
|
7
9
|
from sibi_dst.utils import FilePathGenerator
|
8
10
|
from sibi_dst.utils import Logger
|
9
11
|
|
12
|
+
|
10
13
|
class ParquetConfig(BaseModel):
|
11
14
|
load_parquet: bool = False
|
12
15
|
parquet_filename: Optional[str] = None
|
@@ -27,7 +30,8 @@ class ParquetConfig(BaseModel):
|
|
27
30
|
# Configure paths based on fsspec
|
28
31
|
if self.logger is None:
|
29
32
|
self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
|
30
|
-
self.fs = fsspec.filesystem("file") if "://" not in str(self.parquet_storage_path) else fsspec.filesystem(
|
33
|
+
self.fs = fsspec.filesystem("file") if "://" not in str(self.parquet_storage_path) else fsspec.filesystem(
|
34
|
+
str(self.parquet_storage_path).split("://")[0])
|
31
35
|
|
32
36
|
# Validation for parquet path
|
33
37
|
if self.parquet_storage_path is None:
|
@@ -37,7 +41,8 @@ class ParquetConfig(BaseModel):
|
|
37
41
|
self.load_parquet = False
|
38
42
|
if self.parquet_filename is not None:
|
39
43
|
self.parquet_full_path = self.ensure_file_extension(
|
40
|
-
filepath=self.fs.sep.join([str(self.parquet_storage_path), str(self.parquet_filename)]),
|
44
|
+
filepath=self.fs.sep.join([str(self.parquet_storage_path), str(self.parquet_filename)]),
|
45
|
+
extension='parquet'
|
41
46
|
)
|
42
47
|
self.parquet_is_recent = self.is_file_recent()
|
43
48
|
self.load_parquet = self.parquet_is_recent and self.fs.exists(self.parquet_full_path)
|
@@ -52,10 +57,11 @@ class ParquetConfig(BaseModel):
|
|
52
57
|
raise ValueError('Parquet end date must be greater than start date')
|
53
58
|
|
54
59
|
# Saving to parquet is disabled when start and end dates are provided, as we will load parquet files
|
55
|
-
self.parquet_folder_list = FilePathGenerator(str(self.parquet_storage_path),
|
60
|
+
self.parquet_folder_list = FilePathGenerator(str(self.parquet_storage_path),
|
61
|
+
logger=self.logger).generate_file_paths(start_date, end_date)
|
56
62
|
self.parquet_size_bytes = self.get_parquet_size_bytes()
|
57
63
|
self.load_parquet = True
|
58
|
-
#self.load_parquet = all([self.fs.exists(folder) for folder in self.parquet_folder_list]) and self.parquet_size_bytes > 0
|
64
|
+
# self.load_parquet = all([self.fs.exists(folder) for folder in self.parquet_folder_list]) and self.parquet_size_bytes > 0
|
59
65
|
elif self.parquet_end_date is not None:
|
60
66
|
raise ValueError('Parquet start date must be specified if end date is provided')
|
61
67
|
|
@@ -88,4 +94,3 @@ class ParquetConfig(BaseModel):
|
|
88
94
|
def ensure_file_extension(filepath: str, extension: str) -> str:
|
89
95
|
path = Path(filepath)
|
90
96
|
return str(path.with_suffix(f".{extension}")) if path.suffix != f".{extension}" else filepath
|
91
|
-
|
{sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/sql_alchemy/_io_sqlalchemy_dask.py
RENAMED
@@ -74,7 +74,9 @@ class SQLAlchemyDask:
|
|
74
74
|
deprecated specific filter handling to a generic one
|
75
75
|
#self.query = SqlAlchemyFilterHandler.apply_filters_sqlalchemy(self.query, self.model, self.filters)
|
76
76
|
"""
|
77
|
-
self.query = FilterHandler(backend="sqlalchemy", logger=self.logger).apply_filters(self.query,
|
77
|
+
self.query = FilterHandler(backend="sqlalchemy", logger=self.logger).apply_filters(self.query,
|
78
|
+
model=self.model,
|
79
|
+
filters=self.filters)
|
78
80
|
else:
|
79
81
|
n_records = 100
|
80
82
|
self.query = self.query.limit(n_records)
|
@@ -52,7 +52,6 @@ class SqlAlchemyFilterHandler:
|
|
52
52
|
return [datetime.date.fromisoformat(v) for v in value]
|
53
53
|
return value
|
54
54
|
|
55
|
-
|
56
55
|
def handle_date_operator(column, date_op):
|
57
56
|
"""
|
58
57
|
Handle filtering on specific datetime parts (e.g., year, month).
|
@@ -93,7 +92,7 @@ class SqlAlchemyFilterHandler:
|
|
93
92
|
|
94
93
|
# Get the column from the model
|
95
94
|
column = getattr(model, field_name, None)
|
96
|
-
#column = model.__table__.columns.get(field_name)
|
95
|
+
# column = model.__table__.columns.get(field_name)
|
97
96
|
if not column:
|
98
97
|
raise AttributeError(f"Field '{field_name}' not found in model '{model.__name__}'")
|
99
98
|
|
@@ -117,4 +116,4 @@ class SqlAlchemyFilterHandler:
|
|
117
116
|
else:
|
118
117
|
raise ValueError(f"Unsupported operation: {operation}")
|
119
118
|
|
120
|
-
return query
|
119
|
+
return query
|
@@ -1,5 +1,4 @@
|
|
1
1
|
import dask.dataframe as dd
|
2
|
-
import dask_expr
|
3
2
|
import pandas as pd
|
4
3
|
|
5
4
|
from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
|
@@ -7,6 +6,7 @@ from sibi_dst.utils import Logger
|
|
7
6
|
from ._io_sqlalchemy_dask import SQLAlchemyDask
|
8
7
|
from ._sqlalchemy_db_connection import SqlAlchemyConnectionConfig
|
9
8
|
|
9
|
+
|
10
10
|
class SqlAlchemyLoadFromDb:
|
11
11
|
df: dd.DataFrame = None
|
12
12
|
|
@@ -52,7 +52,7 @@ class SqlAlchemyLoadFromDb:
|
|
52
52
|
|
53
53
|
if self.df is None or len(self.df.head().index) == 0:
|
54
54
|
self.logger.debug("Query returned no results.")
|
55
|
-
dask_df=dd.from_pandas(pd.DataFrame(), npartitions=1)
|
55
|
+
dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
56
56
|
|
57
57
|
return dask_df
|
58
58
|
return self.df
|
@@ -1,15 +1,17 @@
|
|
1
1
|
import re
|
2
|
+
|
2
3
|
from sqlalchemy import MetaData, Table
|
3
4
|
from sqlalchemy.orm import declarative_base, relationship
|
4
5
|
|
5
6
|
# Base class for dynamically created models
|
6
7
|
Base = declarative_base()
|
7
8
|
|
8
|
-
|
9
9
|
apps_label = "datacubes"
|
10
10
|
|
11
|
+
|
11
12
|
class SqlAlchemyModelBuilder:
|
12
13
|
_model_cache = {} # Local cache for model classes
|
14
|
+
|
13
15
|
def __init__(self, engine, table_name):
|
14
16
|
"""
|
15
17
|
Initialize the model builder with a database engine and specific table.
|
@@ -58,7 +60,7 @@ class SqlAlchemyModelBuilder:
|
|
58
60
|
|
59
61
|
# Add columns and relationships to the model
|
60
62
|
attrs.update(columns)
|
61
|
-
#self.add_relationships(attrs, self.table)
|
63
|
+
# self.add_relationships(attrs, self.table)
|
62
64
|
model = Base.registry._class_registry.get(self.class_name)
|
63
65
|
if not model:
|
64
66
|
model = type(self.class_name, (Base,), attrs)
|
@@ -126,4 +128,4 @@ class SqlAlchemyModelBuilder:
|
|
126
128
|
column_name = re.sub(r"\W|^(?=\d)", "_", column_name)
|
127
129
|
if column_name in {"class", "def", "return", "yield", "global"}:
|
128
130
|
column_name += "_field"
|
129
|
-
return column_name
|
131
|
+
return column_name
|
{sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/sql_model/_sqlmodel_db_connection.py
RENAMED
@@ -1,9 +1,10 @@
|
|
1
|
+
import datetime
|
1
2
|
from typing import Any, Optional, Dict, Type
|
3
|
+
|
2
4
|
from pydantic import BaseModel, model_validator
|
3
|
-
from sqlmodel import SQLModel, Field, create_engine
|
4
5
|
from sqlalchemy import inspect
|
5
|
-
from sqlalchemy.sql import text
|
6
6
|
from sqlalchemy.exc import OperationalError
|
7
|
+
from sqlalchemy.sql import text
|
7
8
|
from sqlalchemy.sql.sqltypes import (
|
8
9
|
Integer,
|
9
10
|
String,
|
@@ -14,7 +15,7 @@ from sqlalchemy.sql.sqltypes import (
|
|
14
15
|
Time,
|
15
16
|
Numeric,
|
16
17
|
)
|
17
|
-
import
|
18
|
+
from sqlmodel import SQLModel, Field, create_engine
|
18
19
|
|
19
20
|
|
20
21
|
class SQLModelConnectionConfig(BaseModel):
|
@@ -130,4 +131,4 @@ class SQLModelConnectionConfig(BaseModel):
|
|
130
131
|
@staticmethod
|
131
132
|
def _table2model(table_name: str) -> str:
|
132
133
|
"""Convert table name to PascalCase model name."""
|
133
|
-
return "".join(word.capitalize() for word in table_name.split("_"))
|
134
|
+
return "".join(word.capitalize() for word in table_name.split("_"))
|
{sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/sql_model/_sqlmodel_load_from_db.py
RENAMED
@@ -1,19 +1,21 @@
|
|
1
|
-
import dask.dataframe as dd
|
2
|
-
from sqlmodel import Session, select, text
|
3
|
-
from typing import Any, Dict, Optional
|
4
1
|
import logging
|
2
|
+
from typing import Any, Dict, Optional
|
3
|
+
|
4
|
+
import dask.dataframe as dd
|
5
5
|
import pandas as pd
|
6
|
+
from sqlmodel import Session, select, text
|
7
|
+
|
6
8
|
|
7
9
|
class SQLModelLoadFromDb:
|
8
10
|
df: dd.DataFrame
|
9
11
|
|
10
12
|
def __init__(
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
13
|
+
self,
|
14
|
+
db_connection,
|
15
|
+
db_query: Optional[Dict[str, Any]] = None,
|
16
|
+
db_params: Optional[Dict[str, Any]] = None,
|
17
|
+
logger=None,
|
18
|
+
**kwargs,
|
17
19
|
):
|
18
20
|
"""
|
19
21
|
Initialize the loader with database connection, query, and parameters.
|
@@ -74,7 +76,7 @@ class SQLModelLoadFromDb:
|
|
74
76
|
results = session.exec(query).fetchall()
|
75
77
|
|
76
78
|
# Convert query results to a Dask DataFrame
|
77
|
-
print("results:",results)
|
79
|
+
print("results:", results)
|
78
80
|
if results:
|
79
81
|
df = dd.from_pandas(pd.DataFrame([r.dict() for r in results]), npartitions=1)
|
80
82
|
else:
|
@@ -96,4 +98,4 @@ class SQLModelLoadFromDb:
|
|
96
98
|
if field_map:
|
97
99
|
rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
|
98
100
|
if rename_mapping:
|
99
|
-
self.df = self.df.rename(columns=rename_mapping, meta={v: "object" for v in rename_mapping.values()})
|
101
|
+
self.df = self.df.rename(columns=rename_mapping, meta={v: "object" for v in rename_mapping.values()})
|
@@ -54,8 +54,10 @@ django_field_conversion_map_dask: Dict[str, callable] = {
|
|
54
54
|
"BooleanField": lambda x: x.astype(bool),
|
55
55
|
"NullBooleanField": lambda x: x.astype(bool),
|
56
56
|
"DateTimeField": lambda x: pd.to_datetime(x, errors="coerce"),
|
57
|
-
"DateField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date,
|
58
|
-
|
57
|
+
"DateField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date,
|
58
|
+
meta=("date", "object")),
|
59
|
+
"TimeField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time,
|
60
|
+
meta=("time", "object")),
|
59
61
|
"DurationField": lambda x: pd.to_timedelta(x, errors="coerce"),
|
60
62
|
"JSONField": lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
|
61
63
|
"ArrayField": lambda x: x.map_partitions(lambda s: s.apply(eval), meta=("array", "object")),
|
@@ -72,12 +74,15 @@ sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
|
|
72
74
|
Numeric.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
|
73
75
|
Boolean.__name__: lambda x: x.astype(bool),
|
74
76
|
DateTime.__name__: lambda x: pd.to_datetime(x, errors="coerce"),
|
75
|
-
Date.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date,
|
76
|
-
|
77
|
+
Date.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date,
|
78
|
+
meta=("date", "object")),
|
79
|
+
Time.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time,
|
80
|
+
meta=("time", "object")),
|
77
81
|
JSON.__name__: lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
|
78
82
|
UUID.__name__: lambda x: x.astype(str),
|
79
83
|
}
|
80
84
|
|
85
|
+
|
81
86
|
# Conversion map with normalized SQLAlchemy field types
|
82
87
|
# sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
|
83
88
|
# "String": lambda x: x.map_partitions(lambda s: s.astype(str), meta=("string", "string")),
|
@@ -129,5 +134,3 @@ def normalize_sqlalchemy_type(field_type):
|
|
129
134
|
|
130
135
|
# Fallback to raw class name
|
131
136
|
return field_type.__class__.__name__
|
132
|
-
|
133
|
-
|
@@ -1,10 +1,13 @@
|
|
1
1
|
import datetime
|
2
|
+
|
2
3
|
import dask.dataframe as dd
|
3
4
|
import pandas as pd
|
4
5
|
from sqlalchemy import func, cast
|
5
6
|
from sqlalchemy.sql.sqltypes import Date, Time
|
7
|
+
|
6
8
|
from sibi_dst.utils import Logger
|
7
9
|
|
10
|
+
|
8
11
|
class FilterHandler:
|
9
12
|
def __init__(self, backend, logger=None):
|
10
13
|
"""
|
@@ -15,7 +18,8 @@ class FilterHandler:
|
|
15
18
|
logger: Optional logger for debugging purposes.
|
16
19
|
"""
|
17
20
|
self.backend = backend
|
18
|
-
self.logger = logger or Logger.default_logger(
|
21
|
+
self.logger = logger or Logger.default_logger(
|
22
|
+
logger_name=self.__class__.__name__) # No-op logger if none provided
|
19
23
|
self.backend_methods = self._get_backend_methods(backend)
|
20
24
|
|
21
25
|
def apply_filters(self, query_or_df, model=None, filters=None):
|
@@ -34,7 +38,7 @@ class FilterHandler:
|
|
34
38
|
for key, value in filters.items():
|
35
39
|
field_name, casting, operation = self._parse_filter_key(key)
|
36
40
|
parsed_value = self._parse_filter_value(casting, value)
|
37
|
-
#print(field_name, casting, operation, parsed_value)
|
41
|
+
# print(field_name, casting, operation, parsed_value)
|
38
42
|
# Get the column and apply backend-specific transformations
|
39
43
|
if self.backend == "sqlalchemy":
|
40
44
|
column = self.backend_methods["get_column"](field_name, model, casting)
|
@@ -67,7 +71,6 @@ class FilterHandler:
|
|
67
71
|
|
68
72
|
return field_name, casting, operation
|
69
73
|
|
70
|
-
|
71
74
|
def _parse_filter_value(self, casting, value):
|
72
75
|
"""
|
73
76
|
Convert filter value to appropriate type based on the casting (e.g., date).
|
@@ -213,4 +216,4 @@ class FilterHandler:
|
|
213
216
|
return [
|
214
217
|
"gte", "lte", "gt", "lt", "exact", "in", "range",
|
215
218
|
"contains", "startswith", "endswith", "isnull",
|
216
|
-
]
|
219
|
+
]
|
@@ -1,7 +1,7 @@
|
|
1
|
+
from typing import Optional, Dict, Union, List
|
1
2
|
|
2
3
|
from pydantic import BaseModel, model_validator, Field
|
3
4
|
|
4
|
-
from typing import Optional, Dict, Union, List
|
5
5
|
dataframe_params: Dict[str, Union[None, str, bool, int, None]] = {
|
6
6
|
"fieldnames": None,
|
7
7
|
"index_col": None,
|
@@ -25,6 +25,7 @@ dataframe_options: Dict[str, Union[bool, str, int, None]] = {
|
|
25
25
|
|
26
26
|
LOOKUP_SEP = "__"
|
27
27
|
|
28
|
+
|
28
29
|
class ParamsConfig(BaseModel):
|
29
30
|
field_map: Optional[Dict] = Field(default_factory=dict)
|
30
31
|
legacy_filters: bool = False
|
@@ -76,4 +77,4 @@ class ParamsConfig(BaseModel):
|
|
76
77
|
new_filter_field = LOOKUP_SEP.join(new_parts)
|
77
78
|
new_filters[new_filter_field] = value
|
78
79
|
|
79
|
-
self.filters = new_filters
|
80
|
+
self.filters = new_filters
|
@@ -1,18 +1,19 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
+
|
3
|
+
from ._airflow_manager import AirflowDAGManager
|
4
|
+
from ._clickhouse_writer import ClickHouseWriter
|
2
5
|
from ._credentials import *
|
3
|
-
from ._log_utils import Logger
|
4
|
-
from ._date_utils import *
|
5
6
|
from ._data_utils import DataUtils
|
7
|
+
from ._data_wrapper import DataWrapper
|
8
|
+
from ._date_utils import *
|
9
|
+
from ._df_utils import DfUtils
|
6
10
|
from ._file_utils import FileUtils
|
7
11
|
from ._filepath_generator import FilePathGenerator
|
8
|
-
from .
|
9
|
-
from ._storage_manager import StorageManager
|
12
|
+
from ._log_utils import Logger
|
10
13
|
from ._parquet_saver import ParquetSaver
|
11
|
-
from .
|
12
|
-
from ._data_wrapper import DataWrapper
|
13
|
-
from ._airflow_manager import AirflowDAGManager
|
14
|
+
from ._storage_manager import StorageManager
|
14
15
|
|
15
|
-
__all__=[
|
16
|
+
__all__ = [
|
16
17
|
"ConfigManager",
|
17
18
|
"ConfigLoader",
|
18
19
|
"Logger",
|
@@ -27,4 +28,4 @@ __all__=[
|
|
27
28
|
"DfUtils",
|
28
29
|
"ClickHouseWriter",
|
29
30
|
"AirflowDAGManager",
|
30
|
-
]
|
31
|
+
]
|
@@ -1,8 +1,9 @@
|
|
1
1
|
import os
|
2
|
-
from jinja2 import Template
|
3
2
|
from datetime import datetime
|
3
|
+
|
4
4
|
import fsspec
|
5
5
|
import httpx
|
6
|
+
from jinja2 import Template
|
6
7
|
|
7
8
|
"""
|
8
9
|
A manager to dynamically generate, save, and upload Airflow DAGs via SSH using fsspec.
|
@@ -54,8 +55,8 @@ with DAG(
|
|
54
55
|
{% endfor %}
|
55
56
|
"""
|
56
57
|
|
57
|
-
class AirflowDAGManager:
|
58
58
|
|
59
|
+
class AirflowDAGManager:
|
59
60
|
|
60
61
|
def __init__(self, output_dir, remote_dags_path, ssh_host, ssh_user, ssh_password, url, auth, wrapper_module_path):
|
61
62
|
"""
|
@@ -208,4 +209,4 @@ class AirflowDAGManager:
|
|
208
209
|
return response.json()
|
209
210
|
except httpx.RequestError as e:
|
210
211
|
print(f"Failed to trigger DAG {dag_id}: {e}")
|
211
|
-
raise
|
212
|
+
raise
|
@@ -1,9 +1,12 @@
|
|
1
|
+
from concurrent.futures import ThreadPoolExecutor
|
2
|
+
|
1
3
|
import clickhouse_connect
|
4
|
+
import pandas as pd
|
2
5
|
from clickhouse_driver import Client
|
3
6
|
from dask.dataframe import dd
|
4
|
-
|
7
|
+
|
5
8
|
from sibi_dst.utils import Logger
|
6
|
-
|
9
|
+
|
7
10
|
|
8
11
|
class ClickHouseWriter:
|
9
12
|
dtype_to_clickhouse = {
|
@@ -19,20 +22,20 @@ class ClickHouseWriter:
|
|
19
22
|
df: dd.DataFrame
|
20
23
|
|
21
24
|
def __init__(self, logger=None, **kwargs):
|
22
|
-
self.clickhouse_host = kwargs.setdefault('host',"localhost")
|
23
|
-
self.clickhouse_port = kwargs.setdefault('port',8123)
|
24
|
-
self.clickhouse_dbname = kwargs.setdefault('database','sibi_data')
|
25
|
-
self.clickhouse_user = kwargs.setdefault('user','default')
|
26
|
-
self.clickhouse_password = kwargs.setdefault('password','')
|
27
|
-
self.clickhouse_table = kwargs.setdefault('table','test_sibi_table')
|
25
|
+
self.clickhouse_host = kwargs.setdefault('host', "localhost")
|
26
|
+
self.clickhouse_port = kwargs.setdefault('port', 8123)
|
27
|
+
self.clickhouse_dbname = kwargs.setdefault('database', 'sibi_data')
|
28
|
+
self.clickhouse_user = kwargs.setdefault('user', 'default')
|
29
|
+
self.clickhouse_password = kwargs.setdefault('password', '')
|
30
|
+
self.clickhouse_table = kwargs.setdefault('table', 'test_sibi_table')
|
28
31
|
|
29
32
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
30
33
|
self.client = None
|
31
|
-
self.order_by=kwargs.setdefault('order_by','id')
|
34
|
+
self.order_by = kwargs.setdefault('order_by', 'id')
|
32
35
|
|
33
36
|
def save_to_clickhouse(self, df, **kwargs):
|
34
37
|
self.df = df.copy()
|
35
|
-
self.order_by = kwargs.setdefault('order_by',self.order_by)
|
38
|
+
self.order_by = kwargs.setdefault('order_by', self.order_by)
|
36
39
|
if len(self.df.head().index) == 0:
|
37
40
|
self.logger.debug("Dataframe is empty")
|
38
41
|
return
|
@@ -86,8 +89,8 @@ class ClickHouseWriter:
|
|
86
89
|
if engine is None:
|
87
90
|
engine = f"ENGINE = MergeTree() order by {self.order_by}"
|
88
91
|
dtypes = self.df.dtypes
|
89
|
-
clickhouse_schema = self._generate_clickhouse_schema(dtypes,self.dtype_to_clickhouse)
|
90
|
-
create_table_sql= f"CREATE TABLE IF NOT EXISTS {self.clickhouse_table} ({clickhouse_schema}) {engine};"
|
92
|
+
clickhouse_schema = self._generate_clickhouse_schema(dtypes, self.dtype_to_clickhouse)
|
93
|
+
create_table_sql = f"CREATE TABLE IF NOT EXISTS {self.clickhouse_table} ({clickhouse_schema}) {engine};"
|
91
94
|
self.logger.debug(f"Creating table SQL:{create_table_sql}")
|
92
95
|
if self.client:
|
93
96
|
self.client.command(create_table_sql)
|
@@ -200,4 +203,4 @@ class ClickHouseWriter:
|
|
200
203
|
with ThreadPoolExecutor() as executor:
|
201
204
|
executor.map(write_partition, partitions, range(len(partitions)))
|
202
205
|
except Exception as e:
|
203
|
-
self.logger.error(f"Error during multi-partition write: {e}")
|
206
|
+
self.logger.error(f"Error during multi-partition write: {e}")
|
@@ -1,12 +1,15 @@
|
|
1
1
|
import datetime
|
2
2
|
from typing import Type, Any, Dict, Optional
|
3
|
+
|
3
4
|
import fsspec
|
4
5
|
import pandas as pd
|
5
6
|
from IPython.display import display
|
6
|
-
from sibi_dst.utils import Logger
|
7
7
|
from tqdm import tqdm
|
8
|
+
|
9
|
+
from sibi_dst.utils import Logger
|
8
10
|
from sibi_dst.utils import ParquetSaver
|
9
11
|
|
12
|
+
|
10
13
|
class DataWrapper:
|
11
14
|
DEFAULT_MAX_AGE_MINUTES = 1440
|
12
15
|
DEFAULT_HISTORY_DAYS_THRESHOLD = 30
|
@@ -89,7 +92,7 @@ class DataWrapper:
|
|
89
92
|
# Filter dates in the category where `update_required` is True
|
90
93
|
dates_to_process = update_plan_table[
|
91
94
|
(update_plan_table["update_category"] == category) & (update_plan_table["update_required"])
|
92
|
-
|
95
|
+
]["date"].tolist()
|
93
96
|
|
94
97
|
date_iterator = dates_to_process
|
95
98
|
if self.show_progress:
|
@@ -130,7 +133,7 @@ class DataWrapper:
|
|
130
133
|
data_object = self.dataclass(**self.class_params)
|
131
134
|
df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
|
132
135
|
|
133
|
-
if len(df.index)==0:
|
136
|
+
if len(df.index) == 0:
|
134
137
|
self.logger.error("No data found for the specified date.")
|
135
138
|
return
|
136
139
|
|
@@ -194,7 +197,7 @@ class DataWrapper:
|
|
194
197
|
"missing_file": missing_file,
|
195
198
|
"update_required": update_required,
|
196
199
|
"update_category": category,
|
197
|
-
"datawrapper class":self.dataclass.__name__
|
200
|
+
"datawrapper class": self.dataclass.__name__
|
198
201
|
})
|
199
202
|
|
200
203
|
update_plan_table = pd.DataFrame(rows)
|
@@ -1,8 +1,9 @@
|
|
1
1
|
import datetime
|
2
|
-
from typing import Union, Tuple, Callable, Dict
|
2
|
+
from typing import Union, Tuple, Callable, Dict
|
3
3
|
|
4
4
|
import numpy as np
|
5
5
|
import pandas as pd
|
6
|
+
|
6
7
|
from sibi_dst.utils import Logger
|
7
8
|
|
8
9
|
|
@@ -32,7 +33,8 @@ class DateUtils:
|
|
32
33
|
raise ValueError(f"Unsupported date format: {value}")
|
33
34
|
|
34
35
|
@classmethod
|
35
|
-
def calc_week_range(cls, reference_date: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> Tuple[
|
36
|
+
def calc_week_range(cls, reference_date: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> Tuple[
|
37
|
+
datetime.date, datetime.date]:
|
36
38
|
"""
|
37
39
|
Calculate the start and end of the week for a given reference date.
|
38
40
|
"""
|
@@ -49,7 +51,8 @@ class DateUtils:
|
|
49
51
|
return datetime.date(year, 1, 1), datetime.date(year, 12, 31)
|
50
52
|
|
51
53
|
@classmethod
|
52
|
-
def get_first_day_of_the_quarter(cls, reference_date: Union[
|
54
|
+
def get_first_day_of_the_quarter(cls, reference_date: Union[
|
55
|
+
str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
|
53
56
|
"""
|
54
57
|
Get the first day of the quarter for a given date.
|
55
58
|
"""
|
@@ -58,7 +61,8 @@ class DateUtils:
|
|
58
61
|
return datetime.date(reference_date.year, 3 * quarter - 2, 1)
|
59
62
|
|
60
63
|
@classmethod
|
61
|
-
def get_last_day_of_the_quarter(cls, reference_date: Union[
|
64
|
+
def get_last_day_of_the_quarter(cls, reference_date: Union[
|
65
|
+
str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
|
62
66
|
"""
|
63
67
|
Get the last day of the quarter for a given date.
|
64
68
|
"""
|
@@ -116,10 +120,12 @@ class DateUtils:
|
|
116
120
|
'current_month': lambda: cls.get_month_range(n=0),
|
117
121
|
'last_month': lambda: cls.get_month_range(n=-1),
|
118
122
|
'current_year': lambda: cls.get_year_timerange(today().year),
|
119
|
-
'current_quarter': lambda: (
|
123
|
+
'current_quarter': lambda: (
|
124
|
+
cls.get_first_day_of_the_quarter(today()), cls.get_last_day_of_the_quarter(today())),
|
120
125
|
'ytd': lambda: (datetime.date(today().year, 1, 1), today()),
|
121
126
|
}
|
122
127
|
|
128
|
+
|
123
129
|
class BusinessDays:
|
124
130
|
def __init__(self, holiday_list, logger):
|
125
131
|
"""
|
@@ -1,7 +1,9 @@
|
|
1
|
-
import pandas as pd
|
2
1
|
import dask.dataframe as dd
|
2
|
+
import pandas as pd
|
3
|
+
|
3
4
|
from ._log_utils import Logger
|
4
5
|
|
6
|
+
|
5
7
|
class DfUtils:
|
6
8
|
def __init__(self, logger=None):
|
7
9
|
"""
|
@@ -210,7 +212,7 @@ class DfUtils:
|
|
210
212
|
df['Total'] = df.sum(axis=1, numeric_only=True)
|
211
213
|
return df
|
212
214
|
|
213
|
-
def summarise_data(self,df, summary_column, values_column, rule='D', agg_func='count'):
|
215
|
+
def summarise_data(self, df, summary_column, values_column, rule='D', agg_func='count'):
|
214
216
|
"""
|
215
217
|
Summarizes data by creating a pivot table and resampling.
|
216
218
|
|
@@ -233,10 +235,12 @@ class DfUtils:
|
|
233
235
|
df = df.set_index(dd.to_datetime(df.index))
|
234
236
|
|
235
237
|
# Group by index and summary columns
|
236
|
-
df_grouped = df.groupby([dd.to_datetime(df.index)] + [summary_column])[values_column].agg(
|
238
|
+
df_grouped = df.groupby([dd.to_datetime(df.index)] + [summary_column])[values_column].agg(
|
239
|
+
agg_func).reset_index()
|
237
240
|
|
238
241
|
# Pivot the table
|
239
|
-
df_pivot = df_grouped.pivot_table(index='index', columns=summary_column, values=values_column,
|
242
|
+
df_pivot = df_grouped.pivot_table(index='index', columns=summary_column, values=values_column,
|
243
|
+
aggfunc='sum').fillna(0)
|
240
244
|
|
241
245
|
# Resample
|
242
246
|
df_pivot.index = dd.to_datetime(df_pivot.index)
|
@@ -269,4 +273,4 @@ class DfUtils:
|
|
269
273
|
Returns:
|
270
274
|
DataFrame: Resampled pivot table.
|
271
275
|
"""
|
272
|
-
return DfUtils.summarise_data(df, summary_columns, value_columns, rule=rule, agg_func=agg_func)
|
276
|
+
return DfUtils.summarise_data(df, summary_columns, value_columns, rule=rule, agg_func=agg_func)
|
@@ -1,10 +1,12 @@
|
|
1
1
|
import shutil
|
2
2
|
from pathlib import Path
|
3
3
|
from typing import Optional
|
4
|
+
|
4
5
|
import fsspec
|
5
6
|
|
6
7
|
from sibi_dst.utils import Logger
|
7
8
|
|
9
|
+
|
8
10
|
class FileUtils:
|
9
11
|
def __init__(self, logger=None):
|
10
12
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
@@ -27,7 +29,7 @@ class FileUtils:
|
|
27
29
|
fs.mkdirs(path)
|
28
30
|
|
29
31
|
@staticmethod
|
30
|
-
def construct_full_path(storage_path:str, parquet_filename: Optional[str]) -> Path:
|
32
|
+
def construct_full_path(storage_path: str, parquet_filename: Optional[str]) -> Path:
|
31
33
|
"""Construct and return the full path for the parquet file."""
|
32
34
|
fs, base_path = fsspec.core.url_to_fs(storage_path)
|
33
35
|
parquet_filename = parquet_filename or "default.parquet"
|
@@ -1,7 +1,8 @@
|
|
1
1
|
import datetime
|
2
|
-
import fsspec
|
3
2
|
import re
|
4
3
|
|
4
|
+
import fsspec
|
5
|
+
|
5
6
|
from sibi_dst.utils import Logger
|
6
7
|
|
7
8
|
|
@@ -150,6 +151,7 @@ class FilePathGenerator:
|
|
150
151
|
return datetime.datetime.strptime(date, '%Y-%m-%d')
|
151
152
|
return date
|
152
153
|
|
154
|
+
|
153
155
|
"""
|
154
156
|
Usage:
|
155
157
|
# Initialize the generator
|
@@ -182,4 +184,4 @@ for fp in file_paths:
|
|
182
184
|
|
183
185
|
df_pandas = pd.concat(dataframes, ignore_index=True)
|
184
186
|
print(df_pandas.head())
|
185
|
-
"""
|
187
|
+
"""
|
@@ -1,7 +1,6 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
from typing import Optional
|
3
3
|
|
4
|
-
import dask_expr
|
5
4
|
import fsspec
|
6
5
|
import pyarrow as pa
|
7
6
|
|
@@ -103,4 +102,3 @@ class ParquetSaver:
|
|
103
102
|
self.df_result.to_parquet(
|
104
103
|
str(full_path), engine="pyarrow", schema=schema, write_index=False
|
105
104
|
)
|
106
|
-
|
File without changes
|
File without changes
|
@@ -1,6 +1,6 @@
|
|
1
|
+
from ._sqlachemy_filter_handler import SqlAlchemyFilterHandler
|
1
2
|
from ._sqlalchemy_db_connection import SqlAlchemyConnectionConfig
|
2
|
-
from ._sqlalchemy_model_builder import SqlAlchemyModelBuilder
|
3
3
|
from ._sqlalchemy_load_from_db import SqlAlchemyLoadFromDb
|
4
|
-
from .
|
4
|
+
from ._sqlalchemy_model_builder import SqlAlchemyModelBuilder
|
5
5
|
|
6
6
|
__all__ = ['SqlAlchemyConnectionConfig', 'SqlAlchemyModelBuilder', 'SqlAlchemyLoadFromDb', 'SqlAlchemyFilterHandler']
|
@@ -1,18 +1,19 @@
|
|
1
1
|
from typing import Any, Optional
|
2
2
|
|
3
3
|
from pydantic import BaseModel, model_validator
|
4
|
+
from sqlalchemy import create_engine
|
4
5
|
from sqlalchemy.exc import OperationalError
|
5
6
|
from sqlalchemy.sql import text
|
6
|
-
|
7
|
+
|
7
8
|
from ._sqlalchemy_model_builder import SqlAlchemyModelBuilder
|
8
9
|
|
10
|
+
|
9
11
|
class SqlAlchemyConnectionConfig(BaseModel):
|
10
12
|
connection_url: str
|
11
13
|
table: Optional[str] = None
|
12
14
|
model: Any = None
|
13
15
|
engine: Optional[Any] = None # Save engine to reuse it
|
14
16
|
|
15
|
-
|
16
17
|
@model_validator(mode="after")
|
17
18
|
def validate_and_initialize(self):
|
18
19
|
"""
|
@@ -45,4 +46,3 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
45
46
|
connection.execute(text("SELECT 1"))
|
46
47
|
except OperationalError as e:
|
47
48
|
raise ValueError(f"Failed to connect to the database: {e}")
|
48
|
-
|
File without changes
|
File without changes
|