sibi-dst 0.3.12__tar.gz → 0.3.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/PKG-INFO +4 -3
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/pyproject.toml +4 -3
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/_df_helper.py +21 -21
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/django/_io_dask.py +4 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py +9 -5
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py +15 -16
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py +4 -1
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/utils/_clickhouse_writer.py +3 -3
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/utils/_data_utils.py +35 -77
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/README.md +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/__init__.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/__init__.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/core/_defaults.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/__init__.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/django/__init__.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/django/_django_db_connection.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/django/_django_load_from_db.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/django/_django_sql_model_builder.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/django/_io_dask_alt.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/http/__init__.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/http/_http_config.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/parquet/__init__.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/parquet/_parquet_filter_handler.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/parquet/_parquet_options.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/sql_alchemy/__init__.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_db_connection.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/sql_model/__init__.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/sql_model/_sqlmodel_db_connection.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/sql_model/_sqlmodel_load_from_db.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/utils/__init__.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/utils/_airflow_manager.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/utils/_credentials.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/utils/_data_wrapper.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/utils/_date_utils.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/utils/_df_utils.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/utils/_file_utils.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/utils/_filepath_generator.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/utils/_log_utils.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/utils/_parquet_saver.py +0 -0
- {sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/utils/_storage_manager.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sibi-dst
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.15
|
4
4
|
Summary: Data Science Toolkit
|
5
5
|
Author: Luis Valverde
|
6
6
|
Author-email: lvalverdeb@gmail.com
|
@@ -14,7 +14,8 @@ Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
|
|
14
14
|
Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
|
15
15
|
Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
|
16
16
|
Requires-Dist: dask[complete] (>=2024.11.1,<2025.0.0)
|
17
|
-
Requires-Dist: django (
|
17
|
+
Requires-Dist: django (>=5.1.4,<6.0.0)
|
18
|
+
Requires-Dist: djangorestframework (>=3.15.2,<4.0.0)
|
18
19
|
Requires-Dist: httpx (>=0.27.2,<0.28.0)
|
19
20
|
Requires-Dist: ipython (>=8.29.0,<9.0.0)
|
20
21
|
Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
|
@@ -28,7 +29,7 @@ Requires-Dist: pydantic (>=2.9.2,<3.0.0)
|
|
28
29
|
Requires-Dist: pymysql (>=1.1.1,<2.0.0)
|
29
30
|
Requires-Dist: pytest (>=8.3.3,<9.0.0)
|
30
31
|
Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
|
31
|
-
Requires-Dist:
|
32
|
+
Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0)
|
32
33
|
Requires-Dist: tornado (>=6.4.1,<7.0.0)
|
33
34
|
Requires-Dist: tqdm (>=4.67.0,<5.0.0)
|
34
35
|
Requires-Dist: uvicorn (>=0.32.1,<0.33.0)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "sibi-dst"
|
3
|
-
version = "0.3.
|
3
|
+
version = "0.3.15"
|
4
4
|
description = "Data Science Toolkit"
|
5
5
|
authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
|
6
6
|
readme = "README.md"
|
@@ -13,13 +13,12 @@ dask = {extras = ["complete"], version = "^2024.11.1"}
|
|
13
13
|
pydantic = "^2.9.2"
|
14
14
|
tornado = "^6.4.1"
|
15
15
|
psutil = "^6.1.0"
|
16
|
-
django = "
|
16
|
+
django = "^5.1.4"
|
17
17
|
pyarrow = "^18.0.0"
|
18
18
|
mysqlclient = "^2.2.6"
|
19
19
|
pymysql = "^1.1.1"
|
20
20
|
httpx = "^0.27.2"
|
21
21
|
python-dotenv = "^1.0.1"
|
22
|
-
sqlmodel = "^0.0.22"
|
23
22
|
tqdm = "^4.67.0"
|
24
23
|
openpyxl = "^3.1.5"
|
25
24
|
jinja2 = "^3.1.4"
|
@@ -32,6 +31,8 @@ paramiko = "^3.5.0"
|
|
32
31
|
chardet = "^5.2.0"
|
33
32
|
charset-normalizer = "^3.4.0"
|
34
33
|
uvicorn = "^0.32.1"
|
34
|
+
sqlalchemy = "^2.0.36"
|
35
|
+
djangorestframework = "^3.15.2"
|
35
36
|
|
36
37
|
|
37
38
|
[build-system]
|
@@ -7,9 +7,9 @@ import dask.dataframe as dd
|
|
7
7
|
import pandas as pd
|
8
8
|
from pydantic import BaseModel
|
9
9
|
|
10
|
-
from sibi_dst.utils import ParquetSaver, ClickHouseWriter
|
11
10
|
from sibi_dst.df_helper.core import QueryConfig, ParamsConfig
|
12
11
|
from sibi_dst.utils import Logger
|
12
|
+
from sibi_dst.utils import ParquetSaver, ClickHouseWriter
|
13
13
|
from .plugins.django import *
|
14
14
|
from .plugins.http import HttpConfig
|
15
15
|
from .plugins.parquet import ParquetConfig, ParquetFilterHandler
|
@@ -18,6 +18,7 @@ from .plugins.sql_alchemy import *
|
|
18
18
|
# Define a generic type variable for BaseModel subclasses
|
19
19
|
T = TypeVar("T", bound=BaseModel)
|
20
20
|
|
21
|
+
|
21
22
|
class DfHelper:
|
22
23
|
df: Union[dd.DataFrame, pd.DataFrame] = None
|
23
24
|
plugin_django_connection: Optional[DjangoConnectionConfig] = None
|
@@ -39,13 +40,12 @@ class DfHelper:
|
|
39
40
|
self.debug = kwargs.setdefault("debug", False)
|
40
41
|
self.verbose_debug = kwargs.setdefault("verbose_debug", False)
|
41
42
|
self.parquet_storage_path = kwargs.setdefault("parquet_storage_path", None)
|
42
|
-
self.dt_field=kwargs.setdefault("dt_field", None)
|
43
|
+
self.dt_field = kwargs.setdefault("dt_field", None)
|
43
44
|
self.as_pandas = kwargs.setdefault("as_pandas", False)
|
44
45
|
kwargs.setdefault("live", True)
|
45
46
|
kwargs.setdefault("logger", self.logger)
|
46
47
|
self.post_init(**kwargs)
|
47
48
|
|
48
|
-
|
49
49
|
def post_init(self, **kwargs):
|
50
50
|
self.logger.info(f"Source used: {self.source}")
|
51
51
|
self.plugin_query = self.__get_config(QueryConfig, kwargs)
|
@@ -58,7 +58,7 @@ class DfHelper:
|
|
58
58
|
elif self.source == 'http':
|
59
59
|
self.plugin_http = HttpConfig(**kwargs)
|
60
60
|
elif self.source == 'sqlalchemy':
|
61
|
-
self.plugin_sqlalchemy = self.__get_config(SqlAlchemyConnectionConfig,kwargs)
|
61
|
+
self.plugin_sqlalchemy = self.__get_config(SqlAlchemyConnectionConfig, kwargs)
|
62
62
|
|
63
63
|
@staticmethod
|
64
64
|
def __get_config(model: [T], kwargs: Dict[str, Any]) -> Union[T]:
|
@@ -99,7 +99,6 @@ class DfHelper:
|
|
99
99
|
self.logger.info("Regular asyncio run...")
|
100
100
|
return asyncio.run(self._load_from_http(**options))
|
101
101
|
|
102
|
-
|
103
102
|
def _load_from_sqlalchemy(self, **options):
|
104
103
|
try:
|
105
104
|
options.setdefault("debug", self.debug)
|
@@ -116,7 +115,7 @@ class DfHelper:
|
|
116
115
|
self._post_process_df()
|
117
116
|
self.logger.info("Data successfully loaded from sqlalchemy database.")
|
118
117
|
except Exception as e:
|
119
|
-
self.logger.error(f"Failed to load data from
|
118
|
+
self.logger.error(f"Failed to load data from sqlalchemy database: {e}: options: {options}")
|
120
119
|
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
121
120
|
|
122
121
|
return self.df
|
@@ -138,7 +137,7 @@ class DfHelper:
|
|
138
137
|
self.logger.info("Data successfully loaded from django database.")
|
139
138
|
except Exception as e:
|
140
139
|
self.logger.error(f"Failed to load data from django database: {e}")
|
141
|
-
self.df=dd.from_pandas(pd.DataFrame(), npartitions=1)
|
140
|
+
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
142
141
|
|
143
142
|
return self.df
|
144
143
|
|
@@ -151,10 +150,9 @@ class DfHelper:
|
|
151
150
|
self.df = await self.plugin_http.fetch_data(**options)
|
152
151
|
except Exception as e:
|
153
152
|
self.logger.error(f"Failed to load data from http plugin: {e}")
|
154
|
-
self.df=dd.from_pandas(pd.DataFrame(), npartitions=1)
|
153
|
+
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
155
154
|
return self.df
|
156
155
|
|
157
|
-
|
158
156
|
def _post_process_df(self):
|
159
157
|
"""
|
160
158
|
Efficiently process the DataFrame by filtering, renaming, and setting indices.
|
@@ -195,10 +193,16 @@ class DfHelper:
|
|
195
193
|
self.logger.info("Post-processing of DataFrame completed.")
|
196
194
|
|
197
195
|
def _process_loaded_data(self):
|
198
|
-
|
199
|
-
|
200
|
-
|
196
|
+
self.logger.info(f"Type of self.df: {type(self.df)}")
|
197
|
+
if self.df.map_partitions(len).compute().sum() > 0:
|
198
|
+
field_map = self.plugin_params.field_map or {}
|
199
|
+
if isinstance(field_map, dict):
|
201
200
|
rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
|
201
|
+
missing_columns = [k for k in field_map.keys() if k not in self.df.columns]
|
202
|
+
|
203
|
+
if missing_columns:
|
204
|
+
self.logger.warning(
|
205
|
+
f"The following columns in field_map are not in the DataFrame: {missing_columns}")
|
202
206
|
|
203
207
|
def rename_columns(df, mapping):
|
204
208
|
return df.rename(columns=mapping)
|
@@ -214,14 +218,11 @@ class DfHelper:
|
|
214
218
|
ps.save_to_parquet(parquet_filename)
|
215
219
|
self.logger.info(f"Parquet saved to {parquet_filename} in parquet storage: {self.parquet_storage_path}.")
|
216
220
|
|
217
|
-
def save_to_clickhouse(self,
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
}
|
223
|
-
credentials = {**credentials, **click_config}
|
224
|
-
cs=ClickHouseWriter(logger=self.logger, **credentials)
|
221
|
+
def save_to_clickhouse(self, **credentials):
|
222
|
+
if self.df.map_partitions(len).compute().sum() == 0:
|
223
|
+
self.logger.info("Cannot write to clickhouse since Dataframe is empty")
|
224
|
+
return
|
225
|
+
cs = ClickHouseWriter(logger=self.logger, **credentials)
|
225
226
|
cs.save_to_clickhouse(self.df)
|
226
227
|
self.logger.info("Save to ClickHouse completed.")
|
227
228
|
|
@@ -295,7 +296,6 @@ class DfHelper:
|
|
295
296
|
kwargs[f"{mapped_field}__date__lte"] = end
|
296
297
|
return self.load(**kwargs)
|
297
298
|
|
298
|
-
|
299
299
|
@staticmethod
|
300
300
|
def parse_date(date_str: str) -> Union[datetime.datetime, datetime.date]:
|
301
301
|
try:
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import itertools
|
2
2
|
|
3
3
|
import dask.dataframe as dd
|
4
|
+
import dask_expr
|
4
5
|
import django
|
5
6
|
import pandas as pd
|
6
7
|
from django.core.cache import cache
|
@@ -239,4 +240,7 @@ class ReadFrameDask:
|
|
239
240
|
if verbose:
|
240
241
|
self.update_with_verbose(dask_df, fieldnames, fields)
|
241
242
|
|
243
|
+
if isinstance(dask_df, dask_expr._collection.DataFrame):
|
244
|
+
dask_df = dask_df.to_legacy_dataframe()
|
245
|
+
|
242
246
|
return dask_df
|
{sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py
RENAMED
@@ -1,6 +1,7 @@
|
|
1
1
|
import itertools
|
2
2
|
|
3
3
|
import dask.dataframe as dd
|
4
|
+
import dask_expr
|
4
5
|
import pandas as pd
|
5
6
|
from sqlalchemy import create_engine, inspect, select
|
6
7
|
from sqlalchemy.orm import sessionmaker
|
@@ -10,7 +11,7 @@ from sibi_dst.utils import Logger
|
|
10
11
|
|
11
12
|
|
12
13
|
class SQLAlchemyDask:
|
13
|
-
def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None,
|
14
|
+
def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, debug=False):
|
14
15
|
"""
|
15
16
|
Initialize with an SQLAlchemy query and database engine URL.
|
16
17
|
|
@@ -19,13 +20,13 @@ class SQLAlchemyDask:
|
|
19
20
|
:param engine_url: Database connection string for SQLAlchemy engine.
|
20
21
|
:param chunk_size: Number of records per chunk for Dask partitions.
|
21
22
|
:param logger: Logger instance for logging.
|
22
|
-
:param
|
23
|
+
:param debug: Whether to print detailed logs.
|
23
24
|
"""
|
24
25
|
self.query = None
|
25
26
|
self.model = model
|
26
27
|
self.filters = filters
|
27
28
|
self.chunk_size = chunk_size
|
28
|
-
self.
|
29
|
+
self.debug = debug
|
29
30
|
self.engine = create_engine(engine_url)
|
30
31
|
self.Session = sessionmaker(bind=self.engine)
|
31
32
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
@@ -118,14 +119,17 @@ class SQLAlchemyDask:
|
|
118
119
|
partitions.append(dd.from_pandas(df, npartitions=1))
|
119
120
|
|
120
121
|
# Concatenate all partitions
|
121
|
-
# print(partitions)
|
122
122
|
if partitions:
|
123
123
|
dask_df = dd.concat(partitions, axis=0, ignore_index=True)
|
124
124
|
else:
|
125
125
|
dask_df = dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
|
126
126
|
|
127
|
-
if self.
|
127
|
+
if self.debug:
|
128
128
|
self.logger.info(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
|
129
|
+
|
130
|
+
if isinstance(dask_df, dask_expr._collection.DataFrame):
|
131
|
+
dask_df = dask_df.to_legacy_dataframe()
|
132
|
+
|
129
133
|
return dask_df
|
130
134
|
|
131
135
|
except Exception as e:
|
@@ -1,22 +1,13 @@
|
|
1
|
-
from typing import Dict
|
2
|
-
|
3
1
|
import dask.dataframe as dd
|
4
2
|
import pandas as pd
|
5
|
-
from sqlalchemy.inspection import inspect
|
6
|
-
from sqlalchemy.orm import sessionmaker
|
7
|
-
from sqlalchemy import select
|
8
|
-
#from sqlmodel import Session, select
|
9
3
|
|
10
|
-
from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
|
11
|
-
normalize_sqlalchemy_type
|
4
|
+
from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
|
12
5
|
from sibi_dst.utils import Logger
|
13
6
|
from ._io_sqlalchemy_dask import SQLAlchemyDask
|
14
|
-
from ._sqlachemy_filter_handler import SqlAlchemyFilterHandler
|
15
7
|
from ._sqlalchemy_db_connection import SqlAlchemyConnectionConfig
|
16
8
|
|
17
|
-
|
18
9
|
class SqlAlchemyLoadFromDb:
|
19
|
-
df: dd.DataFrame
|
10
|
+
df: dd.DataFrame = None
|
20
11
|
|
21
12
|
def __init__(
|
22
13
|
self,
|
@@ -43,17 +34,25 @@ class SqlAlchemyLoadFromDb:
|
|
43
34
|
"""
|
44
35
|
Load data into a Dask DataFrame based on the query and parameters.
|
45
36
|
"""
|
46
|
-
self.
|
37
|
+
self._build_and_load()
|
47
38
|
return self.df
|
48
39
|
|
49
40
|
def _build_and_load(self) -> dd.DataFrame:
|
41
|
+
|
50
42
|
try:
|
51
|
-
reader = SQLAlchemyDask(model=self.model, filters=self.params_config.filters,engine_url=self.engine.url, logger=self.logger, chunk_size=1000,
|
52
|
-
df =
|
53
|
-
|
43
|
+
# reader = SQLAlchemyDask(model=self.model, filters=self.params_config.filters,engine_url=self.engine.url, logger=self.logger, chunk_size=1000, debug=self.debug)
|
44
|
+
self.df = SQLAlchemyDask(
|
45
|
+
model=self.model,
|
46
|
+
filters=self.params_config.filters,
|
47
|
+
engine_url=self.engine.url,
|
48
|
+
logger=self.logger,
|
49
|
+
chunk_size=1000,
|
50
|
+
debug=self.debug).read_frame()
|
51
|
+
if self.df is None or len(self.df.head().index) == 0:
|
54
52
|
self.logger.warning("Query returned no results.")
|
55
53
|
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
56
|
-
|
54
|
+
|
55
|
+
return self.df
|
57
56
|
except Exception as e:
|
58
57
|
self.logger.error(f"Failed to load data into Dask DataFrame.{e}")
|
59
58
|
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
@@ -82,9 +82,12 @@ class SqlAlchemyModelBuilder:
|
|
82
82
|
dict: Dictionary of column attributes.
|
83
83
|
"""
|
84
84
|
columns = {}
|
85
|
+
reserved_names = ["metadata", "class_", "table"]
|
86
|
+
|
85
87
|
for column in table.columns:
|
86
88
|
column_name = self.normalize_column_name(column.name)
|
87
|
-
|
89
|
+
if column_name not in reserved_names:
|
90
|
+
columns[column_name] = column
|
88
91
|
return columns
|
89
92
|
|
90
93
|
def add_relationships(self, attrs, table: Table):
|
@@ -31,9 +31,9 @@ class ClickHouseWriter:
|
|
31
31
|
self.order_by=kwargs.setdefault('order_by','id')
|
32
32
|
|
33
33
|
def save_to_clickhouse(self, df, **kwargs):
|
34
|
-
self.df = df
|
34
|
+
self.df = df.copy()
|
35
35
|
self.order_by = kwargs.setdefault('order_by',self.order_by)
|
36
|
-
if len(self.df.index) == 0:
|
36
|
+
if len(self.df.head().index) == 0:
|
37
37
|
self.logger.info("Dataframe is empty")
|
38
38
|
return
|
39
39
|
self._handle_missing_values()
|
@@ -122,7 +122,7 @@ class ClickHouseWriter:
|
|
122
122
|
"""
|
123
123
|
Writes the Dask DataFrame to a ClickHouse table partition by partition.
|
124
124
|
"""
|
125
|
-
if len(self.df.index) == 0:
|
125
|
+
if len(self.df.head().index) == 0:
|
126
126
|
self.logger.info("No data found. Nothing written.")
|
127
127
|
return
|
128
128
|
|
@@ -1,77 +1,32 @@
|
|
1
|
-
import pandas as pd
|
2
1
|
import dask.dataframe as dd
|
2
|
+
import pandas as pd
|
3
|
+
|
3
4
|
from sibi_dst.utils import Logger
|
4
5
|
|
6
|
+
|
5
7
|
class DataUtils:
|
6
8
|
|
7
|
-
def __init__(self, logger=None):
|
9
|
+
def __init__(self, logger=None, **kwargs):
|
8
10
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
11
|
+
self.debug = kwargs.get('debug', False)
|
9
12
|
|
10
13
|
def transform_numeric_cols(self, df, columns, fill_value=0, dtype=int):
|
11
14
|
if not columns:
|
12
15
|
self.logger.warning('No columns specified')
|
13
|
-
|
16
|
+
self.logger.debug(f'Dataframe type:{type(df)}')
|
14
17
|
columns = [column for column in columns if column in df.columns]
|
15
18
|
for col in columns:
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
)
|
24
|
-
else:
|
25
|
-
# For Pandas DataFrame, handle mixed types and invalid values
|
26
|
-
df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to numeric, invalid to NaN
|
27
|
-
df[col] = df[col].fillna(fill_value).astype(dtype)
|
19
|
+
# Replace NaN with 0, then convert to boolean
|
20
|
+
df[col] = df[col].map_partitions(
|
21
|
+
lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
|
22
|
+
.fillna(fill_value) # Replace NaN with 0
|
23
|
+
.astype(dtype),
|
24
|
+
meta=(col, dtype)
|
25
|
+
)
|
28
26
|
|
29
27
|
return df
|
30
28
|
|
31
|
-
|
32
|
-
def transform_numeric_columns(df, columns=None, fill_value=0, transform_func=None):
|
33
|
-
"""
|
34
|
-
Transform numeric columns in a DataFrame (Pandas or Dask), handling missing values and applying optional transformations.
|
35
|
-
|
36
|
-
Parameters:
|
37
|
-
- df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
|
38
|
-
- columns (list of str, optional): Specific columns to transform. If None, all numeric columns are transformed.
|
39
|
-
- fill_value (int or float): The value to replace NA values with.
|
40
|
-
- transform_func (callable, optional): The transformation function to apply.
|
41
|
-
If None, no additional transformation is applied.
|
42
|
-
|
43
|
-
Returns:
|
44
|
-
- pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with transformed numeric columns.
|
45
|
-
"""
|
46
|
-
if columns is None:
|
47
|
-
# Detect numeric columns
|
48
|
-
columns = df.select_dtypes(include=['number']).columns.tolist()
|
49
|
-
|
50
|
-
if not columns:
|
51
|
-
return df
|
52
|
-
|
53
|
-
columns = [column for column in columns if column in df.columns]
|
54
|
-
# Default transformation function (identity) if none is provided
|
55
|
-
if transform_func is None:
|
56
|
-
transform_func = lambda x: x
|
57
|
-
|
58
|
-
# Batch processing for Dask
|
59
|
-
if isinstance(df, dd.DataFrame):
|
60
|
-
def transform_partition(partition):
|
61
|
-
# Apply transformations for all numeric columns in a single pass
|
62
|
-
partition[columns] = partition[columns].fillna(fill_value).map(transform_func)
|
63
|
-
return partition
|
64
|
-
|
65
|
-
# Apply the transformation function to all specified columns
|
66
|
-
df = df.map_partitions(transform_partition, meta=df)
|
67
|
-
else:
|
68
|
-
# Pandas: Vectorized operations for all specified columns
|
69
|
-
df[columns] = df[columns].fillna(fill_value).map(transform_func)
|
70
|
-
|
71
|
-
return df
|
72
|
-
|
73
|
-
@staticmethod
|
74
|
-
def transform_boolean_columns(df, columns=None):
|
29
|
+
def transform_boolean_columns(self, df, columns=None):
|
75
30
|
"""
|
76
31
|
Detect if the provided columns in a DataFrame (Pandas or Dask) contain only 0 and 1
|
77
32
|
and convert them to boolean. Detection is performed using a sample.
|
@@ -84,23 +39,20 @@ class DataUtils:
|
|
84
39
|
Returns:
|
85
40
|
- pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with transformed boolean columns.
|
86
41
|
"""
|
42
|
+
|
87
43
|
# Apply transformation to each specified column
|
88
44
|
for col in columns:
|
89
45
|
if col in df.columns:
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
# For Pandas DataFrame, handle mixed types and invalid values
|
101
|
-
df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to numeric, invalid to NaN
|
102
|
-
df[col] = df[col].fillna(0).astype(int).astype(bool)
|
103
|
-
|
46
|
+
# Replace NaN with 0, then convert to boolean
|
47
|
+
df[col] = df[col].map_partitions(
|
48
|
+
lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
|
49
|
+
.fillna(0) # Replace NaN with 0
|
50
|
+
.astype(int) # Ensure integer type
|
51
|
+
.astype(bool), # Convert to boolean
|
52
|
+
meta=(col, 'bool')
|
53
|
+
)
|
54
|
+
if self.debug:
|
55
|
+
self.logger.debug(f'Dataframe type:{type(df)}, boolean applied to columns: {columns}')
|
104
56
|
return df
|
105
57
|
|
106
58
|
def merge_lookup_data(self, classname, df, **kwargs):
|
@@ -141,12 +93,19 @@ class DataUtils:
|
|
141
93
|
|
142
94
|
# Get unique IDs from source column
|
143
95
|
ids = df[source_col].dropna().unique()
|
144
|
-
if
|
96
|
+
# Compute if it's a Dask Series
|
97
|
+
if isinstance(ids, dd.core.Series):
|
145
98
|
ids = ids.compute()
|
99
|
+
|
100
|
+
# Check if any IDs are found
|
146
101
|
if not len(ids):
|
147
102
|
self.logger.info(f"No IDs found in the source column: {source_col}")
|
148
103
|
return df
|
149
|
-
|
104
|
+
|
105
|
+
# Convert to a list only if necessary and sort
|
106
|
+
if not isinstance(ids, list):
|
107
|
+
ids = ids.tolist()
|
108
|
+
ids = sorted(ids)
|
150
109
|
# Prepare kwargs for loading lookup data
|
151
110
|
load_kwargs = kwargs.copy()
|
152
111
|
load_kwargs.update({
|
@@ -167,14 +126,13 @@ class DataUtils:
|
|
167
126
|
df = df.merge(result, how='left', left_on=source_col, right_on=temp_join_col)
|
168
127
|
|
169
128
|
if fillna_source_description_alias and source_description_alias in df.columns:
|
170
|
-
df[source_description_alias]=df[source_description_alias].fillna('')
|
129
|
+
df[source_description_alias] = df[source_description_alias].fillna('')
|
171
130
|
|
172
131
|
# Drop temp_join_col if present
|
173
132
|
df = df.drop(columns='temp_join_col', errors='ignore')
|
174
133
|
|
175
134
|
return df
|
176
135
|
|
177
|
-
|
178
136
|
def is_dataframe_empty(self, df):
|
179
137
|
"""
|
180
138
|
Check if a DataFrame (Pandas or Dask) is empty.
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/django/_django_db_connection.py
RENAMED
File without changes
|
{sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/django/_django_load_from_db.py
RENAMED
File without changes
|
{sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/django/_django_sql_model_builder.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/parquet/_parquet_filter_handler.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/sql_model/_sqlmodel_db_connection.py
RENAMED
File without changes
|
{sibi_dst-0.3.12 → sibi_dst-0.3.15}/sibi_dst/df_helper/plugins/sql_model/_sqlmodel_load_from_db.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|