sibi-dst 0.3.11__tar.gz → 0.3.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/PKG-INFO +7 -3
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/pyproject.toml +7 -3
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/_df_helper.py +20 -13
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/django/_io_dask.py +4 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/http/_http_config.py +2 -3
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/parquet/_parquet_filter_handler.py +4 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/parquet/_parquet_options.py +2 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py +9 -5
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py +15 -16
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/utils/__init__.py +2 -1
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/utils/_clickhouse_writer.py +3 -3
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/utils/_data_utils.py +91 -62
- sibi_dst-0.3.14/sibi_dst/utils/_data_wrapper.py +238 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/utils/_date_utils.py +130 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/utils/_df_utils.py +91 -0
- sibi_dst-0.3.11/sibi_dst/utils/_data_wrapper.py +0 -556
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/README.md +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/__init__.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/__init__.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/core/_defaults.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/__init__.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/django/__init__.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/django/_django_db_connection.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/django/_django_load_from_db.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/django/_django_sql_model_builder.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/django/_io_dask_alt.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/http/__init__.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/parquet/__init__.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/sql_alchemy/__init__.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_db_connection.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/sql_model/__init__.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/sql_model/_sqlmodel_db_connection.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/sql_model/_sqlmodel_load_from_db.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/utils/_airflow_manager.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/utils/_credentials.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/utils/_file_utils.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/utils/_filepath_generator.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/utils/_log_utils.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/utils/_parquet_saver.py +0 -0
- {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/utils/_storage_manager.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sibi-dst
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.14
|
4
4
|
Summary: Data Science Toolkit
|
5
5
|
Author: Luis Valverde
|
6
6
|
Author-email: lvalverdeb@gmail.com
|
@@ -9,10 +9,13 @@ Classifier: Programming Language :: Python :: 3
|
|
9
9
|
Classifier: Programming Language :: Python :: 3.11
|
10
10
|
Classifier: Programming Language :: Python :: 3.12
|
11
11
|
Requires-Dist: apache-airflow-client (>=2.10.0,<3.0.0)
|
12
|
+
Requires-Dist: chardet (>=5.2.0,<6.0.0)
|
13
|
+
Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
|
12
14
|
Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
|
13
15
|
Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
|
14
16
|
Requires-Dist: dask[complete] (>=2024.11.1,<2025.0.0)
|
15
|
-
Requires-Dist: django (
|
17
|
+
Requires-Dist: django (>=5.1.4,<6.0.0)
|
18
|
+
Requires-Dist: djangorestframework (>=3.15.2,<4.0.0)
|
16
19
|
Requires-Dist: httpx (>=0.27.2,<0.28.0)
|
17
20
|
Requires-Dist: ipython (>=8.29.0,<9.0.0)
|
18
21
|
Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
|
@@ -26,9 +29,10 @@ Requires-Dist: pydantic (>=2.9.2,<3.0.0)
|
|
26
29
|
Requires-Dist: pymysql (>=1.1.1,<2.0.0)
|
27
30
|
Requires-Dist: pytest (>=8.3.3,<9.0.0)
|
28
31
|
Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
|
29
|
-
Requires-Dist:
|
32
|
+
Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0)
|
30
33
|
Requires-Dist: tornado (>=6.4.1,<7.0.0)
|
31
34
|
Requires-Dist: tqdm (>=4.67.0,<5.0.0)
|
35
|
+
Requires-Dist: uvicorn (>=0.32.1,<0.33.0)
|
32
36
|
Description-Content-Type: text/markdown
|
33
37
|
|
34
38
|
# sibi-dst
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "sibi-dst"
|
3
|
-
version = "0.3.
|
3
|
+
version = "0.3.14"
|
4
4
|
description = "Data Science Toolkit"
|
5
5
|
authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
|
6
6
|
readme = "README.md"
|
@@ -13,13 +13,12 @@ dask = {extras = ["complete"], version = "^2024.11.1"}
|
|
13
13
|
pydantic = "^2.9.2"
|
14
14
|
tornado = "^6.4.1"
|
15
15
|
psutil = "^6.1.0"
|
16
|
-
django = "
|
16
|
+
django = "^5.1.4"
|
17
17
|
pyarrow = "^18.0.0"
|
18
18
|
mysqlclient = "^2.2.6"
|
19
19
|
pymysql = "^1.1.1"
|
20
20
|
httpx = "^0.27.2"
|
21
21
|
python-dotenv = "^1.0.1"
|
22
|
-
sqlmodel = "^0.0.22"
|
23
22
|
tqdm = "^4.67.0"
|
24
23
|
openpyxl = "^3.1.5"
|
25
24
|
jinja2 = "^3.1.4"
|
@@ -29,6 +28,11 @@ pytest = "^8.3.3"
|
|
29
28
|
clickhouse-connect = "^0.8.7"
|
30
29
|
clickhouse-driver = "^0.2.9"
|
31
30
|
paramiko = "^3.5.0"
|
31
|
+
chardet = "^5.2.0"
|
32
|
+
charset-normalizer = "^3.4.0"
|
33
|
+
uvicorn = "^0.32.1"
|
34
|
+
sqlalchemy = "^2.0.36"
|
35
|
+
djangorestframework = "^3.15.2"
|
32
36
|
|
33
37
|
|
34
38
|
[build-system]
|
@@ -4,6 +4,7 @@ from typing import Any, Dict, TypeVar
|
|
4
4
|
from typing import Union, Optional
|
5
5
|
|
6
6
|
import dask.dataframe as dd
|
7
|
+
import dask_expr
|
7
8
|
import pandas as pd
|
8
9
|
from pydantic import BaseModel
|
9
10
|
|
@@ -42,6 +43,7 @@ class DfHelper:
|
|
42
43
|
self.dt_field=kwargs.setdefault("dt_field", None)
|
43
44
|
self.as_pandas = kwargs.setdefault("as_pandas", False)
|
44
45
|
kwargs.setdefault("live", True)
|
46
|
+
kwargs.setdefault("logger", self.logger)
|
45
47
|
self.post_init(**kwargs)
|
46
48
|
|
47
49
|
|
@@ -115,7 +117,7 @@ class DfHelper:
|
|
115
117
|
self._post_process_df()
|
116
118
|
self.logger.info("Data successfully loaded from sqlalchemy database.")
|
117
119
|
except Exception as e:
|
118
|
-
self.logger.error(f"Failed to load data from
|
120
|
+
self.logger.error(f"Failed to load data from sqlalchemy database: {e}: options: {options}")
|
119
121
|
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
120
122
|
|
121
123
|
return self.df
|
@@ -194,10 +196,16 @@ class DfHelper:
|
|
194
196
|
self.logger.info("Post-processing of DataFrame completed.")
|
195
197
|
|
196
198
|
def _process_loaded_data(self):
|
197
|
-
|
198
|
-
|
199
|
-
|
199
|
+
self.logger.info(f"Type of self.df: {type(self.df)}")
|
200
|
+
if self.df.map_partitions(len).compute().sum() > 0:
|
201
|
+
field_map = self.plugin_params.field_map or {}
|
202
|
+
if isinstance(field_map, dict):
|
200
203
|
rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
|
204
|
+
missing_columns = [k for k in field_map.keys() if k not in self.df.columns]
|
205
|
+
|
206
|
+
if missing_columns:
|
207
|
+
self.logger.warning(
|
208
|
+
f"The following columns in field_map are not in the DataFrame: {missing_columns}")
|
201
209
|
|
202
210
|
def rename_columns(df, mapping):
|
203
211
|
return df.rename(columns=mapping)
|
@@ -211,21 +219,20 @@ class DfHelper:
|
|
211
219
|
def save_to_parquet(self, parquet_filename: Optional[str] = None):
|
212
220
|
ps = ParquetSaver(self.df, self.parquet_storage_path, self.logger)
|
213
221
|
ps.save_to_parquet(parquet_filename)
|
222
|
+
self.logger.info(f"Parquet saved to {parquet_filename} in parquet storage: {self.parquet_storage_path}.")
|
214
223
|
|
215
|
-
def save_to_clickhouse(self,
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
}
|
221
|
-
credentials = {**credentials, **click_config}
|
222
|
-
cs=ClickHouseWriter(**credentials)
|
224
|
+
def save_to_clickhouse(self, **credentials):
|
225
|
+
if self.df.map_partitions(len).compute().sum() == 0:
|
226
|
+
self.logger.info("Cannot write to clickhouse since Dataframe is empty")
|
227
|
+
return
|
228
|
+
cs=ClickHouseWriter(logger=self.logger, **credentials)
|
223
229
|
cs.save_to_clickhouse(self.df)
|
230
|
+
self.logger.info("Save to ClickHouse completed.")
|
224
231
|
|
225
232
|
def _load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
226
233
|
self.df = self.plugin_parquet.load_files()
|
227
234
|
if options:
|
228
|
-
self.df = ParquetFilterHandler().apply_filters_dask(self.df, options)
|
235
|
+
self.df = ParquetFilterHandler(logger=self.logger).apply_filters_dask(self.df, options)
|
229
236
|
return self.df
|
230
237
|
|
231
238
|
def load_period(self, **kwargs):
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import itertools
|
2
2
|
|
3
3
|
import dask.dataframe as dd
|
4
|
+
import dask_expr
|
4
5
|
import django
|
5
6
|
import pandas as pd
|
6
7
|
from django.core.cache import cache
|
@@ -239,4 +240,7 @@ class ReadFrameDask:
|
|
239
240
|
if verbose:
|
240
241
|
self.update_with_verbose(dask_df, fieldnames, fields)
|
241
242
|
|
243
|
+
if isinstance(dask_df, dask_expr._collection.DataFrame):
|
244
|
+
dask_df = dask_df.to_legacy_dataframe()
|
245
|
+
|
242
246
|
return dask_df
|
@@ -13,11 +13,10 @@ class HttpConfig(BaseModel):
|
|
13
13
|
api_key: Optional[SecretStr] = None
|
14
14
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
15
15
|
|
16
|
-
def __init__(self, **data):
|
16
|
+
def __init__(self, logger=None, **data):
|
17
17
|
super().__init__(**data)
|
18
18
|
# Initialize the logger if not provided
|
19
|
-
|
20
|
-
self.logger = Logger(log_dir='./logs/', logger_name="HttpDataSource", log_file='http_data_source.log')
|
19
|
+
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
21
20
|
|
22
21
|
async def fetch_data(self, **options) -> dd.DataFrame:
|
23
22
|
"""Asynchronously fetch JSON data from HTTP endpoint, substituting options into the URL path."""
|
{sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/parquet/_parquet_filter_handler.py
RENAMED
@@ -1,7 +1,11 @@
|
|
1
1
|
import pandas as pd
|
2
2
|
import dask.dataframe as dd
|
3
|
+
from sibi_dst.utils import Logger
|
3
4
|
|
4
5
|
class ParquetFilterHandler(object):
|
6
|
+
def __init__(self, logger=None):
|
7
|
+
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
8
|
+
|
5
9
|
@staticmethod
|
6
10
|
def apply_filters_dask(df, filters):
|
7
11
|
dt_operators = ['date', 'time']
|
@@ -25,6 +25,8 @@ class ParquetConfig(BaseModel):
|
|
25
25
|
@model_validator(mode='after')
|
26
26
|
def check_parquet_params(self):
|
27
27
|
# Configure paths based on fsspec
|
28
|
+
if self.logger is None:
|
29
|
+
self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
|
28
30
|
self.fs = fsspec.filesystem("file") if "://" not in str(self.parquet_storage_path) else fsspec.filesystem(str(self.parquet_storage_path).split("://")[0])
|
29
31
|
|
30
32
|
# Validation for parquet path
|
{sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py
RENAMED
@@ -1,6 +1,7 @@
|
|
1
1
|
import itertools
|
2
2
|
|
3
3
|
import dask.dataframe as dd
|
4
|
+
import dask_expr
|
4
5
|
import pandas as pd
|
5
6
|
from sqlalchemy import create_engine, inspect, select
|
6
7
|
from sqlalchemy.orm import sessionmaker
|
@@ -10,7 +11,7 @@ from sibi_dst.utils import Logger
|
|
10
11
|
|
11
12
|
|
12
13
|
class SQLAlchemyDask:
|
13
|
-
def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None,
|
14
|
+
def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, debug=False):
|
14
15
|
"""
|
15
16
|
Initialize with an SQLAlchemy query and database engine URL.
|
16
17
|
|
@@ -19,13 +20,13 @@ class SQLAlchemyDask:
|
|
19
20
|
:param engine_url: Database connection string for SQLAlchemy engine.
|
20
21
|
:param chunk_size: Number of records per chunk for Dask partitions.
|
21
22
|
:param logger: Logger instance for logging.
|
22
|
-
:param
|
23
|
+
:param debug: Whether to print detailed logs.
|
23
24
|
"""
|
24
25
|
self.query = None
|
25
26
|
self.model = model
|
26
27
|
self.filters = filters
|
27
28
|
self.chunk_size = chunk_size
|
28
|
-
self.
|
29
|
+
self.debug = debug
|
29
30
|
self.engine = create_engine(engine_url)
|
30
31
|
self.Session = sessionmaker(bind=self.engine)
|
31
32
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
@@ -118,14 +119,17 @@ class SQLAlchemyDask:
|
|
118
119
|
partitions.append(dd.from_pandas(df, npartitions=1))
|
119
120
|
|
120
121
|
# Concatenate all partitions
|
121
|
-
# print(partitions)
|
122
122
|
if partitions:
|
123
123
|
dask_df = dd.concat(partitions, axis=0, ignore_index=True)
|
124
124
|
else:
|
125
125
|
dask_df = dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
|
126
126
|
|
127
|
-
if self.
|
127
|
+
if self.debug:
|
128
128
|
self.logger.info(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
|
129
|
+
|
130
|
+
if isinstance(dask_df, dask_expr._collection.DataFrame):
|
131
|
+
dask_df = dask_df.to_legacy_dataframe()
|
132
|
+
|
129
133
|
return dask_df
|
130
134
|
|
131
135
|
except Exception as e:
|
@@ -1,22 +1,13 @@
|
|
1
|
-
from typing import Dict
|
2
|
-
|
3
1
|
import dask.dataframe as dd
|
4
2
|
import pandas as pd
|
5
|
-
from sqlalchemy.inspection import inspect
|
6
|
-
from sqlalchemy.orm import sessionmaker
|
7
|
-
from sqlalchemy import select
|
8
|
-
#from sqlmodel import Session, select
|
9
3
|
|
10
|
-
from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
|
11
|
-
normalize_sqlalchemy_type
|
4
|
+
from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
|
12
5
|
from sibi_dst.utils import Logger
|
13
6
|
from ._io_sqlalchemy_dask import SQLAlchemyDask
|
14
|
-
from ._sqlachemy_filter_handler import SqlAlchemyFilterHandler
|
15
7
|
from ._sqlalchemy_db_connection import SqlAlchemyConnectionConfig
|
16
8
|
|
17
|
-
|
18
9
|
class SqlAlchemyLoadFromDb:
|
19
|
-
df: dd.DataFrame
|
10
|
+
df: dd.DataFrame = None
|
20
11
|
|
21
12
|
def __init__(
|
22
13
|
self,
|
@@ -43,17 +34,25 @@ class SqlAlchemyLoadFromDb:
|
|
43
34
|
"""
|
44
35
|
Load data into a Dask DataFrame based on the query and parameters.
|
45
36
|
"""
|
46
|
-
self.
|
37
|
+
self._build_and_load()
|
47
38
|
return self.df
|
48
39
|
|
49
40
|
def _build_and_load(self) -> dd.DataFrame:
|
41
|
+
|
50
42
|
try:
|
51
|
-
reader = SQLAlchemyDask(model=self.model, filters=self.params_config.filters,engine_url=self.engine.url, logger=self.logger, chunk_size=1000,
|
52
|
-
df =
|
53
|
-
|
43
|
+
# reader = SQLAlchemyDask(model=self.model, filters=self.params_config.filters,engine_url=self.engine.url, logger=self.logger, chunk_size=1000, debug=self.debug)
|
44
|
+
self.df = SQLAlchemyDask(
|
45
|
+
model=self.model,
|
46
|
+
filters=self.params_config.filters,
|
47
|
+
engine_url=self.engine.url,
|
48
|
+
logger=self.logger,
|
49
|
+
chunk_size=1000,
|
50
|
+
debug=self.debug).read_frame()
|
51
|
+
if self.df is None or len(self.df.head().index) == 0:
|
54
52
|
self.logger.warning("Query returned no results.")
|
55
53
|
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
56
|
-
|
54
|
+
|
55
|
+
return self.df
|
57
56
|
except Exception as e:
|
58
57
|
self.logger.error(f"Failed to load data into Dask DataFrame.{e}")
|
59
58
|
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
from ._credentials import ConfigManager, ConfigLoader
|
3
3
|
from ._log_utils import Logger
|
4
|
-
from ._date_utils import DateUtils
|
4
|
+
from ._date_utils import DateUtils, BusinessDays
|
5
5
|
from ._data_utils import DataUtils
|
6
6
|
from ._file_utils import FileUtils
|
7
7
|
from ._filepath_generator import FilePathGenerator
|
@@ -17,6 +17,7 @@ __all__=[
|
|
17
17
|
"ConfigLoader",
|
18
18
|
"Logger",
|
19
19
|
"DateUtils",
|
20
|
+
"BusinessDays",
|
20
21
|
"FileUtils",
|
21
22
|
"DataWrapper",
|
22
23
|
"DataUtils",
|
@@ -31,9 +31,9 @@ class ClickHouseWriter:
|
|
31
31
|
self.order_by=kwargs.setdefault('order_by','id')
|
32
32
|
|
33
33
|
def save_to_clickhouse(self, df, **kwargs):
|
34
|
-
self.df = df
|
34
|
+
self.df = df.copy()
|
35
35
|
self.order_by = kwargs.setdefault('order_by',self.order_by)
|
36
|
-
if len(self.df.index) == 0:
|
36
|
+
if len(self.df.head().index) == 0:
|
37
37
|
self.logger.info("Dataframe is empty")
|
38
38
|
return
|
39
39
|
self._handle_missing_values()
|
@@ -122,7 +122,7 @@ class ClickHouseWriter:
|
|
122
122
|
"""
|
123
123
|
Writes the Dask DataFrame to a ClickHouse table partition by partition.
|
124
124
|
"""
|
125
|
-
if len(self.df.index) == 0:
|
125
|
+
if len(self.df.head().index) == 0:
|
126
126
|
self.logger.info("No data found. Nothing written.")
|
127
127
|
return
|
128
128
|
|
@@ -7,6 +7,27 @@ class DataUtils:
|
|
7
7
|
def __init__(self, logger=None):
|
8
8
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
9
9
|
|
10
|
+
def transform_numeric_cols(self, df, columns, fill_value=0, dtype=int):
|
11
|
+
if not columns:
|
12
|
+
self.logger.warning('No columns specified')
|
13
|
+
|
14
|
+
columns = [column for column in columns if column in df.columns]
|
15
|
+
for col in columns:
|
16
|
+
if isinstance(df, dd.DataFrame):
|
17
|
+
# Replace NaN with 0, then convert to boolean
|
18
|
+
df[col] = df[col].map_partitions(
|
19
|
+
lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
|
20
|
+
.fillna(fill_value) # Replace NaN with 0
|
21
|
+
.astype(dtype),
|
22
|
+
meta=(col, dtype)
|
23
|
+
)
|
24
|
+
else:
|
25
|
+
# For Pandas DataFrame, handle mixed types and invalid values
|
26
|
+
df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to numeric, invalid to NaN
|
27
|
+
df[col] = df[col].fillna(fill_value).astype(dtype)
|
28
|
+
|
29
|
+
return df
|
30
|
+
|
10
31
|
@staticmethod
|
11
32
|
def transform_numeric_columns(df, columns=None, fill_value=0, transform_func=None):
|
12
33
|
"""
|
@@ -14,6 +35,7 @@ class DataUtils:
|
|
14
35
|
|
15
36
|
Parameters:
|
16
37
|
- df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
|
38
|
+
- columns (list of str, optional): Specific columns to transform. If None, all numeric columns are transformed.
|
17
39
|
- fill_value (int or float): The value to replace NA values with.
|
18
40
|
- transform_func (callable, optional): The transformation function to apply.
|
19
41
|
If None, no additional transformation is applied.
|
@@ -28,31 +50,28 @@ class DataUtils:
|
|
28
50
|
if not columns:
|
29
51
|
return df
|
30
52
|
|
53
|
+
columns = [column for column in columns if column in df.columns]
|
31
54
|
# Default transformation function (identity) if none is provided
|
32
55
|
if transform_func is None:
|
33
56
|
transform_func = lambda x: x
|
34
57
|
|
35
|
-
#
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
58
|
+
# Batch processing for Dask
|
59
|
+
if isinstance(df, dd.DataFrame):
|
60
|
+
def transform_partition(partition):
|
61
|
+
# Apply transformations for all numeric columns in a single pass
|
62
|
+
partition[columns] = partition[columns].fillna(fill_value).map(transform_func)
|
63
|
+
return partition
|
64
|
+
|
65
|
+
# Apply the transformation function to all specified columns
|
66
|
+
df = df.map_partitions(transform_partition, meta=df)
|
67
|
+
else:
|
68
|
+
# Pandas: Vectorized operations for all specified columns
|
69
|
+
df[columns] = df[columns].fillna(fill_value).map(transform_func)
|
44
70
|
|
45
|
-
df[col] = df[col].fillna(fill_value).astype(meta_type)
|
46
|
-
if isinstance(df, dd.DataFrame):
|
47
|
-
df[col] = df[col].map_partitions(
|
48
|
-
lambda s: s.apply(transform_func), meta=(col, meta_type)
|
49
|
-
)
|
50
|
-
else:
|
51
|
-
df[col] = df[col].apply(transform_func)
|
52
71
|
return df
|
53
72
|
|
54
73
|
@staticmethod
|
55
|
-
def transform_boolean_columns(df, columns=None
|
74
|
+
def transform_boolean_columns(df, columns=None):
|
56
75
|
"""
|
57
76
|
Detect if the provided columns in a DataFrame (Pandas or Dask) contain only 0 and 1
|
58
77
|
and convert them to boolean. Detection is performed using a sample.
|
@@ -96,73 +115,67 @@ class DataUtils:
|
|
96
115
|
Returns:
|
97
116
|
- pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with merged lookup data.
|
98
117
|
"""
|
99
|
-
#
|
118
|
+
# Return early if the DataFrame is empty
|
100
119
|
if self.is_dataframe_empty(df):
|
101
120
|
return df
|
102
121
|
|
103
|
-
# Extract required parameters
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
fillna_source_description_alias = kwargs.pop('fillna_source_description_alias', False)
|
109
|
-
fieldnames = kwargs.get('fieldnames', None)
|
110
|
-
column_names = kwargs.get('column_names', None)
|
122
|
+
# Extract and validate required parameters
|
123
|
+
required_params = ['source_col', 'lookup_col', 'lookup_description_col', 'source_description_alias']
|
124
|
+
missing_params = [param for param in required_params if param not in kwargs]
|
125
|
+
if missing_params:
|
126
|
+
raise ValueError(f"Missing required parameters: {', '.join(missing_params)}")
|
111
127
|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
128
|
+
source_col = kwargs.pop('source_col')
|
129
|
+
lookup_col = kwargs.pop('lookup_col')
|
130
|
+
lookup_description_col = kwargs.pop('lookup_description_col')
|
131
|
+
source_description_alias = kwargs.pop('source_description_alias')
|
132
|
+
|
133
|
+
# Optional parameters with default values
|
134
|
+
fillna_source_description_alias = kwargs.pop('fillna_source_description_alias', False)
|
135
|
+
fieldnames = kwargs.pop('fieldnames', (lookup_col, lookup_description_col))
|
136
|
+
column_names = kwargs.pop('column_names', ['temp_join_col', source_description_alias])
|
117
137
|
|
118
138
|
if source_col not in df.columns:
|
119
|
-
self.logger.info(f
|
139
|
+
self.logger.info(f"{source_col} not in DataFrame columns")
|
120
140
|
return df
|
121
141
|
|
122
142
|
# Get unique IDs from source column
|
123
143
|
ids = df[source_col].dropna().unique()
|
124
144
|
if isinstance(ids, dd.Series):
|
125
145
|
ids = ids.compute()
|
126
|
-
|
127
|
-
|
128
|
-
if not ids:
|
129
|
-
self.logger.info(f'No IDs found in the source column: {source_col}')
|
146
|
+
if not len(ids):
|
147
|
+
self.logger.info(f"No IDs found in the source column: {source_col}")
|
130
148
|
return df
|
131
|
-
|
132
|
-
# Set default fieldnames and column_names if not provided
|
133
|
-
if fieldnames is None:
|
134
|
-
kwargs['fieldnames'] = (lookup_col, lookup_description_col)
|
135
|
-
if column_names is None:
|
136
|
-
kwargs['column_names'] = ['temp_join_col', source_description_alias]
|
137
|
-
|
149
|
+
ids = sorted(ids.tolist())
|
138
150
|
# Prepare kwargs for loading lookup data
|
139
151
|
load_kwargs = kwargs.copy()
|
140
|
-
load_kwargs
|
141
|
-
|
152
|
+
load_kwargs.update({
|
153
|
+
'fieldnames': fieldnames,
|
154
|
+
'column_names': column_names,
|
155
|
+
f'{lookup_col}__in': ids
|
156
|
+
})
|
142
157
|
# Load lookup data
|
143
|
-
lookup_instance = classname()
|
158
|
+
lookup_instance = classname(debug=True, verbose_debug=True)
|
144
159
|
result = lookup_instance.load(**load_kwargs)
|
145
|
-
|
160
|
+
if len(result.index) == 0:
|
161
|
+
self.logger.info(f"No IDs found in the source column: {source_col}")
|
162
|
+
return df
|
146
163
|
# Determine the join column on the result DataFrame
|
147
|
-
if 'temp_join_col' in
|
148
|
-
temp_join_col = 'temp_join_col'
|
149
|
-
else:
|
150
|
-
temp_join_col = lookup_col
|
164
|
+
temp_join_col = 'temp_join_col' if 'temp_join_col' in column_names else lookup_col
|
151
165
|
|
152
166
|
# Merge DataFrames
|
153
167
|
df = df.merge(result, how='left', left_on=source_col, right_on=temp_join_col)
|
154
168
|
|
155
169
|
if fillna_source_description_alias and source_description_alias in df.columns:
|
156
|
-
df[source_description_alias]
|
170
|
+
df[source_description_alias]=df[source_description_alias].fillna('')
|
157
171
|
|
158
172
|
# Drop temp_join_col if present
|
159
|
-
|
160
|
-
df = df.drop(columns='temp_join_col')
|
173
|
+
df = df.drop(columns='temp_join_col', errors='ignore')
|
161
174
|
|
162
175
|
return df
|
163
176
|
|
164
|
-
|
165
|
-
def is_dataframe_empty(df):
|
177
|
+
|
178
|
+
def is_dataframe_empty(self, df):
|
166
179
|
"""
|
167
180
|
Check if a DataFrame (Pandas or Dask) is empty.
|
168
181
|
|
@@ -173,14 +186,30 @@ class DataUtils:
|
|
173
186
|
- bool: True if the DataFrame is empty, False otherwise.
|
174
187
|
"""
|
175
188
|
if isinstance(df, dd.DataFrame):
|
176
|
-
|
177
|
-
|
178
|
-
|
189
|
+
try:
|
190
|
+
return len(df.index) == 0
|
191
|
+
except Exception as e:
|
192
|
+
self.logger.error(f"Error while processing Dask DataFrame: {e}")
|
193
|
+
return False
|
194
|
+
elif isinstance(df, pd.DataFrame):
|
179
195
|
return df.empty
|
196
|
+
else:
|
197
|
+
self.logger.error("Input must be a pandas or dask DataFrame.")
|
198
|
+
return False
|
180
199
|
|
181
200
|
@staticmethod
|
182
|
-
def
|
201
|
+
def convert_to_datetime_dask(df, date_fields):
|
202
|
+
"""
|
203
|
+
Convert specified columns in a Dask DataFrame to datetime, handling errors gracefully.
|
204
|
+
|
205
|
+
Parameters:
|
206
|
+
- df (dask.dataframe.DataFrame): The Dask DataFrame containing the columns.
|
207
|
+
- date_fields (list of str): List of column names to convert to datetime.
|
208
|
+
|
209
|
+
Returns:
|
210
|
+
- dask.dataframe.DataFrame: Updated DataFrame with specified columns converted to datetime.
|
211
|
+
"""
|
183
212
|
for col in date_fields:
|
184
213
|
if col in df.columns:
|
185
|
-
df[col] =
|
186
|
-
return df
|
214
|
+
df[col] = df[col].map_partitions(pd.to_datetime, errors="coerce", meta=(col, "datetime64[ns]"))
|
215
|
+
return df
|