sibi-dst 0.3.14__py3-none-any.whl → 0.3.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +7 -11
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py +4 -1
- sibi_dst/utils/_data_utils.py +35 -77
- {sibi_dst-0.3.14.dist-info → sibi_dst-0.3.15.dist-info}/METADATA +1 -1
- {sibi_dst-0.3.14.dist-info → sibi_dst-0.3.15.dist-info}/RECORD +6 -6
- {sibi_dst-0.3.14.dist-info → sibi_dst-0.3.15.dist-info}/WHEEL +0 -0
sibi_dst/df_helper/_df_helper.py
CHANGED
@@ -4,13 +4,12 @@ from typing import Any, Dict, TypeVar
|
|
4
4
|
from typing import Union, Optional
|
5
5
|
|
6
6
|
import dask.dataframe as dd
|
7
|
-
import dask_expr
|
8
7
|
import pandas as pd
|
9
8
|
from pydantic import BaseModel
|
10
9
|
|
11
|
-
from sibi_dst.utils import ParquetSaver, ClickHouseWriter
|
12
10
|
from sibi_dst.df_helper.core import QueryConfig, ParamsConfig
|
13
11
|
from sibi_dst.utils import Logger
|
12
|
+
from sibi_dst.utils import ParquetSaver, ClickHouseWriter
|
14
13
|
from .plugins.django import *
|
15
14
|
from .plugins.http import HttpConfig
|
16
15
|
from .plugins.parquet import ParquetConfig, ParquetFilterHandler
|
@@ -19,6 +18,7 @@ from .plugins.sql_alchemy import *
|
|
19
18
|
# Define a generic type variable for BaseModel subclasses
|
20
19
|
T = TypeVar("T", bound=BaseModel)
|
21
20
|
|
21
|
+
|
22
22
|
class DfHelper:
|
23
23
|
df: Union[dd.DataFrame, pd.DataFrame] = None
|
24
24
|
plugin_django_connection: Optional[DjangoConnectionConfig] = None
|
@@ -40,13 +40,12 @@ class DfHelper:
|
|
40
40
|
self.debug = kwargs.setdefault("debug", False)
|
41
41
|
self.verbose_debug = kwargs.setdefault("verbose_debug", False)
|
42
42
|
self.parquet_storage_path = kwargs.setdefault("parquet_storage_path", None)
|
43
|
-
self.dt_field=kwargs.setdefault("dt_field", None)
|
43
|
+
self.dt_field = kwargs.setdefault("dt_field", None)
|
44
44
|
self.as_pandas = kwargs.setdefault("as_pandas", False)
|
45
45
|
kwargs.setdefault("live", True)
|
46
46
|
kwargs.setdefault("logger", self.logger)
|
47
47
|
self.post_init(**kwargs)
|
48
48
|
|
49
|
-
|
50
49
|
def post_init(self, **kwargs):
|
51
50
|
self.logger.info(f"Source used: {self.source}")
|
52
51
|
self.plugin_query = self.__get_config(QueryConfig, kwargs)
|
@@ -59,7 +58,7 @@ class DfHelper:
|
|
59
58
|
elif self.source == 'http':
|
60
59
|
self.plugin_http = HttpConfig(**kwargs)
|
61
60
|
elif self.source == 'sqlalchemy':
|
62
|
-
self.plugin_sqlalchemy = self.__get_config(SqlAlchemyConnectionConfig,kwargs)
|
61
|
+
self.plugin_sqlalchemy = self.__get_config(SqlAlchemyConnectionConfig, kwargs)
|
63
62
|
|
64
63
|
@staticmethod
|
65
64
|
def __get_config(model: [T], kwargs: Dict[str, Any]) -> Union[T]:
|
@@ -100,7 +99,6 @@ class DfHelper:
|
|
100
99
|
self.logger.info("Regular asyncio run...")
|
101
100
|
return asyncio.run(self._load_from_http(**options))
|
102
101
|
|
103
|
-
|
104
102
|
def _load_from_sqlalchemy(self, **options):
|
105
103
|
try:
|
106
104
|
options.setdefault("debug", self.debug)
|
@@ -139,7 +137,7 @@ class DfHelper:
|
|
139
137
|
self.logger.info("Data successfully loaded from django database.")
|
140
138
|
except Exception as e:
|
141
139
|
self.logger.error(f"Failed to load data from django database: {e}")
|
142
|
-
self.df=dd.from_pandas(pd.DataFrame(), npartitions=1)
|
140
|
+
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
143
141
|
|
144
142
|
return self.df
|
145
143
|
|
@@ -152,10 +150,9 @@ class DfHelper:
|
|
152
150
|
self.df = await self.plugin_http.fetch_data(**options)
|
153
151
|
except Exception as e:
|
154
152
|
self.logger.error(f"Failed to load data from http plugin: {e}")
|
155
|
-
self.df=dd.from_pandas(pd.DataFrame(), npartitions=1)
|
153
|
+
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
156
154
|
return self.df
|
157
155
|
|
158
|
-
|
159
156
|
def _post_process_df(self):
|
160
157
|
"""
|
161
158
|
Efficiently process the DataFrame by filtering, renaming, and setting indices.
|
@@ -225,7 +222,7 @@ class DfHelper:
|
|
225
222
|
if self.df.map_partitions(len).compute().sum() == 0:
|
226
223
|
self.logger.info("Cannot write to clickhouse since Dataframe is empty")
|
227
224
|
return
|
228
|
-
cs=ClickHouseWriter(logger=self.logger, **credentials)
|
225
|
+
cs = ClickHouseWriter(logger=self.logger, **credentials)
|
229
226
|
cs.save_to_clickhouse(self.df)
|
230
227
|
self.logger.info("Save to ClickHouse completed.")
|
231
228
|
|
@@ -299,7 +296,6 @@ class DfHelper:
|
|
299
296
|
kwargs[f"{mapped_field}__date__lte"] = end
|
300
297
|
return self.load(**kwargs)
|
301
298
|
|
302
|
-
|
303
299
|
@staticmethod
|
304
300
|
def parse_date(date_str: str) -> Union[datetime.datetime, datetime.date]:
|
305
301
|
try:
|
@@ -82,9 +82,12 @@ class SqlAlchemyModelBuilder:
|
|
82
82
|
dict: Dictionary of column attributes.
|
83
83
|
"""
|
84
84
|
columns = {}
|
85
|
+
reserved_names = ["metadata", "class_", "table"]
|
86
|
+
|
85
87
|
for column in table.columns:
|
86
88
|
column_name = self.normalize_column_name(column.name)
|
87
|
-
|
89
|
+
if column_name not in reserved_names:
|
90
|
+
columns[column_name] = column
|
88
91
|
return columns
|
89
92
|
|
90
93
|
def add_relationships(self, attrs, table: Table):
|
sibi_dst/utils/_data_utils.py
CHANGED
@@ -1,77 +1,32 @@
|
|
1
|
-
import pandas as pd
|
2
1
|
import dask.dataframe as dd
|
2
|
+
import pandas as pd
|
3
|
+
|
3
4
|
from sibi_dst.utils import Logger
|
4
5
|
|
6
|
+
|
5
7
|
class DataUtils:
|
6
8
|
|
7
|
-
def __init__(self, logger=None):
|
9
|
+
def __init__(self, logger=None, **kwargs):
|
8
10
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
11
|
+
self.debug = kwargs.get('debug', False)
|
9
12
|
|
10
13
|
def transform_numeric_cols(self, df, columns, fill_value=0, dtype=int):
|
11
14
|
if not columns:
|
12
15
|
self.logger.warning('No columns specified')
|
13
|
-
|
16
|
+
self.logger.debug(f'Dataframe type:{type(df)}')
|
14
17
|
columns = [column for column in columns if column in df.columns]
|
15
18
|
for col in columns:
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
)
|
24
|
-
else:
|
25
|
-
# For Pandas DataFrame, handle mixed types and invalid values
|
26
|
-
df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to numeric, invalid to NaN
|
27
|
-
df[col] = df[col].fillna(fill_value).astype(dtype)
|
19
|
+
# Replace NaN with 0, then convert to boolean
|
20
|
+
df[col] = df[col].map_partitions(
|
21
|
+
lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
|
22
|
+
.fillna(fill_value) # Replace NaN with 0
|
23
|
+
.astype(dtype),
|
24
|
+
meta=(col, dtype)
|
25
|
+
)
|
28
26
|
|
29
27
|
return df
|
30
28
|
|
31
|
-
|
32
|
-
def transform_numeric_columns(df, columns=None, fill_value=0, transform_func=None):
|
33
|
-
"""
|
34
|
-
Transform numeric columns in a DataFrame (Pandas or Dask), handling missing values and applying optional transformations.
|
35
|
-
|
36
|
-
Parameters:
|
37
|
-
- df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
|
38
|
-
- columns (list of str, optional): Specific columns to transform. If None, all numeric columns are transformed.
|
39
|
-
- fill_value (int or float): The value to replace NA values with.
|
40
|
-
- transform_func (callable, optional): The transformation function to apply.
|
41
|
-
If None, no additional transformation is applied.
|
42
|
-
|
43
|
-
Returns:
|
44
|
-
- pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with transformed numeric columns.
|
45
|
-
"""
|
46
|
-
if columns is None:
|
47
|
-
# Detect numeric columns
|
48
|
-
columns = df.select_dtypes(include=['number']).columns.tolist()
|
49
|
-
|
50
|
-
if not columns:
|
51
|
-
return df
|
52
|
-
|
53
|
-
columns = [column for column in columns if column in df.columns]
|
54
|
-
# Default transformation function (identity) if none is provided
|
55
|
-
if transform_func is None:
|
56
|
-
transform_func = lambda x: x
|
57
|
-
|
58
|
-
# Batch processing for Dask
|
59
|
-
if isinstance(df, dd.DataFrame):
|
60
|
-
def transform_partition(partition):
|
61
|
-
# Apply transformations for all numeric columns in a single pass
|
62
|
-
partition[columns] = partition[columns].fillna(fill_value).map(transform_func)
|
63
|
-
return partition
|
64
|
-
|
65
|
-
# Apply the transformation function to all specified columns
|
66
|
-
df = df.map_partitions(transform_partition, meta=df)
|
67
|
-
else:
|
68
|
-
# Pandas: Vectorized operations for all specified columns
|
69
|
-
df[columns] = df[columns].fillna(fill_value).map(transform_func)
|
70
|
-
|
71
|
-
return df
|
72
|
-
|
73
|
-
@staticmethod
|
74
|
-
def transform_boolean_columns(df, columns=None):
|
29
|
+
def transform_boolean_columns(self, df, columns=None):
|
75
30
|
"""
|
76
31
|
Detect if the provided columns in a DataFrame (Pandas or Dask) contain only 0 and 1
|
77
32
|
and convert them to boolean. Detection is performed using a sample.
|
@@ -84,23 +39,20 @@ class DataUtils:
|
|
84
39
|
Returns:
|
85
40
|
- pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with transformed boolean columns.
|
86
41
|
"""
|
42
|
+
|
87
43
|
# Apply transformation to each specified column
|
88
44
|
for col in columns:
|
89
45
|
if col in df.columns:
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
# For Pandas DataFrame, handle mixed types and invalid values
|
101
|
-
df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to numeric, invalid to NaN
|
102
|
-
df[col] = df[col].fillna(0).astype(int).astype(bool)
|
103
|
-
|
46
|
+
# Replace NaN with 0, then convert to boolean
|
47
|
+
df[col] = df[col].map_partitions(
|
48
|
+
lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
|
49
|
+
.fillna(0) # Replace NaN with 0
|
50
|
+
.astype(int) # Ensure integer type
|
51
|
+
.astype(bool), # Convert to boolean
|
52
|
+
meta=(col, 'bool')
|
53
|
+
)
|
54
|
+
if self.debug:
|
55
|
+
self.logger.debug(f'Dataframe type:{type(df)}, boolean applied to columns: {columns}')
|
104
56
|
return df
|
105
57
|
|
106
58
|
def merge_lookup_data(self, classname, df, **kwargs):
|
@@ -141,12 +93,19 @@ class DataUtils:
|
|
141
93
|
|
142
94
|
# Get unique IDs from source column
|
143
95
|
ids = df[source_col].dropna().unique()
|
144
|
-
if
|
96
|
+
# Compute if it's a Dask Series
|
97
|
+
if isinstance(ids, dd.core.Series):
|
145
98
|
ids = ids.compute()
|
99
|
+
|
100
|
+
# Check if any IDs are found
|
146
101
|
if not len(ids):
|
147
102
|
self.logger.info(f"No IDs found in the source column: {source_col}")
|
148
103
|
return df
|
149
|
-
|
104
|
+
|
105
|
+
# Convert to a list only if necessary and sort
|
106
|
+
if not isinstance(ids, list):
|
107
|
+
ids = ids.tolist()
|
108
|
+
ids = sorted(ids)
|
150
109
|
# Prepare kwargs for loading lookup data
|
151
110
|
load_kwargs = kwargs.copy()
|
152
111
|
load_kwargs.update({
|
@@ -167,14 +126,13 @@ class DataUtils:
|
|
167
126
|
df = df.merge(result, how='left', left_on=source_col, right_on=temp_join_col)
|
168
127
|
|
169
128
|
if fillna_source_description_alias and source_description_alias in df.columns:
|
170
|
-
df[source_description_alias]=df[source_description_alias].fillna('')
|
129
|
+
df[source_description_alias] = df[source_description_alias].fillna('')
|
171
130
|
|
172
131
|
# Drop temp_join_col if present
|
173
132
|
df = df.drop(columns='temp_join_col', errors='ignore')
|
174
133
|
|
175
134
|
return df
|
176
135
|
|
177
|
-
|
178
136
|
def is_dataframe_empty(self, df):
|
179
137
|
"""
|
180
138
|
Check if a DataFrame (Pandas or Dask) is empty.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
sibi_dst/__init__.py,sha256=1KaC0LYTHxjpENq-NXI325WcEYZ8GCBrHGkLoFxEcu0,251
|
2
2
|
sibi_dst/df_helper/__init__.py,sha256=JXJBY47G6wOYhzNI646OBl3pSGWIy4282-3qPGYHU7w,167
|
3
|
-
sibi_dst/df_helper/_df_helper.py,sha256=
|
3
|
+
sibi_dst/df_helper/_df_helper.py,sha256=ZWhPj9K5q_amJ7eBOrvwAvncxRnI-baveKWWQWfyND8,13354
|
4
4
|
sibi_dst/df_helper/_parquet_artifact.py,sha256=f5oHwXtsNW6-ONSFsRB0AniVefA0THzP92J-nugp9vo,4973
|
5
5
|
sibi_dst/df_helper/core/__init__.py,sha256=NSYY_evzq6XEkO06Nz6xLH5KznzRGI44cLbrnN3zHXQ,503
|
6
6
|
sibi_dst/df_helper/core/_defaults.py,sha256=pJU-lX7w4nrt0Anx35j08mVr_0oMGn1bTA_iCl_p1qI,6700
|
@@ -23,7 +23,7 @@ sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py,sha256=1WQ390XBFWO
|
|
23
23
|
sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py,sha256=H8ypUjLKzYYl9BerfJjX_Uv9qBVkBR-wZiQlh3uRQXg,4669
|
24
24
|
sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_db_connection.py,sha256=HtMsfH5com4dLVJxh3wdMUpQI3mz0cKDJz0CmFS2S8U,1648
|
25
25
|
sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py,sha256=balWGKun0NKIfhLZW-_DCOhKuTzTo_C2NwZoKFwuSJo,2329
|
26
|
-
sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py,sha256=
|
26
|
+
sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py,sha256=rzzZdcRB5TS9uJ3ZIGQiNf04e3u2akqJEsoGCuyPE3c,4467
|
27
27
|
sibi_dst/df_helper/plugins/sql_model/__init__.py,sha256=MXd4OOdTqR4cENSV733SGodPO6eQMCexANs-3w0qL5U,226
|
28
28
|
sibi_dst/df_helper/plugins/sql_model/_sqlmodel_db_connection.py,sha256=6jmMjKIv5Btysj3kZMaXQ98IqKQkhnOC-JWtb1B8rus,4265
|
29
29
|
sibi_dst/df_helper/plugins/sql_model/_sqlmodel_load_from_db.py,sha256=bLD4tEcGDKkJCfSO4b13_89tzVJcpz55I6uw9D4ERnE,3751
|
@@ -31,7 +31,7 @@ sibi_dst/utils/__init__.py,sha256=nkX7tASNn57kw998YdqQQGY8qXv2J4LC4-g0GoQSiic,83
|
|
31
31
|
sibi_dst/utils/_airflow_manager.py,sha256=rlt3eolR5QvtxWhAtBTCpHXvxftnKM-ibPMv3fVwNZk,7524
|
32
32
|
sibi_dst/utils/_clickhouse_writer.py,sha256=kNBQeDn3D4Javrz5L8uU_5itf8Mrvm9l29uxcmcKlbg,8555
|
33
33
|
sibi_dst/utils/_credentials.py,sha256=8i6z7y3y5S-6mSk4xrT2AwhzCA32mTn1n1iYX9IVyHk,1724
|
34
|
-
sibi_dst/utils/_data_utils.py,sha256=
|
34
|
+
sibi_dst/utils/_data_utils.py,sha256=uw0SW9G4GrvTX4IdUd8fmsMTMEG5aXOFcWOv4Au3H5g,7016
|
35
35
|
sibi_dst/utils/_data_wrapper.py,sha256=SmNv1UoZLq7ovRVy4wipsWLMidKJXcRTp4HtxmaCQdk,9399
|
36
36
|
sibi_dst/utils/_date_utils.py,sha256=KYB07puKDrSG8tOm_i1HGX0TjLNUtSWjwfsCYBmW9co,10619
|
37
37
|
sibi_dst/utils/_df_utils.py,sha256=9_dNYoZ9_ofU0t_sxMdsXALWCuh02gvqUrei-6Lhr6w,10910
|
@@ -40,6 +40,6 @@ sibi_dst/utils/_filepath_generator.py,sha256=ytPSZ9GYOnnSP25zwA-0NjFHupPRZyXwixW
|
|
40
40
|
sibi_dst/utils/_log_utils.py,sha256=AAenyubYUjk77WqiaNkjgkxws3dnAMIdaGl2Ryz_cA4,2245
|
41
41
|
sibi_dst/utils/_parquet_saver.py,sha256=-A0o_vucyYe7wlwiby_0_yS-ZfT2GHwImyQHrCIBNwk,9051
|
42
42
|
sibi_dst/utils/_storage_manager.py,sha256=KP2HBXnLUMMquqcO30ecfuoU7g1z8RtaV3Dv0TvEXoY,3856
|
43
|
-
sibi_dst-0.3.
|
44
|
-
sibi_dst-0.3.
|
45
|
-
sibi_dst-0.3.
|
43
|
+
sibi_dst-0.3.15.dist-info/METADATA,sha256=0XU32Bgt1RYV7Y12lmDxq_YmHaXya5d2qMYfYP8Yic0,2090
|
44
|
+
sibi_dst-0.3.15.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
45
|
+
sibi_dst-0.3.15.dist-info/RECORD,,
|
File without changes
|