dragon-ml-toolbox 19.14.0__py3-none-any.whl → 20.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/METADATA +29 -46
- dragon_ml_toolbox-20.0.0.dist-info/RECORD +178 -0
- ml_tools/{ETL_cleaning.py → ETL_cleaning/__init__.py} +13 -5
- ml_tools/ETL_cleaning/_basic_clean.py +351 -0
- ml_tools/ETL_cleaning/_clean_tools.py +128 -0
- ml_tools/ETL_cleaning/_dragon_cleaner.py +245 -0
- ml_tools/ETL_cleaning/_imprimir.py +13 -0
- ml_tools/{ETL_engineering.py → ETL_engineering/__init__.py} +8 -4
- ml_tools/ETL_engineering/_dragon_engineering.py +261 -0
- ml_tools/ETL_engineering/_imprimir.py +24 -0
- ml_tools/{_core/_ETL_engineering.py → ETL_engineering/_transforms.py} +14 -267
- ml_tools/{_core → GUI_tools}/_GUI_tools.py +37 -40
- ml_tools/{GUI_tools.py → GUI_tools/__init__.py} +7 -5
- ml_tools/GUI_tools/_imprimir.py +12 -0
- ml_tools/IO_tools/_IO_loggers.py +235 -0
- ml_tools/IO_tools/_IO_save_load.py +151 -0
- ml_tools/IO_tools/_IO_utils.py +140 -0
- ml_tools/{IO_tools.py → IO_tools/__init__.py} +13 -5
- ml_tools/IO_tools/_imprimir.py +14 -0
- ml_tools/MICE/_MICE_imputation.py +132 -0
- ml_tools/{MICE_imputation.py → MICE/__init__.py} +6 -7
- ml_tools/{_core/_MICE_imputation.py → MICE/_dragon_mice.py} +243 -322
- ml_tools/MICE/_imprimir.py +11 -0
- ml_tools/{ML_callbacks.py → ML_callbacks/__init__.py} +12 -4
- ml_tools/ML_callbacks/_base.py +101 -0
- ml_tools/ML_callbacks/_checkpoint.py +232 -0
- ml_tools/ML_callbacks/_early_stop.py +208 -0
- ml_tools/ML_callbacks/_imprimir.py +12 -0
- ml_tools/ML_callbacks/_scheduler.py +197 -0
- ml_tools/{ML_chaining_utilities.py → ML_chain/__init__.py} +8 -3
- ml_tools/{_core/_ML_chaining_utilities.py → ML_chain/_chaining_tools.py} +5 -129
- ml_tools/ML_chain/_dragon_chain.py +140 -0
- ml_tools/ML_chain/_imprimir.py +11 -0
- ml_tools/ML_configuration/__init__.py +90 -0
- ml_tools/ML_configuration/_base_model_config.py +69 -0
- ml_tools/ML_configuration/_finalize.py +366 -0
- ml_tools/ML_configuration/_imprimir.py +47 -0
- ml_tools/ML_configuration/_metrics.py +593 -0
- ml_tools/ML_configuration/_models.py +206 -0
- ml_tools/ML_configuration/_training.py +124 -0
- ml_tools/ML_datasetmaster/__init__.py +28 -0
- ml_tools/ML_datasetmaster/_base_datasetmaster.py +337 -0
- ml_tools/{_core/_ML_datasetmaster.py → ML_datasetmaster/_datasetmaster.py} +9 -329
- ml_tools/ML_datasetmaster/_imprimir.py +15 -0
- ml_tools/{_core/_ML_sequence_datasetmaster.py → ML_datasetmaster/_sequence_datasetmaster.py} +13 -15
- ml_tools/{_core/_ML_vision_datasetmaster.py → ML_datasetmaster/_vision_datasetmaster.py} +63 -65
- ml_tools/ML_evaluation/__init__.py +53 -0
- ml_tools/ML_evaluation/_classification.py +629 -0
- ml_tools/ML_evaluation/_feature_importance.py +409 -0
- ml_tools/ML_evaluation/_imprimir.py +25 -0
- ml_tools/ML_evaluation/_loss.py +92 -0
- ml_tools/ML_evaluation/_regression.py +273 -0
- ml_tools/{_core/_ML_sequence_evaluation.py → ML_evaluation/_sequence.py} +8 -11
- ml_tools/{_core/_ML_vision_evaluation.py → ML_evaluation/_vision.py} +12 -17
- ml_tools/{_core → ML_evaluation_captum}/_ML_evaluation_captum.py +11 -38
- ml_tools/{ML_evaluation_captum.py → ML_evaluation_captum/__init__.py} +6 -4
- ml_tools/ML_evaluation_captum/_imprimir.py +10 -0
- ml_tools/{_core → ML_finalize_handler}/_ML_finalize_handler.py +3 -7
- ml_tools/ML_finalize_handler/__init__.py +10 -0
- ml_tools/ML_finalize_handler/_imprimir.py +8 -0
- ml_tools/ML_inference/__init__.py +22 -0
- ml_tools/ML_inference/_base_inference.py +166 -0
- ml_tools/{_core/_ML_chaining_inference.py → ML_inference/_chain_inference.py} +14 -17
- ml_tools/ML_inference/_dragon_inference.py +332 -0
- ml_tools/ML_inference/_imprimir.py +11 -0
- ml_tools/ML_inference/_multi_inference.py +180 -0
- ml_tools/ML_inference_sequence/__init__.py +10 -0
- ml_tools/ML_inference_sequence/_imprimir.py +8 -0
- ml_tools/{_core/_ML_sequence_inference.py → ML_inference_sequence/_sequence_inference.py} +11 -15
- ml_tools/ML_inference_vision/__init__.py +10 -0
- ml_tools/ML_inference_vision/_imprimir.py +8 -0
- ml_tools/{_core/_ML_vision_inference.py → ML_inference_vision/_vision_inference.py} +15 -19
- ml_tools/ML_models/__init__.py +32 -0
- ml_tools/{_core/_ML_models_advanced.py → ML_models/_advanced_models.py} +22 -18
- ml_tools/ML_models/_base_mlp_attention.py +198 -0
- ml_tools/{_core/_models_advanced_base.py → ML_models/_base_save_load.py} +73 -49
- ml_tools/ML_models/_dragon_tabular.py +248 -0
- ml_tools/ML_models/_imprimir.py +18 -0
- ml_tools/ML_models/_mlp_attention.py +134 -0
- ml_tools/{_core → ML_models}/_models_advanced_helpers.py +13 -13
- ml_tools/ML_models_sequence/__init__.py +10 -0
- ml_tools/ML_models_sequence/_imprimir.py +8 -0
- ml_tools/{_core/_ML_sequence_models.py → ML_models_sequence/_sequence_models.py} +5 -8
- ml_tools/ML_models_vision/__init__.py +29 -0
- ml_tools/ML_models_vision/_base_wrapper.py +254 -0
- ml_tools/ML_models_vision/_image_classification.py +182 -0
- ml_tools/ML_models_vision/_image_segmentation.py +108 -0
- ml_tools/ML_models_vision/_imprimir.py +16 -0
- ml_tools/ML_models_vision/_object_detection.py +135 -0
- ml_tools/ML_optimization/__init__.py +21 -0
- ml_tools/ML_optimization/_imprimir.py +13 -0
- ml_tools/{_core/_ML_optimization_pareto.py → ML_optimization/_multi_dragon.py} +18 -24
- ml_tools/ML_optimization/_single_dragon.py +203 -0
- ml_tools/{_core/_ML_optimization.py → ML_optimization/_single_manual.py} +75 -213
- ml_tools/{_core → ML_scaler}/_ML_scaler.py +8 -11
- ml_tools/ML_scaler/__init__.py +10 -0
- ml_tools/ML_scaler/_imprimir.py +8 -0
- ml_tools/ML_trainer/__init__.py +20 -0
- ml_tools/ML_trainer/_base_trainer.py +297 -0
- ml_tools/ML_trainer/_dragon_detection_trainer.py +402 -0
- ml_tools/ML_trainer/_dragon_sequence_trainer.py +540 -0
- ml_tools/ML_trainer/_dragon_trainer.py +1160 -0
- ml_tools/ML_trainer/_imprimir.py +10 -0
- ml_tools/{ML_utilities.py → ML_utilities/__init__.py} +14 -6
- ml_tools/ML_utilities/_artifact_finder.py +382 -0
- ml_tools/ML_utilities/_imprimir.py +16 -0
- ml_tools/ML_utilities/_inspection.py +325 -0
- ml_tools/ML_utilities/_train_tools.py +205 -0
- ml_tools/{ML_vision_transformers.py → ML_vision_transformers/__init__.py} +9 -6
- ml_tools/{_core/_ML_vision_transformers.py → ML_vision_transformers/_core_transforms.py} +11 -155
- ml_tools/ML_vision_transformers/_imprimir.py +14 -0
- ml_tools/ML_vision_transformers/_offline_augmentation.py +159 -0
- ml_tools/{_core/_PSO_optimization.py → PSO_optimization/_PSO.py} +58 -15
- ml_tools/{PSO_optimization.py → PSO_optimization/__init__.py} +5 -3
- ml_tools/PSO_optimization/_imprimir.py +10 -0
- ml_tools/SQL/__init__.py +7 -0
- ml_tools/{_core/_SQL.py → SQL/_dragon_SQL.py} +7 -11
- ml_tools/SQL/_imprimir.py +8 -0
- ml_tools/{_core → VIF}/_VIF_factor.py +5 -8
- ml_tools/{VIF_factor.py → VIF/__init__.py} +4 -2
- ml_tools/VIF/_imprimir.py +10 -0
- ml_tools/_core/__init__.py +7 -1
- ml_tools/_core/_logger.py +8 -18
- ml_tools/_core/_schema_load_ops.py +43 -0
- ml_tools/_core/_script_info.py +2 -2
- ml_tools/{data_exploration.py → data_exploration/__init__.py} +32 -16
- ml_tools/data_exploration/_analysis.py +214 -0
- ml_tools/data_exploration/_cleaning.py +566 -0
- ml_tools/data_exploration/_features.py +583 -0
- ml_tools/data_exploration/_imprimir.py +32 -0
- ml_tools/data_exploration/_plotting.py +487 -0
- ml_tools/data_exploration/_schema_ops.py +176 -0
- ml_tools/{ensemble_evaluation.py → ensemble_evaluation/__init__.py} +6 -4
- ml_tools/{_core → ensemble_evaluation}/_ensemble_evaluation.py +3 -7
- ml_tools/ensemble_evaluation/_imprimir.py +14 -0
- ml_tools/{ensemble_inference.py → ensemble_inference/__init__.py} +5 -3
- ml_tools/{_core → ensemble_inference}/_ensemble_inference.py +15 -18
- ml_tools/ensemble_inference/_imprimir.py +9 -0
- ml_tools/{ensemble_learning.py → ensemble_learning/__init__.py} +4 -6
- ml_tools/{_core → ensemble_learning}/_ensemble_learning.py +7 -10
- ml_tools/ensemble_learning/_imprimir.py +10 -0
- ml_tools/{excel_handler.py → excel_handler/__init__.py} +5 -3
- ml_tools/{_core → excel_handler}/_excel_handler.py +6 -10
- ml_tools/excel_handler/_imprimir.py +13 -0
- ml_tools/{keys.py → keys/__init__.py} +4 -1
- ml_tools/keys/_imprimir.py +11 -0
- ml_tools/{_core → keys}/_keys.py +2 -0
- ml_tools/{math_utilities.py → math_utilities/__init__.py} +5 -2
- ml_tools/math_utilities/_imprimir.py +11 -0
- ml_tools/{_core → math_utilities}/_math_utilities.py +1 -5
- ml_tools/{optimization_tools.py → optimization_tools/__init__.py} +9 -4
- ml_tools/optimization_tools/_imprimir.py +13 -0
- ml_tools/optimization_tools/_optimization_bounds.py +236 -0
- ml_tools/optimization_tools/_optimization_plots.py +218 -0
- ml_tools/{path_manager.py → path_manager/__init__.py} +6 -3
- ml_tools/{_core/_path_manager.py → path_manager/_dragonmanager.py} +11 -347
- ml_tools/path_manager/_imprimir.py +15 -0
- ml_tools/path_manager/_path_tools.py +346 -0
- ml_tools/plot_fonts/__init__.py +8 -0
- ml_tools/plot_fonts/_imprimir.py +8 -0
- ml_tools/{_core → plot_fonts}/_plot_fonts.py +2 -5
- ml_tools/schema/__init__.py +15 -0
- ml_tools/schema/_feature_schema.py +223 -0
- ml_tools/schema/_gui_schema.py +191 -0
- ml_tools/schema/_imprimir.py +10 -0
- ml_tools/{serde.py → serde/__init__.py} +4 -2
- ml_tools/serde/_imprimir.py +10 -0
- ml_tools/{_core → serde}/_serde.py +3 -8
- ml_tools/{utilities.py → utilities/__init__.py} +11 -6
- ml_tools/utilities/_imprimir.py +18 -0
- ml_tools/{_core/_utilities.py → utilities/_utility_save_load.py} +13 -190
- ml_tools/utilities/_utility_tools.py +192 -0
- dragon_ml_toolbox-19.14.0.dist-info/RECORD +0 -111
- ml_tools/ML_chaining_inference.py +0 -8
- ml_tools/ML_configuration.py +0 -86
- ml_tools/ML_configuration_pytab.py +0 -14
- ml_tools/ML_datasetmaster.py +0 -10
- ml_tools/ML_evaluation.py +0 -16
- ml_tools/ML_evaluation_multi.py +0 -12
- ml_tools/ML_finalize_handler.py +0 -8
- ml_tools/ML_inference.py +0 -12
- ml_tools/ML_models.py +0 -14
- ml_tools/ML_models_advanced.py +0 -14
- ml_tools/ML_models_pytab.py +0 -14
- ml_tools/ML_optimization.py +0 -14
- ml_tools/ML_optimization_pareto.py +0 -8
- ml_tools/ML_scaler.py +0 -8
- ml_tools/ML_sequence_datasetmaster.py +0 -8
- ml_tools/ML_sequence_evaluation.py +0 -10
- ml_tools/ML_sequence_inference.py +0 -8
- ml_tools/ML_sequence_models.py +0 -8
- ml_tools/ML_trainer.py +0 -12
- ml_tools/ML_vision_datasetmaster.py +0 -12
- ml_tools/ML_vision_evaluation.py +0 -10
- ml_tools/ML_vision_inference.py +0 -8
- ml_tools/ML_vision_models.py +0 -18
- ml_tools/SQL.py +0 -8
- ml_tools/_core/_ETL_cleaning.py +0 -694
- ml_tools/_core/_IO_tools.py +0 -498
- ml_tools/_core/_ML_callbacks.py +0 -702
- ml_tools/_core/_ML_configuration.py +0 -1332
- ml_tools/_core/_ML_configuration_pytab.py +0 -102
- ml_tools/_core/_ML_evaluation.py +0 -867
- ml_tools/_core/_ML_evaluation_multi.py +0 -544
- ml_tools/_core/_ML_inference.py +0 -646
- ml_tools/_core/_ML_models.py +0 -668
- ml_tools/_core/_ML_models_pytab.py +0 -693
- ml_tools/_core/_ML_trainer.py +0 -2323
- ml_tools/_core/_ML_utilities.py +0 -886
- ml_tools/_core/_ML_vision_models.py +0 -644
- ml_tools/_core/_data_exploration.py +0 -1909
- ml_tools/_core/_optimization_tools.py +0 -493
- ml_tools/_core/_schema.py +0 -359
- ml_tools/plot_fonts.py +0 -8
- ml_tools/schema.py +0 -12
- {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,583 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from pandas.api.types import is_numeric_dtype, is_object_dtype
|
|
3
|
+
import numpy as np
|
|
4
|
+
from typing import Any, Optional, Union
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from .._core import get_logger
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
_LOGGER = get_logger("Data Exploration: Feature Ops")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"split_features_targets",
|
|
15
|
+
"split_continuous_binary",
|
|
16
|
+
"split_continuous_categorical_targets",
|
|
17
|
+
"encode_categorical_features",
|
|
18
|
+
"reconstruct_one_hot",
|
|
19
|
+
"reconstruct_binary",
|
|
20
|
+
"reconstruct_multibinary",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def split_features_targets(df: pd.DataFrame, targets: list[str]):
|
|
25
|
+
"""
|
|
26
|
+
Splits a DataFrame's columns into features and targets.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
df (pd.DataFrame): Pandas DataFrame containing the dataset.
|
|
30
|
+
targets (list[str]): List of column names to be treated as target variables.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
tuple: A tuple containing:
|
|
34
|
+
- pd.DataFrame: Features dataframe.
|
|
35
|
+
- pd.DataFrame: Targets dataframe.
|
|
36
|
+
|
|
37
|
+
Prints:
|
|
38
|
+
- Shape of the original dataframe.
|
|
39
|
+
- Shape of the features dataframe.
|
|
40
|
+
- Shape of the targets dataframe.
|
|
41
|
+
"""
|
|
42
|
+
missing_targets = [t for t in targets if t not in df.columns]
|
|
43
|
+
if missing_targets:
|
|
44
|
+
_LOGGER.error(f"Target columns not found in DataFrame: {missing_targets}")
|
|
45
|
+
raise ValueError()
|
|
46
|
+
|
|
47
|
+
# 2. Perform the split
|
|
48
|
+
df_targets = df[targets]
|
|
49
|
+
df_features = df.drop(columns=targets)
|
|
50
|
+
|
|
51
|
+
# 3. Print summary
|
|
52
|
+
print(f"Original shape: {df.shape}\nFeatures shape: {df_features.shape}\nTargets shape: {df_targets.shape}")
|
|
53
|
+
|
|
54
|
+
return df_features, df_targets
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def split_continuous_binary(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
58
|
+
"""
|
|
59
|
+
Split DataFrame into two DataFrames: one with continuous columns, one with binary columns.
|
|
60
|
+
Normalize binary values like 0.0/1.0 to 0/1 if detected.
|
|
61
|
+
|
|
62
|
+
Parameters:
|
|
63
|
+
df (pd.DataFrame): Input DataFrame with only numeric columns.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Tuple(pd.DataFrame, pd.DataFrame): (continuous_columns_df, binary_columns_df)
|
|
67
|
+
|
|
68
|
+
Raises:
|
|
69
|
+
TypeError: If any column is not numeric.
|
|
70
|
+
"""
|
|
71
|
+
if not all(np.issubdtype(dtype, np.number) for dtype in df.dtypes):
|
|
72
|
+
_LOGGER.error("All columns must be numeric (int or float).")
|
|
73
|
+
raise TypeError()
|
|
74
|
+
|
|
75
|
+
binary_cols = []
|
|
76
|
+
continuous_cols = []
|
|
77
|
+
|
|
78
|
+
for col in df.columns:
|
|
79
|
+
series = df[col]
|
|
80
|
+
unique_values = set(series[~series.isna()].unique())
|
|
81
|
+
|
|
82
|
+
if unique_values.issubset({0, 1}):
|
|
83
|
+
binary_cols.append(col)
|
|
84
|
+
elif unique_values.issubset({0.0, 1.0}):
|
|
85
|
+
df[col] = df[col].apply(lambda x: 0 if x == 0.0 else (1 if x == 1.0 else x))
|
|
86
|
+
binary_cols.append(col)
|
|
87
|
+
else:
|
|
88
|
+
continuous_cols.append(col)
|
|
89
|
+
|
|
90
|
+
binary_cols.sort()
|
|
91
|
+
|
|
92
|
+
df_cont = df[continuous_cols]
|
|
93
|
+
df_bin = df[binary_cols]
|
|
94
|
+
|
|
95
|
+
print(f"Continuous columns shape: {df_cont.shape}")
|
|
96
|
+
print(f"Binary columns shape: {df_bin.shape}")
|
|
97
|
+
|
|
98
|
+
return df_cont, df_bin # type: ignore
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def split_continuous_categorical_targets(
|
|
102
|
+
df: pd.DataFrame,
|
|
103
|
+
categorical_cols: list[str],
|
|
104
|
+
target_cols: list[str]
|
|
105
|
+
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
|
106
|
+
"""
|
|
107
|
+
Splits the DataFrame into three subsets: Continuous, Categorical, and Targets.
|
|
108
|
+
|
|
109
|
+
Logic:
|
|
110
|
+
1. Categorical and Target columns are explicitly provided.
|
|
111
|
+
2. Continuous columns are inferred (All columns - Categorical - Targets).
|
|
112
|
+
3. Continuous columns should be numeric and have more than 2 unique values.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
df (pd.DataFrame): Input DataFrame.
|
|
116
|
+
categorical_cols (list[str]): List of categorical column names.
|
|
117
|
+
target_cols (list[str]): List of target column names.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Tuple (pd.DataFrame, pd.DataFrame, pd.DataFrame):
|
|
121
|
+
(df_continuous, df_categorical, df_targets)
|
|
122
|
+
"""
|
|
123
|
+
# Set operations to find inferred continuous columns
|
|
124
|
+
all_cols = set(df.columns)
|
|
125
|
+
cat_set = set(categorical_cols)
|
|
126
|
+
tgt_set = set(target_cols)
|
|
127
|
+
|
|
128
|
+
# Basic input validation
|
|
129
|
+
missing_cat = cat_set - all_cols
|
|
130
|
+
missing_tgt = tgt_set - all_cols
|
|
131
|
+
if missing_cat:
|
|
132
|
+
_LOGGER.error(f"Categorical columns not found in DataFrame: {missing_cat}")
|
|
133
|
+
raise ValueError()
|
|
134
|
+
if missing_tgt:
|
|
135
|
+
_LOGGER.error(f"Target columns not found in DataFrame: {missing_tgt}")
|
|
136
|
+
raise ValueError()
|
|
137
|
+
|
|
138
|
+
# Identify continuous columns
|
|
139
|
+
inferred_continuous = list(all_cols - cat_set - tgt_set)
|
|
140
|
+
inferred_continuous.sort() # Ensure deterministic order
|
|
141
|
+
|
|
142
|
+
# Validate inferred continuous columns
|
|
143
|
+
for col in inferred_continuous:
|
|
144
|
+
series = df[col]
|
|
145
|
+
|
|
146
|
+
# Check 1: Must be numeric
|
|
147
|
+
if not is_numeric_dtype(series):
|
|
148
|
+
_LOGGER.warning(f"Column '{col}' was inferred as continuous but is not numeric (dtype: {series.dtype}).")
|
|
149
|
+
|
|
150
|
+
# Check 2: Must have > 2 unique values (cardinality check)
|
|
151
|
+
# We drop NA to count actual unique values
|
|
152
|
+
unique_count = series.dropna().nunique()
|
|
153
|
+
if unique_count <= 2:
|
|
154
|
+
_LOGGER.warning(f"Column '{col}' was inferred as continuous but has only {unique_count} unique value(s). It might be binary or constant.")
|
|
155
|
+
|
|
156
|
+
# Split DataFrames
|
|
157
|
+
df_continuous = df[inferred_continuous]
|
|
158
|
+
df_categorical = df[categorical_cols] # Preserve user order
|
|
159
|
+
df_targets = df[target_cols] # Preserve user order
|
|
160
|
+
|
|
161
|
+
_LOGGER.info(
|
|
162
|
+
f"Split complete.\n"
|
|
163
|
+
f" - Continuous: {df_continuous.shape}\n"
|
|
164
|
+
f" - Categorical: {df_categorical.shape}\n"
|
|
165
|
+
f" - Targets: {df_targets.shape}"
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
return df_continuous, df_categorical, df_targets
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def encode_categorical_features(
|
|
172
|
+
df_categorical: pd.DataFrame,
|
|
173
|
+
encode_nulls: bool,
|
|
174
|
+
null_label: str = "Other",
|
|
175
|
+
verbose: int = 1
|
|
176
|
+
) -> tuple[pd.DataFrame, dict[str, dict[str, int]]]:
|
|
177
|
+
"""
|
|
178
|
+
Encodes all columns in the provided DataFrame as categorical features using Label Encoding.
|
|
179
|
+
|
|
180
|
+
This function generates a unique integer mapping for the values in each column.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
df_categorical (pd.DataFrame): DataFrame containing ONLY the categorical columns to encode.
|
|
184
|
+
encode_nulls (bool):
|
|
185
|
+
- If True, Nulls (NaN/None) are encoded as a distinct category (0). Real categories will start from 1.
|
|
186
|
+
- If False, Nulls are left as NaN. Real categories start from 0.
|
|
187
|
+
null_label (str): Label used for the null category in the returned mapping if `encode_nulls` is True.
|
|
188
|
+
verbose (int):
|
|
189
|
+
- 0: Error level only.
|
|
190
|
+
- 1: Info and Warning levels.
|
|
191
|
+
- 2: Debug/Print everything (includes per-column summary).
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
Tuple (Dataframe, Dict):
|
|
195
|
+
- pd.DataFrame: A new DataFrame with all columns encoded as integers.
|
|
196
|
+
- Dict[str, Dict[str, int]]: A dictionary where keys are column names and values are the value-to-integer mappings.
|
|
197
|
+
"""
|
|
198
|
+
df_encoded = df_categorical.copy()
|
|
199
|
+
mappings: dict[str, dict[str, int]] = {}
|
|
200
|
+
|
|
201
|
+
cols_to_process = df_encoded.columns.tolist()
|
|
202
|
+
|
|
203
|
+
if verbose >= 1:
|
|
204
|
+
_LOGGER.info(f"Encoding {len(cols_to_process)} categorical column(s).")
|
|
205
|
+
|
|
206
|
+
for col_name in cols_to_process:
|
|
207
|
+
has_nulls = df_encoded[col_name].isnull().any()
|
|
208
|
+
|
|
209
|
+
# Get unique values (excluding nulls) to determine categories
|
|
210
|
+
# Sorting ensures deterministic integer assignment
|
|
211
|
+
raw_unique_values = df_encoded[col_name].dropna().unique()
|
|
212
|
+
categories = sorted([str(cat) for cat in raw_unique_values])
|
|
213
|
+
|
|
214
|
+
# --- Check for constant columns ---
|
|
215
|
+
# Note: If encode_nulls=True and we have nulls, it's effectively binary (Null vs Value), so we keep it.
|
|
216
|
+
is_effectively_binary = encode_nulls and has_nulls
|
|
217
|
+
if len(categories) <= 1 and not is_effectively_binary:
|
|
218
|
+
if verbose >= 1:
|
|
219
|
+
_LOGGER.warning(f"Column '{col_name}' has only {len(categories)} unique non-null value(s).")
|
|
220
|
+
|
|
221
|
+
# --- Encoding Logic ---
|
|
222
|
+
if encode_nulls and has_nulls:
|
|
223
|
+
# Mode A: Encode Nulls.
|
|
224
|
+
# Null -> 0
|
|
225
|
+
# Categories -> 1, 2, 3...
|
|
226
|
+
|
|
227
|
+
mapping = {category: i + 1 for i, category in enumerate(categories)}
|
|
228
|
+
|
|
229
|
+
# 1. Map existing non-null values (cast to str first to match mapping keys)
|
|
230
|
+
mapped_series = df_encoded[col_name].astype(str).map(mapping)
|
|
231
|
+
|
|
232
|
+
# 2. Fill NaNs with 0
|
|
233
|
+
df_encoded[col_name] = mapped_series.fillna(0).astype(int)
|
|
234
|
+
|
|
235
|
+
# --- Handle Mapping Dict Collision ---
|
|
236
|
+
current_null_label = null_label
|
|
237
|
+
if current_null_label in mapping:
|
|
238
|
+
current_null_label = "__NULL__"
|
|
239
|
+
if verbose >= 1:
|
|
240
|
+
_LOGGER.warning(f"Collision in '{col_name}': '{null_label}' is a real category. Using '{current_null_label}' for nulls.")
|
|
241
|
+
|
|
242
|
+
# Add null key to user mapping
|
|
243
|
+
user_mapping = {**mapping, current_null_label: 0}
|
|
244
|
+
mappings[col_name] = user_mapping
|
|
245
|
+
|
|
246
|
+
else:
|
|
247
|
+
# Mode B: Ignore Nulls (preserve them as NaN) or No Nulls exist.
|
|
248
|
+
# Categories -> 0, 1, 2...
|
|
249
|
+
|
|
250
|
+
mapping = {category: i for i, category in enumerate(categories)}
|
|
251
|
+
|
|
252
|
+
# Map values.
|
|
253
|
+
# Note: map() on a Series with NaNs will result in NaNs for those positions.
|
|
254
|
+
# use 'Int64' (capital I) to handle Integers with <NA> values cleanly.
|
|
255
|
+
df_encoded[col_name] = df_encoded[col_name].astype(str).map(mapping).astype("Int64")
|
|
256
|
+
|
|
257
|
+
mappings[col_name] = mapping
|
|
258
|
+
|
|
259
|
+
if verbose >= 2:
|
|
260
|
+
cardinality = len(mappings[col_name])
|
|
261
|
+
print(f" - Encoded '{col_name}': {cardinality} categories.")
|
|
262
|
+
|
|
263
|
+
return df_encoded, mappings
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def reconstruct_one_hot(
|
|
267
|
+
df: pd.DataFrame,
|
|
268
|
+
features_to_reconstruct: list[Union[str, tuple[str, Optional[str]]]],
|
|
269
|
+
separator: str = '_',
|
|
270
|
+
baseline_category_name: Optional[str] = "Other",
|
|
271
|
+
drop_original: bool = True,
|
|
272
|
+
verbose: bool = True
|
|
273
|
+
) -> pd.DataFrame:
|
|
274
|
+
"""
|
|
275
|
+
Reconstructs original categorical columns from a one-hot encoded DataFrame.
|
|
276
|
+
|
|
277
|
+
This function identifies groups of one-hot encoded columns based on a common
|
|
278
|
+
prefix (base feature name) and a separator. It then collapses each group
|
|
279
|
+
into a single column containing the categorical value.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
df (pd.DataFrame):
|
|
283
|
+
The input DataFrame with one-hot encoded columns.
|
|
284
|
+
features_to_reconstruct (List[str | Tuple[str, str | None]]):
|
|
285
|
+
A list defining the features to reconstruct. This list can contain:
|
|
286
|
+
|
|
287
|
+
- A string: (e.g., "Color")
|
|
288
|
+
This reconstructs the feature 'Color' and assumes all-zero rows represent the baseline category ("Other" by default).
|
|
289
|
+
- A tuple: (e.g., ("Pet", "Dog"))
|
|
290
|
+
This reconstructs 'Pet' and maps all-zero rows to the baseline category "Dog".
|
|
291
|
+
- A tuple with None: (e.g., ("Size", None))
|
|
292
|
+
This reconstructs 'Size' and maps all-zero rows to the NaN value.
|
|
293
|
+
Example:
|
|
294
|
+
[
|
|
295
|
+
"Mood", # All-zeros -> "Other"
|
|
296
|
+
("Color", "Red"), # All-zeros -> "Red"
|
|
297
|
+
("Size", None) # All-zeros -> NaN
|
|
298
|
+
]
|
|
299
|
+
separator (str):
|
|
300
|
+
The character separating the base name from the categorical value in
|
|
301
|
+
the column names (e.g., '_' in 'B_a').
|
|
302
|
+
baseline_category_name (str | None):
|
|
303
|
+
The baseline category name to use by default if it is not explicitly provided.
|
|
304
|
+
drop_original (bool):
|
|
305
|
+
If True, the original one-hot encoded columns will be dropped from
|
|
306
|
+
the returned DataFrame.
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
pd.DataFrame:
|
|
310
|
+
A new DataFrame with the specified one-hot encoded features
|
|
311
|
+
reconstructed into single categorical columns.
|
|
312
|
+
|
|
313
|
+
<br>
|
|
314
|
+
|
|
315
|
+
## Note:
|
|
316
|
+
|
|
317
|
+
This function is designed to be robust, but users should be aware of two key edge cases:
|
|
318
|
+
|
|
319
|
+
1. **Ambiguous Base Feature Prefixes**: If `base_feature_names` list contains names where one is a prefix of another (e.g., `['feat', 'feat_ext']`), the order is critical. The function will match columns greedily. To avoid incorrect grouping, always list the **most specific base names first** (e.g., `['feat_ext', 'feat']`).
|
|
320
|
+
|
|
321
|
+
2. **Malformed One-Hot Data**: If a row contains multiple `1`s within the same feature group (e.g., both `B_a` and `B_c` are `1`), the function will not raise an error. It uses `.idxmax()`, which returns the first column that contains the maximum value. This means it will silently select the first category it encounters and ignore the others, potentially masking an upstream data issue.
|
|
322
|
+
"""
|
|
323
|
+
if not isinstance(df, pd.DataFrame):
|
|
324
|
+
_LOGGER.error("Input must be a pandas DataFrame.")
|
|
325
|
+
raise TypeError()
|
|
326
|
+
|
|
327
|
+
if not (baseline_category_name is None or isinstance(baseline_category_name, str)):
|
|
328
|
+
_LOGGER.error("The baseline_category must be None or a string.")
|
|
329
|
+
raise TypeError()
|
|
330
|
+
|
|
331
|
+
new_df = df.copy()
|
|
332
|
+
all_ohe_cols_to_drop = []
|
|
333
|
+
reconstructed_count = 0
|
|
334
|
+
|
|
335
|
+
# --- 1. Parse and validate the reconstruction config ---
|
|
336
|
+
# This normalizes the input into a clean {base_name: baseline_val} dict
|
|
337
|
+
reconstruction_config: dict[str, Optional[str]] = {}
|
|
338
|
+
try:
|
|
339
|
+
for item in features_to_reconstruct:
|
|
340
|
+
if isinstance(item, str):
|
|
341
|
+
# Case 1: "Color"
|
|
342
|
+
base_name = item
|
|
343
|
+
baseline_val = baseline_category_name
|
|
344
|
+
elif isinstance(item, tuple) and len(item) == 2:
|
|
345
|
+
# Case 2: ("Pet", "dog") or ("Size", None)
|
|
346
|
+
base_name, baseline_val = item
|
|
347
|
+
if not (isinstance(base_name, str) and (isinstance(baseline_val, str) or baseline_val is None)):
|
|
348
|
+
_LOGGER.error(f"Invalid tuple format for '{item}'. Must be (str, str|None).")
|
|
349
|
+
raise ValueError()
|
|
350
|
+
else:
|
|
351
|
+
_LOGGER.error(f"Invalid item '{item}'. Must be str or (str, str|None) tuple.")
|
|
352
|
+
raise ValueError()
|
|
353
|
+
|
|
354
|
+
if base_name in reconstruction_config and verbose:
|
|
355
|
+
_LOGGER.warning(f"Duplicate entry for '{base_name}' found. Using the last provided configuration.")
|
|
356
|
+
|
|
357
|
+
reconstruction_config[base_name] = baseline_val
|
|
358
|
+
|
|
359
|
+
except Exception as e:
|
|
360
|
+
_LOGGER.error(f"Failed to parse 'features_to_reconstruct' argument: {e}")
|
|
361
|
+
raise ValueError("Invalid configuration for 'features_to_reconstruct'.") from e
|
|
362
|
+
|
|
363
|
+
_LOGGER.info(f"Attempting to reconstruct {len(reconstruction_config)} one-hot encoded feature(s).")
|
|
364
|
+
|
|
365
|
+
# Main logic
|
|
366
|
+
for base_name, baseline_category in reconstruction_config.items():
|
|
367
|
+
# Regex to find all columns belonging to this base feature.
|
|
368
|
+
pattern = f"^{re.escape(base_name)}{re.escape(separator)}"
|
|
369
|
+
|
|
370
|
+
# Find matching columns
|
|
371
|
+
ohe_cols = [col for col in df.columns if re.match(pattern, col)]
|
|
372
|
+
|
|
373
|
+
if not ohe_cols:
|
|
374
|
+
_LOGGER.warning(f"No one-hot encoded columns found for base feature '{base_name}'. Skipping.")
|
|
375
|
+
continue
|
|
376
|
+
|
|
377
|
+
# For each row, find the column name with the maximum value (which is 1)
|
|
378
|
+
reconstructed_series = new_df[ohe_cols].idxmax(axis=1) # type: ignore
|
|
379
|
+
|
|
380
|
+
# Extract the categorical value (the suffix) from the column name
|
|
381
|
+
# Use n=1 in split to handle cases where the category itself might contain the separator
|
|
382
|
+
new_column_values = reconstructed_series.str.split(separator, n=1).str[1] # type: ignore
|
|
383
|
+
|
|
384
|
+
# Handle rows where all OHE columns were 0 (e.g., original value was NaN or a dropped baseline).
|
|
385
|
+
all_zero_mask = new_df[ohe_cols].sum(axis=1) == 0 # type: ignore
|
|
386
|
+
|
|
387
|
+
if baseline_category is not None:
|
|
388
|
+
# A baseline category was provided
|
|
389
|
+
new_column_values.loc[all_zero_mask] = baseline_category
|
|
390
|
+
else:
|
|
391
|
+
# No baseline provided: assign NaN
|
|
392
|
+
new_column_values.loc[all_zero_mask] = np.nan # type: ignore
|
|
393
|
+
|
|
394
|
+
if verbose:
|
|
395
|
+
print(f" - Mapped 'all-zero' rows for '{base_name}' to baseline: '{baseline_category}'.")
|
|
396
|
+
|
|
397
|
+
# Assign the new reconstructed column to the DataFrame
|
|
398
|
+
new_df[base_name] = new_column_values
|
|
399
|
+
|
|
400
|
+
all_ohe_cols_to_drop.extend(ohe_cols)
|
|
401
|
+
reconstructed_count += 1
|
|
402
|
+
if verbose:
|
|
403
|
+
print(f" - Reconstructed '{base_name}' from {len(ohe_cols)} columns.")
|
|
404
|
+
|
|
405
|
+
# Cleanup
|
|
406
|
+
if drop_original and all_ohe_cols_to_drop:
|
|
407
|
+
# Drop the original OHE columns, ensuring no duplicates in the drop list
|
|
408
|
+
unique_cols_to_drop = list(set(all_ohe_cols_to_drop))
|
|
409
|
+
new_df.drop(columns=unique_cols_to_drop, inplace=True)
|
|
410
|
+
_LOGGER.info(f"Dropped {len(unique_cols_to_drop)} original one-hot encoded columns.")
|
|
411
|
+
|
|
412
|
+
_LOGGER.info(f"Successfully reconstructed {reconstructed_count} feature(s).")
|
|
413
|
+
|
|
414
|
+
return new_df
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def reconstruct_binary(
|
|
418
|
+
df: pd.DataFrame,
|
|
419
|
+
reconstruction_map: dict[str, tuple[str, Any, Any]],
|
|
420
|
+
drop_original: bool = True,
|
|
421
|
+
verbose: bool = True
|
|
422
|
+
) -> pd.DataFrame:
|
|
423
|
+
"""
|
|
424
|
+
Reconstructs new categorical columns from existing binary (0/1) columns.
|
|
425
|
+
|
|
426
|
+
Used to reverse a binary encoding by mapping 0 and 1 back to
|
|
427
|
+
descriptive categorical labels.
|
|
428
|
+
|
|
429
|
+
Args:
|
|
430
|
+
df (pd.DataFrame):
|
|
431
|
+
The input DataFrame.
|
|
432
|
+
reconstruction_map (Dict[str, Tuple[str, Any, Any]]):
|
|
433
|
+
A dictionary defining the reconstructions.
|
|
434
|
+
Format:
|
|
435
|
+
{ "new_col_name": ("source_col_name", "label_for_0", "label_for_1") }
|
|
436
|
+
Example:
|
|
437
|
+
{
|
|
438
|
+
"Sex": ("Sex_male", "Female", "Male"),
|
|
439
|
+
"Smoker": ("Is_Smoker", "No", "Yes")
|
|
440
|
+
}
|
|
441
|
+
drop_original (bool):
|
|
442
|
+
If True, the original binary source columns (e.g., "Sex_male")
|
|
443
|
+
will be dropped from the returned DataFrame.
|
|
444
|
+
verbose (bool):
|
|
445
|
+
If True, prints the details of each reconstruction.
|
|
446
|
+
|
|
447
|
+
Returns:
|
|
448
|
+
pd.DataFrame:
|
|
449
|
+
A new DataFrame with the reconstructed categorical columns.
|
|
450
|
+
|
|
451
|
+
Raises:
|
|
452
|
+
TypeError: If `df` is not a pandas DataFrame.
|
|
453
|
+
ValueError: If `reconstruction_map` is not a dictionary or a
|
|
454
|
+
configuration is invalid (e.g., column name collision).
|
|
455
|
+
|
|
456
|
+
Notes:
|
|
457
|
+
- The function operates on a copy of the DataFrame.
|
|
458
|
+
- Rows with `NaN` in the source column will have `NaN` in the
|
|
459
|
+
new column.
|
|
460
|
+
- Values in the source column other than 0 or 1 (e.g., 2) will
|
|
461
|
+
result in `NaN` in the new column.
|
|
462
|
+
"""
|
|
463
|
+
if not isinstance(df, pd.DataFrame):
|
|
464
|
+
_LOGGER.error("Input must be a pandas DataFrame.")
|
|
465
|
+
raise TypeError()
|
|
466
|
+
|
|
467
|
+
if not isinstance(reconstruction_map, dict):
|
|
468
|
+
_LOGGER.error("`reconstruction_map` must be a dictionary with the required format.")
|
|
469
|
+
raise ValueError()
|
|
470
|
+
|
|
471
|
+
new_df = df.copy()
|
|
472
|
+
source_cols_to_drop: list[str] = []
|
|
473
|
+
reconstructed_count = 0
|
|
474
|
+
|
|
475
|
+
_LOGGER.info(f"Attempting to reconstruct {len(reconstruction_map)} binary feature(s).")
|
|
476
|
+
|
|
477
|
+
for new_col_name, config in reconstruction_map.items():
|
|
478
|
+
|
|
479
|
+
# --- 1. Validation ---
|
|
480
|
+
if not (isinstance(config, tuple) and len(config) == 3):
|
|
481
|
+
_LOGGER.error(f"Config for '{new_col_name}' is invalid. Must be a 3-item tuple. Skipping.")
|
|
482
|
+
raise ValueError()
|
|
483
|
+
|
|
484
|
+
source_col, label_for_0, label_for_1 = config
|
|
485
|
+
|
|
486
|
+
if source_col not in new_df.columns:
|
|
487
|
+
_LOGGER.error(f"Source column '{source_col}' for new column '{new_col_name}' not found. Skipping.")
|
|
488
|
+
raise ValueError()
|
|
489
|
+
|
|
490
|
+
if new_col_name in new_df.columns and new_col_name != source_col and verbose:
|
|
491
|
+
_LOGGER.warning(f"New column '{new_col_name}' already exists and will be overwritten.")
|
|
492
|
+
|
|
493
|
+
# --- 2. Reconstruction ---
|
|
494
|
+
mapping_dict = {0: label_for_0, 1: label_for_1}
|
|
495
|
+
new_df[new_col_name] = new_df[source_col].map(mapping_dict)
|
|
496
|
+
|
|
497
|
+
# --- 3. Logging/Tracking ---
|
|
498
|
+
# Only mark source for dropping if it's NOT the same as the new column
|
|
499
|
+
if source_col != new_col_name:
|
|
500
|
+
source_cols_to_drop.append(source_col)
|
|
501
|
+
|
|
502
|
+
reconstructed_count += 1
|
|
503
|
+
if verbose:
|
|
504
|
+
print(f" - Reconstructed '{new_col_name}' from '{source_col}' (0='{label_for_0}', 1='{label_for_1}').")
|
|
505
|
+
|
|
506
|
+
# --- 4. Cleanup ---
|
|
507
|
+
if drop_original and source_cols_to_drop:
|
|
508
|
+
unique_cols_to_drop = list(set(source_cols_to_drop))
|
|
509
|
+
new_df.drop(columns=unique_cols_to_drop, inplace=True)
|
|
510
|
+
_LOGGER.info(f"Dropped {len(unique_cols_to_drop)} original binary source column(s).")
|
|
511
|
+
|
|
512
|
+
_LOGGER.info(f"Successfully reconstructed {reconstructed_count} feature(s).")
|
|
513
|
+
|
|
514
|
+
return new_df
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
def reconstruct_multibinary(
|
|
518
|
+
df: pd.DataFrame,
|
|
519
|
+
pattern: str,
|
|
520
|
+
pos_label: str = "Yes",
|
|
521
|
+
neg_label: str = "No",
|
|
522
|
+
case_sensitive: bool = False,
|
|
523
|
+
verbose: bool = True
|
|
524
|
+
) -> tuple[pd.DataFrame, list[str]]:
|
|
525
|
+
"""
|
|
526
|
+
Identifies binary columns matching a regex pattern and converts their numeric
|
|
527
|
+
values (0/1) into categorical string labels (e.g., "No"/"Yes").
|
|
528
|
+
|
|
529
|
+
This allows mass-labeling of binary features so they are treated as proper
|
|
530
|
+
categorical variables with meaningful keys during subsequent encoding steps.
|
|
531
|
+
|
|
532
|
+
Args:
|
|
533
|
+
df (pd.DataFrame): The input DataFrame.
|
|
534
|
+
pattern (str): Regex pattern to identify the group of binary columns.
|
|
535
|
+
pos_label (str): The label to assign to 1 or True (default "Yes").
|
|
536
|
+
neg_label (str): The label to assign to 0 or False (default "No").
|
|
537
|
+
case_sensitive (bool): If True, regex matching is case-sensitive.
|
|
538
|
+
verbose (bool): If True, prints a summary of the operation.
|
|
539
|
+
|
|
540
|
+
Returns:
|
|
541
|
+
Tuple(pd.DataFrame, List[str]):
|
|
542
|
+
- A new DataFrame with the matched columns converted to Strings.
|
|
543
|
+
- A list of the column names that were modified.
|
|
544
|
+
"""
|
|
545
|
+
if not isinstance(df, pd.DataFrame):
|
|
546
|
+
_LOGGER.error("Input must be a pandas DataFrame.")
|
|
547
|
+
raise TypeError()
|
|
548
|
+
|
|
549
|
+
new_df = df.copy()
|
|
550
|
+
|
|
551
|
+
# 1. Find columns matching the regex
|
|
552
|
+
mask = new_df.columns.str.contains(pattern, case=case_sensitive, regex=True)
|
|
553
|
+
target_columns = new_df.columns[mask].to_list()
|
|
554
|
+
|
|
555
|
+
if not target_columns:
|
|
556
|
+
_LOGGER.warning(f"No columns found matching pattern '{pattern}'. Returning original DataFrame.")
|
|
557
|
+
return new_df, list()
|
|
558
|
+
|
|
559
|
+
# 2. Define robust mapping (handles ints, floats, and booleans)
|
|
560
|
+
# Note: Any value not in this map will become NaN
|
|
561
|
+
mapping_dict = {
|
|
562
|
+
0: neg_label,
|
|
563
|
+
0.0: neg_label,
|
|
564
|
+
False: neg_label,
|
|
565
|
+
1: pos_label,
|
|
566
|
+
1.0: pos_label,
|
|
567
|
+
True: pos_label
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
converted_count = 0
|
|
571
|
+
|
|
572
|
+
# 3. Apply mapping
|
|
573
|
+
for col in target_columns:
|
|
574
|
+
# Check if column is numeric or boolean before attempting map to avoid destroying existing strings
|
|
575
|
+
if is_numeric_dtype(new_df[col]) or is_object_dtype(new_df[col]):
|
|
576
|
+
# We cast to object implicitly by mapping to strings
|
|
577
|
+
new_df[col] = new_df[col].map(mapping_dict)
|
|
578
|
+
converted_count += 1
|
|
579
|
+
|
|
580
|
+
if verbose:
|
|
581
|
+
_LOGGER.info(f"Reconstructed {converted_count} binary columns matching '{pattern}'.")
|
|
582
|
+
|
|
583
|
+
return new_df, target_columns
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from .._core import _imprimir_disponibles
|
|
2
|
+
|
|
3
|
+
_GRUPOS = [
|
|
4
|
+
"summarize_dataframe",
|
|
5
|
+
"show_null_columns",
|
|
6
|
+
"drop_constant_columns",
|
|
7
|
+
"drop_rows_with_missing_data",
|
|
8
|
+
"drop_columns_with_missing_data",
|
|
9
|
+
"drop_macro",
|
|
10
|
+
"clean_column_names",
|
|
11
|
+
"plot_value_distributions",
|
|
12
|
+
"split_features_targets",
|
|
13
|
+
"split_continuous_binary",
|
|
14
|
+
"split_continuous_categorical_targets",
|
|
15
|
+
"encode_categorical_features",
|
|
16
|
+
"clip_outliers_single",
|
|
17
|
+
"clip_outliers_multi",
|
|
18
|
+
"drop_outlier_samples",
|
|
19
|
+
"plot_continuous_vs_target",
|
|
20
|
+
"plot_categorical_vs_target",
|
|
21
|
+
"plot_correlation_heatmap",
|
|
22
|
+
"finalize_feature_schema",
|
|
23
|
+
"apply_feature_schema",
|
|
24
|
+
"match_and_filter_columns_by_regex",
|
|
25
|
+
"standardize_percentages",
|
|
26
|
+
"reconstruct_one_hot",
|
|
27
|
+
"reconstruct_binary",
|
|
28
|
+
"reconstruct_multibinary",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
def info():
|
|
32
|
+
_imprimir_disponibles(_GRUPOS)
|