dragon-ml-toolbox 12.8.0__tar.gz → 12.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-12.8.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-12.9.0}/PKG-INFO +1 -1
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/data_exploration.py +88 -23
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/pyproject.toml +1 -1
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/LICENSE +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/README.md +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ETL_cleaning.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ETL_engineering.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/GUI_tools.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/MICE_imputation.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_callbacks.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_datasetmaster.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_evaluation.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_evaluation_multi.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_inference.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_models.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_optimization.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_scaler.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_simple_optimization.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_trainer.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_utilities.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/PSO_optimization.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/RNN_forecast.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/SQL.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/VIF_factor.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/__init__.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/_logger.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/_script_info.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/constants.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/custom_logger.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ensemble_evaluation.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ensemble_inference.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ensemble_learning.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/handle_excel.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/keys.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/math_utilities.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/optimization_tools.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/path_manager.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/serde.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/utilities.py +0 -0
- {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/setup.cfg +0 -0
|
@@ -346,6 +346,7 @@ def encode_categorical_features(
|
|
|
346
346
|
df: pd.DataFrame,
|
|
347
347
|
columns_to_encode: List[str],
|
|
348
348
|
encode_nulls: bool,
|
|
349
|
+
null_label: str = "Other",
|
|
349
350
|
split_resulting_dataset: bool = True,
|
|
350
351
|
verbose: bool = True
|
|
351
352
|
) -> Tuple[Dict[str, Dict[str, int]], pd.DataFrame, Optional[pd.DataFrame]]:
|
|
@@ -359,13 +360,15 @@ def encode_categorical_features(
|
|
|
359
360
|
Args:
|
|
360
361
|
df (pd.DataFrame): The input DataFrame.
|
|
361
362
|
columns_to_encode (List[str]): A list of column names to be encoded.
|
|
362
|
-
encode_nulls (bool):
|
|
363
|
-
|
|
364
|
-
If False, Nulls are ignored and categories start from 0.
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
363
|
+
encode_nulls (bool):
|
|
364
|
+
- If True, encodes Null values as a distinct category 'null_label' with a value of 0. Other categories start from 1.
|
|
365
|
+
- If False, Nulls are ignored and categories start from 0.
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
null_label (str): Category to encode Nulls to if `encode_nulls` is True. If a name collision with `null_label` occurs, the fallback key will be "__NULL__".
|
|
369
|
+
split_resulting_dataset (bool):
|
|
370
|
+
- If True, returns two separate DataFrames, one with non-categorical columns and one with the encoded columns.
|
|
371
|
+
- If False, returns a single DataFrame with all columns.
|
|
369
372
|
verbose (bool): If True, prints encoding progress.
|
|
370
373
|
|
|
371
374
|
Returns:
|
|
@@ -376,6 +379,9 @@ def encode_categorical_features(
|
|
|
376
379
|
- pd.DataFrame: The original dataframe with or without encoded columns (see `split_resulting_dataset`).
|
|
377
380
|
|
|
378
381
|
- pd.DataFrame | None: If `split_resulting_dataset` is True, the encoded columns as a new dataframe.
|
|
382
|
+
|
|
383
|
+
## **Note:**
|
|
384
|
+
Use `encode_nulls=False` when encoding binary values with missing entries or a malformed encoding will be returned silently.
|
|
379
385
|
"""
|
|
380
386
|
df_encoded = df.copy()
|
|
381
387
|
|
|
@@ -401,8 +407,16 @@ def encode_categorical_features(
|
|
|
401
407
|
mapped_series = df_encoded[col_name].astype(str).map(mapping)
|
|
402
408
|
df_encoded[col_name] = mapped_series.fillna(0).astype(int)
|
|
403
409
|
|
|
410
|
+
# --- Validate nulls category---
|
|
411
|
+
# Ensure the key for 0 doesn't collide with a real category.
|
|
412
|
+
if null_label in mapping.keys():
|
|
413
|
+
# COLLISION! null_label is a real category
|
|
414
|
+
original_label = null_label
|
|
415
|
+
null_label = "__NULL__" # fallback
|
|
416
|
+
_LOGGER.warning(f"Column '{col_name}': '{original_label}' is a real category. Mapping nulls (0) to '{null_label}' instead.")
|
|
417
|
+
|
|
404
418
|
# Create the complete user-facing map including "Other"
|
|
405
|
-
user_mapping = {**mapping,
|
|
419
|
+
user_mapping = {**mapping, null_label: 0}
|
|
406
420
|
mappings[col_name] = user_mapping
|
|
407
421
|
else:
|
|
408
422
|
# ignore nulls
|
|
@@ -1009,9 +1023,10 @@ def create_transformer_categorical_map(
|
|
|
1009
1023
|
|
|
1010
1024
|
def reconstruct_one_hot(
|
|
1011
1025
|
df: pd.DataFrame,
|
|
1012
|
-
|
|
1026
|
+
features_to_reconstruct: List[Union[str, Tuple[str, Optional[str]]]],
|
|
1013
1027
|
separator: str = '_',
|
|
1014
|
-
drop_original: bool = True
|
|
1028
|
+
drop_original: bool = True,
|
|
1029
|
+
verbose: bool = True
|
|
1015
1030
|
) -> pd.DataFrame:
|
|
1016
1031
|
"""
|
|
1017
1032
|
Reconstructs original categorical columns from a one-hot encoded DataFrame.
|
|
@@ -1023,9 +1038,20 @@ def reconstruct_one_hot(
|
|
|
1023
1038
|
Args:
|
|
1024
1039
|
df (pd.DataFrame):
|
|
1025
1040
|
The input DataFrame with one-hot encoded columns.
|
|
1026
|
-
|
|
1027
|
-
A list
|
|
1028
|
-
|
|
1041
|
+
features_to_reconstruct (List[str | Tuple[str, str | None]]):
|
|
1042
|
+
A list defining the features to reconstruct. This list can contain:
|
|
1043
|
+
|
|
1044
|
+
- A string: (e.g., "Color")
|
|
1045
|
+
This reconstructs the feature 'Color' and assumes all-zero rows represent missing data NaN.
|
|
1046
|
+
- A tuple: (e.g., ("Pet", "Dog"))
|
|
1047
|
+
This reconstructs 'Pet' and maps all-zero rows to the baseline category "Dog" (handling 'drop_first=True' scenarios).
|
|
1048
|
+
- A tuple with None: (e.g., ("Size", None))
|
|
1049
|
+
This is explicit and behaves identically to just passing "Size". All-zero rows will be mapped to NaN.
|
|
1050
|
+
Example:
|
|
1051
|
+
[
|
|
1052
|
+
"Mood", # All-zeros -> NaN
|
|
1053
|
+
("Color", "Red"), # All-zeros -> "Red"
|
|
1054
|
+
]
|
|
1029
1055
|
separator (str):
|
|
1030
1056
|
The character separating the base name from the categorical value in
|
|
1031
1057
|
the column names (e.g., '_' in 'B_a').
|
|
@@ -1055,10 +1081,39 @@ def reconstruct_one_hot(
|
|
|
1055
1081
|
new_df = df.copy()
|
|
1056
1082
|
all_ohe_cols_to_drop = []
|
|
1057
1083
|
reconstructed_count = 0
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1084
|
+
|
|
1085
|
+
# --- 1. Parse and validate the reconstruction config ---
|
|
1086
|
+
# This normalizes the input into a clean {base_name: baseline_val} dict
|
|
1087
|
+
reconstruction_config: Dict[str, Optional[str]] = {}
|
|
1088
|
+
try:
|
|
1089
|
+
for item in features_to_reconstruct:
|
|
1090
|
+
if isinstance(item, str):
|
|
1091
|
+
# Case 1: "Color"
|
|
1092
|
+
base_name = item
|
|
1093
|
+
baseline_val = None
|
|
1094
|
+
elif isinstance(item, tuple) and len(item) == 2:
|
|
1095
|
+
# Case 2: ("Pet", "dog") or ("Size", None)
|
|
1096
|
+
base_name, baseline_val = item
|
|
1097
|
+
if not (isinstance(base_name, str) and (isinstance(baseline_val, str) or baseline_val is None)):
|
|
1098
|
+
_LOGGER.error(f"Invalid tuple format for '{item}'. Must be (str, str|None).")
|
|
1099
|
+
raise ValueError()
|
|
1100
|
+
else:
|
|
1101
|
+
_LOGGER.error(f"Invalid item '{item}'. Must be str or (str, str|None) tuple.")
|
|
1102
|
+
raise ValueError()
|
|
1103
|
+
|
|
1104
|
+
if base_name in reconstruction_config and verbose:
|
|
1105
|
+
_LOGGER.warning(f"Duplicate entry for '{base_name}' found. Using the last provided configuration.")
|
|
1106
|
+
|
|
1107
|
+
reconstruction_config[base_name] = baseline_val
|
|
1108
|
+
|
|
1109
|
+
except Exception as e:
|
|
1110
|
+
_LOGGER.error(f"Failed to parse 'features_to_reconstruct' argument: {e}")
|
|
1111
|
+
raise ValueError("Invalid configuration for 'features_to_reconstruct'.") from e
|
|
1112
|
+
|
|
1113
|
+
_LOGGER.info(f"Attempting to reconstruct {len(reconstruction_config)} one-hot encoded feature(s).")
|
|
1114
|
+
|
|
1115
|
+
# Main logic
|
|
1116
|
+
for base_name, baseline_category in reconstruction_config.items():
|
|
1062
1117
|
# Regex to find all columns belonging to this base feature.
|
|
1063
1118
|
pattern = f"^{re.escape(base_name)}{re.escape(separator)}"
|
|
1064
1119
|
|
|
@@ -1070,24 +1125,34 @@ def reconstruct_one_hot(
|
|
|
1070
1125
|
continue
|
|
1071
1126
|
|
|
1072
1127
|
# For each row, find the column name with the maximum value (which is 1)
|
|
1073
|
-
reconstructed_series = new_df[ohe_cols].idxmax(axis=1)
|
|
1128
|
+
reconstructed_series = new_df[ohe_cols].idxmax(axis=1) # type: ignore
|
|
1074
1129
|
|
|
1075
1130
|
# Extract the categorical value (the suffix) from the column name
|
|
1076
1131
|
# Use n=1 in split to handle cases where the category itself might contain the separator
|
|
1077
1132
|
new_column_values = reconstructed_series.str.split(separator, n=1).str[1]
|
|
1078
1133
|
|
|
1079
|
-
# Handle rows where all OHE columns were 0 (e.g., original value was NaN).
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1134
|
+
# Handle rows where all OHE columns were 0 (e.g., original value was NaN or a dropped baseline).
|
|
1135
|
+
all_zero_mask = new_df[ohe_cols].sum(axis=1) == 0 # type: ignore
|
|
1136
|
+
|
|
1137
|
+
if baseline_category is not None:
|
|
1138
|
+
# A baseline category was provided
|
|
1139
|
+
new_column_values.loc[all_zero_mask] = baseline_category
|
|
1140
|
+
else:
|
|
1141
|
+
# No baseline provided: assign NaN
|
|
1142
|
+
new_column_values.loc[all_zero_mask] = np.nan # type: ignore
|
|
1143
|
+
|
|
1144
|
+
if verbose:
|
|
1145
|
+
print(f" - Mapped 'all-zero' rows for '{base_name}' to baseline: '{baseline_category}'.")
|
|
1083
1146
|
|
|
1084
1147
|
# Assign the new reconstructed column to the DataFrame
|
|
1085
1148
|
new_df[base_name] = new_column_values
|
|
1086
1149
|
|
|
1087
1150
|
all_ohe_cols_to_drop.extend(ohe_cols)
|
|
1088
1151
|
reconstructed_count += 1
|
|
1089
|
-
|
|
1152
|
+
if verbose:
|
|
1153
|
+
print(f" - Reconstructed '{base_name}' from {len(ohe_cols)} columns.")
|
|
1090
1154
|
|
|
1155
|
+
# Cleanup
|
|
1091
1156
|
if drop_original and all_ohe_cols_to_drop:
|
|
1092
1157
|
# Drop the original OHE columns, ensuring no duplicates in the drop list
|
|
1093
1158
|
unique_cols_to_drop = list(set(all_ohe_cols_to_drop))
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/dragon_ml_toolbox.egg-info/SOURCES.txt
RENAMED
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/dragon_ml_toolbox.egg-info/requires.txt
RENAMED
|
File without changes
|
{dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/dragon_ml_toolbox.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|