dragon-ml-toolbox 12.8.0__py3-none-any.whl → 12.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-12.8.0.dist-info → dragon_ml_toolbox-12.9.1.dist-info}/METADATA +1 -1
- {dragon_ml_toolbox-12.8.0.dist-info → dragon_ml_toolbox-12.9.1.dist-info}/RECORD +7 -7
- ml_tools/data_exploration.py +95 -23
- {dragon_ml_toolbox-12.8.0.dist-info → dragon_ml_toolbox-12.9.1.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-12.8.0.dist-info → dragon_ml_toolbox-12.9.1.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-12.8.0.dist-info → dragon_ml_toolbox-12.9.1.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-12.8.0.dist-info → dragon_ml_toolbox-12.9.1.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
dragon_ml_toolbox-12.
|
|
2
|
-
dragon_ml_toolbox-12.
|
|
1
|
+
dragon_ml_toolbox-12.9.1.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
|
|
2
|
+
dragon_ml_toolbox-12.9.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
|
|
3
3
|
ml_tools/ETL_cleaning.py,sha256=2VBRllV8F-ZiPylPp8Az2gwn5ztgazN0BH5OKnRUhV0,20402
|
|
4
4
|
ml_tools/ETL_engineering.py,sha256=KfYqgsxupAx6e_TxwO1LZXeu5mFkIhVXJrNjP3CzIZc,54927
|
|
5
5
|
ml_tools/GUI_tools.py,sha256=Va6ig-dHULPVRwQYYtH3fvY5XPIoqRcJpRW8oXC55Hw,45413
|
|
@@ -24,7 +24,7 @@ ml_tools/_logger.py,sha256=dlp5cGbzooK9YSNSZYB4yjZrOaQUGW8PTrM411AOvL8,4717
|
|
|
24
24
|
ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
|
|
25
25
|
ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
|
|
26
26
|
ml_tools/custom_logger.py,sha256=xot-VeZFigKjcVxADgzvI54vZO_MqMMejo7JmDED8Xo,5892
|
|
27
|
-
ml_tools/data_exploration.py,sha256=
|
|
27
|
+
ml_tools/data_exploration.py,sha256=9Bbppxi6WWSAotB1tCwwWPOEkx7Vs-yvCAhesVplIBY,50618
|
|
28
28
|
ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
|
|
29
29
|
ml_tools/ensemble_inference.py,sha256=0yLmLNj45RVVoSCLH1ZYJG9IoAhTkWUqEZmLOQTFGTY,9348
|
|
30
30
|
ml_tools/ensemble_learning.py,sha256=vsIED7nlheYI4w2SBzP6SC1AnNeMfn-2A1Gqw5EfxsM,21964
|
|
@@ -35,7 +35,7 @@ ml_tools/optimization_tools.py,sha256=P074YCuZzkqkONnAsM-Zb9DTX_i8cRkkJLpwAWz6CR
|
|
|
35
35
|
ml_tools/path_manager.py,sha256=CyDU16pOKmC82jPubqJPT6EBt-u-3rGVbxyPIZCvDDY,18432
|
|
36
36
|
ml_tools/serde.py,sha256=UIshIesHRFmxr8F6B3LxGG8bYc1HHK-nlE3kENSZL18,5288
|
|
37
37
|
ml_tools/utilities.py,sha256=OcAyV1tEcYAfOWlGjRgopsjDLxU3DcI5EynzvWV4q3A,15754
|
|
38
|
-
dragon_ml_toolbox-12.
|
|
39
|
-
dragon_ml_toolbox-12.
|
|
40
|
-
dragon_ml_toolbox-12.
|
|
41
|
-
dragon_ml_toolbox-12.
|
|
38
|
+
dragon_ml_toolbox-12.9.1.dist-info/METADATA,sha256=oQWsgVpaYAb7-91f2DpCuMUNCmP1OuHmwzMCeSgVQU8,6166
|
|
39
|
+
dragon_ml_toolbox-12.9.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
40
|
+
dragon_ml_toolbox-12.9.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
41
|
+
dragon_ml_toolbox-12.9.1.dist-info/RECORD,,
|
ml_tools/data_exploration.py
CHANGED
|
@@ -346,6 +346,7 @@ def encode_categorical_features(
|
|
|
346
346
|
df: pd.DataFrame,
|
|
347
347
|
columns_to_encode: List[str],
|
|
348
348
|
encode_nulls: bool,
|
|
349
|
+
null_label: str = "Other",
|
|
349
350
|
split_resulting_dataset: bool = True,
|
|
350
351
|
verbose: bool = True
|
|
351
352
|
) -> Tuple[Dict[str, Dict[str, int]], pd.DataFrame, Optional[pd.DataFrame]]:
|
|
@@ -359,13 +360,14 @@ def encode_categorical_features(
|
|
|
359
360
|
Args:
|
|
360
361
|
df (pd.DataFrame): The input DataFrame.
|
|
361
362
|
columns_to_encode (List[str]): A list of column names to be encoded.
|
|
362
|
-
encode_nulls (bool):
|
|
363
|
-
|
|
364
|
-
If False, Nulls are ignored and categories start from 0.
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
If
|
|
363
|
+
encode_nulls (bool):
|
|
364
|
+
- If True, encodes Null values as a distinct category 'null_label' with a value of 0. Other categories start from 1.
|
|
365
|
+
- If False, Nulls are ignored and categories start from 0.
|
|
366
|
+
|
|
367
|
+
null_label (str): Category to encode Nulls to if `encode_nulls` is True. If a name collision with `null_label` occurs, the fallback key will be "__NULL__".
|
|
368
|
+
split_resulting_dataset (bool):
|
|
369
|
+
- If True, returns two separate DataFrames, one with non-categorical columns and one with the encoded columns.
|
|
370
|
+
- If False, returns a single DataFrame with all columns.
|
|
369
371
|
verbose (bool): If True, prints encoding progress.
|
|
370
372
|
|
|
371
373
|
Returns:
|
|
@@ -376,6 +378,9 @@ def encode_categorical_features(
|
|
|
376
378
|
- pd.DataFrame: The original dataframe with or without encoded columns (see `split_resulting_dataset`).
|
|
377
379
|
|
|
378
380
|
- pd.DataFrame | None: If `split_resulting_dataset` is True, the encoded columns as a new dataframe.
|
|
381
|
+
|
|
382
|
+
## **Note:**
|
|
383
|
+
Use `encode_nulls=False` when encoding binary values with missing entries or a malformed encoding will be returned silently.
|
|
379
384
|
"""
|
|
380
385
|
df_encoded = df.copy()
|
|
381
386
|
|
|
@@ -401,8 +406,16 @@ def encode_categorical_features(
|
|
|
401
406
|
mapped_series = df_encoded[col_name].astype(str).map(mapping)
|
|
402
407
|
df_encoded[col_name] = mapped_series.fillna(0).astype(int)
|
|
403
408
|
|
|
409
|
+
# --- Validate nulls category---
|
|
410
|
+
# Ensure the key for 0 doesn't collide with a real category.
|
|
411
|
+
if null_label in mapping.keys():
|
|
412
|
+
# COLLISION! null_label is a real category
|
|
413
|
+
original_label = null_label
|
|
414
|
+
null_label = "__NULL__" # fallback
|
|
415
|
+
_LOGGER.warning(f"Column '{col_name}': '{original_label}' is a real category. Mapping nulls (0) to '{null_label}' instead.")
|
|
416
|
+
|
|
404
417
|
# Create the complete user-facing map including "Other"
|
|
405
|
-
user_mapping = {**mapping,
|
|
418
|
+
user_mapping = {**mapping, null_label: 0}
|
|
406
419
|
mappings[col_name] = user_mapping
|
|
407
420
|
else:
|
|
408
421
|
# ignore nulls
|
|
@@ -1009,9 +1022,11 @@ def create_transformer_categorical_map(
|
|
|
1009
1022
|
|
|
1010
1023
|
def reconstruct_one_hot(
|
|
1011
1024
|
df: pd.DataFrame,
|
|
1012
|
-
|
|
1025
|
+
features_to_reconstruct: List[Union[str, Tuple[str, Optional[str]]]],
|
|
1013
1026
|
separator: str = '_',
|
|
1014
|
-
|
|
1027
|
+
baseline_category_name: str = "Other",
|
|
1028
|
+
drop_original: bool = True,
|
|
1029
|
+
verbose: bool = True
|
|
1015
1030
|
) -> pd.DataFrame:
|
|
1016
1031
|
"""
|
|
1017
1032
|
Reconstructs original categorical columns from a one-hot encoded DataFrame.
|
|
@@ -1023,12 +1038,26 @@ def reconstruct_one_hot(
|
|
|
1023
1038
|
Args:
|
|
1024
1039
|
df (pd.DataFrame):
|
|
1025
1040
|
The input DataFrame with one-hot encoded columns.
|
|
1026
|
-
|
|
1027
|
-
A list
|
|
1028
|
-
|
|
1041
|
+
features_to_reconstruct (List[str | Tuple[str, str | None]]):
|
|
1042
|
+
A list defining the features to reconstruct. This list can contain:
|
|
1043
|
+
|
|
1044
|
+
- A string: (e.g., "Color")
|
|
1045
|
+
This reconstructs the feature 'Color' and assumes all-zero rows represent the baseline category ("Other" by default).
|
|
1046
|
+
- A tuple: (e.g., ("Pet", "Dog"))
|
|
1047
|
+
This reconstructs 'Pet' and maps all-zero rows to the baseline category "Dog".
|
|
1048
|
+
- A tuple with None: (e.g., ("Size", None))
|
|
1049
|
+
This reconstructs 'Size' and maps all-zero rows to the NaN value.
|
|
1050
|
+
Example:
|
|
1051
|
+
[
|
|
1052
|
+
"Mood", # All-zeros -> "Other"
|
|
1053
|
+
("Color", "Red"), # All-zeros -> "Red"
|
|
1054
|
+
("Size", None) # All-zeros -> NaN
|
|
1055
|
+
]
|
|
1029
1056
|
separator (str):
|
|
1030
1057
|
The character separating the base name from the categorical value in
|
|
1031
1058
|
the column names (e.g., '_' in 'B_a').
|
|
1059
|
+
baseline_category_name (str):
|
|
1060
|
+
The baseline category name to use by default if it is not explicitly provided.
|
|
1032
1061
|
drop_original (bool):
|
|
1033
1062
|
If True, the original one-hot encoded columns will be dropped from
|
|
1034
1063
|
the returned DataFrame.
|
|
@@ -1051,14 +1080,47 @@ def reconstruct_one_hot(
|
|
|
1051
1080
|
if not isinstance(df, pd.DataFrame):
|
|
1052
1081
|
_LOGGER.error("Input must be a pandas DataFrame.")
|
|
1053
1082
|
raise TypeError()
|
|
1083
|
+
|
|
1084
|
+
if not isinstance(baseline_category_name, str):
|
|
1085
|
+
_LOGGER.error("The baseline_category must be a string.")
|
|
1086
|
+
raise TypeError()
|
|
1054
1087
|
|
|
1055
1088
|
new_df = df.copy()
|
|
1056
1089
|
all_ohe_cols_to_drop = []
|
|
1057
1090
|
reconstructed_count = 0
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1091
|
+
|
|
1092
|
+
# --- 1. Parse and validate the reconstruction config ---
|
|
1093
|
+
# This normalizes the input into a clean {base_name: baseline_val} dict
|
|
1094
|
+
reconstruction_config: Dict[str, Optional[str]] = {}
|
|
1095
|
+
try:
|
|
1096
|
+
for item in features_to_reconstruct:
|
|
1097
|
+
if isinstance(item, str):
|
|
1098
|
+
# Case 1: "Color"
|
|
1099
|
+
base_name = item
|
|
1100
|
+
baseline_val = baseline_category_name
|
|
1101
|
+
elif isinstance(item, tuple) and len(item) == 2:
|
|
1102
|
+
# Case 2: ("Pet", "dog") or ("Size", None)
|
|
1103
|
+
base_name, baseline_val = item
|
|
1104
|
+
if not (isinstance(base_name, str) and (isinstance(baseline_val, str) or baseline_val is None)):
|
|
1105
|
+
_LOGGER.error(f"Invalid tuple format for '{item}'. Must be (str, str|None).")
|
|
1106
|
+
raise ValueError()
|
|
1107
|
+
else:
|
|
1108
|
+
_LOGGER.error(f"Invalid item '{item}'. Must be str or (str, str|None) tuple.")
|
|
1109
|
+
raise ValueError()
|
|
1110
|
+
|
|
1111
|
+
if base_name in reconstruction_config and verbose:
|
|
1112
|
+
_LOGGER.warning(f"Duplicate entry for '{base_name}' found. Using the last provided configuration.")
|
|
1113
|
+
|
|
1114
|
+
reconstruction_config[base_name] = baseline_val
|
|
1115
|
+
|
|
1116
|
+
except Exception as e:
|
|
1117
|
+
_LOGGER.error(f"Failed to parse 'features_to_reconstruct' argument: {e}")
|
|
1118
|
+
raise ValueError("Invalid configuration for 'features_to_reconstruct'.") from e
|
|
1119
|
+
|
|
1120
|
+
_LOGGER.info(f"Attempting to reconstruct {len(reconstruction_config)} one-hot encoded feature(s).")
|
|
1121
|
+
|
|
1122
|
+
# Main logic
|
|
1123
|
+
for base_name, baseline_category in reconstruction_config.items():
|
|
1062
1124
|
# Regex to find all columns belonging to this base feature.
|
|
1063
1125
|
pattern = f"^{re.escape(base_name)}{re.escape(separator)}"
|
|
1064
1126
|
|
|
@@ -1070,24 +1132,34 @@ def reconstruct_one_hot(
|
|
|
1070
1132
|
continue
|
|
1071
1133
|
|
|
1072
1134
|
# For each row, find the column name with the maximum value (which is 1)
|
|
1073
|
-
reconstructed_series = new_df[ohe_cols].idxmax(axis=1)
|
|
1135
|
+
reconstructed_series = new_df[ohe_cols].idxmax(axis=1) # type: ignore
|
|
1074
1136
|
|
|
1075
1137
|
# Extract the categorical value (the suffix) from the column name
|
|
1076
1138
|
# Use n=1 in split to handle cases where the category itself might contain the separator
|
|
1077
1139
|
new_column_values = reconstructed_series.str.split(separator, n=1).str[1]
|
|
1078
1140
|
|
|
1079
|
-
# Handle rows where all OHE columns were 0 (e.g., original value was NaN).
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1141
|
+
# Handle rows where all OHE columns were 0 (e.g., original value was NaN or a dropped baseline).
|
|
1142
|
+
all_zero_mask = new_df[ohe_cols].sum(axis=1) == 0 # type: ignore
|
|
1143
|
+
|
|
1144
|
+
if baseline_category is not None:
|
|
1145
|
+
# A baseline category was provided
|
|
1146
|
+
new_column_values.loc[all_zero_mask] = baseline_category
|
|
1147
|
+
else:
|
|
1148
|
+
# No baseline provided: assign NaN
|
|
1149
|
+
new_column_values.loc[all_zero_mask] = np.nan # type: ignore
|
|
1150
|
+
|
|
1151
|
+
if verbose:
|
|
1152
|
+
print(f" - Mapped 'all-zero' rows for '{base_name}' to baseline: '{baseline_category}'.")
|
|
1083
1153
|
|
|
1084
1154
|
# Assign the new reconstructed column to the DataFrame
|
|
1085
1155
|
new_df[base_name] = new_column_values
|
|
1086
1156
|
|
|
1087
1157
|
all_ohe_cols_to_drop.extend(ohe_cols)
|
|
1088
1158
|
reconstructed_count += 1
|
|
1089
|
-
|
|
1159
|
+
if verbose:
|
|
1160
|
+
print(f" - Reconstructed '{base_name}' from {len(ohe_cols)} columns.")
|
|
1090
1161
|
|
|
1162
|
+
# Cleanup
|
|
1091
1163
|
if drop_original and all_ohe_cols_to_drop:
|
|
1092
1164
|
# Drop the original OHE columns, ensuring no duplicates in the drop list
|
|
1093
1165
|
unique_cols_to_drop = list(set(all_ohe_cols_to_drop))
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|