dragon-ml-toolbox 12.8.0__py3-none-any.whl → 12.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 12.8.0
3
+ Version: 12.9.1
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,5 +1,5 @@
1
- dragon_ml_toolbox-12.8.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
2
- dragon_ml_toolbox-12.8.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
1
+ dragon_ml_toolbox-12.9.1.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
2
+ dragon_ml_toolbox-12.9.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
3
3
  ml_tools/ETL_cleaning.py,sha256=2VBRllV8F-ZiPylPp8Az2gwn5ztgazN0BH5OKnRUhV0,20402
4
4
  ml_tools/ETL_engineering.py,sha256=KfYqgsxupAx6e_TxwO1LZXeu5mFkIhVXJrNjP3CzIZc,54927
5
5
  ml_tools/GUI_tools.py,sha256=Va6ig-dHULPVRwQYYtH3fvY5XPIoqRcJpRW8oXC55Hw,45413
@@ -24,7 +24,7 @@ ml_tools/_logger.py,sha256=dlp5cGbzooK9YSNSZYB4yjZrOaQUGW8PTrM411AOvL8,4717
24
24
  ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
25
25
  ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
26
26
  ml_tools/custom_logger.py,sha256=xot-VeZFigKjcVxADgzvI54vZO_MqMMejo7JmDED8Xo,5892
27
- ml_tools/data_exploration.py,sha256=joaJPgXeweYMAn-xnMOzUIE8VvKvbEPenVjVHM21U4c,46914
27
+ ml_tools/data_exploration.py,sha256=9Bbppxi6WWSAotB1tCwwWPOEkx7Vs-yvCAhesVplIBY,50618
28
28
  ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
29
29
  ml_tools/ensemble_inference.py,sha256=0yLmLNj45RVVoSCLH1ZYJG9IoAhTkWUqEZmLOQTFGTY,9348
30
30
  ml_tools/ensemble_learning.py,sha256=vsIED7nlheYI4w2SBzP6SC1AnNeMfn-2A1Gqw5EfxsM,21964
@@ -35,7 +35,7 @@ ml_tools/optimization_tools.py,sha256=P074YCuZzkqkONnAsM-Zb9DTX_i8cRkkJLpwAWz6CR
35
35
  ml_tools/path_manager.py,sha256=CyDU16pOKmC82jPubqJPT6EBt-u-3rGVbxyPIZCvDDY,18432
36
36
  ml_tools/serde.py,sha256=UIshIesHRFmxr8F6B3LxGG8bYc1HHK-nlE3kENSZL18,5288
37
37
  ml_tools/utilities.py,sha256=OcAyV1tEcYAfOWlGjRgopsjDLxU3DcI5EynzvWV4q3A,15754
38
- dragon_ml_toolbox-12.8.0.dist-info/METADATA,sha256=zbA_0bdkX_96eSpkx7QGZelCTKrckDXUdvmHE4oCNMI,6166
39
- dragon_ml_toolbox-12.8.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
40
- dragon_ml_toolbox-12.8.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
41
- dragon_ml_toolbox-12.8.0.dist-info/RECORD,,
38
+ dragon_ml_toolbox-12.9.1.dist-info/METADATA,sha256=oQWsgVpaYAb7-91f2DpCuMUNCmP1OuHmwzMCeSgVQU8,6166
39
+ dragon_ml_toolbox-12.9.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
40
+ dragon_ml_toolbox-12.9.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
41
+ dragon_ml_toolbox-12.9.1.dist-info/RECORD,,
@@ -346,6 +346,7 @@ def encode_categorical_features(
346
346
  df: pd.DataFrame,
347
347
  columns_to_encode: List[str],
348
348
  encode_nulls: bool,
349
+ null_label: str = "Other",
349
350
  split_resulting_dataset: bool = True,
350
351
  verbose: bool = True
351
352
  ) -> Tuple[Dict[str, Dict[str, int]], pd.DataFrame, Optional[pd.DataFrame]]:
@@ -359,13 +360,14 @@ def encode_categorical_features(
359
360
  Args:
360
361
  df (pd.DataFrame): The input DataFrame.
361
362
  columns_to_encode (List[str]): A list of column names to be encoded.
362
- encode_nulls (bool): If True, encodes Null values as a distinct category
363
- "Other" with a value of 0. Other categories start from 1.
364
- If False, Nulls are ignored and categories start from 0.
365
- Note: Use False when encoding binary values with missing entries.
366
- split_resulting_dataset (bool): If True, returns two separate DataFrames:
367
- one with non-categorical columns and one with the encoded columns.
368
- If False, returns a single DataFrame with all columns.
363
+ encode_nulls (bool):
364
+ - If True, encodes Null values as a distinct category 'null_label' with a value of 0. Other categories start from 1.
365
+ - If False, Nulls are ignored and categories start from 0.
366
+
367
+ null_label (str): Category to encode Nulls to if `encode_nulls` is True. If a name collision with `null_label` occurs, the fallback key will be "__NULL__".
368
+ split_resulting_dataset (bool):
369
+ - If True, returns two separate DataFrames, one with non-categorical columns and one with the encoded columns.
370
+ - If False, returns a single DataFrame with all columns.
369
371
  verbose (bool): If True, prints encoding progress.
370
372
 
371
373
  Returns:
@@ -376,6 +378,9 @@ def encode_categorical_features(
376
378
  - pd.DataFrame: The original dataframe with or without encoded columns (see `split_resulting_dataset`).
377
379
 
378
380
  - pd.DataFrame | None: If `split_resulting_dataset` is True, the encoded columns as a new dataframe.
381
+
382
+ ## **Note:**
383
+ Use `encode_nulls=False` when encoding binary values with missing entries or a malformed encoding will be returned silently.
379
384
  """
380
385
  df_encoded = df.copy()
381
386
 
@@ -401,8 +406,16 @@ def encode_categorical_features(
401
406
  mapped_series = df_encoded[col_name].astype(str).map(mapping)
402
407
  df_encoded[col_name] = mapped_series.fillna(0).astype(int)
403
408
 
409
+ # --- Validate nulls category---
410
+ # Ensure the key for 0 doesn't collide with a real category.
411
+ if null_label in mapping.keys():
412
+ # COLLISION! null_label is a real category
413
+ original_label = null_label
414
+ null_label = "__NULL__" # fallback
415
+ _LOGGER.warning(f"Column '{col_name}': '{original_label}' is a real category. Mapping nulls (0) to '{null_label}' instead.")
416
+
404
417
  # Create the complete user-facing map including "Other"
405
- user_mapping = {**mapping, "Other": 0}
418
+ user_mapping = {**mapping, null_label: 0}
406
419
  mappings[col_name] = user_mapping
407
420
  else:
408
421
  # ignore nulls
@@ -1009,9 +1022,11 @@ def create_transformer_categorical_map(
1009
1022
 
1010
1023
  def reconstruct_one_hot(
1011
1024
  df: pd.DataFrame,
1012
- base_feature_names: List[str],
1025
+ features_to_reconstruct: List[Union[str, Tuple[str, Optional[str]]]],
1013
1026
  separator: str = '_',
1014
- drop_original: bool = True
1027
+ baseline_category_name: str = "Other",
1028
+ drop_original: bool = True,
1029
+ verbose: bool = True
1015
1030
  ) -> pd.DataFrame:
1016
1031
  """
1017
1032
  Reconstructs original categorical columns from a one-hot encoded DataFrame.
@@ -1023,12 +1038,26 @@ def reconstruct_one_hot(
1023
1038
  Args:
1024
1039
  df (pd.DataFrame):
1025
1040
  The input DataFrame with one-hot encoded columns.
1026
- base_features (List[str]):
1027
- A list of base feature names to reconstruct. For example, if you have
1028
- columns 'B_a', 'B_b', 'B_c', you would pass `['B']`.
1041
+ features_to_reconstruct (List[str | Tuple[str, str | None]]):
1042
+ A list defining the features to reconstruct. This list can contain:
1043
+
1044
+ - A string: (e.g., "Color")
1045
+ This reconstructs the feature 'Color' and assumes all-zero rows represent the baseline category ("Other" by default).
1046
+ - A tuple: (e.g., ("Pet", "Dog"))
1047
+ This reconstructs 'Pet' and maps all-zero rows to the baseline category "Dog".
1048
+ - A tuple with None: (e.g., ("Size", None))
1049
+ This reconstructs 'Size' and maps all-zero rows to the NaN value.
1050
+ Example:
1051
+ [
1052
+ "Mood", # All-zeros -> "Other"
1053
+ ("Color", "Red"), # All-zeros -> "Red"
1054
+ ("Size", None) # All-zeros -> NaN
1055
+ ]
1029
1056
  separator (str):
1030
1057
  The character separating the base name from the categorical value in
1031
1058
  the column names (e.g., '_' in 'B_a').
1059
+ baseline_category_name (str):
1060
+ The baseline category name to use by default if it is not explicitly provided.
1032
1061
  drop_original (bool):
1033
1062
  If True, the original one-hot encoded columns will be dropped from
1034
1063
  the returned DataFrame.
@@ -1051,14 +1080,47 @@ def reconstruct_one_hot(
1051
1080
  if not isinstance(df, pd.DataFrame):
1052
1081
  _LOGGER.error("Input must be a pandas DataFrame.")
1053
1082
  raise TypeError()
1083
+
1084
+ if not isinstance(baseline_category_name, str):
1085
+ _LOGGER.error("The baseline_category must be a string.")
1086
+ raise TypeError()
1054
1087
 
1055
1088
  new_df = df.copy()
1056
1089
  all_ohe_cols_to_drop = []
1057
1090
  reconstructed_count = 0
1058
-
1059
- _LOGGER.info(f"Attempting to reconstruct {len(base_feature_names)} one-hot encoded feature(s).")
1060
-
1061
- for base_name in base_feature_names:
1091
+
1092
+ # --- 1. Parse and validate the reconstruction config ---
1093
+ # This normalizes the input into a clean {base_name: baseline_val} dict
1094
+ reconstruction_config: Dict[str, Optional[str]] = {}
1095
+ try:
1096
+ for item in features_to_reconstruct:
1097
+ if isinstance(item, str):
1098
+ # Case 1: "Color"
1099
+ base_name = item
1100
+ baseline_val = baseline_category_name
1101
+ elif isinstance(item, tuple) and len(item) == 2:
1102
+ # Case 2: ("Pet", "dog") or ("Size", None)
1103
+ base_name, baseline_val = item
1104
+ if not (isinstance(base_name, str) and (isinstance(baseline_val, str) or baseline_val is None)):
1105
+ _LOGGER.error(f"Invalid tuple format for '{item}'. Must be (str, str|None).")
1106
+ raise ValueError()
1107
+ else:
1108
+ _LOGGER.error(f"Invalid item '{item}'. Must be str or (str, str|None) tuple.")
1109
+ raise ValueError()
1110
+
1111
+ if base_name in reconstruction_config and verbose:
1112
+ _LOGGER.warning(f"Duplicate entry for '{base_name}' found. Using the last provided configuration.")
1113
+
1114
+ reconstruction_config[base_name] = baseline_val
1115
+
1116
+ except Exception as e:
1117
+ _LOGGER.error(f"Failed to parse 'features_to_reconstruct' argument: {e}")
1118
+ raise ValueError("Invalid configuration for 'features_to_reconstruct'.") from e
1119
+
1120
+ _LOGGER.info(f"Attempting to reconstruct {len(reconstruction_config)} one-hot encoded feature(s).")
1121
+
1122
+ # Main logic
1123
+ for base_name, baseline_category in reconstruction_config.items():
1062
1124
  # Regex to find all columns belonging to this base feature.
1063
1125
  pattern = f"^{re.escape(base_name)}{re.escape(separator)}"
1064
1126
 
@@ -1070,24 +1132,34 @@ def reconstruct_one_hot(
1070
1132
  continue
1071
1133
 
1072
1134
  # For each row, find the column name with the maximum value (which is 1)
1073
- reconstructed_series = new_df[ohe_cols].idxmax(axis=1)
1135
+ reconstructed_series = new_df[ohe_cols].idxmax(axis=1) # type: ignore
1074
1136
 
1075
1137
  # Extract the categorical value (the suffix) from the column name
1076
1138
  # Use n=1 in split to handle cases where the category itself might contain the separator
1077
1139
  new_column_values = reconstructed_series.str.split(separator, n=1).str[1]
1078
1140
 
1079
- # Handle rows where all OHE columns were 0 (e.g., original value was NaN).
1080
- # In these cases, idxmax returns the first column name, but the sum of values is 0.
1081
- all_zero_mask = new_df[ohe_cols].sum(axis=1) == 0
1082
- new_column_values.loc[all_zero_mask] = np.nan # type: ignore
1141
+ # Handle rows where all OHE columns were 0 (e.g., original value was NaN or a dropped baseline).
1142
+ all_zero_mask = new_df[ohe_cols].sum(axis=1) == 0 # type: ignore
1143
+
1144
+ if baseline_category is not None:
1145
+ # A baseline category was provided
1146
+ new_column_values.loc[all_zero_mask] = baseline_category
1147
+ else:
1148
+ # No baseline provided: assign NaN
1149
+ new_column_values.loc[all_zero_mask] = np.nan # type: ignore
1150
+
1151
+ if verbose:
1152
+ print(f" - Mapped 'all-zero' rows for '{base_name}' to baseline: '{baseline_category}'.")
1083
1153
 
1084
1154
  # Assign the new reconstructed column to the DataFrame
1085
1155
  new_df[base_name] = new_column_values
1086
1156
 
1087
1157
  all_ohe_cols_to_drop.extend(ohe_cols)
1088
1158
  reconstructed_count += 1
1089
- print(f" - Reconstructed '{base_name}' from {len(ohe_cols)} columns.")
1159
+ if verbose:
1160
+ print(f" - Reconstructed '{base_name}' from {len(ohe_cols)} columns.")
1090
1161
 
1162
+ # Cleanup
1091
1163
  if drop_original and all_ohe_cols_to_drop:
1092
1164
  # Drop the original OHE columns, ensuring no duplicates in the drop list
1093
1165
  unique_cols_to_drop = list(set(all_ohe_cols_to_drop))