dragon-ml-toolbox 12.8.0__tar.gz → 12.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (46) hide show
  1. {dragon_ml_toolbox-12.8.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-12.9.0}/PKG-INFO +1 -1
  2. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
  3. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/data_exploration.py +88 -23
  4. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/pyproject.toml +1 -1
  5. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/LICENSE +0 -0
  6. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/LICENSE-THIRD-PARTY.md +0 -0
  7. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/README.md +0 -0
  8. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
  9. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  10. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
  11. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  12. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ETL_cleaning.py +0 -0
  13. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ETL_engineering.py +0 -0
  14. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/GUI_tools.py +0 -0
  15. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/MICE_imputation.py +0 -0
  16. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_callbacks.py +0 -0
  17. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_datasetmaster.py +0 -0
  18. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_evaluation.py +0 -0
  19. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_evaluation_multi.py +0 -0
  20. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_inference.py +0 -0
  21. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_models.py +0 -0
  22. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_optimization.py +0 -0
  23. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_scaler.py +0 -0
  24. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_simple_optimization.py +0 -0
  25. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_trainer.py +0 -0
  26. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_utilities.py +0 -0
  27. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/PSO_optimization.py +0 -0
  28. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/RNN_forecast.py +0 -0
  29. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/SQL.py +0 -0
  30. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/VIF_factor.py +0 -0
  31. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/__init__.py +0 -0
  32. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/_logger.py +0 -0
  33. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/_script_info.py +0 -0
  34. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/constants.py +0 -0
  35. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/custom_logger.py +0 -0
  36. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ensemble_evaluation.py +0 -0
  37. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ensemble_inference.py +0 -0
  38. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ensemble_learning.py +0 -0
  39. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/handle_excel.py +0 -0
  40. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/keys.py +0 -0
  41. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/math_utilities.py +0 -0
  42. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/optimization_tools.py +0 -0
  43. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/path_manager.py +0 -0
  44. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/serde.py +0 -0
  45. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/ml_tools/utilities.py +0 -0
  46. {dragon_ml_toolbox-12.8.0 → dragon_ml_toolbox-12.9.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 12.8.0
3
+ Version: 12.9.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 12.8.0
3
+ Version: 12.9.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -346,6 +346,7 @@ def encode_categorical_features(
346
346
  df: pd.DataFrame,
347
347
  columns_to_encode: List[str],
348
348
  encode_nulls: bool,
349
+ null_label: str = "Other",
349
350
  split_resulting_dataset: bool = True,
350
351
  verbose: bool = True
351
352
  ) -> Tuple[Dict[str, Dict[str, int]], pd.DataFrame, Optional[pd.DataFrame]]:
@@ -359,13 +360,15 @@ def encode_categorical_features(
359
360
  Args:
360
361
  df (pd.DataFrame): The input DataFrame.
361
362
  columns_to_encode (List[str]): A list of column names to be encoded.
362
- encode_nulls (bool): If True, encodes Null values as a distinct category
363
- "Other" with a value of 0. Other categories start from 1.
364
- If False, Nulls are ignored and categories start from 0.
365
- Note: Use False when encoding binary values with missing entries.
366
- split_resulting_dataset (bool): If True, returns two separate DataFrames:
367
- one with non-categorical columns and one with the encoded columns.
368
- If False, returns a single DataFrame with all columns.
363
+ encode_nulls (bool):
364
+ - If True, encodes Null values as a distinct category 'null_label' with a value of 0. Other categories start from 1.
365
+ - If False, Nulls are ignored and categories start from 0.
366
+
367
+
368
+ null_label (str): Category to encode Nulls to if `encode_nulls` is True. If a name collision with `null_label` occurs, the fallback key will be "__NULL__".
369
+ split_resulting_dataset (bool):
370
+ - If True, returns two separate DataFrames, one with non-categorical columns and one with the encoded columns.
371
+ - If False, returns a single DataFrame with all columns.
369
372
  verbose (bool): If True, prints encoding progress.
370
373
 
371
374
  Returns:
@@ -376,6 +379,9 @@ def encode_categorical_features(
376
379
  - pd.DataFrame: The original dataframe with or without encoded columns (see `split_resulting_dataset`).
377
380
 
378
381
  - pd.DataFrame | None: If `split_resulting_dataset` is True, the encoded columns as a new dataframe.
382
+
383
+ ## **Note:**
384
+ Use `encode_nulls=False` when encoding binary values with missing entries or a malformed encoding will be returned silently.
379
385
  """
380
386
  df_encoded = df.copy()
381
387
 
@@ -401,8 +407,16 @@ def encode_categorical_features(
401
407
  mapped_series = df_encoded[col_name].astype(str).map(mapping)
402
408
  df_encoded[col_name] = mapped_series.fillna(0).astype(int)
403
409
 
410
+ # --- Validate nulls category---
411
+ # Ensure the key for 0 doesn't collide with a real category.
412
+ if null_label in mapping.keys():
413
+ # COLLISION! null_label is a real category
414
+ original_label = null_label
415
+ null_label = "__NULL__" # fallback
416
+ _LOGGER.warning(f"Column '{col_name}': '{original_label}' is a real category. Mapping nulls (0) to '{null_label}' instead.")
417
+
404
418
  # Create the complete user-facing map including "Other"
405
- user_mapping = {**mapping, "Other": 0}
419
+ user_mapping = {**mapping, null_label: 0}
406
420
  mappings[col_name] = user_mapping
407
421
  else:
408
422
  # ignore nulls
@@ -1009,9 +1023,10 @@ def create_transformer_categorical_map(
1009
1023
 
1010
1024
  def reconstruct_one_hot(
1011
1025
  df: pd.DataFrame,
1012
- base_feature_names: List[str],
1026
+ features_to_reconstruct: List[Union[str, Tuple[str, Optional[str]]]],
1013
1027
  separator: str = '_',
1014
- drop_original: bool = True
1028
+ drop_original: bool = True,
1029
+ verbose: bool = True
1015
1030
  ) -> pd.DataFrame:
1016
1031
  """
1017
1032
  Reconstructs original categorical columns from a one-hot encoded DataFrame.
@@ -1023,9 +1038,20 @@ def reconstruct_one_hot(
1023
1038
  Args:
1024
1039
  df (pd.DataFrame):
1025
1040
  The input DataFrame with one-hot encoded columns.
1026
- base_features (List[str]):
1027
- A list of base feature names to reconstruct. For example, if you have
1028
- columns 'B_a', 'B_b', 'B_c', you would pass `['B']`.
1041
+ features_to_reconstruct (List[str | Tuple[str, str | None]]):
1042
+ A list defining the features to reconstruct. This list can contain:
1043
+
1044
+ - A string: (e.g., "Color")
1045
+ This reconstructs the feature 'Color' and assumes all-zero rows represent missing data NaN.
1046
+ - A tuple: (e.g., ("Pet", "Dog"))
1047
+ This reconstructs 'Pet' and maps all-zero rows to the baseline category "Dog" (handling 'drop_first=True' scenarios).
1048
+ - A tuple with None: (e.g., ("Size", None))
1049
+ This is explicit and behaves identically to just passing "Size". All-zero rows will be mapped to NaN.
1050
+ Example:
1051
+ [
1052
+ "Mood", # All-zeros -> NaN
1053
+ ("Color", "Red"), # All-zeros -> "Red"
1054
+ ]
1029
1055
  separator (str):
1030
1056
  The character separating the base name from the categorical value in
1031
1057
  the column names (e.g., '_' in 'B_a').
@@ -1055,10 +1081,39 @@ def reconstruct_one_hot(
1055
1081
  new_df = df.copy()
1056
1082
  all_ohe_cols_to_drop = []
1057
1083
  reconstructed_count = 0
1058
-
1059
- _LOGGER.info(f"Attempting to reconstruct {len(base_feature_names)} one-hot encoded feature(s).")
1060
-
1061
- for base_name in base_feature_names:
1084
+
1085
+ # --- 1. Parse and validate the reconstruction config ---
1086
+ # This normalizes the input into a clean {base_name: baseline_val} dict
1087
+ reconstruction_config: Dict[str, Optional[str]] = {}
1088
+ try:
1089
+ for item in features_to_reconstruct:
1090
+ if isinstance(item, str):
1091
+ # Case 1: "Color"
1092
+ base_name = item
1093
+ baseline_val = None
1094
+ elif isinstance(item, tuple) and len(item) == 2:
1095
+ # Case 2: ("Pet", "dog") or ("Size", None)
1096
+ base_name, baseline_val = item
1097
+ if not (isinstance(base_name, str) and (isinstance(baseline_val, str) or baseline_val is None)):
1098
+ _LOGGER.error(f"Invalid tuple format for '{item}'. Must be (str, str|None).")
1099
+ raise ValueError()
1100
+ else:
1101
+ _LOGGER.error(f"Invalid item '{item}'. Must be str or (str, str|None) tuple.")
1102
+ raise ValueError()
1103
+
1104
+ if base_name in reconstruction_config and verbose:
1105
+ _LOGGER.warning(f"Duplicate entry for '{base_name}' found. Using the last provided configuration.")
1106
+
1107
+ reconstruction_config[base_name] = baseline_val
1108
+
1109
+ except Exception as e:
1110
+ _LOGGER.error(f"Failed to parse 'features_to_reconstruct' argument: {e}")
1111
+ raise ValueError("Invalid configuration for 'features_to_reconstruct'.") from e
1112
+
1113
+ _LOGGER.info(f"Attempting to reconstruct {len(reconstruction_config)} one-hot encoded feature(s).")
1114
+
1115
+ # Main logic
1116
+ for base_name, baseline_category in reconstruction_config.items():
1062
1117
  # Regex to find all columns belonging to this base feature.
1063
1118
  pattern = f"^{re.escape(base_name)}{re.escape(separator)}"
1064
1119
 
@@ -1070,24 +1125,34 @@ def reconstruct_one_hot(
1070
1125
  continue
1071
1126
 
1072
1127
  # For each row, find the column name with the maximum value (which is 1)
1073
- reconstructed_series = new_df[ohe_cols].idxmax(axis=1)
1128
+ reconstructed_series = new_df[ohe_cols].idxmax(axis=1) # type: ignore
1074
1129
 
1075
1130
  # Extract the categorical value (the suffix) from the column name
1076
1131
  # Use n=1 in split to handle cases where the category itself might contain the separator
1077
1132
  new_column_values = reconstructed_series.str.split(separator, n=1).str[1]
1078
1133
 
1079
- # Handle rows where all OHE columns were 0 (e.g., original value was NaN).
1080
- # In these cases, idxmax returns the first column name, but the sum of values is 0.
1081
- all_zero_mask = new_df[ohe_cols].sum(axis=1) == 0
1082
- new_column_values.loc[all_zero_mask] = np.nan # type: ignore
1134
+ # Handle rows where all OHE columns were 0 (e.g., original value was NaN or a dropped baseline).
1135
+ all_zero_mask = new_df[ohe_cols].sum(axis=1) == 0 # type: ignore
1136
+
1137
+ if baseline_category is not None:
1138
+ # A baseline category was provided
1139
+ new_column_values.loc[all_zero_mask] = baseline_category
1140
+ else:
1141
+ # No baseline provided: assign NaN
1142
+ new_column_values.loc[all_zero_mask] = np.nan # type: ignore
1143
+
1144
+ if verbose:
1145
+ print(f" - Mapped 'all-zero' rows for '{base_name}' to baseline: '{baseline_category}'.")
1083
1146
 
1084
1147
  # Assign the new reconstructed column to the DataFrame
1085
1148
  new_df[base_name] = new_column_values
1086
1149
 
1087
1150
  all_ohe_cols_to_drop.extend(ohe_cols)
1088
1151
  reconstructed_count += 1
1089
- print(f" - Reconstructed '{base_name}' from {len(ohe_cols)} columns.")
1152
+ if verbose:
1153
+ print(f" - Reconstructed '{base_name}' from {len(ohe_cols)} columns.")
1090
1154
 
1155
+ # Cleanup
1091
1156
  if drop_original and all_ohe_cols_to_drop:
1092
1157
  # Drop the original OHE columns, ensuring no duplicates in the drop list
1093
1158
  unique_cols_to_drop = list(set(all_ohe_cols_to_drop))
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "12.8.0"
3
+ version = "12.9.0"
4
4
  description = "A collection of tools for data science and machine learning projects."
5
5
  authors = [
6
6
  { name = "Karl L. Loza Vidaurre", email = "luigiloza@gmail.com" }