dragon-ml-toolbox 12.7.0__tar.gz → 12.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (46) hide show
  1. {dragon_ml_toolbox-12.7.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-12.9.0}/PKG-INFO +1 -1
  2. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
  3. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_utilities.py +13 -2
  4. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/data_exploration.py +88 -22
  5. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/optimization_tools.py +3 -3
  6. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/pyproject.toml +1 -1
  7. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/LICENSE +0 -0
  8. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/LICENSE-THIRD-PARTY.md +0 -0
  9. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/README.md +0 -0
  10. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
  11. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  12. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
  13. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  14. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ETL_cleaning.py +0 -0
  15. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ETL_engineering.py +0 -0
  16. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/GUI_tools.py +0 -0
  17. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/MICE_imputation.py +0 -0
  18. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_callbacks.py +0 -0
  19. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_datasetmaster.py +0 -0
  20. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_evaluation.py +0 -0
  21. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_evaluation_multi.py +0 -0
  22. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_inference.py +0 -0
  23. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_models.py +0 -0
  24. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_optimization.py +0 -0
  25. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_scaler.py +0 -0
  26. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_simple_optimization.py +0 -0
  27. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ML_trainer.py +0 -0
  28. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/PSO_optimization.py +0 -0
  29. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/RNN_forecast.py +0 -0
  30. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/SQL.py +0 -0
  31. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/VIF_factor.py +0 -0
  32. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/__init__.py +0 -0
  33. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/_logger.py +0 -0
  34. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/_script_info.py +0 -0
  35. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/constants.py +0 -0
  36. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/custom_logger.py +0 -0
  37. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ensemble_evaluation.py +0 -0
  38. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ensemble_inference.py +0 -0
  39. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/ensemble_learning.py +0 -0
  40. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/handle_excel.py +0 -0
  41. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/keys.py +0 -0
  42. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/math_utilities.py +0 -0
  43. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/path_manager.py +0 -0
  44. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/serde.py +0 -0
  45. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/ml_tools/utilities.py +0 -0
  46. {dragon_ml_toolbox-12.7.0 → dragon_ml_toolbox-12.9.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 12.7.0
3
+ Version: 12.9.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 12.7.0
3
+ Version: 12.9.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,12 +1,13 @@
1
1
  import pandas as pd
2
2
  from pathlib import Path
3
- from typing import Union, Any
3
+ from typing import Union, Any, Optional
4
4
 
5
5
  from .path_manager import make_fullpath, list_subdirectories, list_files_by_extension
6
6
  from ._script_info import _script_info
7
7
  from ._logger import _LOGGER
8
8
  from .keys import DatasetKeys, PytorchModelArchitectureKeys, PytorchArtifactPathKeys, SHAPKeys
9
9
  from .utilities import load_dataframe
10
+ from .custom_logger import save_list_strings
10
11
 
11
12
 
12
13
  __all__ = [
@@ -139,6 +140,7 @@ def find_model_artifacts(target_directory: Union[str,Path], load_scaler: bool, v
139
140
  def select_features_by_shap(
140
141
  root_directory: Union[str, Path],
141
142
  shap_threshold: float,
143
+ log_feature_names_directory: Optional[Union[str, Path]],
142
144
  verbose: bool = True) -> list[str]:
143
145
  """
144
146
  Scans subdirectories to find SHAP summary CSVs, then extracts feature
@@ -148,11 +150,13 @@ def select_features_by_shap(
148
150
  importance scores aggregated from multiple models.
149
151
 
150
152
  Args:
151
- root_directory (Union[str, Path]):
153
+ root_directory (str | Path):
152
154
  The path to the root directory that contains model subdirectories.
153
155
  shap_threshold (float):
154
156
  The minimum mean absolute SHAP value for a feature to be included
155
157
  in the final list.
158
+ log_feature_names_directory (str | Path | None):
159
+ If given, saves the chosen feature names as a .txt file in this directory.
156
160
 
157
161
  Returns:
158
162
  list[str]:
@@ -211,6 +215,13 @@ def select_features_by_shap(
211
215
  final_features = sorted(list(master_feature_set))
212
216
  if verbose:
213
217
  _LOGGER.info(f"Selected {len(final_features)} unique features across all files.")
218
+
219
+ if log_feature_names_directory is not None:
220
+ save_names_path = make_fullpath(log_feature_names_directory, make=True, enforce="directory")
221
+ save_list_strings(list_strings=final_features,
222
+ directory=save_names_path,
223
+ filename=DatasetKeys.FEATURE_NAMES,
224
+ verbose=verbose)
214
225
 
215
226
  return final_features
216
227
 
@@ -346,6 +346,7 @@ def encode_categorical_features(
346
346
  df: pd.DataFrame,
347
347
  columns_to_encode: List[str],
348
348
  encode_nulls: bool,
349
+ null_label: str = "Other",
349
350
  split_resulting_dataset: bool = True,
350
351
  verbose: bool = True
351
352
  ) -> Tuple[Dict[str, Dict[str, int]], pd.DataFrame, Optional[pd.DataFrame]]:
@@ -359,12 +360,15 @@ def encode_categorical_features(
359
360
  Args:
360
361
  df (pd.DataFrame): The input DataFrame.
361
362
  columns_to_encode (List[str]): A list of column names to be encoded.
362
- encode_nulls (bool): If True, encodes Null values as a distinct category
363
- "Other" with a value of 0. Other categories start from 1.
364
- If False, Nulls are ignored and categories start from 0.
365
- split_resulting_dataset (bool): If True, returns two separate DataFrames:
366
- one with non-categorical columns and one with the encoded columns.
367
- If False, returns a single DataFrame with all columns.
363
+ encode_nulls (bool):
364
+ - If True, encodes Null values as a distinct category 'null_label' with a value of 0. Other categories start from 1.
365
+ - If False, Nulls are ignored and categories start from 0.
366
+
367
+
368
+ null_label (str): Category to encode Nulls to if `encode_nulls` is True. If a name collision with `null_label` occurs, the fallback key will be "__NULL__".
369
+ split_resulting_dataset (bool):
370
+ - If True, returns two separate DataFrames, one with non-categorical columns and one with the encoded columns.
371
+ - If False, returns a single DataFrame with all columns.
368
372
  verbose (bool): If True, prints encoding progress.
369
373
 
370
374
  Returns:
@@ -375,6 +379,9 @@ def encode_categorical_features(
375
379
  - pd.DataFrame: The original dataframe with or without encoded columns (see `split_resulting_dataset`).
376
380
 
377
381
  - pd.DataFrame | None: If `split_resulting_dataset` is True, the encoded columns as a new dataframe.
382
+
383
+ ## **Note:**
384
+ Use `encode_nulls=False` when encoding binary values with missing entries or a malformed encoding will be returned silently.
378
385
  """
379
386
  df_encoded = df.copy()
380
387
 
@@ -400,8 +407,16 @@ def encode_categorical_features(
400
407
  mapped_series = df_encoded[col_name].astype(str).map(mapping)
401
408
  df_encoded[col_name] = mapped_series.fillna(0).astype(int)
402
409
 
410
+ # --- Validate nulls category---
411
+ # Ensure the key for 0 doesn't collide with a real category.
412
+ if null_label in mapping.keys():
413
+ # COLLISION! null_label is a real category
414
+ original_label = null_label
415
+ null_label = "__NULL__" # fallback
416
+ _LOGGER.warning(f"Column '{col_name}': '{original_label}' is a real category. Mapping nulls (0) to '{null_label}' instead.")
417
+
403
418
  # Create the complete user-facing map including "Other"
404
- user_mapping = {**mapping, "Other": 0}
419
+ user_mapping = {**mapping, null_label: 0}
405
420
  mappings[col_name] = user_mapping
406
421
  else:
407
422
  # ignore nulls
@@ -1008,9 +1023,10 @@ def create_transformer_categorical_map(
1008
1023
 
1009
1024
  def reconstruct_one_hot(
1010
1025
  df: pd.DataFrame,
1011
- base_feature_names: List[str],
1026
+ features_to_reconstruct: List[Union[str, Tuple[str, Optional[str]]]],
1012
1027
  separator: str = '_',
1013
- drop_original: bool = True
1028
+ drop_original: bool = True,
1029
+ verbose: bool = True
1014
1030
  ) -> pd.DataFrame:
1015
1031
  """
1016
1032
  Reconstructs original categorical columns from a one-hot encoded DataFrame.
@@ -1022,9 +1038,20 @@ def reconstruct_one_hot(
1022
1038
  Args:
1023
1039
  df (pd.DataFrame):
1024
1040
  The input DataFrame with one-hot encoded columns.
1025
- base_features (List[str]):
1026
- A list of base feature names to reconstruct. For example, if you have
1027
- columns 'B_a', 'B_b', 'B_c', you would pass `['B']`.
1041
+ features_to_reconstruct (List[str | Tuple[str, str | None]]):
1042
+ A list defining the features to reconstruct. This list can contain:
1043
+
1044
+ - A string: (e.g., "Color")
1045
+ This reconstructs the feature 'Color' and assumes all-zero rows represent missing data NaN.
1046
+ - A tuple: (e.g., ("Pet", "Dog"))
1047
+ This reconstructs 'Pet' and maps all-zero rows to the baseline category "Dog" (handling 'drop_first=True' scenarios).
1048
+ - A tuple with None: (e.g., ("Size", None))
1049
+ This is explicit and behaves identically to just passing "Size". All-zero rows will be mapped to NaN.
1050
+ Example:
1051
+ [
1052
+ "Mood", # All-zeros -> NaN
1053
+ ("Color", "Red"), # All-zeros -> "Red"
1054
+ ]
1028
1055
  separator (str):
1029
1056
  The character separating the base name from the categorical value in
1030
1057
  the column names (e.g., '_' in 'B_a').
@@ -1054,10 +1081,39 @@ def reconstruct_one_hot(
1054
1081
  new_df = df.copy()
1055
1082
  all_ohe_cols_to_drop = []
1056
1083
  reconstructed_count = 0
1057
-
1058
- _LOGGER.info(f"Attempting to reconstruct {len(base_feature_names)} one-hot encoded feature(s).")
1059
-
1060
- for base_name in base_feature_names:
1084
+
1085
+ # --- 1. Parse and validate the reconstruction config ---
1086
+ # This normalizes the input into a clean {base_name: baseline_val} dict
1087
+ reconstruction_config: Dict[str, Optional[str]] = {}
1088
+ try:
1089
+ for item in features_to_reconstruct:
1090
+ if isinstance(item, str):
1091
+ # Case 1: "Color"
1092
+ base_name = item
1093
+ baseline_val = None
1094
+ elif isinstance(item, tuple) and len(item) == 2:
1095
+ # Case 2: ("Pet", "dog") or ("Size", None)
1096
+ base_name, baseline_val = item
1097
+ if not (isinstance(base_name, str) and (isinstance(baseline_val, str) or baseline_val is None)):
1098
+ _LOGGER.error(f"Invalid tuple format for '{item}'. Must be (str, str|None).")
1099
+ raise ValueError()
1100
+ else:
1101
+ _LOGGER.error(f"Invalid item '{item}'. Must be str or (str, str|None) tuple.")
1102
+ raise ValueError()
1103
+
1104
+ if base_name in reconstruction_config and verbose:
1105
+ _LOGGER.warning(f"Duplicate entry for '{base_name}' found. Using the last provided configuration.")
1106
+
1107
+ reconstruction_config[base_name] = baseline_val
1108
+
1109
+ except Exception as e:
1110
+ _LOGGER.error(f"Failed to parse 'features_to_reconstruct' argument: {e}")
1111
+ raise ValueError("Invalid configuration for 'features_to_reconstruct'.") from e
1112
+
1113
+ _LOGGER.info(f"Attempting to reconstruct {len(reconstruction_config)} one-hot encoded feature(s).")
1114
+
1115
+ # Main logic
1116
+ for base_name, baseline_category in reconstruction_config.items():
1061
1117
  # Regex to find all columns belonging to this base feature.
1062
1118
  pattern = f"^{re.escape(base_name)}{re.escape(separator)}"
1063
1119
 
@@ -1069,24 +1125,34 @@ def reconstruct_one_hot(
1069
1125
  continue
1070
1126
 
1071
1127
  # For each row, find the column name with the maximum value (which is 1)
1072
- reconstructed_series = new_df[ohe_cols].idxmax(axis=1)
1128
+ reconstructed_series = new_df[ohe_cols].idxmax(axis=1) # type: ignore
1073
1129
 
1074
1130
  # Extract the categorical value (the suffix) from the column name
1075
1131
  # Use n=1 in split to handle cases where the category itself might contain the separator
1076
1132
  new_column_values = reconstructed_series.str.split(separator, n=1).str[1]
1077
1133
 
1078
- # Handle rows where all OHE columns were 0 (e.g., original value was NaN).
1079
- # In these cases, idxmax returns the first column name, but the sum of values is 0.
1080
- all_zero_mask = new_df[ohe_cols].sum(axis=1) == 0
1081
- new_column_values.loc[all_zero_mask] = np.nan # type: ignore
1134
+ # Handle rows where all OHE columns were 0 (e.g., original value was NaN or a dropped baseline).
1135
+ all_zero_mask = new_df[ohe_cols].sum(axis=1) == 0 # type: ignore
1136
+
1137
+ if baseline_category is not None:
1138
+ # A baseline category was provided
1139
+ new_column_values.loc[all_zero_mask] = baseline_category
1140
+ else:
1141
+ # No baseline provided: assign NaN
1142
+ new_column_values.loc[all_zero_mask] = np.nan # type: ignore
1143
+
1144
+ if verbose:
1145
+ print(f" - Mapped 'all-zero' rows for '{base_name}' to baseline: '{baseline_category}'.")
1082
1146
 
1083
1147
  # Assign the new reconstructed column to the DataFrame
1084
1148
  new_df[base_name] = new_column_values
1085
1149
 
1086
1150
  all_ohe_cols_to_drop.extend(ohe_cols)
1087
1151
  reconstructed_count += 1
1088
- print(f" - Reconstructed '{base_name}' from {len(ohe_cols)} columns.")
1152
+ if verbose:
1153
+ print(f" - Reconstructed '{base_name}' from {len(ohe_cols)} columns.")
1089
1154
 
1155
+ # Cleanup
1090
1156
  if drop_original and all_ohe_cols_to_drop:
1091
1157
  # Drop the original OHE columns, ensuring no duplicates in the drop list
1092
1158
  unique_cols_to_drop = list(set(all_ohe_cols_to_drop))
@@ -98,7 +98,7 @@ def create_optimization_bounds(
98
98
 
99
99
  # 3. Populate categorical bounds (Index-based)
100
100
  # The indices in categorical_map (e.g., {2: 4}) directly correspond
101
- # to the indices in our new `feature_names` list.
101
+ # to the indices in the `feature_names` list.
102
102
  for index, cardinality in categorical_map.items():
103
103
  if not (0 <= index < total_features):
104
104
  _LOGGER.error(f"Categorical index {index} is out of range for the {total_features} features.")
@@ -125,8 +125,8 @@ def create_optimization_bounds(
125
125
  # Map name to its index in the *feature-only* list
126
126
  index = feature_names.index(name)
127
127
  except ValueError:
128
- _LOGGER.error(f"Feature name '{name}' from 'continuous_bounds_map' not found in the CSV's feature columns.")
129
- raise ValueError()
128
+ _LOGGER.warning(f"Feature name '{name}' from 'continuous_bounds_map' not found in the CSV's feature columns.")
129
+ continue
130
130
 
131
131
  if lower_bounds[index] is not None:
132
132
  # This index was already set by the categorical map
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "12.7.0"
3
+ version = "12.9.0"
4
4
  description = "A collection of tools for data science and machine learning projects."
5
5
  authors = [
6
6
  { name = "Karl L. Loza Vidaurre", email = "luigiloza@gmail.com" }