dragon-ml-toolbox 12.1.0__tar.gz → 12.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (46) hide show
  1. {dragon_ml_toolbox-12.1.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-12.3.0}/PKG-INFO +1 -1
  2. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
  3. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/ML_optimization.py +73 -15
  4. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/data_exploration.py +107 -3
  5. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/optimization_tools.py +1 -1
  6. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/pyproject.toml +1 -1
  7. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/LICENSE +0 -0
  8. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/LICENSE-THIRD-PARTY.md +0 -0
  9. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/README.md +0 -0
  10. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
  11. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  12. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
  13. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  14. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/ETL_cleaning.py +0 -0
  15. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/ETL_engineering.py +0 -0
  16. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/GUI_tools.py +0 -0
  17. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/MICE_imputation.py +0 -0
  18. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/ML_callbacks.py +0 -0
  19. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/ML_datasetmaster.py +0 -0
  20. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/ML_evaluation.py +0 -0
  21. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/ML_evaluation_multi.py +0 -0
  22. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/ML_inference.py +0 -0
  23. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/ML_models.py +0 -0
  24. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/ML_scaler.py +0 -0
  25. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/ML_simple_optimization.py +0 -0
  26. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/ML_trainer.py +0 -0
  27. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/ML_utilities.py +0 -0
  28. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/PSO_optimization.py +0 -0
  29. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/RNN_forecast.py +0 -0
  30. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/SQL.py +0 -0
  31. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/VIF_factor.py +0 -0
  32. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/__init__.py +0 -0
  33. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/_logger.py +0 -0
  34. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/_script_info.py +0 -0
  35. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/constants.py +0 -0
  36. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/custom_logger.py +0 -0
  37. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/ensemble_evaluation.py +0 -0
  38. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/ensemble_inference.py +0 -0
  39. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/ensemble_learning.py +0 -0
  40. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/handle_excel.py +0 -0
  41. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/keys.py +0 -0
  42. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/math_utilities.py +0 -0
  43. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/path_manager.py +0 -0
  44. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/serde.py +0 -0
  45. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/ml_tools/utilities.py +0 -0
  46. {dragon_ml_toolbox-12.1.0 → dragon_ml_toolbox-12.3.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 12.1.0
3
+ Version: 12.3.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 12.1.0
3
+ Version: 12.3.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -24,6 +24,7 @@ from .math_utilities import discretize_categorical_values
24
24
 
25
25
  __all__ = [
26
26
  "MLOptimizer",
27
+ "FitnessEvaluator",
27
28
  "create_pytorch_problem",
28
29
  "run_optimization"
29
30
  ]
@@ -33,8 +34,8 @@ class MLOptimizer:
33
34
  """
34
35
  A wrapper class for setting up and running EvoTorch optimization tasks.
35
36
 
36
- This class combines the functionality of `create_pytorch_problem` and
37
- `run_optimization` functions into a single, streamlined workflow.
37
+ This class combines the functionality of `FitnessEvaluator`, `create_pytorch_problem`, and
38
+ `run_optimization` into a single, streamlined workflow.
38
39
 
39
40
  SNES and CEM algorithms do not accept bounds, the given bounds will be used as an initial starting point.
40
41
 
@@ -91,9 +92,16 @@ class MLOptimizer:
91
92
  False if it starts at 1 (e.g., [1, 2, 3]).
92
93
  **searcher_kwargs: Additional keyword arguments for the selected search algorithm's constructor.
93
94
  """
95
+ # Make a fitness function
96
+ self.evaluator = FitnessEvaluator(
97
+ inference_handler=inference_handler,
98
+ categorical_index_map=categorical_index_map,
99
+ discretize_start_at_zero=discretize_start_at_zero
100
+ )
101
+
94
102
  # Call the existing factory function to get the problem and searcher factory
95
103
  self.problem, self.searcher_factory = create_pytorch_problem(
96
- inference_handler=inference_handler,
104
+ evaluator=self.evaluator,
97
105
  bounds=bounds,
98
106
  task=task,
99
107
  algorithm=algorithm,
@@ -144,10 +152,67 @@ class MLOptimizer:
144
152
  categorical_mappings=self.categorical_mappings,
145
153
  discretize_start_at_zero=self.discretize_start_at_zero
146
154
  )
155
+
156
+
157
+ class FitnessEvaluator:
158
+ """
159
+ A callable class that wraps the PyTorch model inference handler and performs
160
+ on-the-fly discretization for the EvoTorch fitness function.
161
+
162
+ This class is automatically instantiated by MLOptimizer and passed to
163
+ create_pytorch_problem, encapsulating the evaluation logic.
164
+ """
165
+ def __init__(self,
166
+ inference_handler: PyTorchInferenceHandler,
167
+ categorical_index_map: Optional[Dict[int, int]] = None,
168
+ discretize_start_at_zero: bool = True):
169
+ """
170
+ Initializes the fitness evaluator.
171
+
172
+ Args:
173
+ inference_handler (PyTorchInferenceHandler):
174
+ An initialized inference handler containing the model.
175
+ categorical_index_map (Dict[int, int] | None):
176
+ Maps {column_index: cardinality} for discretization.
177
+ discretize_start_at_zero (bool):
178
+ True if discrete encoding starts at 0.
179
+ """
180
+ self.inference_handler = inference_handler
181
+ self.categorical_index_map = categorical_index_map
182
+ self.discretize_start_at_zero = discretize_start_at_zero
183
+
184
+ # Expose the device
185
+ self.device = self.inference_handler.device
186
+
187
+ def __call__(self, solution_tensor: torch.Tensor) -> torch.Tensor:
188
+ """
189
+ This is the fitness function EvoTorch will call.
190
+
191
+ It receives a batch of continuous solutions, discretizes the
192
+ categorical ones, and returns the model's predictions.
193
+ """
194
+ # Clone to avoid modifying the optimizer's internal state (SNES, CEM, GA)
195
+ processed_tensor = solution_tensor.clone()
196
+
197
+ if self.categorical_index_map:
198
+ for col_idx, cardinality in self.categorical_index_map.items():
199
+ # 1. Round (using torch.floor(x + 0.5) for "round half up" behavior)
200
+ rounded_col = torch.floor(processed_tensor[:, col_idx] + 0.5)
201
+
202
+ # 2. Determine clamping bounds
203
+ min_bound = 0 if self.discretize_start_at_zero else 1
204
+ max_bound = cardinality - 1 if self.discretize_start_at_zero else cardinality
205
+
206
+ # 3. Clamp the values and update the processed tensor
207
+ processed_tensor[:, col_idx] = torch.clamp(rounded_col, min_bound, max_bound)
208
+
209
+ # Use the *processed_tensor* for prediction
210
+ predictions = self.inference_handler.predict_batch(processed_tensor)[PyTorchInferenceKeys.PREDICTIONS]
211
+ return predictions.flatten()
147
212
 
148
213
 
149
214
  def create_pytorch_problem(
150
- inference_handler: PyTorchInferenceHandler,
215
+ evaluator: FitnessEvaluator,
151
216
  bounds: Tuple[List[float], List[float]],
152
217
  task: Literal["min", "max"],
153
218
  algorithm: Literal["SNES", "CEM", "Genetic"] = "Genetic",
@@ -162,7 +227,7 @@ def create_pytorch_problem(
162
227
  The Genetic Algorithm works directly with the bounds, and operators such as SimulatedBinaryCrossOver and GaussianMutation.
163
228
 
164
229
  Args:
165
- inference_handler (PyTorchInferenceHandler): An initialized inference handler containing the model and weights.
230
+ evaluator (FitnessEvaluator): A callable class that wraps the model inference and handles on-the-fly discretization.
166
231
  bounds (tuple[list[float], list[float]]): A tuple containing the lower and upper bounds for the solution features.
167
232
  Use the `optimization_tools.create_optimization_bounds()` helper to easily generate this and ensure unbiased categorical bounds.
168
233
  task (str): The optimization goal, either "minimize" or "maximize".
@@ -180,20 +245,13 @@ def create_pytorch_problem(
180
245
  upper_bounds = list(bounds[1])
181
246
 
182
247
  solution_length = len(lower_bounds)
183
- device = inference_handler.device
248
+ device = evaluator.device
184
249
 
185
- # Define the fitness function that EvoTorch will call.
186
- def fitness_func(solution_tensor: torch.Tensor) -> torch.Tensor:
187
- # Directly use the continuous-valued tensor from the optimizer for prediction
188
- predictions = inference_handler.predict_batch(solution_tensor)[PyTorchInferenceKeys.PREDICTIONS]
189
- return predictions.flatten()
190
-
191
-
192
250
  # Create the Problem instance.
193
251
  if algorithm == "CEM" or algorithm == "SNES":
194
252
  problem = evotorch.Problem(
195
253
  objective_sense=task,
196
- objective_func=fitness_func,
254
+ objective_func=evaluator,
197
255
  solution_length=solution_length,
198
256
  initial_bounds=(lower_bounds, upper_bounds),
199
257
  device=device,
@@ -219,7 +277,7 @@ def create_pytorch_problem(
219
277
  elif algorithm == "Genetic":
220
278
  problem = evotorch.Problem(
221
279
  objective_sense=task,
222
- objective_func=fitness_func,
280
+ objective_func=evaluator,
223
281
  solution_length=solution_length,
224
282
  bounds=(lower_bounds, upper_bounds),
225
283
  device=device,
@@ -3,7 +3,7 @@ from pandas.api.types import is_numeric_dtype
3
3
  import numpy as np
4
4
  import matplotlib.pyplot as plt
5
5
  import seaborn as sns
6
- from typing import Union, Literal, Dict, Tuple, List, Optional
6
+ from typing import Union, Literal, Dict, Tuple, List, Optional, Any
7
7
  from pathlib import Path
8
8
  import re
9
9
 
@@ -33,7 +33,8 @@ __all__ = [
33
33
  "match_and_filter_columns_by_regex",
34
34
  "standardize_percentages",
35
35
  "create_transformer_categorical_map",
36
- "reconstruct_one_hot"
36
+ "reconstruct_one_hot",
37
+ "reconstruct_binary"
37
38
  ]
38
39
 
39
40
 
@@ -1081,7 +1082,110 @@ def reconstruct_one_hot(
1081
1082
  unique_cols_to_drop = list(set(all_ohe_cols_to_drop))
1082
1083
  new_df.drop(columns=unique_cols_to_drop, inplace=True)
1083
1084
  _LOGGER.info(f"Dropped {len(unique_cols_to_drop)} original one-hot encoded columns.")
1084
-
1085
+
1086
+ _LOGGER.info(f"Successfully reconstructed {reconstructed_count} feature(s).")
1087
+
1088
+ return new_df
1089
+
1090
+
1091
+ def reconstruct_binary(
1092
+ df: pd.DataFrame,
1093
+ reconstruction_map: Dict[str, Tuple[str, Any, Any]],
1094
+ drop_original: bool = True,
1095
+ verbose: bool = True
1096
+ ) -> pd.DataFrame:
1097
+ """
1098
+ Reconstructs new categorical columns from existing binary (0/1) columns.
1099
+
1100
+ Used to reverse a binary encoding by mapping 0 and 1 back to
1101
+ descriptive categorical labels.
1102
+
1103
+ Args:
1104
+ df (pd.DataFrame):
1105
+ The input DataFrame.
1106
+ reconstruction_map (Dict[str, Tuple[str, Any, Any]]):
1107
+ A dictionary defining the reconstructions.
1108
+ Format:
1109
+ { "new_col_name": ("source_col_name", "label_for_0", "label_for_1") }
1110
+ Example:
1111
+ {
1112
+ "Sex": ("Sex_male", "Female", "Male"),
1113
+ "Smoker": ("Is_Smoker", "No", "Yes")
1114
+ }
1115
+ drop_original (bool):
1116
+ If True, the original binary source columns (e.g., "Sex_male")
1117
+ will be dropped from the returned DataFrame.
1118
+ verbose (bool):
1119
+ If True, prints the details of each reconstruction.
1120
+
1121
+ Returns:
1122
+ pd.DataFrame:
1123
+ A new DataFrame with the reconstructed categorical columns.
1124
+
1125
+ Raises:
1126
+ TypeError: If `df` is not a pandas DataFrame.
1127
+ ValueError: If `reconstruction_map` is not a dictionary or a
1128
+ configuration is invalid (e.g., column name collision).
1129
+
1130
+ Notes:
1131
+ - The function operates on a copy of the DataFrame.
1132
+ - Rows with `NaN` in the source column will have `NaN` in the
1133
+ new column.
1134
+ - Values in the source column other than 0 or 1 (e.g., 2) will
1135
+ result in `NaN` in the new column.
1136
+ """
1137
+ if not isinstance(df, pd.DataFrame):
1138
+ _LOGGER.error("Input must be a pandas DataFrame.")
1139
+ raise TypeError()
1140
+
1141
+ if not isinstance(reconstruction_map, dict):
1142
+ _LOGGER.error("`reconstruction_map` must be a dictionary with the required format.")
1143
+ raise ValueError()
1144
+
1145
+ new_df = df.copy()
1146
+ source_cols_to_drop: List[str] = []
1147
+ reconstructed_count = 0
1148
+
1149
+ _LOGGER.info(f"Attempting to reconstruct {len(reconstruction_map)} binary feature(s).")
1150
+
1151
+ for new_col_name, config in reconstruction_map.items():
1152
+
1153
+ # --- 1. Validation ---
1154
+ if not (isinstance(config, tuple) and len(config) == 3):
1155
+ _LOGGER.error(f"Config for '{new_col_name}' is invalid. Must be a 3-item tuple. Skipping.")
1156
+ raise ValueError()
1157
+
1158
+ source_col, label_for_0, label_for_1 = config
1159
+
1160
+ if source_col not in new_df.columns:
1161
+ _LOGGER.error(f"Source column '{source_col}' for new column '{new_col_name}' not found. Skipping.")
1162
+ raise ValueError()
1163
+
1164
+ if new_col_name in new_df.columns and verbose:
1165
+ _LOGGER.warning(f"New column '{new_col_name}' already exists and will be overwritten.")
1166
+
1167
+ if new_col_name == source_col:
1168
+ _LOGGER.error(f"New column name '{new_col_name}' cannot be the same as source column '{source_col}'.")
1169
+ raise ValueError()
1170
+
1171
+ # --- 2. Reconstruction ---
1172
+ # .map() handles 0, 1, preserves NaNs, and converts any other value to NaN.
1173
+ mapping_dict = {0: label_for_0, 1: label_for_1}
1174
+ new_df[new_col_name] = new_df[source_col].map(mapping_dict)
1175
+
1176
+ # --- 3. Logging/Tracking ---
1177
+ source_cols_to_drop.append(source_col)
1178
+ reconstructed_count += 1
1179
+ if verbose:
1180
+ print(f" - Reconstructed '{new_col_name}' from '{source_col}' (0='{label_for_0}', 1='{label_for_1}').")
1181
+
1182
+ # --- 4. Cleanup ---
1183
+ if drop_original and source_cols_to_drop:
1184
+ # Use set() to avoid duplicates if the same source col was used
1185
+ unique_cols_to_drop = list(set(source_cols_to_drop))
1186
+ new_df.drop(columns=unique_cols_to_drop, inplace=True)
1187
+ _LOGGER.info(f"Dropped {len(unique_cols_to_drop)} original binary source column(s).")
1188
+
1085
1189
  _LOGGER.info(f"Successfully reconstructed {reconstructed_count} feature(s).")
1086
1190
 
1087
1191
  return new_df
@@ -66,7 +66,7 @@ def create_optimization_bounds(
66
66
  # 1. Read header and determine feature names
67
67
  full_csv_path = make_fullpath(csv_path, enforce="file")
68
68
  try:
69
- df_header = pd.read_csv(full_csv_path, nrows=0)
69
+ df_header = pd.read_csv(full_csv_path, nrows=0, encoding="utf-8")
70
70
  except Exception as e:
71
71
  _LOGGER.error(f"Failed to read header from CSV: {e}")
72
72
  raise
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "12.1.0"
3
+ version = "12.3.0"
4
4
  description = "A collection of tools for data science and machine learning projects."
5
5
  authors = [
6
6
  { name = "Karl L. Loza Vidaurre", email = "luigiloza@gmail.com" }