dragon-ml-toolbox 1.4.1__py3-none-any.whl → 1.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,11 +5,10 @@ import xgboost as xgb
5
5
  import lightgbm as lgb
6
6
  from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
7
7
  from sklearn.base import ClassifierMixin
8
- from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
9
- from typing import Literal, Union, Tuple, Dict
8
+ from typing import Literal, Union, Tuple, Dict, Optional
10
9
  import polars as pl
11
10
  from functools import partial
12
- from .utilities import sanitize_filename, _script_info
11
+ from .utilities import sanitize_filename, _script_info, threshold_binary_values
13
12
 
14
13
 
15
14
  __all__ = [
@@ -20,14 +19,14 @@ __all__ = [
20
19
 
21
20
  class ObjectiveFunction():
22
21
  """
23
- Callable objective function designed for optimizing continuous outputs from regression models.
22
+ Callable objective function designed for optimizing continuous outputs from tree-based regression models.
24
23
 
25
- The target serialized file (joblib) must include a 'model' and a 'scaler'. Additionally 'feature_names' and 'target_name' will be parsed if present.
24
+ The target serialized file (joblib) must include a trained tree-based 'model'. Additionally 'feature_names' and 'target_name' will be parsed if present.
26
25
 
27
26
  Parameters
28
27
  ----------
29
28
  trained_model_path : str
30
- Path to a serialized model and its scaler (joblib) compatible with scikit-learn-like `.predict`.
29
+ Path to a serialized model (joblib) compatible with scikit-learn-like `.predict`.
31
30
  add_noise : bool
32
31
  Whether to apply multiplicative noise to the input features during evaluation.
33
32
  binary_features : int, default=0
@@ -35,15 +34,14 @@ class ObjectiveFunction():
35
34
  task : Literal, default 'maximization'
36
35
  Whether to maximize or minimize the target.
37
36
  """
38
- def __init__(self, trained_model_path: str, add_noise: bool=True, task: Literal["maximization", "minimization"]="maximization", binary_features: int=0) -> None:
37
+ def __init__(self, trained_model_path: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int=0) -> None:
39
38
  self.binary_features = binary_features
40
39
  self.is_hybrid = False if binary_features <= 0 else True
41
40
  self.use_noise = add_noise
42
41
  self._artifact = joblib.load(trained_model_path)
43
42
  self.model = self._get_from_artifact('model')
44
- self.scaler = self._get_from_artifact('scaler')
45
- self.feature_names: list[str] = self._get_from_artifact('feature_names') # type: ignore
46
- self.target_name: str = self._get_from_artifact('target_name') # type: ignore
43
+ self.feature_names: Optional[list[str]] = self._get_from_artifact('feature_names') # type: ignore
44
+ self.target_name: Optional[str] = self._get_from_artifact('target_name') # type: ignore
47
45
  self.task = task
48
46
  self.check_model() # check for classification models and None values
49
47
 
@@ -51,16 +49,15 @@ class ObjectiveFunction():
51
49
  if self.use_noise:
52
50
  features_array = self.add_noise(features_array)
53
51
  if self.is_hybrid:
54
- features_array = self._handle_hybrid(features_array)
52
+ features_array = threshold_binary_values(input_array=features_array, binary_features=self.binary_features)
55
53
 
56
54
  if features_array.ndim == 1:
57
55
  features_array = features_array.reshape(1, -1)
58
56
 
59
- # scale features as the model expects
60
- features_array = self.scaler.transform(features_array) # type: ignore
61
-
62
57
  result = self.model.predict(features_array) # type: ignore
63
58
  scalar = result.item()
59
+ # print(f"[DEBUG] Model predicted: {scalar}")
60
+
64
61
  # pso minimizes by default, so we return the negative value to maximize
65
62
  if self.task == "maximization":
66
63
  return -scalar
@@ -68,33 +65,22 @@ class ObjectiveFunction():
68
65
  return scalar
69
66
 
70
67
  def add_noise(self, features_array):
71
- noise_range = np.random.uniform(0.95, 1.05, size=features_array.shape)
72
- new_feature_values = features_array * noise_range
73
- return new_feature_values
74
-
75
- def _handle_hybrid(self, features_array):
76
- total_features = features_array.shape[0]
77
- if self.binary_features > total_features:
78
- raise ValueError("self.binary_features exceeds total number of features.")
79
-
80
- # Handle corner case where all features are binary
81
- if self.binary_features == total_features:
82
- feat_binary = (features_array > 0.5).astype(int)
83
- return feat_binary
84
-
85
- # Normal case: split into continuous and binary parts
86
- feat_continuous = features_array[:-self.binary_features]
87
- feat_binary = (features_array[-self.binary_features:] > 0.5).astype(int) #threshold binary values
88
- new_feature_values = np.concatenate([feat_continuous, feat_binary])
89
- return new_feature_values
68
+ if self.binary_features > 0:
69
+ split_idx = -self.binary_features
70
+ cont_part = features_array[:split_idx]
71
+ bin_part = features_array[split_idx:]
72
+ noise = np.random.uniform(0.95, 1.05, size=cont_part.shape)
73
+ cont_noised = cont_part * noise
74
+ return np.concatenate([cont_noised, bin_part])
75
+ else:
76
+ noise = np.random.uniform(0.95, 1.05, size=features_array.shape)
77
+ return features_array * noise
90
78
 
91
79
  def check_model(self):
92
80
  if isinstance(self.model, ClassifierMixin) or isinstance(self.model, xgb.XGBClassifier) or isinstance(self.model, lgb.LGBMClassifier):
93
81
  raise ValueError(f"[Model Check Failed] ❌\nThe loaded model ({type(self.model).__name__}) is a Classifier.\nOptimization is not suitable for standard classification tasks.")
94
82
  if self.model is None:
95
83
  raise ValueError("Loaded model is None")
96
- if self.scaler is None:
97
- raise ValueError("Loaded scaler is None")
98
84
 
99
85
  def _get_from_artifact(self, key: str):
100
86
  val = self._artifact.get(key)
@@ -105,7 +91,7 @@ class ObjectiveFunction():
105
91
  return result
106
92
 
107
93
  def __repr__(self):
108
- return (f"<ObjectiveFunction(model={type(self.model).__name__}, scaler={type(self.scaler).__name__}, use_noise={self.use_noise}, is_hybrid={self.is_hybrid}, task='{self.task}')>")
94
+ return (f"<ObjectiveFunction(model={type(self.model).__name__}, use_noise={self.use_noise}, is_hybrid={self.is_hybrid}, task='{self.task}')>")
109
95
 
110
96
 
111
97
  def _set_boundaries(lower_boundaries: list[float], upper_boundaries: list[float]):
@@ -142,11 +128,11 @@ def run_pso(lower_boundaries: list[float],
142
128
  auto_binary_boundaries: bool=True,
143
129
  target_name: Union[str, None]=None,
144
130
  feature_names: Union[list[str], None]=None,
145
- swarm_size: int=100,
146
- max_iterations: int=100,
131
+ swarm_size: int=200,
132
+ max_iterations: int=1000,
147
133
  inequality_constrain_function=None,
148
- post_hoc_analysis: Union[int, None]=None,
149
- workers: int=5) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
134
+ post_hoc_analysis: Optional[int]=3,
135
+ workers: int=3) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
150
136
  """
151
137
  Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
152
138
 
@@ -157,7 +143,7 @@ def run_pso(lower_boundaries: list[float],
157
143
  upper_boundaries : list[float]
158
144
  Upper bounds for each feature in the search space (as many as features expected by the model).
159
145
  objective_function : ObjectiveFunction
160
- A callable object encapsulating a regression model and its scaler.
146
+ A callable object encapsulating a tree-based regression model.
161
147
  save_results_dir : str
162
148
  Directory path to save the results CSV file.
163
149
  auto_binary_boundaries : bool
@@ -172,7 +158,7 @@ def run_pso(lower_boundaries: list[float],
172
158
  Maximum number of iterations for the optimization algorithm.
173
159
  inequality_constrain_function : callable or None, optional
174
160
  Optional function defining inequality constraints to be respected by the optimization.
175
- post_hoc_analysis : int or None, optional
161
+ post_hoc_analysis : int or None
176
162
  If specified, runs the optimization multiple times to perform post hoc analysis. The value indicates the number of repetitions.
177
163
  workers : int
178
164
  Number of parallel processes to use.
@@ -191,7 +177,6 @@ def run_pso(lower_boundaries: list[float],
191
177
  Notes
192
178
  -----
193
179
  - PSO minimizes the objective function by default; if maximization is desired, it should be handled inside the ObjectiveFunction.
194
- - Feature values are scaled before being passed to the model and inverse-transformed before result saving.
195
180
  """
196
181
  # Append binary boundaries
197
182
  binary_number = objective_function.binary_features
@@ -229,12 +214,15 @@ def run_pso(lower_boundaries: list[float],
229
214
  best_features, best_target, *_ = _pso(**arguments)
230
215
  # best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
231
216
 
232
- # inverse transformation
233
- best_features = np.array(best_features).reshape(1, -1)
234
- best_features_real = objective_function.scaler.inverse_transform(best_features).flatten() # type: ignore
217
+ # flip best_target if maximization was used
218
+ if objective_function.task == "maximization":
219
+ best_target = -best_target
220
+
221
+ # threshold binary features
222
+ best_features_threshold = threshold_binary_values(best_features, binary_number)
235
223
 
236
224
  # name features
237
- best_features_named = {name: value for name, value in zip(names, best_features_real)}
225
+ best_features_named = {name: value for name, value in zip(names, best_features_threshold)}
238
226
  best_target_named = {target_name: best_target}
239
227
 
240
228
  # save results
@@ -248,11 +236,14 @@ def run_pso(lower_boundaries: list[float],
248
236
  best_features, best_target, *_ = _pso(**arguments)
249
237
  # best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
250
238
 
251
- # inverse transformation
252
- best_features = np.array(best_features).reshape(1, -1)
253
- best_features_real = objective_function.scaler.inverse_transform(best_features).flatten() # type: ignore
239
+ # flip best_target if maximization was used
240
+ if objective_function.task == "maximization":
241
+ best_target = -best_target
242
+
243
+ # threshold binary features
244
+ best_features_threshold = threshold_binary_values(best_features, binary_number)
254
245
 
255
- for i, best_feature in enumerate(best_features_real):
246
+ for i, best_feature in enumerate(best_features_threshold):
256
247
  all_best_features[i].append(best_feature)
257
248
  all_best_targets.append(best_target)
258
249
 
@@ -270,7 +261,7 @@ def info():
270
261
  _script_info(__all__)
271
262
 
272
263
 
273
- ### SOURCE CODE FOR PSO ###
264
+ ### SOURCE CODE FOR PSO FROM PYSWARM ###
274
265
  def _obj_wrapper(func, args, kwargs, x):
275
266
  return func(x, *args, **kwargs)
276
267
 
ml_tools/utilities.py CHANGED
@@ -4,7 +4,7 @@ import pandas as pd
4
4
  import os
5
5
  from pathlib import Path
6
6
  import re
7
- from typing import Literal
7
+ from typing import Literal, Union, Sequence
8
8
 
9
9
 
10
10
  # Keep track of available tools
@@ -15,7 +15,8 @@ __all__ = [
15
15
  "merge_dataframes",
16
16
  "save_dataframe",
17
17
  "normalize_mixed_list",
18
- "sanitize_filename"
18
+ "sanitize_filename",
19
+ "threshold_binary_values"
19
20
  ]
20
21
 
21
22
 
@@ -94,7 +95,8 @@ def yield_dataframes_from_dir(datasets_dir: str):
94
95
  def merge_dataframes(
95
96
  *dfs: pd.DataFrame,
96
97
  reset_index: bool = False,
97
- direction: Literal["horizontal", "vertical"] = "horizontal"
98
+ direction: Literal["horizontal", "vertical"] = "horizontal",
99
+ verbose: bool=True
98
100
  ) -> pd.DataFrame:
99
101
  """
100
102
  Merges multiple DataFrames either horizontally or vertically.
@@ -118,8 +120,9 @@ def merge_dataframes(
118
120
  if len(dfs) < 2:
119
121
  raise ValueError("At least 2 DataFrames must be provided.")
120
122
 
121
- for i, df in enumerate(dfs, start=1):
122
- print(f"DataFrame {i} shape: {df.shape}")
123
+ if verbose:
124
+ for i, df in enumerate(dfs, start=1):
125
+ print(f"DataFrame {i} shape: {df.shape}")
123
126
 
124
127
 
125
128
  if direction == "horizontal":
@@ -141,8 +144,9 @@ def merge_dataframes(
141
144
 
142
145
  if reset_index:
143
146
  merged_df = merged_df.reset_index(drop=True)
144
-
145
- print(f"Merged DataFrame shape: {merged_df.shape}")
147
+
148
+ if verbose:
149
+ print(f"Merged DataFrame shape: {merged_df.shape}")
146
150
 
147
151
  return merged_df
148
152
 
@@ -170,7 +174,7 @@ def save_dataframe(df: pd.DataFrame, save_dir: str, filename: str) -> None:
170
174
  output_path = os.path.join(save_dir, filename)
171
175
 
172
176
  df.to_csv(output_path, index=False, encoding='utf-8')
173
- print(f"✅ Saved file: '{filename}'")
177
+ print(f"✅ Saved dataset: '{filename}' with shape: {df.shape}")
174
178
 
175
179
 
176
180
  def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
@@ -263,6 +267,38 @@ def sanitize_filename(filename: str) -> str:
263
267
  return sanitized
264
268
 
265
269
 
270
+ def threshold_binary_values(
271
+ input_array: Union[Sequence[float], np.ndarray],
272
+ binary_features: int
273
+ ) -> np.ndarray:
274
+ """
275
+ Thresholds binary features in a 1D numeric sequence. Binary features must be located at the end of the sequence.
276
+
277
+ Converts binary elements to values (0 or 1) using a threshold of 0.5. The rest of the array (assumed to be continuous features) is returned unchanged.
278
+
279
+ Parameters:
280
+ input_array (Union[Sequence[float], np.ndarray]) : A one-dimensional collection of numeric values. The binary features must be located at the end of the array.
281
+
282
+ binary_features (int) : Number of binary features to threshold from the end of the array. Must be between 0 and the total number of elements.
283
+
284
+ Returns:
285
+ np.ndarray : A 1D NumPy array where the final `binary_features` values have been binarized.
286
+ """
287
+ array = np.asarray(input_array).flatten()
288
+ total = array.shape[0]
289
+
290
+ if binary_features < 0 or binary_features > total:
291
+ raise ValueError("Binary features must be between 0 and the total number of features.")
292
+
293
+ if binary_features == 0:
294
+ return array
295
+
296
+ cont_part = array[:-binary_features]
297
+ bin_part = (array[-binary_features:] > 0.5).astype(int)
298
+
299
+ return np.concatenate([cont_part, bin_part])
300
+
301
+
266
302
  def _script_info(all_data: list[str]):
267
303
  """
268
304
  List available names.
@@ -1,19 +0,0 @@
1
- dragon_ml_toolbox-1.4.1.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
- dragon_ml_toolbox-1.4.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=e1Hg5ZtaBpDV7ZvxhLe1ac28l7nMjvi1MSE5YvB1s-o,1472
3
- ml_tools/MICE_imputation.py,sha256=CK0tYZ_kQkdETohOlhI7RP7oFkJTXrP-XtIxb--dzpU,9726
4
- ml_tools/VIF_factor.py,sha256=LQWr1P8WYij07FX_3RZC6Rr22bfAMnrt0Lhvi7SbBpY,9846
5
- ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- ml_tools/data_exploration.py,sha256=FXP5i6bQo8J3RCyLRmlX-qJVh4VH8DbMjrdUmyd1mF0,18708
7
- ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
8
- ml_tools/ensemble_learning.py,sha256=khXXRiR7boWwI4CAvb2bxzS3fhLADNETMOiRe3ihZ4Y,28821
9
- ml_tools/handle_excel.py,sha256=ZJui5__0rc2T8UGHTheqZGhKmdVZ7Q2I54IoYCjAqJw,12612
10
- ml_tools/logger.py,sha256=NOtL3YSuffAGmpTpXjY-uJjqFLdRG_jpL7MDyloBw9c,4712
11
- ml_tools/particle_swarm_optimization.py,sha256=714kZo6lvUvRaPTtj6kJGecZwHcehcSkLysokXAf3No,20706
12
- ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
13
- ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
14
- ml_tools/utilities.py,sha256=z2JPy4GM2YBLUC0sPq7aNLuesPFAQu5KNcsgmuOywdU,8738
15
- ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
16
- dragon_ml_toolbox-1.4.1.dist-info/METADATA,sha256=0XdPwNWe81rCvJLJfSS5XvB2ZdJKpBLLoqMU5uxYLMc,2516
17
- dragon_ml_toolbox-1.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
- dragon_ml_toolbox-1.4.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
19
- dragon_ml_toolbox-1.4.1.dist-info/RECORD,,