dragon-ml-toolbox 1.4.8__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -0,0 +1,490 @@
1
+ import numpy as np
2
+ from pathlib import Path
3
+ import xgboost as xgb
4
+ import lightgbm as lgb
5
+ from sklearn.ensemble import HistGradientBoostingRegressor
6
+ from sklearn.base import ClassifierMixin
7
+ from typing import Literal, Union, Tuple, Dict, Optional
8
+ import pandas as pd
9
+ from copy import deepcopy
10
+ from .utilities import _script_info, threshold_binary_values, threshold_binary_values_batch, deserialize_object, list_files_by_extension, save_dataframe, make_fullpath
11
+ import torch
12
+ from tqdm import trange
13
+
14
+
15
+ __all__ = [
16
+ "ObjectiveFunction",
17
+ "multiple_objective_functions_from_dir",
18
+ "run_pso"
19
+ ]
20
+
21
+
22
+ class ObjectiveFunction():
23
+ """
24
+ Callable objective function designed for optimizing continuous outputs from tree-based regression models.
25
+
26
+ The target serialized file (joblib) must include a trained tree-based 'model'. Additionally 'feature_names' and 'target_name' will be parsed if present.
27
+
28
+ Parameters
29
+ ----------
30
+ trained_model_path : str
31
+ Path to a serialized model (joblib) compatible with scikit-learn-like `.predict`.
32
+ add_noise : bool
33
+ Whether to apply multiplicative noise to the input features during evaluation.
34
+ task : (Literal["maximization", "minimization"])
35
+ Whether to maximize or minimize the target.
36
+ binary_features : int
37
+ Number of binary features located at the END of the feature vector. Model should be trained with continuous features first, followed by binary.
38
+ """
39
+ def __init__(self, trained_model_path: Union[str, Path], add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int) -> None:
40
+ self.binary_features = binary_features
41
+ self.is_hybrid = False if binary_features <= 0 else True
42
+ self.use_noise = add_noise
43
+ self._artifact = deserialize_object(trained_model_path, verbose=False, raise_on_error=True)
44
+ self.model = self._get_from_artifact('model')
45
+ self.feature_names: Optional[list[str]] = self._get_from_artifact('feature_names') # type: ignore
46
+ self.target_name: Optional[str] = self._get_from_artifact('target_name') # type: ignore
47
+ self.task = task
48
+ self.check_model() # check for classification models and None values
49
+
50
+ def __call__(self, features_array: np.ndarray) -> np.ndarray:
51
+ """
52
+ Batched evaluation for PSO. Accepts 2D array (n_samples, n_features).
53
+
54
+ Applies optional noise and hybrid binary thresholding.
55
+
56
+ Returns
57
+ -------
58
+ np.ndarray
59
+ 1D array with length n_samples containing predicted target values.
60
+ """
61
+ assert features_array.ndim == 2, f"Expected 2D array, got shape {features_array.shape}"
62
+
63
+ # Apply noise if enabled
64
+ if self.use_noise:
65
+ features_array = self.add_noise(features_array)
66
+
67
+ # Apply binary thresholding if enabled
68
+ if self.is_hybrid:
69
+ features_array = threshold_binary_values_batch(features_array, self.binary_features)
70
+
71
+ # Ensure correct type
72
+ features_array = features_array.astype(np.float32)
73
+
74
+ # Evaluate
75
+ result = self.model.predict(features_array) # type: ignore
76
+
77
+ # Flip sign if maximizing
78
+ if self.task == "maximization":
79
+ return -result
80
+ return result
81
+
82
+ def add_noise(self, features_array: np.ndarray) -> np.ndarray:
83
+ """
84
+ Apply multiplicative noise to input feature batch (2D).
85
+ Binary features (if present) are excluded from noise injection.
86
+
87
+ Parameters
88
+ ----------
89
+ features_array : np.ndarray
90
+ Input array of shape (batch_size, n_features)
91
+
92
+ Returns
93
+ -------
94
+ np.ndarray
95
+ Noised array of same shape
96
+ """
97
+ assert features_array.ndim == 2, "Expected 2D array for batch noise injection"
98
+
99
+ if self.binary_features > 0:
100
+ split_idx = -self.binary_features
101
+ cont_part = features_array[:, :split_idx]
102
+ bin_part = features_array[:, split_idx:]
103
+
104
+ noise = np.random.uniform(0.95, 1.05, size=cont_part.shape)
105
+ cont_noised = cont_part * noise
106
+
107
+ return np.hstack([cont_noised, bin_part])
108
+ else:
109
+ noise = np.random.uniform(0.95, 1.05, size=features_array.shape)
110
+ return features_array * noise
111
+
112
+ def check_model(self):
113
+ if isinstance(self.model, ClassifierMixin) or isinstance(self.model, xgb.XGBClassifier) or isinstance(self.model, lgb.LGBMClassifier):
114
+ raise ValueError(f"[Model Check Failed] ❌\nThe loaded model ({type(self.model).__name__}) is a Classifier.\nOptimization is not suitable for standard classification tasks.")
115
+ if self.model is None:
116
+ raise ValueError("Loaded model is None")
117
+
118
+ def _get_from_artifact(self, key: str):
119
+ if self._artifact is None:
120
+ raise TypeError("Load model error")
121
+ val = self._artifact.get(key)
122
+ if key == "feature_names":
123
+ result = val if isinstance(val, list) and val else None
124
+ else:
125
+ result = val if val else None
126
+ return result
127
+
128
+ def __repr__(self):
129
+ return (f"<ObjectiveFunction(model={type(self.model).__name__}, use_noise={self.use_noise}, is_hybrid={self.is_hybrid}, task='{self.task}')>")
130
+
131
+
132
+ def multiple_objective_functions_from_dir(directory: Union[str,Path], add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int):
133
+ """
134
+ Loads multiple objective functions from serialized models in the given directory.
135
+
136
+ Each `.joblib` file which is loaded and wrapped as an `ObjectiveFunction` instance. Returns a list of such instances along with their corresponding names.
137
+
138
+ Parameters:
139
+ directory (str) : Path to the directory containing `.joblib` files (serialized models).
140
+ add_noise (bool) : Whether to apply multiplicative noise to the input features during evaluation.
141
+ task (Literal["maximization", "minimization"]) : Defines the nature of the optimization task.
142
+ binary_features (int) : Number of binary features expected by each objective function.
143
+
144
+ Returns:
145
+ (tuple[list[ObjectiveFunction], list[str]]) : A tuple containing:
146
+ - list of `ObjectiveFunction` instances.
147
+ - list of corresponding filenames.
148
+ """
149
+ objective_functions = list()
150
+ objective_function_names = list()
151
+ for file_name, file_path in list_files_by_extension(directory=directory, extension='joblib').items():
152
+ current_objective = ObjectiveFunction(trained_model_path=file_path,
153
+ add_noise=add_noise,
154
+ task=task,
155
+ binary_features=binary_features)
156
+ objective_functions.append(current_objective)
157
+ objective_function_names.append(file_name)
158
+ return objective_functions, objective_function_names
159
+
160
+
161
+ def _set_boundaries(lower_boundaries: list[float], upper_boundaries: list[float]):
162
+ assert len(lower_boundaries) == len(upper_boundaries), "Lower and upper boundaries must have the same length."
163
+ assert len(lower_boundaries) >= 1, "At least one boundary pair is required."
164
+ lower = np.array(lower_boundaries)
165
+ upper = np.array(upper_boundaries)
166
+ return lower, upper
167
+
168
+
169
+ def _set_feature_names(size: int, names: Union[list[str], None]):
170
+ if names is None:
171
+ return [str(i) for i in range(1, size+1)]
172
+ else:
173
+ assert len(names) == size, "List with feature names do not match the number of features"
174
+ return names
175
+
176
+
177
+ def _save_results(*dicts, save_dir: Union[str,Path], target_name: str):
178
+ combined_dict = dict()
179
+ for single_dict in dicts:
180
+ combined_dict.update(single_dict)
181
+
182
+ df = pd.DataFrame(combined_dict)
183
+
184
+ save_dataframe(df=df, save_dir=save_dir, filename=f"Optimization_{target_name}")
185
+
186
+
187
+ def run_pso(lower_boundaries: list[float],
188
+ upper_boundaries: list[float],
189
+ objective_function: ObjectiveFunction,
190
+ save_results_dir: Union[str,Path],
191
+ auto_binary_boundaries: bool=True,
192
+ target_name: Union[str, None]=None,
193
+ feature_names: Union[list[str], None]=None,
194
+ swarm_size: int=200,
195
+ max_iterations: int=3000,
196
+ random_state: int=101,
197
+ post_hoc_analysis: Optional[int]=10) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
198
+ """
199
+ Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
200
+
201
+ Parameters
202
+ ----------
203
+ lower_boundaries : list[float]
204
+ Lower bounds for each feature in the search space (as many as features expected by the model).
205
+ upper_boundaries : list[float]
206
+ Upper bounds for each feature in the search space (as many as features expected by the model).
207
+ objective_function : ObjectiveFunction
208
+ A callable object encapsulating a tree-based regression model.
209
+ save_results_dir : str | Path
210
+ Directory path to save the results CSV file.
211
+ auto_binary_boundaries : bool
212
+ Use `ObjectiveFunction.binary_features` to append as many binary boundaries as needed to `lower_boundaries` and `upper_boundaries` automatically.
213
+ target_name : str or None, optional
214
+ Name of the target variable. If None, attempts to retrieve from the ObjectiveFunction object.
215
+ feature_names : list[str] or None, optional
216
+ List of feature names. If None, attempts to retrieve from the ObjectiveFunction or generate generic names.
217
+ swarm_size : int
218
+ Number of particles in the swarm.
219
+ max_iterations : int
220
+ Maximum number of iterations for the optimization algorithm.
221
+ post_hoc_analysis : int or None
222
+ If specified, runs the optimization multiple times to perform post hoc analysis. The value indicates the number of repetitions.
223
+
224
+ Returns
225
+ -------
226
+ Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]
227
+ If `post_hoc_analysis` is None, returns two dictionaries:
228
+ - feature_names: Feature values (after inverse scaling) that yield the best result.
229
+ - target_name: Best result obtained for the target variable.
230
+
231
+ If `post_hoc_analysis` is an integer, returns two dictionaries:
232
+ - feature_names: Lists of best feature values (after inverse scaling) for each repetition.
233
+ - target_name: List of best target values across repetitions.
234
+
235
+ Notes
236
+ -----
237
+ - PSO minimizes the objective function by default; if maximization is desired, it should be handled inside the ObjectiveFunction.
238
+ """
239
+ # Select device
240
+ if torch.cuda.is_available():
241
+ device = torch.device("cuda")
242
+ elif torch.backends.mps.is_available():
243
+ device = torch.device("mps")
244
+ else:
245
+ device = torch.device("cpu")
246
+ print(f"[PSO] Using device: '{device}'")
247
+
248
+ # set local deep copies to prevent in place list modification
249
+ local_lower_boundaries = deepcopy(lower_boundaries)
250
+ local_upper_boundaries = deepcopy(upper_boundaries)
251
+
252
+ # Append binary boundaries
253
+ binary_number = objective_function.binary_features
254
+ if auto_binary_boundaries and binary_number > 0:
255
+ local_lower_boundaries.extend([0] * binary_number)
256
+ local_upper_boundaries.extend([1] * binary_number)
257
+
258
+ # Set the total length of features
259
+ size_of_features = len(local_lower_boundaries)
260
+
261
+ lower, upper = _set_boundaries(local_lower_boundaries, local_upper_boundaries)
262
+
263
+ # feature names
264
+ if feature_names is None and objective_function.feature_names is not None:
265
+ feature_names = objective_function.feature_names
266
+ names = _set_feature_names(size=size_of_features, names=feature_names)
267
+
268
+ # target name
269
+ if target_name is None and objective_function.target_name is not None:
270
+ target_name = objective_function.target_name
271
+ if target_name is None:
272
+ target_name = "Target"
273
+
274
+ arguments = {
275
+ "func":objective_function,
276
+ "lb": lower,
277
+ "ub": upper,
278
+ "device": device,
279
+ "swarmsize": swarm_size,
280
+ "maxiter": max_iterations,
281
+ "particle_output": False,
282
+ }
283
+
284
+ save_results_path = make_fullpath(save_results_dir, make=True)
285
+
286
+ if post_hoc_analysis is None or post_hoc_analysis == 1:
287
+ arguments.update({"seed": random_state})
288
+
289
+ best_features, best_target, *_ = _pso(**arguments)
290
+ # best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
291
+
292
+ # flip best_target if maximization was used
293
+ if objective_function.task == "maximization":
294
+ best_target = -best_target
295
+
296
+ # threshold binary features
297
+ best_features_threshold = threshold_binary_values(best_features, binary_number)
298
+
299
+ # name features
300
+ best_features_named = {name: value for name, value in zip(names, best_features_threshold)}
301
+ best_target_named = {target_name: best_target}
302
+
303
+ # save results
304
+ _save_results(best_features_named, best_target_named, save_dir=save_results_path, target_name=target_name)
305
+
306
+ return best_features_named, best_target_named
307
+ else:
308
+ all_best_targets = list()
309
+ all_best_features = [[] for _ in range(size_of_features)]
310
+ for _ in range(post_hoc_analysis):
311
+ best_features, best_target, *_ = _pso(**arguments)
312
+ # best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
313
+
314
+ # flip best_target if maximization was used
315
+ if objective_function.task == "maximization":
316
+ best_target = -best_target
317
+
318
+ # threshold binary features
319
+ best_features_threshold = threshold_binary_values(best_features, binary_number)
320
+
321
+ for i, best_feature in enumerate(best_features_threshold):
322
+ all_best_features[i].append(best_feature)
323
+ all_best_targets.append(best_target)
324
+
325
+ # name features
326
+ all_best_features_named = {name: list_values for name, list_values in zip(names, all_best_features)}
327
+ all_best_targets_named = {target_name: all_best_targets}
328
+
329
+ # save results
330
+ _save_results(all_best_features_named, all_best_targets_named, save_dir=save_results_path, target_name=target_name)
331
+
332
+ return all_best_features_named, all_best_targets_named # type: ignore
333
+
334
+
335
+ def info():
336
+ _script_info(__all__)
337
+
338
+
339
+ def _pso(func: ObjectiveFunction,
340
+ lb: np.ndarray,
341
+ ub: np.ndarray,
342
+ device: torch.device,
343
+ swarmsize=100,
344
+ maxiter=100,
345
+ omega = 0.729, # Clerc and Kennedy’s constriction coefficient
346
+ phip = 1.49445, # Clerc and Kennedy’s constriction coefficient
347
+ phig = 1.49445, # Clerc and Kennedy’s constriction coefficient
348
+ tolerance = 1e-8,
349
+ particle_output=False,
350
+ seed: Optional[int] = None):
351
+ """
352
+ Internal PSO implementation using PyTorch tensors for acceleration on CUDA or MPS devices.
353
+
354
+ Parameters
355
+ ----------
356
+ func : callable
357
+ Callable objective function with batched evaluation support. Must accept a 2D NumPy array
358
+ of shape (n_particles, n_features) and return a 1D NumPy array of shape (n_particles,).
359
+
360
+ lb : np.ndarray
361
+ Lower bounds for each feature (1D array of length n_features).
362
+
363
+ ub : np.ndarray
364
+ Upper bounds for each feature (1D array of length n_features).
365
+
366
+ swarmsize : int
367
+ Number of particles in the swarm (i.e., batch size per iteration).
368
+
369
+ maxiter : int
370
+ Number of iterations to perform (i.e., optimization steps).
371
+
372
+ omega : float
373
+ Inertia weight controlling velocity retention across iterations.
374
+ - Typical range: [0.4, 0.9]
375
+ - Lower values encourage convergence, higher values promote exploration.
376
+ - The default value (0.729) comes from Clerc & Kennedy's constriction method.
377
+
378
+ phip : float
379
+ Cognitive acceleration coefficient.
380
+ - Controls how strongly particles are pulled toward their own best-known positions.
381
+ - Typical range: [0.5, 2.5]
382
+ - Default from Clerc & Kennedy's recommended setting.
383
+
384
+ phig : float
385
+ Social acceleration coefficient.
386
+ - Controls how strongly particles are pulled toward the swarm's global best.
387
+ - Typical range: [0.5, 2.5]
388
+ - Default from Clerc & Kennedy's recommended setting.
389
+
390
+ particle_output : bool, default=False
391
+ If True, returns the full history of particle positions and objective scores at each iteration.
392
+
393
+ seed : int or None, default=None
394
+ Random seed for reproducibility. If None, defaults to 42.
395
+
396
+ Returns
397
+ -------
398
+ best_position : np.ndarray
399
+ 1D array of shape (n_features,) representing the best solution found.
400
+
401
+ best_score : float
402
+ Objective value at `best_position`.
403
+
404
+ history_positions : list[np.ndarray], optional
405
+ Only returned if `particle_output=True`. List of particle positions per iteration.
406
+ Each element has shape (swarmsize, n_features).
407
+
408
+ history_scores : list[np.ndarray], optional
409
+ Only returned if `particle_output=True`. List of objective scores per iteration.
410
+ Each element has shape (swarmsize,).
411
+ """
412
+ if seed is not None:
413
+ torch.manual_seed(seed)
414
+
415
+ ndim = len(lb)
416
+ lb_t = torch.tensor(lb, dtype=torch.float32, device=device, requires_grad=False)
417
+ ub_t = torch.tensor(ub, dtype=torch.float32, device=device, requires_grad=False)
418
+
419
+ # Initialize positions and velocities
420
+ r = torch.rand((swarmsize, ndim), device=device, requires_grad=False)
421
+ positions = lb_t + r * (ub_t - lb_t) # shape: (swarmsize, ndim)
422
+ velocities = torch.zeros_like(positions, requires_grad=False)
423
+
424
+ # Initialize best positions and scores
425
+ personal_best_positions = positions.clone()
426
+ personal_best_scores = torch.full((swarmsize,), float('inf'), device=device, requires_grad=False)
427
+
428
+ global_best_score = float('inf')
429
+ global_best_position = torch.zeros(ndim, device=device, requires_grad=False)
430
+
431
+ # History (optional)
432
+ if particle_output:
433
+ history_positions = []
434
+ history_scores = []
435
+
436
+ # Main loop
437
+ previous_best_score = float('inf')
438
+ progress = trange(maxiter, desc="PSO", unit="iter", leave=True) #tqdm bar
439
+ with torch.no_grad():
440
+ for i in progress:
441
+ # Evaluate objective for all particles
442
+ positions_np = positions.detach().cpu().numpy() # shape: (swarmsize, n_features)
443
+ scores_np = func(positions_np) # shape: (swarmsize,)
444
+ scores = torch.tensor(scores_np, device=device, dtype=torch.float32)
445
+
446
+ # Update personal bests
447
+ improved = scores < personal_best_scores
448
+ personal_best_scores = torch.where(improved, scores, personal_best_scores)
449
+ personal_best_positions = torch.where(improved[:, None], positions, personal_best_positions)
450
+
451
+ # Update global best
452
+ min_score, min_idx = torch.min(personal_best_scores, dim=0)
453
+ if min_score < global_best_score:
454
+ global_best_score = min_score.item()
455
+ global_best_position = personal_best_positions[min_idx].clone()
456
+
457
+ # Early stopping criteria
458
+ if abs(previous_best_score - global_best_score) < tolerance:
459
+ progress.set_description(f"PSO (early stop at iteration {i+1})")
460
+ break
461
+ previous_best_score = global_best_score
462
+
463
+ # Optional: track history for debugging/visualization
464
+ if particle_output:
465
+ history_positions.append(positions.detach().cpu().numpy())
466
+ history_scores.append(scores_np)
467
+
468
+ # Velocity update
469
+ rp = torch.rand((swarmsize, ndim), device=device, requires_grad=False)
470
+ rg = torch.rand((swarmsize, ndim), device=device, requires_grad=False)
471
+
472
+ cognitive = phip * rp * (personal_best_positions - positions)
473
+ social = phig * rg * (global_best_position - positions)
474
+ velocities = omega * velocities + cognitive + social
475
+
476
+ # Position update
477
+ positions = positions + velocities
478
+
479
+ # Clamp to search space bounds
480
+ positions = torch.max(positions, lb_t)
481
+ positions = torch.min(positions, ub_t)
482
+
483
+ # Move to CPU and convert to NumPy
484
+ best_position = global_best_position.detach().cpu().numpy()
485
+ best_score = global_best_score
486
+
487
+ if particle_output:
488
+ return best_position, best_score, history_positions, history_scores
489
+ else:
490
+ return best_position, best_score
ml_tools/VIF_factor.py CHANGED
@@ -2,12 +2,12 @@
2
2
  import pandas as pd
3
3
  import numpy as np
4
4
  import matplotlib.pyplot as plt
5
- from typing import Optional
5
+ from typing import Optional, Union
6
6
  from statsmodels.stats.outliers_influence import variance_inflation_factor
7
7
  from statsmodels.tools.tools import add_constant
8
8
  import warnings
9
- import os
10
- from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe, _script_info
9
+ from pathlib import Path
10
+ from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe, _script_info, make_fullpath
11
11
 
12
12
 
13
13
  __all__ = [
@@ -22,7 +22,7 @@ def compute_vif(
22
22
  use_columns: Optional[list[str]] = None,
23
23
  ignore_columns: Optional[list[str]] = None,
24
24
  max_features_to_plot: int = 20,
25
- save_dir: Optional[str] = None,
25
+ save_dir: Optional[Union[str,Path]] = None,
26
26
  filename: Optional[str] = None,
27
27
  fontsize: int = 14,
28
28
  show_plot: bool = True,
@@ -36,7 +36,7 @@ def compute_vif(
36
36
  use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
37
37
  ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
38
38
  max_features_to_plot (int): Adjust the number of features shown in the plot.
39
- save_dir (str | None): Directory to save the plot as SVG. If None, the plot is not saved.
39
+ save_dir (str | Path | None): Directory to save the plot as SVG. If None, the plot is not saved.
40
40
  filename (str | None): Optional filename for saving the plot. Defaults to "VIF_plot.svg".
41
41
  fontsize (int): Base fontsize to scale title and labels on the plot.
42
42
  show_plot (bool): Display plot.
@@ -128,15 +128,16 @@ def compute_vif(
128
128
  plt.tight_layout()
129
129
 
130
130
  if save_dir:
131
- os.makedirs(save_dir, exist_ok=True)
131
+ save_path = make_fullpath(save_dir, make=True)
132
132
  if filename is None:
133
133
  filename = "VIF_plot.svg"
134
134
  else:
135
135
  filename = sanitize_filename(filename)
136
+ filename = "VIF_" + filename
136
137
  if not filename.endswith(".svg"):
137
138
  filename += ".svg"
138
- save_path = os.path.join(save_dir, "VIF_" + filename)
139
- plt.savefig(save_path, format='svg', bbox_inches='tight')
139
+ full_save_path = save_path / filename
140
+ plt.savefig(full_save_path, format='svg', bbox_inches='tight')
140
141
  print(f"\tSaved VIF plot: '{filename}'")
141
142
 
142
143
  if show_plot:
@@ -176,9 +177,9 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
176
177
  return result_df, to_drop
177
178
 
178
179
 
179
- def compute_vif_multi(input_directory: str,
180
- output_plot_directory: str,
181
- output_dataset_directory: Optional[str] = None,
180
+ def compute_vif_multi(input_directory: Union[str, Path],
181
+ output_plot_directory: Union[str, Path],
182
+ output_dataset_directory: Optional[Union[str, Path]] = None,
182
183
  use_columns: Optional[list[str]] = None,
183
184
  ignore_columns: Optional[list[str]] = None,
184
185
  max_features_to_plot: int = 20,
@@ -188,9 +189,9 @@ def compute_vif_multi(input_directory: str,
188
189
  Generates a bar plot of VIF values. Optionally drops columns with VIF >= 10 and saves as a new CSV file.
189
190
 
190
191
  Args:
191
- input_directory (str): Target directory with CSV files able to be loaded as DataFrame.
192
- output_plot_directory (str): Save plots to this directory.
193
- output_dataset_directory (str | None): If provided, saves new CSV files to this directory.
192
+ input_directory (str | Path): Target directory with CSV files able to be loaded as DataFrame.
193
+ output_plot_directory (str | Path): Save plots to this directory.
194
+ output_dataset_directory (str | Path | None): If provided, saves new CSV files to this directory.
194
195
  use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
195
196
  ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
196
197
  max_features_to_plot (int): Adjust the number of features shown in the plot.
@@ -202,7 +203,9 @@ def compute_vif_multi(input_directory: str,
202
203
  A VIF of 1 suggests no correlation, values between 1 and 5 indicate moderate correlation, and values greater than 10 typically signal high multicollinearity, which may distort model interpretation and degrade performance.
203
204
  """
204
205
  if output_dataset_directory is not None:
205
- os.makedirs(output_dataset_directory, exist_ok=True)
206
+ output_dataset_path = make_fullpath(output_dataset_directory, make=True)
207
+ else:
208
+ output_dataset_path = None
206
209
 
207
210
  for df, df_name in yield_dataframes_from_dir(datasets_dir=input_directory):
208
211
  vif_dataframe = compute_vif(df=df,
@@ -215,12 +218,12 @@ def compute_vif_multi(input_directory: str,
215
218
  show_plot=False,
216
219
  verbose=False)
217
220
 
218
- if output_dataset_directory is not None:
221
+ if output_dataset_path is not None:
219
222
  new_filename = df_name + '_VIF'
220
223
  result_df, dropped_cols = drop_vif_based(df=df, vif_df=vif_dataframe)
221
224
 
222
225
  if len(dropped_cols) > 0:
223
- save_dataframe(df=result_df, save_dir=output_dataset_directory, filename=new_filename)
226
+ save_dataframe(df=result_df, save_dir=output_dataset_path, filename=new_filename)
224
227
 
225
228
 
226
229
  def info():
@@ -1,3 +1,8 @@
1
+ """
2
+ DEPRECATED
3
+ """
4
+
5
+
1
6
  import numpy as np
2
7
  import os
3
8
  import xgboost as xgb