dragon-ml-toolbox 1.4.4__tar.gz → 1.4.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (24) hide show
  1. {dragon_ml_toolbox-1.4.4/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-1.4.6}/PKG-INFO +1 -1
  2. {dragon_ml_toolbox-1.4.4 → dragon_ml_toolbox-1.4.6/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
  3. {dragon_ml_toolbox-1.4.4 → dragon_ml_toolbox-1.4.6}/ml_tools/MICE_imputation.py +22 -6
  4. {dragon_ml_toolbox-1.4.4 → dragon_ml_toolbox-1.4.6}/ml_tools/data_exploration.py +33 -38
  5. {dragon_ml_toolbox-1.4.4 → dragon_ml_toolbox-1.4.6}/ml_tools/ensemble_learning.py +4 -3
  6. {dragon_ml_toolbox-1.4.4 → dragon_ml_toolbox-1.4.6}/ml_tools/particle_swarm_optimization.py +41 -9
  7. {dragon_ml_toolbox-1.4.4 → dragon_ml_toolbox-1.4.6}/ml_tools/utilities.py +185 -26
  8. {dragon_ml_toolbox-1.4.4 → dragon_ml_toolbox-1.4.6}/pyproject.toml +1 -1
  9. {dragon_ml_toolbox-1.4.4 → dragon_ml_toolbox-1.4.6}/LICENSE +0 -0
  10. {dragon_ml_toolbox-1.4.4 → dragon_ml_toolbox-1.4.6}/LICENSE-THIRD-PARTY.md +0 -0
  11. {dragon_ml_toolbox-1.4.4 → dragon_ml_toolbox-1.4.6}/README.md +0 -0
  12. {dragon_ml_toolbox-1.4.4 → dragon_ml_toolbox-1.4.6}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
  13. {dragon_ml_toolbox-1.4.4 → dragon_ml_toolbox-1.4.6}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  14. {dragon_ml_toolbox-1.4.4 → dragon_ml_toolbox-1.4.6}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
  15. {dragon_ml_toolbox-1.4.4 → dragon_ml_toolbox-1.4.6}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  16. {dragon_ml_toolbox-1.4.4 → dragon_ml_toolbox-1.4.6}/ml_tools/VIF_factor.py +0 -0
  17. {dragon_ml_toolbox-1.4.4 → dragon_ml_toolbox-1.4.6}/ml_tools/__init__.py +0 -0
  18. {dragon_ml_toolbox-1.4.4 → dragon_ml_toolbox-1.4.6}/ml_tools/datasetmaster.py +0 -0
  19. {dragon_ml_toolbox-1.4.4 → dragon_ml_toolbox-1.4.6}/ml_tools/handle_excel.py +0 -0
  20. {dragon_ml_toolbox-1.4.4 → dragon_ml_toolbox-1.4.6}/ml_tools/logger.py +0 -0
  21. {dragon_ml_toolbox-1.4.4 → dragon_ml_toolbox-1.4.6}/ml_tools/pytorch_models.py +0 -0
  22. {dragon_ml_toolbox-1.4.4 → dragon_ml_toolbox-1.4.6}/ml_tools/trainer.py +0 -0
  23. {dragon_ml_toolbox-1.4.4 → dragon_ml_toolbox-1.4.6}/ml_tools/vision_helpers.py +0 -0
  24. {dragon_ml_toolbox-1.4.4 → dragon_ml_toolbox-1.4.6}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 1.4.4
3
+ Version: 1.4.6
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 1.4.4
3
+ Version: 1.4.6
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -3,8 +3,9 @@ import miceforest as mf
3
3
  import os
4
4
  import matplotlib.pyplot as plt
5
5
  import numpy as np
6
- from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe
6
+ from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe, threshold_binary_values
7
7
  from plotnine import ggplot, labs, theme, element_blank # type: ignore
8
+ from typing import Optional
8
9
 
9
10
 
10
11
  __all__ = [
@@ -17,7 +18,7 @@ __all__ = [
17
18
  ]
18
19
 
19
20
 
20
- def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
21
+ def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str]]=None, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
21
22
 
22
23
  # Initialize kernel with number of imputed datasets to generate
23
24
  kernel = mf.ImputationKernel(
@@ -35,6 +36,16 @@ def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterat
35
36
  if imputed_datasets is None or len(imputed_datasets) == 0:
36
37
  raise ValueError("No imputed datasets were generated. Check the MICE process.")
37
38
 
39
+ # threshold binary columns
40
+ if binary_columns is not None:
41
+ invalid_binary_columns = set(binary_columns) - set(df.columns)
42
+ if invalid_binary_columns:
43
+ print(f"⚠️ These 'binary columns' are not in the dataset: {invalid_binary_columns}")
44
+ valid_binary_columns = [col for col in binary_columns if col not in invalid_binary_columns]
45
+ for imputed_df in imputed_datasets:
46
+ for binary_column_name in valid_binary_columns:
47
+ imputed_df[binary_column_name] = threshold_binary_values(imputed_df[binary_column_name]) # type: ignore
48
+
38
49
  if resulting_datasets == 1:
39
50
  imputed_dataset_names = [f"{df_name}_MICE"]
40
51
  else:
@@ -106,7 +117,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
106
117
  # Adjust plot display for the X axis
107
118
  _ticks = np.arange(iterations_cap)
108
119
  _labels = np.arange(1, iterations_cap + 1)
109
- plt.xticks(ticks=_ticks, labels=_labels)
120
+ plt.xticks(ticks=_ticks, labels=_labels) # type: ignore
110
121
  plt.grid(True)
111
122
 
112
123
  feature_save_name = sanitize_filename(feature_name)
@@ -202,7 +213,12 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
202
213
  print(f"{local_dir_name} completed.")
203
214
 
204
215
 
205
- def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str], save_datasets_dir: str, save_metrics_dir: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
216
+ def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str],
217
+ save_datasets_dir: str, save_metrics_dir: str,
218
+ binary_columns: Optional[list[str]]=None,
219
+ resulting_datasets: int=1,
220
+ iterations: int=20,
221
+ random_state: int=101):
206
222
  """
207
223
  Call functions in sequence for each dataset in the provided path or directory:
208
224
  1. Load dataframe
@@ -211,7 +227,7 @@ def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str], save_datas
211
227
  4. Save convergence metrics
212
228
  5. Save distribution metrics
213
229
 
214
- Target columns must be skipped from the imputation.
230
+ Target columns must be skipped from the imputation. Binary columns will be thresholded after imputation.
215
231
  """
216
232
  # Check paths
217
233
  os.makedirs(save_datasets_dir, exist_ok=True)
@@ -229,7 +245,7 @@ def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str], save_datas
229
245
 
230
246
  df, df_targets = _skip_targets(df, target_columns)
231
247
 
232
- kernel, imputed_datasets, imputed_dataset_names = apply_mice(df=df, df_name=df_name, resulting_datasets=resulting_datasets, iterations=iterations, random_state=random_state)
248
+ kernel, imputed_datasets, imputed_dataset_names = apply_mice(df=df, df_name=df_name, binary_columns=binary_columns, resulting_datasets=resulting_datasets, iterations=iterations, random_state=random_state)
233
249
 
234
250
  save_imputed_datasets(save_dir=save_datasets_dir, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
235
251
 
@@ -5,9 +5,10 @@ import seaborn as sns
5
5
  from IPython import get_ipython
6
6
  from IPython.display import clear_output
7
7
  import time
8
- from typing import Union, Literal, Dict, Tuple, Iterator
8
+ from typing import Union, Literal, Dict, Tuple, List
9
9
  import os
10
10
  from ml_tools.utilities import sanitize_filename, _script_info
11
+ import re
11
12
 
12
13
 
13
14
  # Keep track of all available tools, show using `info()`
@@ -23,7 +24,7 @@ __all__ = [
23
24
  "plot_value_distributions",
24
25
  "clip_outliers_single",
25
26
  "clip_outliers_multi",
26
- "distribute_datasets_by_target"
27
+ "match_and_filter_columns_by_regex"
27
28
  ]
28
29
 
29
30
 
@@ -90,18 +91,18 @@ def split_features_targets(df: pd.DataFrame, targets: list[str]):
90
91
 
91
92
  Returns:
92
93
  tuple: A tuple containing:
93
- - pd.DataFrame: Targets dataframe.
94
94
  - pd.DataFrame: Features dataframe.
95
+ - pd.DataFrame: Targets dataframe.
95
96
 
96
97
  Prints:
97
98
  - Shape of the original dataframe.
98
- - Shape of the targets dataframe.
99
99
  - Shape of the features dataframe.
100
+ - Shape of the targets dataframe.
100
101
  """
101
102
  df_targets = df[targets]
102
103
  df_features = df.drop(columns=targets)
103
- print(f"Original shape: {df.shape}\nTargets shape: {df_targets.shape}\nFeatures shape: {df_features.shape}")
104
- return df_targets, df_features
104
+ print(f"Original shape: {df.shape}\nFeatures shape: {df_features.shape}\nTargets shape: {df_targets.shape}")
105
+ return df_features, df_targets
105
106
 
106
107
 
107
108
  def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
@@ -246,9 +247,6 @@ def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None
246
247
  cbar_kws={"shrink": 0.8}
247
248
  )
248
249
 
249
- # sanitize the plot title
250
- plot_title = sanitize_filename(plot_title)
251
-
252
250
  plt.title(plot_title)
253
251
  plt.xticks(rotation=45, ha='right')
254
252
  plt.yticks(rotation=0)
@@ -256,6 +254,8 @@ def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None
256
254
  plt.tight_layout()
257
255
 
258
256
  if save_dir:
257
+ # sanitize the plot title to save the file
258
+ plot_title = sanitize_filename(plot_title)
259
259
  os.makedirs(save_dir, exist_ok=True)
260
260
  full_path = os.path.join(save_dir, plot_title + ".svg")
261
261
  plt.savefig(full_path, bbox_inches="tight", format='svg')
@@ -519,38 +519,34 @@ def clip_outliers_multi(
519
519
  return new_df
520
520
 
521
521
 
522
- def distribute_datasets_by_target(
522
+ def match_and_filter_columns_by_regex(
523
523
  df: pd.DataFrame,
524
- target_columns: list[str],
525
- verbose: bool = False
526
- ) -> Iterator[Tuple[str, pd.DataFrame]]:
524
+ pattern: str,
525
+ case_sensitive: bool = False,
526
+ escape_pattern: bool = False
527
+ ) -> Tuple[pd.DataFrame, List[str]]:
527
528
  """
528
- Yields cleaned DataFrames for each target column, where rows with missing
529
- target values are removed. The target column is placed at the end.
530
-
531
- Parameters
532
- ----------
533
- df : pd.DataFrame
534
- Preprocessed dataframe with all feature and target columns ready to train.
535
- target_columns : List[str]
536
- List of target column names to generate per-target DataFrames.
537
- verbose: bool
538
- Whether to print info for each yielded dataset.
539
-
540
- Yields
541
- ------
542
- Tuple[str, pd.DataFrame]
543
- * First element is the target column name.
544
- * Second element is the corresponding cleaned DataFrame.
529
+ Return a tuple of (filtered DataFrame, matched column names) based on a regex pattern.
530
+
531
+ Parameters:
532
+ df (pd.DataFrame): The DataFrame to search.
533
+ pattern (str): The regex pattern to match column names (use a raw string).
534
+ case_sensitive (bool): Whether matching is case-sensitive.
535
+ escape_pattern (bool): If True, the pattern is escaped with `re.escape()` to treat it literally.
536
+
537
+ Returns:
538
+ (Tuple[pd.DataFrame, list[str]]): A DataFrame filtered to matched columns, and a list of matching column names.
545
539
  """
546
- valid_targets = [col for col in df.columns if col in target_columns]
547
- feature_columns = [col for col in df.columns if col not in valid_targets]
540
+ if escape_pattern:
541
+ pattern = re.escape(pattern)
548
542
 
549
- for target in valid_targets:
550
- subset = df[feature_columns + [target]].dropna(subset=[target])
551
- if verbose:
552
- print(f"Target: '{target}' - Dataframe shape: {subset.shape}")
553
- yield target, subset
543
+ mask = df.columns.str.contains(pattern, case=case_sensitive, regex=True)
544
+ matched_columns = df.columns[mask].to_list()
545
+ filtered_df = df.loc[:, mask]
546
+
547
+ print(f"{len(matched_columns)} column(s) match the regex pattern '{pattern}'.")
548
+
549
+ return filtered_df, matched_columns
554
550
 
555
551
 
556
552
  def _is_notebook():
@@ -559,4 +555,3 @@ def _is_notebook():
559
555
 
560
556
  def info():
561
557
  _script_info(__all__)
562
-
@@ -20,7 +20,7 @@ from sklearn.model_selection import train_test_split
20
20
  from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, mean_absolute_error, mean_squared_error, r2_score, roc_curve, roc_auc_score
21
21
  import shap
22
22
 
23
- from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info
23
+ from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info, serialize_object
24
24
 
25
25
  import warnings # Ignore warnings
26
26
  warnings.filterwarnings('ignore', category=DeprecationWarning)
@@ -485,8 +485,9 @@ def _local_directories(model_name: str, dataset_id: str, save_dir: str):
485
485
  def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str):
486
486
  #Sanitize filenames to save
487
487
  sanitized_target_name = sanitize_filename(target_name)
488
- full_path = os.path.join(save_directory, f"{model_name}_{sanitized_target_name}.joblib")
489
- joblib.dump({'model': trained_model, 'feature_names': feature_names, 'target_name':target_name}, full_path)
488
+ filename = f"{model_name}_{sanitized_target_name}"
489
+ to_save = {'model': trained_model, 'feature_names': feature_names, 'target_name':target_name}
490
+ serialize_object(obj=to_save, save_dir=save_directory, filename=filename, verbose=False, raise_on_error=True)
490
491
 
491
492
  # function to evaluate the model and save metrics (Classification)
492
493
  def evaluate_model_classification(
@@ -8,11 +8,12 @@ from sklearn.base import ClassifierMixin
8
8
  from typing import Literal, Union, Tuple, Dict, Optional
9
9
  import polars as pl
10
10
  from functools import partial
11
- from .utilities import sanitize_filename, _script_info, threshold_binary_values
11
+ from .utilities import sanitize_filename, _script_info, threshold_binary_values, deserialize_object, list_files_by_extension
12
12
 
13
13
 
14
14
  __all__ = [
15
15
  "ObjectiveFunction",
16
+ "multiple_objective_functions_from_dir",
16
17
  "run_pso"
17
18
  ]
18
19
 
@@ -29,16 +30,16 @@ class ObjectiveFunction():
29
30
  Path to a serialized model (joblib) compatible with scikit-learn-like `.predict`.
30
31
  add_noise : bool
31
32
  Whether to apply multiplicative noise to the input features during evaluation.
32
- binary_features : int, default=0
33
- Number of binary features located at the END of the feature vector. Model should be trained with continuous features first, followed by binary.
34
- task : Literal, default 'maximization'
33
+ task : (Literal["maximization", "minimization"])
35
34
  Whether to maximize or minimize the target.
35
+ binary_features : int
36
+ Number of binary features located at the END of the feature vector. Model should be trained with continuous features first, followed by binary.
36
37
  """
37
- def __init__(self, trained_model_path: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int=0) -> None:
38
+ def __init__(self, trained_model_path: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int) -> None:
38
39
  self.binary_features = binary_features
39
40
  self.is_hybrid = False if binary_features <= 0 else True
40
41
  self.use_noise = add_noise
41
- self._artifact = joblib.load(trained_model_path)
42
+ self._artifact = deserialize_object(trained_model_path, verbose=False, raise_on_error=True)
42
43
  self.model = self._get_from_artifact('model')
43
44
  self.feature_names: Optional[list[str]] = self._get_from_artifact('feature_names') # type: ignore
44
45
  self.target_name: Optional[str] = self._get_from_artifact('target_name') # type: ignore
@@ -49,7 +50,7 @@ class ObjectiveFunction():
49
50
  if self.use_noise:
50
51
  features_array = self.add_noise(features_array)
51
52
  if self.is_hybrid:
52
- features_array = threshold_binary_values(input_array=features_array, binary_features=self.binary_features)
53
+ features_array = threshold_binary_values(input_array=features_array, binary_values=self.binary_features) # type: ignore
53
54
 
54
55
  if features_array.ndim == 1:
55
56
  features_array = features_array.reshape(1, -1)
@@ -83,6 +84,8 @@ class ObjectiveFunction():
83
84
  raise ValueError("Loaded model is None")
84
85
 
85
86
  def _get_from_artifact(self, key: str):
87
+ if self._artifact is None:
88
+ raise TypeError("Load model error")
86
89
  val = self._artifact.get(key)
87
90
  if key == "feature_names":
88
91
  result = val if isinstance(val, list) and val else None
@@ -94,6 +97,35 @@ class ObjectiveFunction():
94
97
  return (f"<ObjectiveFunction(model={type(self.model).__name__}, use_noise={self.use_noise}, is_hybrid={self.is_hybrid}, task='{self.task}')>")
95
98
 
96
99
 
100
+ def multiple_objective_functions_from_dir(directory: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int):
101
+ """
102
+ Loads multiple objective functions from serialized models in the given directory.
103
+
104
+ Each `.joblib` file which is loaded and wrapped as an `ObjectiveFunction` instance. Returns a list of such instances along with their corresponding names.
105
+
106
+ Parameters:
107
+ directory (str) : Path to the directory containing `.joblib` files (serialized models).
108
+ add_noise (bool) : Whether to apply multiplicative noise to the input features during evaluation.
109
+ task (Literal["maximization", "minimization"]) : Defines the nature of the optimization task.
110
+ binary_features (int) : Number of binary features expected by each objective function.
111
+
112
+ Returns:
113
+ (tuple[list[ObjectiveFunction], list[str]]) : A tuple containing:
114
+ - list of `ObjectiveFunction` instances.
115
+ - list of corresponding filenames.
116
+ """
117
+ objective_functions = list()
118
+ objective_function_names = list()
119
+ for file_name, file_path in list_files_by_extension(directory=directory, extension='joblib').items():
120
+ current_objective = ObjectiveFunction(trained_model_path=file_path,
121
+ add_noise=add_noise,
122
+ task=task,
123
+ binary_features=binary_features)
124
+ objective_functions.append(current_objective)
125
+ objective_function_names.append(file_name)
126
+ return objective_functions, objective_function_names
127
+
128
+
97
129
  def _set_boundaries(lower_boundaries: list[float], upper_boundaries: list[float]):
98
130
  assert len(lower_boundaries) == len(upper_boundaries), "Lower and upper boundaries must have the same length."
99
131
  assert len(lower_boundaries) >= 1, "At least one boundary pair is required."
@@ -129,9 +161,9 @@ def run_pso(lower_boundaries: list[float],
129
161
  target_name: Union[str, None]=None,
130
162
  feature_names: Union[list[str], None]=None,
131
163
  swarm_size: int=200,
132
- max_iterations: int=1500,
164
+ max_iterations: int=1000,
133
165
  inequality_constrain_function=None,
134
- post_hoc_analysis: Optional[int]=5,
166
+ post_hoc_analysis: Optional[int]=3,
135
167
  workers: int=1) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
136
168
  """
137
169
  Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
@@ -1,22 +1,29 @@
1
1
  import math
2
2
  import numpy as np
3
3
  import pandas as pd
4
+ import polars as pl
4
5
  import os
5
6
  from pathlib import Path
6
7
  import re
7
- from typing import Literal, Union, Sequence
8
+ from typing import Literal, Union, Sequence, Optional, Any, Iterator, Tuple
9
+ import joblib
10
+ from joblib.externals.loky.process_executor import TerminatedWorkerError
8
11
 
9
12
 
10
13
  # Keep track of available tools
11
14
  __all__ = [
12
15
  "list_csv_paths",
16
+ "list_files_by_extension",
13
17
  "load_dataframe",
14
18
  "yield_dataframes_from_dir",
15
19
  "merge_dataframes",
16
20
  "save_dataframe",
17
21
  "normalize_mixed_list",
18
22
  "sanitize_filename",
19
- "threshold_binary_values"
23
+ "threshold_binary_values",
24
+ "serialize_object",
25
+ "deserialize_object",
26
+ "distribute_datasets_by_target"
20
27
  ]
21
28
 
22
29
 
@@ -28,7 +35,7 @@ def list_csv_paths(directory: str) -> dict[str, str]:
28
35
  directory (str): Path to the directory containing `.csv` files.
29
36
 
30
37
  Returns:
31
- (dict[str, str]): Mapping {name, path}.
38
+ (dict[str, str]): Dictionary mapping {filename: filepath}.
32
39
  """
33
40
  dir_path = Path(directory).expanduser().resolve()
34
41
 
@@ -42,13 +49,47 @@ def list_csv_paths(directory: str) -> dict[str, str]:
42
49
  # make a dictionary of paths and names
43
50
  name_path_dict = {p.stem: str(p) for p in csv_paths}
44
51
 
45
- print("🗂️ CSV files found:")
52
+ print("\n🗂️ CSV files found:")
46
53
  for name in name_path_dict.keys():
47
54
  print(f"\t{name}")
48
55
 
49
56
  return name_path_dict
50
57
 
51
58
 
59
+ def list_files_by_extension(directory: str, extension: str) -> dict[str, str]:
60
+ """
61
+ Lists all files with the specified extension in the given directory and returns a mapping:
62
+ filenames (without extensions) to their absolute paths.
63
+
64
+ Parameters:
65
+ directory (str): Path to the directory to search in.
66
+ extension (str): File extension to search for (e.g., 'json', 'txt').
67
+
68
+ Returns:
69
+ (dict[str, str]): Dictionary mapping {filename: filepath}.
70
+ """
71
+ dir_path = Path(directory).expanduser().resolve()
72
+
73
+ if not dir_path.is_dir():
74
+ raise FileNotFoundError(f"Directory not found: {dir_path}")
75
+
76
+ # Normalize the extension (remove leading dot if present)
77
+ normalized_ext = extension.lstrip(".").lower()
78
+ pattern = f"*.{normalized_ext}"
79
+
80
+ matched_paths = list(dir_path.glob(pattern))
81
+ if not matched_paths:
82
+ raise IOError(f"No '.{normalized_ext}' files found in directory: {dir_path}")
83
+
84
+ name_path_dict = {p.stem: str(p) for p in matched_paths}
85
+
86
+ print(f"\n📂 '{normalized_ext.upper()}' files found:")
87
+ for name in name_path_dict:
88
+ print(f"\t{name}")
89
+
90
+ return name_path_dict
91
+
92
+
52
93
  def load_dataframe(df_path: str) -> tuple[pd.DataFrame, str]:
53
94
  """
54
95
  Load a CSV file into a pandas DataFrame and extract the base name (without extension) from the file path.
@@ -194,12 +235,9 @@ def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
194
235
 
195
236
  Returns:
196
237
  List[float]: A list of normalized float values summing to 1.0.
197
- Values significantly smaller than the median scale are scaled up
198
- before normalization to correct likely input errors.
199
238
 
200
239
  Notes:
201
240
  - Zeros and None values remain zero.
202
- - If all input values are zero or None, the function returns a list of zeros.
203
241
  - Input strings are automatically cast to floats if possible.
204
242
 
205
243
  Example:
@@ -268,35 +306,156 @@ def sanitize_filename(filename: str) -> str:
268
306
 
269
307
 
270
308
  def threshold_binary_values(
271
- input_array: Union[Sequence[float], np.ndarray],
272
- binary_features: int
273
- ) -> np.ndarray:
309
+ input_array: Union[Sequence[float], np.ndarray, pd.Series, pl.Series],
310
+ binary_values: Optional[int] = None
311
+ ) -> Union[np.ndarray, pd.Series, pl.Series, list[float], tuple[float]]:
274
312
  """
275
- Thresholds binary features in a 1D numeric sequence. Binary features must be located at the end of the sequence.
313
+ Thresholds binary features in a 1D input. The number of binary features are counted starting from the end.
276
314
 
277
- Converts binary elements to values (0 or 1) using a threshold of 0.5. The rest of the array (assumed to be continuous features) is returned unchanged.
315
+ Parameters:
316
+ input_array: 1D sequence, NumPy array, pandas Series, or polars Series.
317
+ binary_values (Optional[int]) :
318
+ - If `None`, all values are treated as binary.
319
+ - If `int`, only this many last `binary_values` are thresholded.
320
+
321
+ Returns:
322
+ Same type as input, with binary elements binarized to 0 or 1 using a 0.5 threshold.
323
+ """
324
+ original_type = type(input_array)
325
+
326
+ if isinstance(input_array, pl.Series):
327
+ array = input_array.to_numpy()
328
+ elif isinstance(input_array, (pd.Series, np.ndarray)):
329
+ array = np.asarray(input_array)
330
+ elif isinstance(input_array, (list, tuple)):
331
+ array = np.array(input_array)
332
+ else:
333
+ raise TypeError("Unsupported input type")
334
+
335
+ array = array.flatten()
336
+ total = array.shape[0]
337
+
338
+ bin_count = total if binary_values is None else binary_values
339
+ if not (0 <= bin_count <= total):
340
+ raise ValueError("binary_values must be between 0 and the total number of elements")
341
+
342
+ if bin_count == 0:
343
+ result = array
344
+ else:
345
+ cont_part = array[:-bin_count] if bin_count < total else np.array([])
346
+ bin_part = (array[-bin_count:] > 0.5).astype(int)
347
+ result = np.concatenate([cont_part, bin_part])
348
+
349
+ if original_type is pd.Series:
350
+ return pd.Series(result, index=input_array.index if hasattr(input_array, 'index') else None) # type: ignore
351
+ elif original_type is pl.Series:
352
+ return pl.Series(input_array.name if hasattr(input_array, 'name') else "binary", result) # type: ignore
353
+ elif original_type is list:
354
+ return result.tolist()
355
+ elif original_type is tuple:
356
+ return tuple(result)
357
+ else:
358
+ return result
359
+
360
+
361
+ def serialize_object(obj: Any, save_dir: str, filename: str, verbose: bool=True, raise_on_error: bool=False) -> Optional[str]:
362
+ """
363
+ Serializes a Python object using joblib; suitable for Python built-ins, numpy, and pandas.
278
364
 
279
365
  Parameters:
280
- input_array (Union[Sequence[float], np.ndarray]) : A one-dimensional collection of numeric values. The binary features must be located at the end of the array.
366
+ obj (Any) : The Python object to serialize.
367
+ save_dir (str) : Directory path where the serialized object will be saved.
368
+ filename (str) : Name for the output file, extension will be appended if needed.
369
+
370
+ Returns:
371
+ (str | None) : The full file path where the object was saved if successful; otherwise, None.
372
+ """
373
+ try:
374
+ os.makedirs(save_dir, exist_ok=True)
375
+ sanitized_name = sanitize_filename(filename)
376
+ if not sanitized_name.endswith('.joblib'):
377
+ sanitized_name = sanitized_name + ".joblib"
378
+ full_path = os.path.join(save_dir, sanitized_name)
379
+ joblib.dump(obj, full_path)
380
+ except (IOError, OSError, TypeError, TerminatedWorkerError) as e:
381
+ message = f"❌ Failed to serialize object of type '{type(obj)}': {e}"
382
+ if raise_on_error:
383
+ raise Exception(message)
384
+ else:
385
+ print(message)
386
+ return None
387
+ else:
388
+ if verbose:
389
+ print(f"✅ Object of type '{type(obj)}' saved to '{full_path}'")
390
+ return full_path
281
391
 
282
- binary_features (int) : Number of binary features to threshold from the end of the array. Must be between 0 and the total number of elements.
392
+
393
+ def deserialize_object(filepath: str, verbose: bool=True, raise_on_error: bool=True) -> Optional[Any]:
394
+ """
395
+ Loads a serialized object from a .joblib file.
396
+
397
+ Parameters:
398
+ filepath (str): Full path to the serialized .joblib file.
283
399
 
284
400
  Returns:
285
- np.ndarray : A 1D NumPy array where the final `binary_features` values have been binarized.
401
+ (Any | None): The deserialized Python object, or None if loading fails.
286
402
  """
287
- array = np.asarray(input_array).flatten()
288
- total = array.shape[0]
289
-
290
- if binary_features < 0 or binary_features > total:
291
- raise ValueError("Binary features must be between 0 and the total number of features.")
292
-
293
- if binary_features == 0:
294
- return array
403
+ if not os.path.exists(filepath):
404
+ print(f"❌ File does not exist: {filepath}")
405
+ return None
406
+ try:
407
+ obj = joblib.load(filepath)
408
+ except (IOError, OSError, EOFError, TypeError, ValueError) as e:
409
+ message = f"❌ Failed to deserialize object from '{filepath}': {e}"
410
+ if raise_on_error:
411
+ raise Exception(message)
412
+ else:
413
+ print(message)
414
+ return None
415
+ else:
416
+ if verbose:
417
+ print(f"✅ Loaded object of type '{type(obj)}'")
418
+ return obj
419
+
295
420
 
296
- cont_part = array[:-binary_features]
297
- bin_part = (array[-binary_features:] > 0.5).astype(int)
421
+ def distribute_datasets_by_target(
422
+ df_or_path: Union[pd.DataFrame, str],
423
+ target_columns: list[str],
424
+ verbose: bool = False
425
+ ) -> Iterator[Tuple[str, pd.DataFrame]]:
426
+ """
427
+ Yields cleaned DataFrames for each target column, where rows with missing
428
+ target values are removed. The target column is placed at the end.
429
+
430
+ Parameters
431
+ ----------
432
+ df_or_path : [pd.DataFrame | str]
433
+ Dataframe or path to Dataframe with all feature and target columns ready to split and train a model.
434
+ target_columns : List[str]
435
+ List of target column names to generate per-target DataFrames.
436
+ verbose: bool
437
+ Whether to print info for each yielded dataset.
438
+
439
+ Yields
440
+ ------
441
+ Tuple[str, pd.DataFrame]
442
+ * Target name.
443
+ * Pandas DataFrame.
444
+ """
445
+ # Validate path
446
+ if isinstance(df_or_path, str):
447
+ df, _ = load_dataframe(df_or_path)
448
+ else:
449
+ df = df_or_path
298
450
 
299
- return np.concatenate([cont_part, bin_part])
451
+ valid_targets = [col for col in df.columns if col in target_columns]
452
+ feature_columns = [col for col in df.columns if col not in valid_targets]
453
+
454
+ for target in valid_targets:
455
+ subset = df[feature_columns + [target]].dropna(subset=[target])
456
+ if verbose:
457
+ print(f"Target: '{target}' - Dataframe shape: {subset.shape}")
458
+ yield target, subset
300
459
 
301
460
 
302
461
  def _script_info(all_data: list[str]):
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "1.4.4"
3
+ version = "1.4.6"
4
4
  description = "A collection of tools for data science and machine learning projects"
5
5
  authors = [
6
6
  { name = "Karl Loza", email = "luigiloza@gmail.com" }