dragon-ml-toolbox 1.4.5__tar.gz → 1.4.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {dragon_ml_toolbox-1.4.5/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-1.4.7}/PKG-INFO +1 -1
  2. {dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
  3. {dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7}/ml_tools/data_exploration.py +36 -5
  4. {dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7}/ml_tools/particle_swarm_optimization.py +50 -12
  5. {dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7}/ml_tools/utilities.py +39 -4
  6. {dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7}/pyproject.toml +1 -1
  7. {dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7}/LICENSE +0 -0
  8. {dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7}/LICENSE-THIRD-PARTY.md +0 -0
  9. {dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7}/README.md +0 -0
  10. {dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
  11. {dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  12. {dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
  13. {dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  14. {dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7}/ml_tools/MICE_imputation.py +0 -0
  15. {dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7}/ml_tools/VIF_factor.py +0 -0
  16. {dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7}/ml_tools/__init__.py +0 -0
  17. {dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7}/ml_tools/datasetmaster.py +0 -0
  18. {dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7}/ml_tools/ensemble_learning.py +0 -0
  19. {dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7}/ml_tools/handle_excel.py +0 -0
  20. {dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7}/ml_tools/logger.py +0 -0
  21. {dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7}/ml_tools/pytorch_models.py +0 -0
  22. {dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7}/ml_tools/trainer.py +0 -0
  23. {dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7}/ml_tools/vision_helpers.py +0 -0
  24. {dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 1.4.5
3
+ Version: 1.4.7
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 1.4.5
3
+ Version: 1.4.7
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -5,9 +5,10 @@ import seaborn as sns
5
5
  from IPython import get_ipython
6
6
  from IPython.display import clear_output
7
7
  import time
8
- from typing import Union, Literal, Dict, Tuple, Iterator
8
+ from typing import Union, Literal, Dict, Tuple, List
9
9
  import os
10
10
  from ml_tools.utilities import sanitize_filename, _script_info
11
+ import re
11
12
 
12
13
 
13
14
  # Keep track of all available tools, show using `info()`
@@ -22,7 +23,8 @@ __all__ = [
22
23
  "check_value_distributions",
23
24
  "plot_value_distributions",
24
25
  "clip_outliers_single",
25
- "clip_outliers_multi"
26
+ "clip_outliers_multi",
27
+ "match_and_filter_columns_by_regex"
26
28
  ]
27
29
 
28
30
 
@@ -245,9 +247,6 @@ def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None
245
247
  cbar_kws={"shrink": 0.8}
246
248
  )
247
249
 
248
- # sanitize the plot title
249
- plot_title = sanitize_filename(plot_title)
250
-
251
250
  plt.title(plot_title)
252
251
  plt.xticks(rotation=45, ha='right')
253
252
  plt.yticks(rotation=0)
@@ -255,6 +254,8 @@ def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None
255
254
  plt.tight_layout()
256
255
 
257
256
  if save_dir:
257
+ # sanitize the plot title to save the file
258
+ plot_title = sanitize_filename(plot_title)
258
259
  os.makedirs(save_dir, exist_ok=True)
259
260
  full_path = os.path.join(save_dir, plot_title + ".svg")
260
261
  plt.savefig(full_path, bbox_inches="tight", format='svg')
@@ -518,6 +519,36 @@ def clip_outliers_multi(
518
519
  return new_df
519
520
 
520
521
 
522
+ def match_and_filter_columns_by_regex(
523
+ df: pd.DataFrame,
524
+ pattern: str,
525
+ case_sensitive: bool = False,
526
+ escape_pattern: bool = False
527
+ ) -> Tuple[pd.DataFrame, List[str]]:
528
+ """
529
+ Return a tuple of (filtered DataFrame, matched column names) based on a regex pattern.
530
+
531
+ Parameters:
532
+ df (pd.DataFrame): The DataFrame to search.
533
+ pattern (str): The regex pattern to match column names (use a raw string).
534
+ case_sensitive (bool): Whether matching is case-sensitive.
535
+ escape_pattern (bool): If True, the pattern is escaped with `re.escape()` to treat it literally.
536
+
537
+ Returns:
538
+ (Tuple[pd.DataFrame, list[str]]): A DataFrame filtered to matched columns, and a list of matching column names.
539
+ """
540
+ if escape_pattern:
541
+ pattern = re.escape(pattern)
542
+
543
+ mask = df.columns.str.contains(pattern, case=case_sensitive, regex=True)
544
+ matched_columns = df.columns[mask].to_list()
545
+ filtered_df = df.loc[:, mask]
546
+
547
+ print(f"{len(matched_columns)} column(s) match the regex pattern '{pattern}'.")
548
+
549
+ return filtered_df, matched_columns
550
+
551
+
521
552
  def _is_notebook():
522
553
  return get_ipython() is not None
523
554
 
@@ -8,11 +8,13 @@ from sklearn.base import ClassifierMixin
8
8
  from typing import Literal, Union, Tuple, Dict, Optional
9
9
  import polars as pl
10
10
  from functools import partial
11
- from .utilities import sanitize_filename, _script_info, threshold_binary_values, deserialize_object
11
+ from copy import deepcopy
12
+ from .utilities import sanitize_filename, _script_info, threshold_binary_values, deserialize_object, list_files_by_extension
12
13
 
13
14
 
14
15
  __all__ = [
15
16
  "ObjectiveFunction",
17
+ "multiple_objective_functions_from_dir",
16
18
  "run_pso"
17
19
  ]
18
20
 
@@ -29,12 +31,12 @@ class ObjectiveFunction():
29
31
  Path to a serialized model (joblib) compatible with scikit-learn-like `.predict`.
30
32
  add_noise : bool
31
33
  Whether to apply multiplicative noise to the input features during evaluation.
32
- binary_features : int, default=0
33
- Number of binary features located at the END of the feature vector. Model should be trained with continuous features first, followed by binary.
34
- task : Literal, default 'maximization'
34
+ task : (Literal["maximization", "minimization"])
35
35
  Whether to maximize or minimize the target.
36
+ binary_features : int
37
+ Number of binary features located at the END of the feature vector. Model should be trained with continuous features first, followed by binary.
36
38
  """
37
- def __init__(self, trained_model_path: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int=0) -> None:
39
+ def __init__(self, trained_model_path: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int) -> None:
38
40
  self.binary_features = binary_features
39
41
  self.is_hybrid = False if binary_features <= 0 else True
40
42
  self.use_noise = add_noise
@@ -96,6 +98,35 @@ class ObjectiveFunction():
96
98
  return (f"<ObjectiveFunction(model={type(self.model).__name__}, use_noise={self.use_noise}, is_hybrid={self.is_hybrid}, task='{self.task}')>")
97
99
 
98
100
 
101
+ def multiple_objective_functions_from_dir(directory: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int):
102
+ """
103
+ Loads multiple objective functions from serialized models in the given directory.
104
+
105
+ Each `.joblib` file which is loaded and wrapped as an `ObjectiveFunction` instance. Returns a list of such instances along with their corresponding names.
106
+
107
+ Parameters:
108
+ directory (str) : Path to the directory containing `.joblib` files (serialized models).
109
+ add_noise (bool) : Whether to apply multiplicative noise to the input features during evaluation.
110
+ task (Literal["maximization", "minimization"]) : Defines the nature of the optimization task.
111
+ binary_features (int) : Number of binary features expected by each objective function.
112
+
113
+ Returns:
114
+ (tuple[list[ObjectiveFunction], list[str]]) : A tuple containing:
115
+ - list of `ObjectiveFunction` instances.
116
+ - list of corresponding filenames.
117
+ """
118
+ objective_functions = list()
119
+ objective_function_names = list()
120
+ for file_name, file_path in list_files_by_extension(directory=directory, extension='joblib').items():
121
+ current_objective = ObjectiveFunction(trained_model_path=file_path,
122
+ add_noise=add_noise,
123
+ task=task,
124
+ binary_features=binary_features)
125
+ objective_functions.append(current_objective)
126
+ objective_function_names.append(file_name)
127
+ return objective_functions, objective_function_names
128
+
129
+
99
130
  def _set_boundaries(lower_boundaries: list[float], upper_boundaries: list[float]):
100
131
  assert len(lower_boundaries) == len(upper_boundaries), "Lower and upper boundaries must have the same length."
101
132
  assert len(lower_boundaries) >= 1, "At least one boundary pair is required."
@@ -131,9 +162,9 @@ def run_pso(lower_boundaries: list[float],
131
162
  target_name: Union[str, None]=None,
132
163
  feature_names: Union[list[str], None]=None,
133
164
  swarm_size: int=200,
134
- max_iterations: int=1500,
165
+ max_iterations: int=1000,
135
166
  inequality_constrain_function=None,
136
- post_hoc_analysis: Optional[int]=5,
167
+ post_hoc_analysis: Optional[int]=3,
137
168
  workers: int=1) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
138
169
  """
139
170
  Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
@@ -180,18 +211,25 @@ def run_pso(lower_boundaries: list[float],
180
211
  -----
181
212
  - PSO minimizes the objective function by default; if maximization is desired, it should be handled inside the ObjectiveFunction.
182
213
  """
214
+ # set local deep copies to prevent in place list modification
215
+ local_lower_boundaries = deepcopy(lower_boundaries)
216
+ local_upper_boundaries = deepcopy(upper_boundaries)
217
+
183
218
  # Append binary boundaries
184
219
  binary_number = objective_function.binary_features
185
220
  if auto_binary_boundaries and binary_number > 0:
186
- lower_boundaries.extend([0] * binary_number)
187
- upper_boundaries.extend([1] * binary_number)
221
+ local_lower_boundaries.extend([0] * binary_number)
222
+ local_upper_boundaries.extend([1] * binary_number)
223
+
224
+ # Set the total length of features
225
+ size_of_features = len(local_lower_boundaries)
188
226
 
189
- lower, upper = _set_boundaries(lower_boundaries, upper_boundaries)
227
+ lower, upper = _set_boundaries(local_lower_boundaries, local_upper_boundaries)
190
228
 
191
229
  # feature names
192
230
  if feature_names is None and objective_function.feature_names is not None:
193
231
  feature_names = objective_function.feature_names
194
- names = _set_feature_names(size=len(lower_boundaries), names=feature_names)
232
+ names = _set_feature_names(size=size_of_features, names=feature_names)
195
233
 
196
234
  # target name
197
235
  if target_name is None and objective_function.target_name is not None:
@@ -233,7 +271,7 @@ def run_pso(lower_boundaries: list[float],
233
271
  return best_features_named, best_target_named
234
272
  else:
235
273
  all_best_targets = list()
236
- all_best_features = [[] for _ in range(len(lower_boundaries))]
274
+ all_best_features = [[] for _ in range(size_of_features)]
237
275
  for _ in range(post_hoc_analysis):
238
276
  best_features, best_target, *_ = _pso(**arguments)
239
277
  # best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
@@ -13,6 +13,7 @@ from joblib.externals.loky.process_executor import TerminatedWorkerError
13
13
  # Keep track of available tools
14
14
  __all__ = [
15
15
  "list_csv_paths",
16
+ "list_files_by_extension",
16
17
  "load_dataframe",
17
18
  "yield_dataframes_from_dir",
18
19
  "merge_dataframes",
@@ -34,7 +35,7 @@ def list_csv_paths(directory: str) -> dict[str, str]:
34
35
  directory (str): Path to the directory containing `.csv` files.
35
36
 
36
37
  Returns:
37
- (dict[str, str]): Mapping {name, path}.
38
+ (dict[str, str]): Dictionary mapping {filename: filepath}.
38
39
  """
39
40
  dir_path = Path(directory).expanduser().resolve()
40
41
 
@@ -48,13 +49,47 @@ def list_csv_paths(directory: str) -> dict[str, str]:
48
49
  # make a dictionary of paths and names
49
50
  name_path_dict = {p.stem: str(p) for p in csv_paths}
50
51
 
51
- print("🗂️ CSV files found:")
52
+ print("\n🗂️ CSV files found:")
52
53
  for name in name_path_dict.keys():
53
54
  print(f"\t{name}")
54
55
 
55
56
  return name_path_dict
56
57
 
57
58
 
59
+ def list_files_by_extension(directory: str, extension: str) -> dict[str, str]:
60
+ """
61
+ Lists all files with the specified extension in the given directory and returns a mapping:
62
+ filenames (without extensions) to their absolute paths.
63
+
64
+ Parameters:
65
+ directory (str): Path to the directory to search in.
66
+ extension (str): File extension to search for (e.g., 'json', 'txt').
67
+
68
+ Returns:
69
+ (dict[str, str]): Dictionary mapping {filename: filepath}.
70
+ """
71
+ dir_path = Path(directory).expanduser().resolve()
72
+
73
+ if not dir_path.is_dir():
74
+ raise FileNotFoundError(f"Directory not found: {dir_path}")
75
+
76
+ # Normalize the extension (remove leading dot if present)
77
+ normalized_ext = extension.lstrip(".").lower()
78
+ pattern = f"*.{normalized_ext}"
79
+
80
+ matched_paths = list(dir_path.glob(pattern))
81
+ if not matched_paths:
82
+ raise IOError(f"No '.{normalized_ext}' files found in directory: {dir_path}")
83
+
84
+ name_path_dict = {p.stem: str(p) for p in matched_paths}
85
+
86
+ print(f"\n📂 '{normalized_ext.upper()}' files found:")
87
+ for name in name_path_dict:
88
+ print(f"\t{name}")
89
+
90
+ return name_path_dict
91
+
92
+
58
93
  def load_dataframe(df_path: str) -> tuple[pd.DataFrame, str]:
59
94
  """
60
95
  Load a CSV file into a pandas DataFrame and extract the base name (without extension) from the file path.
@@ -404,8 +439,8 @@ def distribute_datasets_by_target(
404
439
  Yields
405
440
  ------
406
441
  Tuple[str, pd.DataFrame]
407
- * First element is the target column name.
408
- * Second element is the corresponding cleaned DataFrame.
442
+ * Target name.
443
+ * Pandas DataFrame.
409
444
  """
410
445
  # Validate path
411
446
  if isinstance(df_or_path, str):
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "1.4.5"
3
+ version = "1.4.7"
4
4
  description = "A collection of tools for data science and machine learning projects"
5
5
  authors = [
6
6
  { name = "Karl Loza", email = "luigiloza@gmail.com" }