dragon-ml-toolbox 1.4.5__py3-none-any.whl → 1.4.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 1.4.5
3
+ Version: 1.4.6
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,19 +1,19 @@
1
- dragon_ml_toolbox-1.4.5.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
- dragon_ml_toolbox-1.4.5.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=jDnniT0tgD0uw1NpjibsPF-qK3wmOKgTykLG2iNQU7E,1840
1
+ dragon_ml_toolbox-1.4.6.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
+ dragon_ml_toolbox-1.4.6.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=jDnniT0tgD0uw1NpjibsPF-qK3wmOKgTykLG2iNQU7E,1840
3
3
  ml_tools/MICE_imputation.py,sha256=JMe9hyidJadFTHW7AHkNQ_fduTxH6CEh7_Ouy2LhCOQ,11096
4
4
  ml_tools/VIF_factor.py,sha256=HEBsLJy_qSDaPw1Btha5B7omxN4wjJXg-sqoetCjCJw,10016
5
5
  ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- ml_tools/data_exploration.py,sha256=vSjqKwtmPm1RHhHdC5AWH8Edg78nTM5SowUzWG2AxdY,18951
6
+ ml_tools/data_exploration.py,sha256=X9mYZdynRGghT06GeOdVsfGBTFa342Ko-MkMImDll-M,20123
7
7
  ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
8
8
  ml_tools/ensemble_learning.py,sha256=xJyEbkFObm5YX6DmDW10FOUjSeYeBRhHLvncWZv_uTo,37319
9
9
  ml_tools/handle_excel.py,sha256=ZJui5__0rc2T8UGHTheqZGhKmdVZ7Q2I54IoYCjAqJw,12612
10
10
  ml_tools/logger.py,sha256=ZTtUB9HTkNs5zHTdYRKNbKADjUkuObsF7s8U5pNnVRA,4716
11
- ml_tools/particle_swarm_optimization.py,sha256=c5TG_MnCgxzkcREkqbJHqi6x_fovikUv2ePtKFL4HL8,20193
11
+ ml_tools/particle_swarm_optimization.py,sha256=ByCYFV8PWP9CYGZ0wblphtmDLRbSezY9a0_fGqGWQV4,21891
12
12
  ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
13
13
  ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
14
- ml_tools/utilities.py,sha256=6369V_8VcDBPqUBOYS6VI6JKt0-wq_xVq9voTOU6VsQ,14515
14
+ ml_tools/utilities.py,sha256=Ir3Yw4SuWMLKnbnl4Qzudn5U8CgcQ7zMtNqcllZMHeM,15682
15
15
  ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
16
- dragon_ml_toolbox-1.4.5.dist-info/METADATA,sha256=BDFkLly4Ylq2yPdQC3T4UVFr21fWjYlNgudq_bVWk-g,2516
17
- dragon_ml_toolbox-1.4.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
- dragon_ml_toolbox-1.4.5.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
19
- dragon_ml_toolbox-1.4.5.dist-info/RECORD,,
16
+ dragon_ml_toolbox-1.4.6.dist-info/METADATA,sha256=SDqa8Cz72fH669cfuMIcVX02SC0DeK0UmKU-fAPx4AU,2516
17
+ dragon_ml_toolbox-1.4.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
+ dragon_ml_toolbox-1.4.6.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
19
+ dragon_ml_toolbox-1.4.6.dist-info/RECORD,,
@@ -5,9 +5,10 @@ import seaborn as sns
5
5
  from IPython import get_ipython
6
6
  from IPython.display import clear_output
7
7
  import time
8
- from typing import Union, Literal, Dict, Tuple, Iterator
8
+ from typing import Union, Literal, Dict, Tuple, List
9
9
  import os
10
10
  from ml_tools.utilities import sanitize_filename, _script_info
11
+ import re
11
12
 
12
13
 
13
14
  # Keep track of all available tools, show using `info()`
@@ -22,7 +23,8 @@ __all__ = [
22
23
  "check_value_distributions",
23
24
  "plot_value_distributions",
24
25
  "clip_outliers_single",
25
- "clip_outliers_multi"
26
+ "clip_outliers_multi",
27
+ "match_and_filter_columns_by_regex"
26
28
  ]
27
29
 
28
30
 
@@ -245,9 +247,6 @@ def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None
245
247
  cbar_kws={"shrink": 0.8}
246
248
  )
247
249
 
248
- # sanitize the plot title
249
- plot_title = sanitize_filename(plot_title)
250
-
251
250
  plt.title(plot_title)
252
251
  plt.xticks(rotation=45, ha='right')
253
252
  plt.yticks(rotation=0)
@@ -255,6 +254,8 @@ def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None
255
254
  plt.tight_layout()
256
255
 
257
256
  if save_dir:
257
+ # sanitize the plot title to save the file
258
+ plot_title = sanitize_filename(plot_title)
258
259
  os.makedirs(save_dir, exist_ok=True)
259
260
  full_path = os.path.join(save_dir, plot_title + ".svg")
260
261
  plt.savefig(full_path, bbox_inches="tight", format='svg')
@@ -518,6 +519,36 @@ def clip_outliers_multi(
518
519
  return new_df
519
520
 
520
521
 
522
+ def match_and_filter_columns_by_regex(
523
+ df: pd.DataFrame,
524
+ pattern: str,
525
+ case_sensitive: bool = False,
526
+ escape_pattern: bool = False
527
+ ) -> Tuple[pd.DataFrame, List[str]]:
528
+ """
529
+ Return a tuple of (filtered DataFrame, matched column names) based on a regex pattern.
530
+
531
+ Parameters:
532
+ df (pd.DataFrame): The DataFrame to search.
533
+ pattern (str): The regex pattern to match column names (use a raw string).
534
+ case_sensitive (bool): Whether matching is case-sensitive.
535
+ escape_pattern (bool): If True, the pattern is escaped with `re.escape()` to treat it literally.
536
+
537
+ Returns:
538
+ (Tuple[pd.DataFrame, list[str]]): A DataFrame filtered to matched columns, and a list of matching column names.
539
+ """
540
+ if escape_pattern:
541
+ pattern = re.escape(pattern)
542
+
543
+ mask = df.columns.str.contains(pattern, case=case_sensitive, regex=True)
544
+ matched_columns = df.columns[mask].to_list()
545
+ filtered_df = df.loc[:, mask]
546
+
547
+ print(f"{len(matched_columns)} column(s) match the regex pattern '{pattern}'.")
548
+
549
+ return filtered_df, matched_columns
550
+
551
+
521
552
  def _is_notebook():
522
553
  return get_ipython() is not None
523
554
 
@@ -8,11 +8,12 @@ from sklearn.base import ClassifierMixin
8
8
  from typing import Literal, Union, Tuple, Dict, Optional
9
9
  import polars as pl
10
10
  from functools import partial
11
- from .utilities import sanitize_filename, _script_info, threshold_binary_values, deserialize_object
11
+ from .utilities import sanitize_filename, _script_info, threshold_binary_values, deserialize_object, list_files_by_extension
12
12
 
13
13
 
14
14
  __all__ = [
15
15
  "ObjectiveFunction",
16
+ "multiple_objective_functions_from_dir",
16
17
  "run_pso"
17
18
  ]
18
19
 
@@ -29,12 +30,12 @@ class ObjectiveFunction():
29
30
  Path to a serialized model (joblib) compatible with scikit-learn-like `.predict`.
30
31
  add_noise : bool
31
32
  Whether to apply multiplicative noise to the input features during evaluation.
32
- binary_features : int, default=0
33
- Number of binary features located at the END of the feature vector. Model should be trained with continuous features first, followed by binary.
34
- task : Literal, default 'maximization'
33
+ task : (Literal["maximization", "minimization"])
35
34
  Whether to maximize or minimize the target.
35
+ binary_features : int
36
+ Number of binary features located at the END of the feature vector. Model should be trained with continuous features first, followed by binary.
36
37
  """
37
- def __init__(self, trained_model_path: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int=0) -> None:
38
+ def __init__(self, trained_model_path: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int) -> None:
38
39
  self.binary_features = binary_features
39
40
  self.is_hybrid = False if binary_features <= 0 else True
40
41
  self.use_noise = add_noise
@@ -96,6 +97,35 @@ class ObjectiveFunction():
96
97
  return (f"<ObjectiveFunction(model={type(self.model).__name__}, use_noise={self.use_noise}, is_hybrid={self.is_hybrid}, task='{self.task}')>")
97
98
 
98
99
 
100
+ def multiple_objective_functions_from_dir(directory: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int):
101
+ """
102
+ Loads multiple objective functions from serialized models in the given directory.
103
+
104
+ Each `.joblib` file which is loaded and wrapped as an `ObjectiveFunction` instance. Returns a list of such instances along with their corresponding names.
105
+
106
+ Parameters:
107
+ directory (str) : Path to the directory containing `.joblib` files (serialized models).
108
+ add_noise (bool) : Whether to apply multiplicative noise to the input features during evaluation.
109
+ task (Literal["maximization", "minimization"]) : Defines the nature of the optimization task.
110
+ binary_features (int) : Number of binary features expected by each objective function.
111
+
112
+ Returns:
113
+ (tuple[list[ObjectiveFunction], list[str]]) : A tuple containing:
114
+ - list of `ObjectiveFunction` instances.
115
+ - list of corresponding filenames.
116
+ """
117
+ objective_functions = list()
118
+ objective_function_names = list()
119
+ for file_name, file_path in list_files_by_extension(directory=directory, extension='joblib').items():
120
+ current_objective = ObjectiveFunction(trained_model_path=file_path,
121
+ add_noise=add_noise,
122
+ task=task,
123
+ binary_features=binary_features)
124
+ objective_functions.append(current_objective)
125
+ objective_function_names.append(file_name)
126
+ return objective_functions, objective_function_names
127
+
128
+
99
129
  def _set_boundaries(lower_boundaries: list[float], upper_boundaries: list[float]):
100
130
  assert len(lower_boundaries) == len(upper_boundaries), "Lower and upper boundaries must have the same length."
101
131
  assert len(lower_boundaries) >= 1, "At least one boundary pair is required."
@@ -131,9 +161,9 @@ def run_pso(lower_boundaries: list[float],
131
161
  target_name: Union[str, None]=None,
132
162
  feature_names: Union[list[str], None]=None,
133
163
  swarm_size: int=200,
134
- max_iterations: int=1500,
164
+ max_iterations: int=1000,
135
165
  inequality_constrain_function=None,
136
- post_hoc_analysis: Optional[int]=5,
166
+ post_hoc_analysis: Optional[int]=3,
137
167
  workers: int=1) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
138
168
  """
139
169
  Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
ml_tools/utilities.py CHANGED
@@ -13,6 +13,7 @@ from joblib.externals.loky.process_executor import TerminatedWorkerError
13
13
  # Keep track of available tools
14
14
  __all__ = [
15
15
  "list_csv_paths",
16
+ "list_files_by_extension",
16
17
  "load_dataframe",
17
18
  "yield_dataframes_from_dir",
18
19
  "merge_dataframes",
@@ -34,7 +35,7 @@ def list_csv_paths(directory: str) -> dict[str, str]:
34
35
  directory (str): Path to the directory containing `.csv` files.
35
36
 
36
37
  Returns:
37
- (dict[str, str]): Mapping {name, path}.
38
+ (dict[str, str]): Dictionary mapping {filename: filepath}.
38
39
  """
39
40
  dir_path = Path(directory).expanduser().resolve()
40
41
 
@@ -48,13 +49,47 @@ def list_csv_paths(directory: str) -> dict[str, str]:
48
49
  # make a dictionary of paths and names
49
50
  name_path_dict = {p.stem: str(p) for p in csv_paths}
50
51
 
51
- print("🗂️ CSV files found:")
52
+ print("\n🗂️ CSV files found:")
52
53
  for name in name_path_dict.keys():
53
54
  print(f"\t{name}")
54
55
 
55
56
  return name_path_dict
56
57
 
57
58
 
59
+ def list_files_by_extension(directory: str, extension: str) -> dict[str, str]:
60
+ """
61
+ Lists all files with the specified extension in the given directory and returns a mapping:
62
+ filenames (without extensions) to their absolute paths.
63
+
64
+ Parameters:
65
+ directory (str): Path to the directory to search in.
66
+ extension (str): File extension to search for (e.g., 'json', 'txt').
67
+
68
+ Returns:
69
+ (dict[str, str]): Dictionary mapping {filename: filepath}.
70
+ """
71
+ dir_path = Path(directory).expanduser().resolve()
72
+
73
+ if not dir_path.is_dir():
74
+ raise FileNotFoundError(f"Directory not found: {dir_path}")
75
+
76
+ # Normalize the extension (remove leading dot if present)
77
+ normalized_ext = extension.lstrip(".").lower()
78
+ pattern = f"*.{normalized_ext}"
79
+
80
+ matched_paths = list(dir_path.glob(pattern))
81
+ if not matched_paths:
82
+ raise IOError(f"No '.{normalized_ext}' files found in directory: {dir_path}")
83
+
84
+ name_path_dict = {p.stem: str(p) for p in matched_paths}
85
+
86
+ print(f"\n📂 '{normalized_ext.upper()}' files found:")
87
+ for name in name_path_dict:
88
+ print(f"\t{name}")
89
+
90
+ return name_path_dict
91
+
92
+
58
93
  def load_dataframe(df_path: str) -> tuple[pd.DataFrame, str]:
59
94
  """
60
95
  Load a CSV file into a pandas DataFrame and extract the base name (without extension) from the file path.
@@ -404,8 +439,8 @@ def distribute_datasets_by_target(
404
439
  Yields
405
440
  ------
406
441
  Tuple[str, pd.DataFrame]
407
- * First element is the target column name.
408
- * Second element is the corresponding cleaned DataFrame.
442
+ * Target name.
443
+ * Pandas DataFrame.
409
444
  """
410
445
  # Validate path
411
446
  if isinstance(df_or_path, str):