dragon-ml-toolbox 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 1.4.0
3
+ Version: 1.4.1
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -80,7 +80,7 @@ Clone the repository and install in editable mode with optional dependencies:
80
80
  ```bash
81
81
  git clone https://github.com/DrAg0n-BoRn/ML_tools.git
82
82
  cd ML_tools
83
- pip install -e '.[pytorch]'
83
+ pip install -e .
84
84
  ```
85
85
 
86
86
  ## Usage
@@ -91,3 +91,19 @@ After installation, import modules like this:
91
91
  from ml_tools.utilities import sanitize_filename
92
92
  from ml_tools.logger import custom_logger
93
93
  ```
94
+
95
+ ## Available modules
96
+
97
+ ```bash
98
+ data_exploration
99
+ datasetmaster
100
+ ensemble_learning
101
+ handle_excel
102
+ logger
103
+ MICE_imputation
104
+ particle_swarm_optimization
105
+ trainer
106
+ utilities
107
+ VIF_factor
108
+ vision_helpers
109
+ ```
@@ -0,0 +1,19 @@
1
+ dragon_ml_toolbox-1.4.1.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
+ dragon_ml_toolbox-1.4.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=e1Hg5ZtaBpDV7ZvxhLe1ac28l7nMjvi1MSE5YvB1s-o,1472
3
+ ml_tools/MICE_imputation.py,sha256=CK0tYZ_kQkdETohOlhI7RP7oFkJTXrP-XtIxb--dzpU,9726
4
+ ml_tools/VIF_factor.py,sha256=LQWr1P8WYij07FX_3RZC6Rr22bfAMnrt0Lhvi7SbBpY,9846
5
+ ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ ml_tools/data_exploration.py,sha256=FXP5i6bQo8J3RCyLRmlX-qJVh4VH8DbMjrdUmyd1mF0,18708
7
+ ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
8
+ ml_tools/ensemble_learning.py,sha256=khXXRiR7boWwI4CAvb2bxzS3fhLADNETMOiRe3ihZ4Y,28821
9
+ ml_tools/handle_excel.py,sha256=ZJui5__0rc2T8UGHTheqZGhKmdVZ7Q2I54IoYCjAqJw,12612
10
+ ml_tools/logger.py,sha256=NOtL3YSuffAGmpTpXjY-uJjqFLdRG_jpL7MDyloBw9c,4712
11
+ ml_tools/particle_swarm_optimization.py,sha256=714kZo6lvUvRaPTtj6kJGecZwHcehcSkLysokXAf3No,20706
12
+ ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
13
+ ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
14
+ ml_tools/utilities.py,sha256=z2JPy4GM2YBLUC0sPq7aNLuesPFAQu5KNcsgmuOywdU,8738
15
+ ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
16
+ dragon_ml_toolbox-1.4.1.dist-info/METADATA,sha256=0XdPwNWe81rCvJLJfSS5XvB2ZdJKpBLLoqMU5uxYLMc,2516
17
+ dragon_ml_toolbox-1.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
+ dragon_ml_toolbox-1.4.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
19
+ dragon_ml_toolbox-1.4.1.dist-info/RECORD,,
@@ -3,9 +3,20 @@ import miceforest as mf
3
3
  import os
4
4
  import matplotlib.pyplot as plt
5
5
  import numpy as np
6
- from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename
6
+ from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info
7
7
  from plotnine import ggplot, labs, theme, element_blank # type: ignore
8
8
 
9
+
10
+ __all__ = [
11
+ "apply_mice",
12
+ "save_imputed_datasets",
13
+ "get_na_column_names",
14
+ "get_convergence_diagnostic",
15
+ "get_imputed_distributions",
16
+ "run_mice_pipeline"
17
+ ]
18
+
19
+
9
20
  def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
10
21
 
11
22
  # Initialize kernel with number of imputed datasets to generate
@@ -210,7 +221,7 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
210
221
  if os.path.isfile(df_path_or_dir):
211
222
  all_file_paths = [df_path_or_dir]
212
223
  elif os.path.isdir(df_path_or_dir):
213
- all_file_paths = list_csv_paths(df_path_or_dir).values()
224
+ all_file_paths = list(list_csv_paths(df_path_or_dir).values())
214
225
  else:
215
226
  raise ValueError(f"Invalid path or directory: {df_path_or_dir}")
216
227
 
@@ -226,3 +237,7 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
226
237
  get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_dir)
227
238
 
228
239
  get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_dir, column_names=imputed_column_names)
240
+
241
+
242
+ def info():
243
+ _script_info(__all__)
ml_tools/VIF_factor.py CHANGED
@@ -7,12 +7,19 @@ from statsmodels.stats.outliers_influence import variance_inflation_factor
7
7
  from statsmodels.tools.tools import add_constant
8
8
  import warnings
9
9
  import os
10
- from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe
10
+ from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe, _script_info
11
+
12
+
13
+ __all__ = [
14
+ "compute_vif",
15
+ "drop_vif_based",
16
+ "compute_vif_multi"
17
+ ]
11
18
 
12
19
 
13
20
  def compute_vif(
14
21
  df: pd.DataFrame,
15
- target_columns: Optional[list[str]] = None,
22
+ use_columns: Optional[list[str]] = None,
16
23
  ignore_columns: Optional[list[str]] = None,
17
24
  max_features_to_plot: int = 20,
18
25
  save_dir: Optional[str] = None,
@@ -25,7 +32,7 @@ def compute_vif(
25
32
 
26
33
  Args:
27
34
  df (pd.DataFrame): The input DataFrame.
28
- target_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
35
+ use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
29
36
  ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
30
37
  max_features_to_plot (int): Adjust the number of features shown in the plot.
31
38
  save_dir (str | None): Directory to save the plot as SVG. If None, the plot is not saved.
@@ -42,20 +49,20 @@ def compute_vif(
42
49
  A VIF of 1 suggests no correlation, values between 1 and 5 indicate moderate correlation, and values greater than 10 typically signal high multicollinearity, which may distort model interpretation and degrade performance.
43
50
  """
44
51
  ground_truth_cols = df.columns.to_list()
45
- if target_columns is None:
52
+ if use_columns is None:
46
53
  sanitized_columns = df.select_dtypes(include='number').columns.tolist()
47
54
  missing_features = set(ground_truth_cols) - set(sanitized_columns)
48
55
  if missing_features:
49
56
  print(f"⚠️ These columns are not Numeric:\n{missing_features}")
50
57
  else:
51
58
  sanitized_columns = list()
52
- for feature in target_columns:
59
+ for feature in use_columns:
53
60
  if feature not in ground_truth_cols:
54
61
  print(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
55
62
  else:
56
63
  sanitized_columns.append(feature)
57
64
 
58
- if ignore_columns is not None and target_columns is None:
65
+ if ignore_columns is not None and use_columns is None:
59
66
  missing_ignore = set(ignore_columns) - set(ground_truth_cols)
60
67
  if missing_ignore:
61
68
  print(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
@@ -137,7 +144,7 @@ def compute_vif(
137
144
  return vif_data.drop(columns="color")
138
145
 
139
146
 
140
- def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10.0) -> pd.DataFrame:
147
+ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10.0) -> tuple[pd.DataFrame, list[str]]:
141
148
  """
142
149
  Drops columns from the original DataFrame based on their VIF values exceeding a given threshold.
143
150
 
@@ -147,7 +154,9 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
147
154
  threshold (float): VIF threshold above which columns will be dropped.
148
155
 
149
156
  Returns:
150
- pd.DataFrame: A new DataFrame with high-VIF columns removed.
157
+ (tuple[pd.DataFrame, list[str]]):
158
+ - A new DataFrame with high-VIF columns removed.
159
+ - A list with dropped column names.
151
160
  """
152
161
  # Ensure expected structure
153
162
  if 'feature' not in vif_df.columns or 'VIF' not in vif_df.columns:
@@ -162,13 +171,13 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
162
171
  if result_df.empty:
163
172
  print(f"\t⚠️ Warning: All columns were dropped.")
164
173
 
165
- return result_df
174
+ return result_df, to_drop
166
175
 
167
176
 
168
177
  def compute_vif_multi(input_directory: str,
169
178
  output_plot_directory: str,
170
179
  output_dataset_directory: Optional[str] = None,
171
- target_columns: Optional[list[str]] = None,
180
+ use_columns: Optional[list[str]] = None,
172
181
  ignore_columns: Optional[list[str]] = None,
173
182
  max_features_to_plot: int = 20,
174
183
  fontsize: int = 14):
@@ -180,7 +189,7 @@ def compute_vif_multi(input_directory: str,
180
189
  input_directory (str): Target directory with CSV files able to be loaded as DataFrame.
181
190
  output_plot_directory (str): Save plots to this directory.
182
191
  output_dataset_directory (str | None): If provided, saves new CSV files to this directory.
183
- target_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
192
+ use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
184
193
  ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
185
194
  max_features_to_plot (int): Adjust the number of features shown in the plot.
186
195
  fontsize (int): Base fontsize to scale title and labels on hte plot.
@@ -195,7 +204,7 @@ def compute_vif_multi(input_directory: str,
195
204
 
196
205
  for df, df_name in yield_dataframes_from_dir(datasets_dir=input_directory):
197
206
  vif_dataframe = compute_vif(df=df,
198
- target_columns=target_columns,
207
+ use_columns=use_columns,
199
208
  ignore_columns=ignore_columns,
200
209
  max_features_to_plot=max_features_to_plot,
201
210
  fontsize=fontsize,
@@ -205,5 +214,11 @@ def compute_vif_multi(input_directory: str,
205
214
 
206
215
  if output_dataset_directory is not None:
207
216
  new_filename = 'VIF_' + df_name
208
- result_df = drop_vif_based(df=df, vif_df=vif_dataframe)
209
- save_dataframe(df=result_df, save_dir=output_dataset_directory, filename=new_filename)
217
+ result_df, dropped_cols = drop_vif_based(df=df, vif_df=vif_dataframe)
218
+
219
+ if len(dropped_cols) > 0:
220
+ save_dataframe(df=result_df, save_dir=output_dataset_directory, filename=new_filename)
221
+
222
+
223
+ def info():
224
+ _script_info(__all__)
@@ -9,22 +9,23 @@ from typing import Union, Literal, Dict, Tuple
9
9
  import os
10
10
  import sys
11
11
  import textwrap
12
- from ml_tools.utilities import sanitize_filename
12
+ from ml_tools.utilities import sanitize_filename, _script_info
13
13
 
14
14
 
15
- # Keep track of all available functions, show using `info()`
16
- __all__ = ["summarize_dataframe",
17
- "drop_rows_with_missing_data",
18
- "split_features_targets",
19
- "show_null_columns",
20
- "drop_columns_with_missing_data",
21
- "split_continuous_binary",
22
- "plot_correlation_heatmap",
23
- "check_value_distributions",
24
- "plot_value_distributions",
25
- "clip_outliers_single",
26
- "clip_outliers_multi",
27
- "merge_dataframes"]
15
+ # Keep track of all available tools, show using `info()`
16
+ __all__ = [
17
+ "summarize_dataframe",
18
+ "drop_rows_with_missing_data",
19
+ "split_features_targets",
20
+ "show_null_columns",
21
+ "drop_columns_with_missing_data",
22
+ "split_continuous_binary",
23
+ "plot_correlation_heatmap",
24
+ "check_value_distributions",
25
+ "plot_value_distributions",
26
+ "clip_outliers_single",
27
+ "clip_outliers_multi"
28
+ ]
28
29
 
29
30
 
30
31
  def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
@@ -58,34 +59,6 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
58
59
  return summary
59
60
 
60
61
 
61
- def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
62
- """
63
- Displays a table of columns with missing values, showing both the count and
64
- percentage of missing entries per column.
65
-
66
- Parameters:
67
- df (pd.DataFrame): The input DataFrame.
68
- round_digits (int): Number of decimal places for the percentage.
69
-
70
- Returns:
71
- pd.DataFrame: A DataFrame summarizing missing values in each column.
72
- """
73
- null_counts = df.isnull().sum()
74
- null_percent = df.isnull().mean() * 100
75
-
76
- # Filter only columns with at least one null
77
- mask = null_counts > 0
78
- null_summary = pd.DataFrame({
79
- 'Missing Count': null_counts[mask],
80
- 'Missing %': null_percent[mask].round(round_digits)
81
- })
82
-
83
- # Sort by descending percentage of missing values
84
- null_summary = null_summary.sort_values(by='Missing %', ascending=False)
85
- # print(null_summary)
86
- return null_summary
87
-
88
-
89
62
  def drop_rows_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) -> pd.DataFrame:
90
63
  """
91
64
  Drops rows with more than `threshold` fraction of missing values.
@@ -132,6 +105,57 @@ def split_features_targets(df: pd.DataFrame, targets: list[str]):
132
105
  return df_targets, df_features
133
106
 
134
107
 
108
+ def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
109
+ """
110
+ Displays a table of columns with missing values, showing both the count and
111
+ percentage of missing entries per column.
112
+
113
+ Parameters:
114
+ df (pd.DataFrame): The input DataFrame.
115
+ round_digits (int): Number of decimal places for the percentage.
116
+
117
+ Returns:
118
+ pd.DataFrame: A DataFrame summarizing missing values in each column.
119
+ """
120
+ null_counts = df.isnull().sum()
121
+ null_percent = df.isnull().mean() * 100
122
+
123
+ # Filter only columns with at least one null
124
+ mask = null_counts > 0
125
+ null_summary = pd.DataFrame({
126
+ 'Missing Count': null_counts[mask],
127
+ 'Missing %': null_percent[mask].round(round_digits)
128
+ })
129
+
130
+ # Sort by descending percentage of missing values
131
+ null_summary = null_summary.sort_values(by='Missing %', ascending=False)
132
+ # print(null_summary)
133
+ return null_summary
134
+
135
+
136
+ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) -> pd.DataFrame:
137
+ """
138
+ Drops columns with more than `threshold` fraction of missing values.
139
+
140
+ Parameters:
141
+ df (pd.DataFrame): The input DataFrame.
142
+ threshold (float): Fraction of missing values above which columns are dropped.
143
+
144
+ Returns:
145
+ pd.DataFrame: A new DataFrame without the dropped columns.
146
+ """
147
+ missing_fraction = df.isnull().mean()
148
+ cols_to_drop = missing_fraction[missing_fraction > threshold].index
149
+
150
+ if len(cols_to_drop) > 0:
151
+ print(f"Dropping columns with more than {threshold*100:.0f}% missing data:")
152
+ print(list(cols_to_drop))
153
+ else:
154
+ print(f"No columns have more than {threshold*100:.0f}% missing data.")
155
+
156
+ return df.drop(columns=cols_to_drop)
157
+
158
+
135
159
  def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
136
160
  """
137
161
  Split DataFrame into two DataFrames: one with continuous columns, one with binary columns.
@@ -174,29 +198,6 @@ def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFram
174
198
 
175
199
  return df_cont, df_bin # type: ignore
176
200
 
177
-
178
- def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) -> pd.DataFrame:
179
- """
180
- Drops columns with more than `threshold` fraction of missing values.
181
-
182
- Parameters:
183
- df (pd.DataFrame): The input DataFrame.
184
- threshold (float): Fraction of missing values above which columns are dropped.
185
-
186
- Returns:
187
- pd.DataFrame: A new DataFrame without the dropped columns.
188
- """
189
- missing_fraction = df.isnull().mean()
190
- cols_to_drop = missing_fraction[missing_fraction > threshold].index
191
-
192
- if len(cols_to_drop) > 0:
193
- print(f"Dropping columns with more than {threshold*100:.0f}% missing data:")
194
- print(list(cols_to_drop))
195
- else:
196
- print(f"No columns have more than {threshold*100:.0f}% missing data.")
197
-
198
- return df.drop(columns=cols_to_drop)
199
-
200
201
 
201
202
  def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None, method: Literal["pearson", "kendall", "spearman"]="pearson", plot_title: str="Correlation Heatmap"):
202
203
  """
@@ -513,83 +514,10 @@ def clip_outliers_multi(
513
514
  return new_df
514
515
 
515
516
 
516
- def merge_dataframes(
517
- *dfs: pd.DataFrame,
518
- reset_index: bool = False,
519
- direction: Literal["horizontal", "vertical"] = "horizontal"
520
- ) -> pd.DataFrame:
521
- """
522
- Merges multiple DataFrames either horizontally or vertically.
523
-
524
- Parameters:
525
- *dfs (pd.DataFrame): Variable number of DataFrames to merge.
526
- reset_index (bool): Whether to reset index in the final merged DataFrame.
527
- direction (["horizontal" | "vertical"]):
528
- - "horizontal": Merge on index, adding columns.
529
- - "vertical": Append rows; all DataFrames must have identical columns.
530
-
531
- Returns:
532
- pd.DataFrame: A single merged DataFrame.
533
-
534
- Raises:
535
- ValueError:
536
- - If fewer than 2 DataFrames are provided.
537
- - If indexes do not match for horizontal merge.
538
- - If column names or order differ for vertical merge.
539
- """
540
- if len(dfs) < 2:
541
- raise ValueError("At least 2 DataFrames must be provided.")
542
-
543
- for i, df in enumerate(dfs, start=1):
544
- print(f"DataFrame {i} shape: {df.shape}")
545
-
546
-
547
- if direction == "horizontal":
548
- reference_index = dfs[0].index
549
- for i, df in enumerate(dfs, start=1):
550
- if not df.index.equals(reference_index):
551
- raise ValueError(f"Indexes do not match: Dataset 1 and Dataset {i}.")
552
- merged_df = pd.concat(dfs, axis=1)
553
-
554
- elif direction == "vertical":
555
- reference_columns = dfs[0].columns
556
- for i, df in enumerate(dfs, start=1):
557
- if not df.columns.equals(reference_columns):
558
- raise ValueError(f"Column names/order do not match: Dataset 1 and Dataset {i}.")
559
- merged_df = pd.concat(dfs, axis=0)
560
-
561
- else:
562
- raise ValueError(f"Invalid merge direction: {direction}")
563
-
564
- if reset_index:
565
- merged_df = merged_df.reset_index(drop=True)
566
-
567
- print(f"Merged DataFrame shape: {merged_df.shape}")
568
-
569
- return merged_df
570
-
571
-
572
517
  def _is_notebook():
573
518
  return get_ipython() is not None
574
519
 
575
520
 
576
- def info(full_info: bool=True):
577
- """
578
- List available functions and their descriptions.
579
- """
580
- print("Available functions for data exploration:")
581
- if full_info:
582
- module = sys.modules[__name__]
583
- for name in __all__:
584
- obj = getattr(module, name, None)
585
- if callable(obj):
586
- doc = obj.__doc__ or "No docstring provided."
587
- formatted_doc = textwrap.indent(textwrap.dedent(doc.strip()), prefix=" ")
588
- print(f"\n{name}:\n{formatted_doc}")
589
- else:
590
- for i, name in enumerate(__all__, start=1):
591
- print(f"{i} - {name}")
592
-
521
+ def info():
522
+ _script_info(__all__)
593
523
 
594
- if __name__ == "__main__":
595
- info()
ml_tools/datasetmaster.py CHANGED
@@ -11,6 +11,15 @@ from PIL import Image
11
11
  from torchvision.datasets import ImageFolder
12
12
  from torchvision import transforms
13
13
  import matplotlib.pyplot as plt
14
+ from .utilities import _script_info
15
+
16
+
17
+ __all__ = [
18
+ "DatasetMaker",
19
+ "PytorchDataset",
20
+ "make_vision_dataset",
21
+ "SequenceDataset",
22
+ ]
14
23
 
15
24
 
16
25
  class DatasetMaker():
@@ -592,4 +601,7 @@ class SequenceDataset():
592
601
 
593
602
  def __len__(self):
594
603
  return f"Train: {len(self.train_dataset)}, Test: {len(self.test_dataset)}"
595
-
604
+
605
+
606
+ def info():
607
+ _script_info(__all__)
@@ -21,7 +21,7 @@ from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
21
21
  from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, mean_absolute_error, mean_squared_error, r2_score, roc_curve, roc_auc_score
22
22
  import shap
23
23
 
24
- from .utilities import yield_dataframes_from_dir
24
+ from .utilities import yield_dataframes_from_dir, sanitize_filename
25
25
 
26
26
  import warnings # Ignore warnings
27
27
  warnings.filterwarnings('ignore', category=DeprecationWarning)
@@ -245,7 +245,9 @@ def _local_directories(model_name: str, dataset_id: str, save_dir: str):
245
245
 
246
246
  # save model
247
247
  def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str, scaler_object: Union[StandardScaler, MinMaxScaler, MaxAbsScaler]):
248
- full_path = os.path.join(save_directory, f"{model_name}_{target_name}.joblib")
248
+ #Sanitize filenames to save
249
+ sanitized_target_name = sanitize_filename(target_name)
250
+ full_path = os.path.join(save_directory, f"{model_name}_{sanitized_target_name}.joblib")
249
251
  joblib.dump({'model': trained_model, 'scaler':scaler_object, 'feature_names': feature_names, 'target_name':target_name}, full_path)
250
252
 
251
253
  # function to evaluate the model and save metrics (Classification)
@@ -298,7 +300,8 @@ def evaluate_model_classification(
298
300
  )
299
301
 
300
302
  # Save text report
301
- report_path = os.path.join(save_dir, f"Classification_Report_{target_id}.txt")
303
+ sanitized_target_id = sanitize_filename(target_id)
304
+ report_path = os.path.join(save_dir, f"Classification_Report_{sanitized_target_id}.txt")
302
305
  with open(report_path, "w") as f:
303
306
  f.write(f"{model_name} - {target_id}\t\tAccuracy: {accuracy:.2f}\n")
304
307
  f.write("Classification Report:\n")
@@ -328,7 +331,7 @@ def evaluate_model_classification(
328
331
  text.set_fontsize(title_fontsize+4)
329
332
 
330
333
  fig.tight_layout()
331
- fig_path = os.path.join(save_dir, f"Confusion_Matrix_{target_id}.svg")
334
+ fig_path = os.path.join(save_dir, f"Confusion_Matrix_{sanitized_target_id}.svg")
332
335
  fig.savefig(fig_path, format="svg", bbox_inches="tight")
333
336
  plt.close(fig)
334
337
 
@@ -411,7 +414,8 @@ def plot_roc_curve(
411
414
 
412
415
  # Save figure
413
416
  os.makedirs(save_directory, exist_ok=True)
414
- save_path = os.path.join(save_directory, f"ROC_{target_name}.svg")
417
+ sanitized_target_name = sanitize_filename(target_name)
418
+ save_path = os.path.join(save_directory, f"ROC_{sanitized_target_name}.svg")
415
419
  fig.savefig(save_path, bbox_inches="tight", format="svg")
416
420
 
417
421
  return fig
@@ -435,7 +439,8 @@ def evaluate_model_regression(model, model_name: str,
435
439
  r2 = r2_score(single_y_test, y_pred)
436
440
 
437
441
  # Create formatted report
438
- report_path = os.path.join(save_dir, f"Regression_Report_{target_id}.txt")
442
+ sanitized_target_id = sanitize_filename(target_id)
443
+ report_path = os.path.join(save_dir, f"Regression_Report_{sanitized_target_id}.txt")
439
444
  with open(report_path, "w") as f:
440
445
  f.write(f"{model_name} - {target_id} Regression Performance\n")
441
446
  f.write(f"Mean Absolute Error (MAE): {mae:.4f}\n")
@@ -453,7 +458,7 @@ def evaluate_model_regression(model, model_name: str,
453
458
  plt.title(f"{model_name} - Residual Plot for {target_id}", fontsize=base_fontsize)
454
459
  plt.grid(True)
455
460
  plt.tight_layout()
456
- plt.savefig(os.path.join(save_dir, f"Residual_Plot_{target_id}.svg"), bbox_inches='tight', format="svg")
461
+ plt.savefig(os.path.join(save_dir, f"Residual_Plot_{sanitized_target_id}.svg"), bbox_inches='tight', format="svg")
457
462
  plt.close()
458
463
 
459
464
  # Create true vs predicted values plot
@@ -466,12 +471,13 @@ def evaluate_model_regression(model, model_name: str,
466
471
  plt.ylabel('Predictions', fontsize=base_fontsize)
467
472
  plt.title(f"{model_name} - True vs Predicted for {target_id}", fontsize=base_fontsize)
468
473
  plt.grid(True)
469
- plot_path = os.path.join(save_dir, f"Regression_Plot_{target_id}.svg")
474
+ plot_path = os.path.join(save_dir, f"Regression_Plot_{sanitized_target_id}.svg")
470
475
  plt.savefig(plot_path, bbox_inches='tight', format="svg")
471
476
  plt.close()
472
477
 
473
478
  return y_pred
474
479
 
480
+
475
481
  # Get SHAP values
476
482
  def get_shap_values(
477
483
  model,
@@ -498,7 +504,8 @@ def get_shap_values(
498
504
  features_to_explain: Should match the model's training data format, including scaling.
499
505
  save_dir: Directory to save visualizations
500
506
  """
501
-
507
+ sanitized_target_id = sanitize_filename(target_id)
508
+
502
509
  def _apply_plot_style():
503
510
  styles = ['seaborn', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8', 'default']
504
511
  for style in styles:
@@ -560,7 +567,7 @@ def get_shap_values(
560
567
  _create_shap_plot(
561
568
  shap_values=class_shap,
562
569
  features=features_to_explain,
563
- save_path=os.path.join(save_dir, f"SHAP_{target_id}_Class{class_name}_{plot_type}.svg"),
570
+ save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_id}_Class{class_name}_{plot_type}.svg"),
564
571
  plot_type=plot_type,
565
572
  title=f"{model_name} - {target_id} (Class {class_name})"
566
573
  )
@@ -570,7 +577,7 @@ def get_shap_values(
570
577
  _create_shap_plot(
571
578
  shap_values=values,
572
579
  features=features_to_explain,
573
- save_path=os.path.join(save_dir, f"SHAP_{target_id}_{plot_type}.svg"),
580
+ save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_id}_{plot_type}.svg"),
574
581
  plot_type=plot_type,
575
582
  title=f"{model_name} - {target_id}"
576
583
  )
@@ -580,10 +587,11 @@ def get_shap_values(
580
587
  _create_shap_plot(
581
588
  shap_values=shap_values,
582
589
  features=features_to_explain,
583
- save_path=os.path.join(save_dir, f"SHAP_{target_id}_{plot_type}.svg"),
590
+ save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_id}_{plot_type}.svg"),
584
591
  plot_type=plot_type,
585
592
  title=f"{model_name} - {target_id}"
586
593
  )
594
+ #START_O
587
595
 
588
596
  explainer = shap.TreeExplainer(model)
589
597
  shap_values = explainer.shap_values(features_to_explain)
@@ -672,6 +680,6 @@ def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list
672
680
 
673
681
  def _check_paths(datasets_dir: str, save_dir:str):
674
682
  if not os.path.isdir(save_dir):
675
- os.makedirs(save_dir)
683
+ os.makedirs(save_dir)
676
684
  if not os.path.isdir(datasets_dir):
677
685
  raise IOError(f"Datasets directory '{datasets_dir}' not found.")
ml_tools/handle_excel.py CHANGED
@@ -2,6 +2,16 @@ import os
2
2
  from openpyxl import load_workbook, Workbook
3
3
  import pandas as pd
4
4
  from typing import List, Optional
5
+ from utilities import _script_info, sanitize_filename
6
+
7
+
8
+ __all__ = [
9
+ "unmerge_and_split_excel",
10
+ "unmerge_and_split_from_directory",
11
+ "validate_excel_schema",
12
+ "vertical_merge_transform_excel",
13
+ "horizontal_merge_transform_excel"
14
+ ]
5
15
 
6
16
 
7
17
  def unmerge_and_split_excel(filepath: str) -> None:
@@ -25,12 +35,12 @@ def unmerge_and_split_excel(filepath: str) -> None:
25
35
  ws = wb[sheet_name]
26
36
  new_wb = Workbook()
27
37
  new_ws = new_wb.active
28
- new_ws.title = sheet_name
38
+ new_ws.title = sheet_name # type: ignore
29
39
 
30
40
  # Copy all cell values
31
41
  for row in ws.iter_rows():
32
42
  for cell in row:
33
- new_ws.cell(row=cell.row, column=cell.column, value=cell.value)
43
+ new_ws.cell(row=cell.row, column=cell.column, value=cell.value) # type: ignore
34
44
 
35
45
  # Fill and unmerge merged regions
36
46
  for merged_range in list(ws.merged_cells.ranges):
@@ -41,10 +51,10 @@ def unmerge_and_split_excel(filepath: str) -> None:
41
51
  value = ws.cell(row=min_row, column=min_col).value
42
52
  for row in range(min_row, max_row + 1):
43
53
  for col in range(min_col, max_col + 1):
44
- new_ws.cell(row=row, column=col, value=value)
54
+ new_ws.cell(row=row, column=col, value=value) # type: ignore
45
55
 
46
56
  # Construct flat output file name
47
- sanitized_sheet_name = sheet_name.replace("/", "_").replace("\\", "_")
57
+ sanitized_sheet_name = sanitize_filename(sheet_name)
48
58
  output_filename = f"{base_name}_{sanitized_sheet_name}.xlsx"
49
59
  output_path = os.path.join(base_dir, output_filename)
50
60
  new_wb.save(output_path)
@@ -85,12 +95,12 @@ def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
85
95
  ws = wb[sheet_name]
86
96
  new_wb = Workbook()
87
97
  new_ws = new_wb.active
88
- new_ws.title = sheet_name
98
+ new_ws.title = sheet_name # type: ignore
89
99
 
90
100
  # Copy all cell values
91
101
  for row in ws.iter_rows():
92
102
  for cell in row:
93
- new_ws.cell(row=cell.row, column=cell.column, value=cell.value)
103
+ new_ws.cell(row=cell.row, column=cell.column, value=cell.value) # type: ignore
94
104
 
95
105
  # Fill and unmerge merged regions
96
106
  for merged_range in list(ws.merged_cells.ranges):
@@ -101,10 +111,10 @@ def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
101
111
  value = ws.cell(row=min_row, column=min_col).value
102
112
  for row in range(min_row, max_row + 1):
103
113
  for col in range(min_col, max_col + 1):
104
- new_ws.cell(row=row, column=col, value=value)
114
+ new_ws.cell(row=row, column=col, value=value) # type: ignore
105
115
 
106
116
  # Construct flat output file name
107
- sanitized_sheet_name = sheet_name.replace("/", "_").replace("\\", "_")
117
+ sanitized_sheet_name = sanitize_filename(sheet_name)
108
118
  output_filename = f"{base_name}_{sanitized_sheet_name}.xlsx"
109
119
  output_path = os.path.join(output_dir, output_filename)
110
120
  new_wb.save(output_path)
@@ -151,7 +161,7 @@ def validate_excel_schema(
151
161
  wb = load_workbook(file_path, read_only=True)
152
162
  ws = wb.active # Only check the first worksheet
153
163
 
154
- header = [cell.value for cell in next(ws.iter_rows(max_row=1))]
164
+ header = [cell.value for cell in next(ws.iter_rows(max_row=1))] # type: ignore
155
165
 
156
166
  if strict:
157
167
  if header != expected_columns:
@@ -202,6 +212,11 @@ def vertical_merge_transform_excel(
202
212
 
203
213
  if not excel_files:
204
214
  raise ValueError("No Excel files found in the target directory.")
215
+
216
+ # sanitize filename
217
+ csv_filename = sanitize_filename(csv_filename)
218
+ # make directory
219
+ os.makedirs(output_dir, exist_ok=True)
205
220
 
206
221
  csv_filename = csv_filename if csv_filename.endswith('.csv') else f"{csv_filename}.csv"
207
222
  csv_path = os.path.join(output_dir, csv_filename)
@@ -260,6 +275,11 @@ def horizontal_merge_transform_excel(
260
275
  excel_files = [f for f in raw_excel_files if not f.startswith('~')] # Exclude temporary files
261
276
  if not excel_files:
262
277
  raise ValueError("No Excel files found in the target directory.")
278
+
279
+ # sanitize filename
280
+ csv_filename = sanitize_filename(csv_filename)
281
+ # make directory
282
+ os.makedirs(output_dir, exist_ok=True)
263
283
 
264
284
  csv_filename = csv_filename if csv_filename.endswith('.csv') else f"{csv_filename}.csv"
265
285
  csv_path = os.path.join(output_dir, csv_filename)
@@ -308,3 +328,6 @@ def horizontal_merge_transform_excel(
308
328
  if duplicate_columns:
309
329
  print(f"⚠️ Duplicate columns: {duplicate_columns}")
310
330
 
331
+
332
+ def info():
333
+ _script_info(__all__)
ml_tools/logger.py CHANGED
@@ -5,7 +5,12 @@ import pandas as pd
5
5
  from openpyxl.styles import Font, PatternFill
6
6
  import traceback
7
7
  import json
8
- from ml_tools.utilities import sanitize_filename
8
+ from ml_tools.utilities import sanitize_filename, _script_info
9
+
10
+
11
+ __all__ = [
12
+ "custom_logger"
13
+ ]
9
14
 
10
15
 
11
16
  def custom_logger(
@@ -143,3 +148,7 @@ def _log_exception_to_log(exc: BaseException, path: str) -> None:
143
148
  def _log_dict_to_json(data: Dict[Any, Any], path: str) -> None:
144
149
  with open(path, 'w', encoding='utf-8') as f:
145
150
  json.dump(data, f, indent=4, ensure_ascii=False)
151
+
152
+
153
+ def info():
154
+ _script_info(__all__)
@@ -5,23 +5,29 @@ import xgboost as xgb
5
5
  import lightgbm as lgb
6
6
  from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
7
7
  from sklearn.base import ClassifierMixin
8
- from sklearn.preprocessing import StandardScaler
8
+ from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
9
9
  from typing import Literal, Union, Tuple, Dict
10
- from collections.abc import Sequence
11
10
  import polars as pl
12
11
  from functools import partial
12
+ from .utilities import sanitize_filename, _script_info
13
+
14
+
15
+ __all__ = [
16
+ "ObjectiveFunction",
17
+ "run_pso"
18
+ ]
13
19
 
14
20
 
15
21
  class ObjectiveFunction():
16
22
  """
17
23
  Callable objective function designed for optimizing continuous outputs from regression models.
18
24
 
19
- The trained model must include a 'model' and a 'scaler'. Additionally 'feature_names' and 'target_name' will be parsed if present.
25
+ The target serialized file (joblib) must include a 'model' and a 'scaler'. Additionally 'feature_names' and 'target_name' will be parsed if present.
20
26
 
21
27
  Parameters
22
28
  ----------
23
29
  trained_model_path : str
24
- Path to a serialized model (joblib) compatible with scikit-learn-like `.predict`.
30
+ Path to a serialized model and its scaler (joblib) compatible with scikit-learn-like `.predict`.
25
31
  add_noise : bool
26
32
  Whether to apply multiplicative noise to the input features during evaluation.
27
33
  binary_features : int, default=0
@@ -67,8 +73,18 @@ class ObjectiveFunction():
67
73
  return new_feature_values
68
74
 
69
75
  def _handle_hybrid(self, features_array):
70
- feat_continuous = features_array[:self.binary_features]
71
- feat_binary = (features_array[self.binary_features:] > 0.5).astype(int) #threshold binary values
76
+ total_features = features_array.shape[0]
77
+ if self.binary_features > total_features:
78
+ raise ValueError("self.binary_features exceeds total number of features.")
79
+
80
+ # Handle corner case where all features are binary
81
+ if self.binary_features == total_features:
82
+ feat_binary = (features_array > 0.5).astype(int)
83
+ return feat_binary
84
+
85
+ # Normal case: split into continuous and binary parts
86
+ feat_continuous = features_array[:-self.binary_features]
87
+ feat_binary = (features_array[-self.binary_features:] > 0.5).astype(int) #threshold binary values
72
88
  new_feature_values = np.concatenate([feat_continuous, feat_binary])
73
89
  return new_feature_values
74
90
 
@@ -92,7 +108,7 @@ class ObjectiveFunction():
92
108
  return (f"<ObjectiveFunction(model={type(self.model).__name__}, scaler={type(self.scaler).__name__}, use_noise={self.use_noise}, is_hybrid={self.is_hybrid}, task='{self.task}')>")
93
109
 
94
110
 
95
- def _set_boundaries(lower_boundaries: Sequence[float], upper_boundaries: Sequence[float]):
111
+ def _set_boundaries(lower_boundaries: list[float], upper_boundaries: list[float]):
96
112
  assert len(lower_boundaries) == len(upper_boundaries), "Lower and upper boundaries must have the same length."
97
113
  assert len(lower_boundaries) >= 1, "At least one boundary pair is required."
98
114
  lower = np.array(lower_boundaries)
@@ -112,31 +128,40 @@ def _save_results(*dicts, save_dir: str, target_name: str):
112
128
  combined_dict = dict()
113
129
  for single_dict in dicts:
114
130
  combined_dict.update(single_dict)
115
-
116
- full_path = os.path.join(save_dir, f"results_{target_name}.csv")
131
+
132
+ sanitized_target_name = sanitize_filename(target_name)
133
+
134
+ full_path = os.path.join(save_dir, f"Optimization_{sanitized_target_name}.csv")
117
135
  pl.DataFrame(combined_dict).write_csv(full_path)
118
136
 
119
137
 
120
- def run_pso(lower_boundaries: Sequence[float], upper_boundaries: Sequence[float], objective_function: ObjectiveFunction,
121
- save_results_dir: str,
138
+ def run_pso(lower_boundaries: list[float],
139
+ upper_boundaries: list[float],
140
+ objective_function: ObjectiveFunction,
141
+ save_results_dir: str,
142
+ auto_binary_boundaries: bool=True,
122
143
  target_name: Union[str, None]=None,
123
144
  feature_names: Union[list[str], None]=None,
124
- swarm_size: int=100, max_iterations: int=100,
145
+ swarm_size: int=100,
146
+ max_iterations: int=100,
125
147
  inequality_constrain_function=None,
126
- post_hoc_analysis: Union[int, None]=None) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
148
+ post_hoc_analysis: Union[int, None]=None,
149
+ workers: int=5) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
127
150
  """
128
- Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results.
151
+ Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
129
152
 
130
153
  Parameters
131
154
  ----------
132
- lower_boundaries : Sequence[float]
133
- Lower bounds for each feature in the search space.
134
- upper_boundaries : Sequence[float]
135
- Upper bounds for each feature in the search space.
155
+ lower_boundaries : list[float]
156
+ Lower bounds for each feature in the search space (as many as features expected by the model).
157
+ upper_boundaries : list[float]
158
+ Upper bounds for each feature in the search space (as many as features expected by the model).
136
159
  objective_function : ObjectiveFunction
137
160
  A callable object encapsulating a regression model and its scaler.
138
161
  save_results_dir : str
139
162
  Directory path to save the results CSV file.
163
+ auto_binary_boundaries : bool
164
+ Use `ObjectiveFunction.binary_features` to append as many binary boundaries as needed to `lower_boundaries` and `upper_boundaries` automatically.
140
165
  target_name : str or None, optional
141
166
  Name of the target variable. If None, attempts to retrieve from the ObjectiveFunction object.
142
167
  feature_names : list[str] or None, optional
@@ -149,30 +174,38 @@ def run_pso(lower_boundaries: Sequence[float], upper_boundaries: Sequence[float]
149
174
  Optional function defining inequality constraints to be respected by the optimization.
150
175
  post_hoc_analysis : int or None, optional
151
176
  If specified, runs the optimization multiple times to perform post hoc analysis. The value indicates the number of repetitions.
177
+ workers : int
178
+ Number of parallel processes to use.
152
179
 
153
180
  Returns
154
181
  -------
155
182
  Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]
156
183
  If `post_hoc_analysis` is None, returns two dictionaries:
157
- - best_features_named: Feature values (after inverse scaling) that yield the best result.
158
- - best_target_named: Best result obtained for the target variable.
184
+ - feature_names: Feature values (after inverse scaling) that yield the best result.
185
+ - target_name: Best result obtained for the target variable.
159
186
 
160
187
  If `post_hoc_analysis` is an integer, returns two dictionaries:
161
- - all_best_features_named: Lists of best feature values (after inverse scaling) for each repetition.
162
- - all_best_targets_named: List of best target values across repetitions.
188
+ - feature_names: Lists of best feature values (after inverse scaling) for each repetition.
189
+ - target_name: List of best target values across repetitions.
163
190
 
164
191
  Notes
165
192
  -----
166
193
  - PSO minimizes the objective function by default; if maximization is desired, it should be handled inside the ObjectiveFunction.
167
194
  - Feature values are scaled before being passed to the model and inverse-transformed before result saving.
168
195
  """
196
+ # Append binary boundaries
197
+ binary_number = objective_function.binary_features
198
+ if auto_binary_boundaries and binary_number > 0:
199
+ lower_boundaries.extend([0] * binary_number)
200
+ upper_boundaries.extend([1] * binary_number)
201
+
169
202
  lower, upper = _set_boundaries(lower_boundaries, upper_boundaries)
170
-
203
+
171
204
  # feature names
172
205
  if feature_names is None and objective_function.feature_names is not None:
173
206
  feature_names = objective_function.feature_names
174
207
  names = _set_feature_names(size=len(lower_boundaries), names=feature_names)
175
-
208
+
176
209
  # target name
177
210
  if target_name is None and objective_function.target_name is not None:
178
211
  target_name = objective_function.target_name
@@ -186,13 +219,15 @@ def run_pso(lower_boundaries: Sequence[float], upper_boundaries: Sequence[float]
186
219
  "f_ieqcons": inequality_constrain_function,
187
220
  "swarmsize": swarm_size,
188
221
  "maxiter": max_iterations,
189
- "processes": 1,
190
- "particle_output": True
222
+ "processes": workers,
223
+ "particle_output": False
191
224
  }
192
225
 
193
- if post_hoc_analysis is None:
194
- # best_features, best_target = pso(**arguments)
195
- best_features, best_target, _particle_positions, _target_values_per_position = pso(**arguments)
226
+ os.makedirs(save_results_dir, exist_ok=True)
227
+
228
+ if post_hoc_analysis is None or post_hoc_analysis == 1:
229
+ best_features, best_target, *_ = _pso(**arguments)
230
+ # best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
196
231
 
197
232
  # inverse transformation
198
233
  best_features = np.array(best_features).reshape(1, -1)
@@ -209,9 +244,9 @@ def run_pso(lower_boundaries: Sequence[float], upper_boundaries: Sequence[float]
209
244
  else:
210
245
  all_best_targets = list()
211
246
  all_best_features = [[] for _ in range(len(lower_boundaries))]
212
- for _ in range(post_hoc_analysis):
213
- # best_features, best_target = pso(**arguments)
214
- best_features, best_target, _particle_positions, _target_values_per_position = pso(**arguments)
247
+ for _ in range(post_hoc_analysis):
248
+ best_features, best_target, *_ = _pso(**arguments)
249
+ # best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
215
250
 
216
251
  # inverse transformation
217
252
  best_features = np.array(best_features).reshape(1, -1)
@@ -231,6 +266,8 @@ def run_pso(lower_boundaries: Sequence[float], upper_boundaries: Sequence[float]
231
266
  return all_best_features_named, all_best_targets_named # type: ignore
232
267
 
233
268
 
269
+ def info():
270
+ _script_info(__all__)
234
271
 
235
272
 
236
273
  ### SOURCE CODE FOR PSO ###
@@ -249,7 +286,7 @@ def _cons_ieqcons_wrapper(ieqcons, args, kwargs, x):
249
286
  def _cons_f_ieqcons_wrapper(f_ieqcons, args, kwargs, x):
250
287
  return np.array(f_ieqcons(x, *args, **kwargs))
251
288
 
252
- def pso(func, lb, ub, ieqcons=[], f_ieqcons=None, args=(), kwargs={},
289
+ def _pso(func, lb, ub, ieqcons=[], f_ieqcons=None, args=(), kwargs={},
253
290
  swarmsize=100, omega=0.5, phip=0.5, phig=0.5, maxiter=100,
254
291
  minstep=1e-8, minfunc=1e-8, debug=False, processes=1,
255
292
  particle_output=False):
@@ -377,7 +414,7 @@ def pso(func, lb, ub, ieqcons=[], f_ieqcons=None, args=(), kwargs={},
377
414
  for i in range(S):
378
415
  fx[i] = obj(x[i, :])
379
416
  fs[i] = is_feasible(x[i, :])
380
-
417
+
381
418
  # Store particle's best position (if constraints are satisfied)
382
419
  i_update = np.logical_and((fx < fp), fs)
383
420
  p[i_update, :] = x[i_update, :].copy()
@@ -1,5 +1,12 @@
1
1
  import torch
2
2
  from torch import nn
3
+ from .utilities import _script_info
4
+
5
+
6
+ __all__ = [
7
+ "MyNeuralNetwork",
8
+ "MyLSTMNetwork"
9
+ ]
3
10
 
4
11
 
5
12
  class MyNeuralNetwork(nn.Module):
@@ -73,9 +80,11 @@ class MyNeuralNetwork(nn.Module):
73
80
  return X
74
81
 
75
82
 
76
- class MyConvolutionalNetwork(nn.Module):
83
+ class _MyConvolutionalNetwork(nn.Module):
77
84
  def __init__(self, outputs: int, color_channels: int=3, img_size: int=256, drop_out: float=0.2):
78
85
  """
86
+ - EDUCATIONAL PURPOSES ONLY, not optimized and requires lots of memory.
87
+
79
88
  Create a basic Convolutional Neural Network with two convolution layers with a pooling layer after each convolution.
80
89
 
81
90
  Args:
@@ -225,3 +234,6 @@ class MyLSTMNetwork(nn.Module):
225
234
  else:
226
235
  return output
227
236
 
237
+
238
+ def info():
239
+ _script_info(__all__)
ml_tools/trainer.py CHANGED
@@ -6,6 +6,12 @@ import matplotlib.pyplot as plt
6
6
  import torch
7
7
  from torch import nn
8
8
  from sklearn.metrics import mean_squared_error, classification_report, ConfusionMatrixDisplay, roc_curve, roc_auc_score, r2_score, median_absolute_error
9
+ from .utilities import _script_info
10
+
11
+
12
+ __all__ = [
13
+ "MyTrainer"
14
+ ]
9
15
 
10
16
 
11
17
  class MyTrainer():
@@ -288,36 +294,6 @@ class MyTrainer():
288
294
  print(f"Area under the curve score: {area_under_curve:4.2f}")
289
295
  else:
290
296
  print("Error encountered while retrieving 'model.kind' attribute.")
291
-
292
-
293
- def forecast(self, samples_list: list[torch.Tensor], view_as: tuple[int,int]=(1,-1)):
294
- """
295
- DEPRECATED - Use `helpers.model_predict()` instead
296
-
297
- Returns a list containing lists of predicted values, one for each sample.
298
-
299
- Each sample must be a tensor and have the same shape and normalization expected by the model
300
- (this method will add the batch dimension automatically).
301
-
302
- Args:
303
- `samples_list`: list of tensors.
304
-
305
- `view_as`: reshape each output, default is (1,-1).
306
-
307
- Returns: List of lists.
308
- """
309
- self.model.eval()
310
- results = list()
311
- with torch.no_grad():
312
- for data_point in samples_list:
313
- data_point = data_point.unsqueeze(0).to(self.device)
314
- output = self.model(data_point)
315
- if self.kind == "classification":
316
- results.append(output.argmax(dim=1).view(view_as).cpu().tolist())
317
- else: #regression
318
- results.append(output.view(view_as).cpu().tolist())
319
-
320
- return results
321
297
 
322
298
 
323
299
  def rnn_forecast(self, sequence: torch.Tensor, steps: int):
@@ -364,3 +340,7 @@ class MyTrainer():
364
340
  # Cast to array and return
365
341
  predictions = numpy.array(predictions)
366
342
  return predictions
343
+
344
+
345
+ def info():
346
+ _script_info(__all__)
ml_tools/utilities.py CHANGED
@@ -4,6 +4,19 @@ import pandas as pd
4
4
  import os
5
5
  from pathlib import Path
6
6
  import re
7
+ from typing import Literal
8
+
9
+
10
+ # Keep track of available tools
11
+ __all__ = [
12
+ "list_csv_paths",
13
+ "load_dataframe",
14
+ "yield_dataframes_from_dir",
15
+ "merge_dataframes",
16
+ "save_dataframe",
17
+ "normalize_mixed_list",
18
+ "sanitize_filename"
19
+ ]
7
20
 
8
21
 
9
22
  def list_csv_paths(directory: str) -> dict[str, str]:
@@ -76,11 +89,93 @@ def yield_dataframes_from_dir(datasets_dir: str):
76
89
  for df_name, df_path in list_csv_paths(datasets_dir).items():
77
90
  df, _ = load_dataframe(df_path)
78
91
  yield df, df_name
92
+
93
+
94
+ def merge_dataframes(
95
+ *dfs: pd.DataFrame,
96
+ reset_index: bool = False,
97
+ direction: Literal["horizontal", "vertical"] = "horizontal"
98
+ ) -> pd.DataFrame:
99
+ """
100
+ Merges multiple DataFrames either horizontally or vertically.
101
+
102
+ Parameters:
103
+ *dfs (pd.DataFrame): Variable number of DataFrames to merge.
104
+ reset_index (bool): Whether to reset index in the final merged DataFrame.
105
+ direction (["horizontal" | "vertical"]):
106
+ - "horizontal": Merge on index, adding columns.
107
+ - "vertical": Append rows; all DataFrames must have identical columns.
108
+
109
+ Returns:
110
+ pd.DataFrame: A single merged DataFrame.
111
+
112
+ Raises:
113
+ ValueError:
114
+ - If fewer than 2 DataFrames are provided.
115
+ - If indexes do not match for horizontal merge.
116
+ - If column names or order differ for vertical merge.
117
+ """
118
+ if len(dfs) < 2:
119
+ raise ValueError("At least 2 DataFrames must be provided.")
120
+
121
+ for i, df in enumerate(dfs, start=1):
122
+ print(f"DataFrame {i} shape: {df.shape}")
123
+
124
+
125
+ if direction == "horizontal":
126
+ reference_index = dfs[0].index
127
+ for i, df in enumerate(dfs, start=1):
128
+ if not df.index.equals(reference_index):
129
+ raise ValueError(f"Indexes do not match: Dataset 1 and Dataset {i}.")
130
+ merged_df = pd.concat(dfs, axis=1)
131
+
132
+ elif direction == "vertical":
133
+ reference_columns = dfs[0].columns
134
+ for i, df in enumerate(dfs, start=1):
135
+ if not df.columns.equals(reference_columns):
136
+ raise ValueError(f"Column names/order do not match: Dataset 1 and Dataset {i}.")
137
+ merged_df = pd.concat(dfs, axis=0)
138
+
139
+ else:
140
+ raise ValueError(f"Invalid merge direction: {direction}")
141
+
142
+ if reset_index:
143
+ merged_df = merged_df.reset_index(drop=True)
144
+
145
+ print(f"Merged DataFrame shape: {merged_df.shape}")
146
+
147
+ return merged_df
148
+
149
+
150
+ def save_dataframe(df: pd.DataFrame, save_dir: str, filename: str) -> None:
151
+ """
152
+ Save a pandas DataFrame to a CSV file.
153
+
154
+ Parameters:
155
+ df: pandas.DataFrame to save
156
+ save_dir: str, directory where the CSV file will be saved.
157
+ filename: str, CSV filename, extension will be added if missing.
158
+ """
159
+ if df.empty:
160
+ print(f"⚠️ Attempting to save an empty DataFrame: '{filename}'. Process Skipped.")
161
+ return
162
+
163
+ os.makedirs(save_dir, exist_ok=True)
164
+
165
+ filename = sanitize_filename(filename)
166
+
167
+ if not filename.endswith('.csv'):
168
+ filename += '.csv'
79
169
 
170
+ output_path = os.path.join(save_dir, filename)
80
171
 
172
+ df.to_csv(output_path, index=False, encoding='utf-8')
173
+ print(f"✅ Saved file: '{filename}'")
174
+
175
+
81
176
  def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
82
177
  """
83
- Normalize a mixed list of numeric values and strings so that the sum of the values equals 1.0,
178
+ Normalize a mixed list of numeric values and strings casted to floats so that the sum of the values equals 1.0,
84
179
  applying heuristic adjustments to correct for potential data entry scale mismatches.
85
180
 
86
181
  Parameters:
@@ -168,27 +263,14 @@ def sanitize_filename(filename: str) -> str:
168
263
  return sanitized
169
264
 
170
265
 
171
- def save_dataframe(df: pd.DataFrame, save_dir: str, filename: str) -> None:
266
+ def _script_info(all_data: list[str]):
172
267
  """
173
- Save a pandas DataFrame to a CSV file.
174
-
175
- Parameters:
176
- df: pandas.DataFrame to save
177
- save_dir: str, directory where the CSV file will be saved.
178
- filename: str, CSV filename, extension will be added if missing.
268
+ List available names.
179
269
  """
180
- if df.empty:
181
- print(f"⚠️ Attempting to save an empty DataFrame: '{filename}'. Process Skipped.")
182
- return
183
-
184
- os.makedirs(save_dir, exist_ok=True)
185
-
186
- filename = sanitize_filename(filename)
187
-
188
- if not filename.endswith('.csv'):
189
- filename += '.csv'
190
-
191
- output_path = os.path.join(save_dir, filename)
192
-
193
- df.to_csv(output_path, index=False, encoding='utf-8')
194
- print(f"✅ Saved file: '{filename}'")
270
+ print("Available functions and objects:")
271
+ for i, name in enumerate(all_data, start=1):
272
+ print(f"{i} - {name}")
273
+
274
+
275
+ def info():
276
+ _script_info(__all__)
@@ -4,9 +4,18 @@ from PIL import Image, ImageOps
4
4
  from typing import Literal
5
5
  from torchvision import transforms
6
6
  import torch
7
+ from .utilities import _script_info
8
+
9
+
10
+ __all__ = [
11
+ "inspect_images",
12
+ "image_augmentation",
13
+ "ResizeAspectFill",
14
+ "is_image",
15
+ "model_predict"
16
+ ]
7
17
 
8
18
 
9
- # --- Helper Functions ---
10
19
  def inspect_images(path: str):
11
20
  """
12
21
  Prints out the types, sizes and channels of image files found in the directory and its subdirectories.
@@ -216,3 +225,7 @@ def model_predict(model: torch.nn.Module, kind: Literal["regression", "classific
216
225
  results.append(output.view(view_as).cpu().tolist())
217
226
 
218
227
  return results
228
+
229
+
230
+ def info():
231
+ _script_info(__all__)
@@ -1,19 +0,0 @@
1
- dragon_ml_toolbox-1.4.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
- dragon_ml_toolbox-1.4.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=e1Hg5ZtaBpDV7ZvxhLe1ac28l7nMjvi1MSE5YvB1s-o,1472
3
- ml_tools/MICE_imputation.py,sha256=4kqZiesk8vyh4MBLnNE9grflG4fDusqzuYBElsbk4LY,9484
4
- ml_tools/VIF_factor.py,sha256=rHSAxQcXLrG8dIjCXBAvETsSkCBfYus9NqimOnm2Bvk,9559
5
- ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- ml_tools/data_exploration.py,sha256=qtkGumckC2PmTpj3brVFi072ewX0OI6dwUF4Or7Yikg,21341
7
- ml_tools/datasetmaster.py,sha256=VUneKshnmjOGbtqVVGTFcIMRKF3s6ZDYrosIYKDjD80,28956
8
- ml_tools/ensemble_learning.py,sha256=wK6mtOE4v9AWlxkcWhJj5XZjREChxb46kE0i2IxS-OE,28372
9
- ml_tools/handle_excel.py,sha256=IR0VQc3hYdmjwC31E5YxDnRcWig4jSIx7Y_7to-KZz4,11969
10
- ml_tools/logger.py,sha256=XwSpCUzw2Le24fJHyljBxNLgw63SwjZ0pMjTJqf0ylI,4622
11
- ml_tools/particle_swarm_optimization.py,sha256=jpkje4OETC9fyISxxUTx4XGrImSU6gDEcwz46ZDs2bQ,19250
12
- ml_tools/pytorch_models.py,sha256=Oykw02sOZLCjvSadQd64UGesBN7kq0x1EGXHusvYiQI,9908
13
- ml_tools/trainer.py,sha256=Zd7AaHeoNd8dEas2JChWoHaCUpWUVRDUMybuHaKJ0XY,16740
14
- ml_tools/utilities.py,sha256=gr1cyRUfZcRo9fjWpCaQkrvWY0-xJnDJdrE8JEsOi8o,6309
15
- ml_tools/vision_helpers.py,sha256=lBAW6dzAK-HOswAt1fU_tfP9hkNLY5D8c_I_7hhEXno,7528
16
- dragon_ml_toolbox-1.4.0.dist-info/METADATA,sha256=V7Y96iAbgX6Xl6RWzEt4nGfKMZe4cuLs0BrFQghXxX8,2335
17
- dragon_ml_toolbox-1.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
- dragon_ml_toolbox-1.4.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
19
- dragon_ml_toolbox-1.4.0.dist-info/RECORD,,