dragon-ml-toolbox 1.3.2__tar.gz → 1.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (24) hide show
  1. {dragon_ml_toolbox-1.3.2/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-1.4.1}/PKG-INFO +19 -2
  2. {dragon_ml_toolbox-1.3.2 → dragon_ml_toolbox-1.4.1}/README.md +17 -1
  3. {dragon_ml_toolbox-1.3.2 → dragon_ml_toolbox-1.4.1/dragon_ml_toolbox.egg-info}/PKG-INFO +19 -2
  4. {dragon_ml_toolbox-1.3.2 → dragon_ml_toolbox-1.4.1}/dragon_ml_toolbox.egg-info/SOURCES.txt +1 -0
  5. {dragon_ml_toolbox-1.3.2 → dragon_ml_toolbox-1.4.1}/dragon_ml_toolbox.egg-info/requires.txt +1 -0
  6. {dragon_ml_toolbox-1.3.2 → dragon_ml_toolbox-1.4.1}/ml_tools/MICE_imputation.py +24 -6
  7. dragon_ml_toolbox-1.4.1/ml_tools/VIF_factor.py +224 -0
  8. {dragon_ml_toolbox-1.3.2 → dragon_ml_toolbox-1.4.1}/ml_tools/data_exploration.py +74 -286
  9. {dragon_ml_toolbox-1.3.2 → dragon_ml_toolbox-1.4.1}/ml_tools/datasetmaster.py +13 -1
  10. {dragon_ml_toolbox-1.3.2 → dragon_ml_toolbox-1.4.1}/ml_tools/ensemble_learning.py +128 -129
  11. {dragon_ml_toolbox-1.3.2 → dragon_ml_toolbox-1.4.1}/ml_tools/handle_excel.py +32 -9
  12. {dragon_ml_toolbox-1.3.2 → dragon_ml_toolbox-1.4.1}/ml_tools/logger.py +10 -1
  13. {dragon_ml_toolbox-1.3.2 → dragon_ml_toolbox-1.4.1}/ml_tools/particle_swarm_optimization.py +71 -34
  14. {dragon_ml_toolbox-1.3.2 → dragon_ml_toolbox-1.4.1}/ml_tools/pytorch_models.py +13 -1
  15. {dragon_ml_toolbox-1.3.2 → dragon_ml_toolbox-1.4.1}/ml_tools/trainer.py +10 -30
  16. {dragon_ml_toolbox-1.3.2 → dragon_ml_toolbox-1.4.1}/ml_tools/utilities.py +122 -14
  17. {dragon_ml_toolbox-1.3.2 → dragon_ml_toolbox-1.4.1}/ml_tools/vision_helpers.py +14 -1
  18. {dragon_ml_toolbox-1.3.2 → dragon_ml_toolbox-1.4.1}/pyproject.toml +2 -1
  19. {dragon_ml_toolbox-1.3.2 → dragon_ml_toolbox-1.4.1}/LICENSE +0 -0
  20. {dragon_ml_toolbox-1.3.2 → dragon_ml_toolbox-1.4.1}/LICENSE-THIRD-PARTY.md +0 -0
  21. {dragon_ml_toolbox-1.3.2 → dragon_ml_toolbox-1.4.1}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  22. {dragon_ml_toolbox-1.3.2 → dragon_ml_toolbox-1.4.1}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  23. {dragon_ml_toolbox-1.3.2 → dragon_ml_toolbox-1.4.1}/ml_tools/__init__.py +0 -0
  24. {dragon_ml_toolbox-1.3.2 → dragon_ml_toolbox-1.4.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 1.3.2
3
+ Version: 1.4.1
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -27,6 +27,7 @@ Requires-Dist: ipython
27
27
  Requires-Dist: ipykernel
28
28
  Requires-Dist: notebook
29
29
  Requires-Dist: jupyterlab
30
+ Requires-Dist: ipywidgets
30
31
  Requires-Dist: joblib
31
32
  Requires-Dist: xgboost
32
33
  Requires-Dist: lightgbm<=4.5.0
@@ -79,7 +80,7 @@ Clone the repository and install in editable mode with optional dependencies:
79
80
  ```bash
80
81
  git clone https://github.com/DrAg0n-BoRn/ML_tools.git
81
82
  cd ML_tools
82
- pip install -e '.[pytorch]'
83
+ pip install -e .
83
84
  ```
84
85
 
85
86
  ## Usage
@@ -90,3 +91,19 @@ After installation, import modules like this:
90
91
  from ml_tools.utilities import sanitize_filename
91
92
  from ml_tools.logger import custom_logger
92
93
  ```
94
+
95
+ ## Available modules
96
+
97
+ ```bash
98
+ data_exploration
99
+ datasetmaster
100
+ ensemble_learning
101
+ handle_excel
102
+ logger
103
+ MICE_imputation
104
+ particle_swarm_optimization
105
+ trainer
106
+ utilities
107
+ VIF_factor
108
+ vision_helpers
109
+ ```
@@ -40,7 +40,7 @@ Clone the repository and install in editable mode with optional dependencies:
40
40
  ```bash
41
41
  git clone https://github.com/DrAg0n-BoRn/ML_tools.git
42
42
  cd ML_tools
43
- pip install -e '.[pytorch]'
43
+ pip install -e .
44
44
  ```
45
45
 
46
46
  ## Usage
@@ -51,3 +51,19 @@ After installation, import modules like this:
51
51
  from ml_tools.utilities import sanitize_filename
52
52
  from ml_tools.logger import custom_logger
53
53
  ```
54
+
55
+ ## Available modules
56
+
57
+ ```bash
58
+ data_exploration
59
+ datasetmaster
60
+ ensemble_learning
61
+ handle_excel
62
+ logger
63
+ MICE_imputation
64
+ particle_swarm_optimization
65
+ trainer
66
+ utilities
67
+ VIF_factor
68
+ vision_helpers
69
+ ```
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 1.3.2
3
+ Version: 1.4.1
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -27,6 +27,7 @@ Requires-Dist: ipython
27
27
  Requires-Dist: ipykernel
28
28
  Requires-Dist: notebook
29
29
  Requires-Dist: jupyterlab
30
+ Requires-Dist: ipywidgets
30
31
  Requires-Dist: joblib
31
32
  Requires-Dist: xgboost
32
33
  Requires-Dist: lightgbm<=4.5.0
@@ -79,7 +80,7 @@ Clone the repository and install in editable mode with optional dependencies:
79
80
  ```bash
80
81
  git clone https://github.com/DrAg0n-BoRn/ML_tools.git
81
82
  cd ML_tools
82
- pip install -e '.[pytorch]'
83
+ pip install -e .
83
84
  ```
84
85
 
85
86
  ## Usage
@@ -90,3 +91,19 @@ After installation, import modules like this:
90
91
  from ml_tools.utilities import sanitize_filename
91
92
  from ml_tools.logger import custom_logger
92
93
  ```
94
+
95
+ ## Available modules
96
+
97
+ ```bash
98
+ data_exploration
99
+ datasetmaster
100
+ ensemble_learning
101
+ handle_excel
102
+ logger
103
+ MICE_imputation
104
+ particle_swarm_optimization
105
+ trainer
106
+ utilities
107
+ VIF_factor
108
+ vision_helpers
109
+ ```
@@ -8,6 +8,7 @@ dragon_ml_toolbox.egg-info/dependency_links.txt
8
8
  dragon_ml_toolbox.egg-info/requires.txt
9
9
  dragon_ml_toolbox.egg-info/top_level.txt
10
10
  ml_tools/MICE_imputation.py
11
+ ml_tools/VIF_factor.py
11
12
  ml_tools/__init__.py
12
13
  ml_tools/data_exploration.py
13
14
  ml_tools/datasetmaster.py
@@ -13,6 +13,7 @@ ipython
13
13
  ipykernel
14
14
  notebook
15
15
  jupyterlab
16
+ ipywidgets
16
17
  joblib
17
18
  xgboost
18
19
  lightgbm<=4.5.0
@@ -3,9 +3,20 @@ import miceforest as mf
3
3
  import os
4
4
  import matplotlib.pyplot as plt
5
5
  import numpy as np
6
- from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename
6
+ from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info
7
7
  from plotnine import ggplot, labs, theme, element_blank # type: ignore
8
8
 
9
+
10
+ __all__ = [
11
+ "apply_mice",
12
+ "save_imputed_datasets",
13
+ "get_na_column_names",
14
+ "get_convergence_diagnostic",
15
+ "get_imputed_distributions",
16
+ "run_mice_pipeline"
17
+ ]
18
+
19
+
9
20
  def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
10
21
 
11
22
  # Initialize kernel with number of imputed datasets to generate
@@ -120,7 +131,7 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
120
131
  '''
121
132
  # Check path
122
133
  os.makedirs(root_dir, exist_ok=True)
123
- local_save_dir = os.path.join(root_dir, f"Distribution_Metrics_{df_name}")
134
+ local_save_dir = os.path.join(root_dir, f"Distribution_Metrics_{df_name}_imputed")
124
135
  if not os.path.isdir(local_save_dir):
125
136
  os.makedirs(local_save_dir)
126
137
 
@@ -169,8 +180,12 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
169
180
  # Adjust layout and save
170
181
  # fig.tight_layout()
171
182
  # fig.subplots_adjust(bottom=0.2, left=0.2) # Optional, depending on overflow
183
+
184
+ # sanitize savename
185
+ feature_save_name = sanitize_filename(filename)
186
+
172
187
  fig.savefig(
173
- os.path.join(local_save_dir, filename + ".svg"),
188
+ os.path.join(local_save_dir, feature_save_name + ".svg"),
174
189
  format='svg',
175
190
  bbox_inches='tight',
176
191
  pad_inches=0.1
@@ -185,8 +200,7 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
185
200
  else:
186
201
  for feature in column_names:
187
202
  fig = kernel.plot_imputed_distributions(variables=[feature])
188
- feature_save_name = sanitize_filename(feature)
189
- _process_figure(fig, feature_save_name)
203
+ _process_figure(fig, feature)
190
204
 
191
205
  print("\tImputed distributions saved successfully.")
192
206
 
@@ -207,7 +221,7 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
207
221
  if os.path.isfile(df_path_or_dir):
208
222
  all_file_paths = [df_path_or_dir]
209
223
  elif os.path.isdir(df_path_or_dir):
210
- all_file_paths, _ = list_csv_paths(df_path_or_dir)
224
+ all_file_paths = list(list_csv_paths(df_path_or_dir).values())
211
225
  else:
212
226
  raise ValueError(f"Invalid path or directory: {df_path_or_dir}")
213
227
 
@@ -223,3 +237,7 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
223
237
  get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_dir)
224
238
 
225
239
  get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_dir, column_names=imputed_column_names)
240
+
241
+
242
+ def info():
243
+ _script_info(__all__)
@@ -0,0 +1,224 @@
1
+
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ from typing import Optional
6
+ from statsmodels.stats.outliers_influence import variance_inflation_factor
7
+ from statsmodels.tools.tools import add_constant
8
+ import warnings
9
+ import os
10
+ from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe, _script_info
11
+
12
+
13
+ __all__ = [
14
+ "compute_vif",
15
+ "drop_vif_based",
16
+ "compute_vif_multi"
17
+ ]
18
+
19
+
20
+ def compute_vif(
21
+ df: pd.DataFrame,
22
+ use_columns: Optional[list[str]] = None,
23
+ ignore_columns: Optional[list[str]] = None,
24
+ max_features_to_plot: int = 20,
25
+ save_dir: Optional[str] = None,
26
+ filename: Optional[str] = None,
27
+ fontsize: int = 14,
28
+ show_plot: bool = True,
29
+ ) -> pd.DataFrame:
30
+ """
31
+ Computes Variance Inflation Factors (VIF) for numeric columns in a DataFrame. Optionally, generates a bar plot of VIF values.
32
+
33
+ Args:
34
+ df (pd.DataFrame): The input DataFrame.
35
+ use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
36
+ ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
37
+ max_features_to_plot (int): Adjust the number of features shown in the plot.
38
+ save_dir (str | None): Directory to save the plot as SVG. If None, the plot is not saved.
39
+ filename (str | None): Optional filename for saving the plot. Defaults to "VIF_plot.svg".
40
+ fontsize (int): Base fontsize to scale title and labels on the plot.
41
+ show_plot (bool): Display plot.
42
+
43
+ Returns:
44
+ pd.DataFrame: DataFrame with features and their corresponding VIF values.
45
+
46
+ NOTE:
47
+ **Variance Inflation Factor (VIF)** quantifies the degree of multicollinearity among features in a dataset.
48
+ A VIF value indicates how much the variance of a regression coefficient is inflated due to linear dependence with other features.
49
+ A VIF of 1 suggests no correlation, values between 1 and 5 indicate moderate correlation, and values greater than 10 typically signal high multicollinearity, which may distort model interpretation and degrade performance.
50
+ """
51
+ ground_truth_cols = df.columns.to_list()
52
+ if use_columns is None:
53
+ sanitized_columns = df.select_dtypes(include='number').columns.tolist()
54
+ missing_features = set(ground_truth_cols) - set(sanitized_columns)
55
+ if missing_features:
56
+ print(f"⚠️ These columns are not Numeric:\n{missing_features}")
57
+ else:
58
+ sanitized_columns = list()
59
+ for feature in use_columns:
60
+ if feature not in ground_truth_cols:
61
+ print(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
62
+ else:
63
+ sanitized_columns.append(feature)
64
+
65
+ if ignore_columns is not None and use_columns is None:
66
+ missing_ignore = set(ignore_columns) - set(ground_truth_cols)
67
+ if missing_ignore:
68
+ print(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
69
+ sanitized_columns = [f for f in sanitized_columns if f not in ignore_columns]
70
+
71
+ X = df[sanitized_columns].copy()
72
+ X = add_constant(X, has_constant='add')
73
+
74
+ vif_data = pd.DataFrame()
75
+ vif_data["feature"] = X.columns # type: ignore
76
+
77
+ with warnings.catch_warnings():
78
+ warnings.simplefilter("ignore", category=RuntimeWarning)
79
+
80
+ vif_data["VIF"] = [
81
+ variance_inflation_factor(X.values, i) for i in range(X.shape[1]) # type: ignore
82
+ ]
83
+
84
+ # Replace infinite values (perfect multicollinearity)
85
+ vif_data["VIF"] = vif_data["VIF"].replace([np.inf, -np.inf], 999.0)
86
+
87
+ # Drop the constant column
88
+ vif_data = vif_data[vif_data["feature"] != "const"]
89
+
90
+ # Add color coding
91
+ def vif_color(v: float) -> str:
92
+ if v >= 10:
93
+ return "red"
94
+ elif v >= 5:
95
+ return "gold"
96
+ else:
97
+ return "green"
98
+
99
+ vif_data["color"] = vif_data["VIF"].apply(vif_color)
100
+
101
+ # Sort by VIF descending
102
+ vif_data = vif_data.sort_values(by="VIF", ascending=False).reset_index(drop=True)
103
+
104
+ # Filter for plotting
105
+ plot_data = vif_data.head(max_features_to_plot)
106
+
107
+ if save_dir or show_plot:
108
+ if not plot_data.empty:
109
+ plt.figure(figsize=(10, 6))
110
+ plt.barh(
111
+ plot_data["feature"],
112
+ plot_data["VIF"],
113
+ color=plot_data["color"],
114
+ edgecolor='black'
115
+ )
116
+ plt.title("Variance Inflation Factor (VIF) per Feature", fontsize=fontsize+1)
117
+ plt.xlabel("VIF value", fontsize=fontsize)
118
+ plt.xticks(fontsize=fontsize)
119
+ plt.yticks(fontsize=fontsize)
120
+ plt.axvline(x=5, color='gold', linestyle='--', label='VIF = 5')
121
+ plt.axvline(x=10, color='red', linestyle='--', label='VIF = 10')
122
+ plt.xlim(0, 12)
123
+ plt.legend(loc='lower right', fontsize=fontsize-1)
124
+ plt.gca().invert_yaxis()
125
+ plt.grid(axis='x', linestyle='--', alpha=0.5)
126
+ plt.tight_layout()
127
+
128
+ if save_dir:
129
+ os.makedirs(save_dir, exist_ok=True)
130
+ if filename is None:
131
+ filename = "VIF_plot.svg"
132
+ else:
133
+ filename = sanitize_filename(filename)
134
+ if not filename.endswith(".svg"):
135
+ filename += ".svg"
136
+ save_path = os.path.join(save_dir, "VIF_" + filename)
137
+ plt.savefig(save_path, format='svg', bbox_inches='tight')
138
+ print(f"\tSaved VIF plot: '{filename}'")
139
+
140
+ if show_plot:
141
+ plt.show()
142
+ plt.close()
143
+
144
+ return vif_data.drop(columns="color")
145
+
146
+
147
+ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10.0) -> tuple[pd.DataFrame, list[str]]:
148
+ """
149
+ Drops columns from the original DataFrame based on their VIF values exceeding a given threshold.
150
+
151
+ Args:
152
+ df (pd.DataFrame): Original DataFrame containing the columns to test.
153
+ vif_df (pd.DataFrame): DataFrame with 'feature' and 'VIF' columns as returned by `compute_vif()`.
154
+ threshold (float): VIF threshold above which columns will be dropped.
155
+
156
+ Returns:
157
+ (tuple[pd.DataFrame, list[str]]):
158
+ - A new DataFrame with high-VIF columns removed.
159
+ - A list with dropped column names.
160
+ """
161
+ # Ensure expected structure
162
+ if 'feature' not in vif_df.columns or 'VIF' not in vif_df.columns:
163
+ raise ValueError("`vif_df` must contain 'feature' and 'VIF' columns.")
164
+
165
+ # Identify features to drop
166
+ to_drop = vif_df[vif_df["VIF"] > threshold]["feature"].tolist()
167
+ print(f"\tDropping {len(to_drop)} column(s) with VIF > {threshold}: {to_drop}")
168
+
169
+ result_df = df.drop(columns=to_drop)
170
+
171
+ if result_df.empty:
172
+ print(f"\t⚠️ Warning: All columns were dropped.")
173
+
174
+ return result_df, to_drop
175
+
176
+
177
+ def compute_vif_multi(input_directory: str,
178
+ output_plot_directory: str,
179
+ output_dataset_directory: Optional[str] = None,
180
+ use_columns: Optional[list[str]] = None,
181
+ ignore_columns: Optional[list[str]] = None,
182
+ max_features_to_plot: int = 20,
183
+ fontsize: int = 14):
184
+ """
185
+ Computes Variance Inflation Factors (VIF) for numeric columns in a directory with CSV files (loaded as pandas DataFrames).
186
+ Generates a bar plot of VIF values. Optionally drops columns with VIF >= 10 and saves as a new CSV file.
187
+
188
+ Args:
189
+ input_directory (str): Target directory with CSV files able to be loaded as DataFrame.
190
+ output_plot_directory (str): Save plots to this directory.
191
+ output_dataset_directory (str | None): If provided, saves new CSV files to this directory.
192
+ use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
193
+ ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
194
+ max_features_to_plot (int): Adjust the number of features shown in the plot.
195
+ fontsize (int): Base fontsize to scale title and labels on hte plot.
196
+
197
+ NOTE:
198
+ **Variance Inflation Factor (VIF)** quantifies the degree of multicollinearity among features in a dataset.
199
+ A VIF value indicates how much the variance of a regression coefficient is inflated due to linear dependence with other features.
200
+ A VIF of 1 suggests no correlation, values between 1 and 5 indicate moderate correlation, and values greater than 10 typically signal high multicollinearity, which may distort model interpretation and degrade performance.
201
+ """
202
+ if output_dataset_directory is not None:
203
+ os.makedirs(output_dataset_directory, exist_ok=True)
204
+
205
+ for df, df_name in yield_dataframes_from_dir(datasets_dir=input_directory):
206
+ vif_dataframe = compute_vif(df=df,
207
+ use_columns=use_columns,
208
+ ignore_columns=ignore_columns,
209
+ max_features_to_plot=max_features_to_plot,
210
+ fontsize=fontsize,
211
+ save_dir=output_plot_directory,
212
+ filename=df_name,
213
+ show_plot=False)
214
+
215
+ if output_dataset_directory is not None:
216
+ new_filename = 'VIF_' + df_name
217
+ result_df, dropped_cols = drop_vif_based(df=df, vif_df=vif_dataframe)
218
+
219
+ if len(dropped_cols) > 0:
220
+ save_dataframe(df=result_df, save_dir=output_dataset_directory, filename=new_filename)
221
+
222
+
223
+ def info():
224
+ _script_info(__all__)