dragon-ml-toolbox 1.3.2__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/METADATA +19 -2
- dragon_ml_toolbox-1.4.1.dist-info/RECORD +19 -0
- ml_tools/MICE_imputation.py +24 -6
- ml_tools/VIF_factor.py +224 -0
- ml_tools/data_exploration.py +74 -286
- ml_tools/datasetmaster.py +13 -1
- ml_tools/ensemble_learning.py +128 -129
- ml_tools/handle_excel.py +32 -9
- ml_tools/logger.py +10 -1
- ml_tools/particle_swarm_optimization.py +71 -34
- ml_tools/pytorch_models.py +13 -1
- ml_tools/trainer.py +10 -30
- ml_tools/utilities.py +122 -14
- ml_tools/vision_helpers.py +14 -1
- dragon_ml_toolbox-1.3.2.dist-info/RECORD +0 -18
- {dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.4.1
|
|
4
4
|
Summary: A collection of tools for data science and machine learning projects
|
|
5
5
|
Author-email: Karl Loza <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -27,6 +27,7 @@ Requires-Dist: ipython
|
|
|
27
27
|
Requires-Dist: ipykernel
|
|
28
28
|
Requires-Dist: notebook
|
|
29
29
|
Requires-Dist: jupyterlab
|
|
30
|
+
Requires-Dist: ipywidgets
|
|
30
31
|
Requires-Dist: joblib
|
|
31
32
|
Requires-Dist: xgboost
|
|
32
33
|
Requires-Dist: lightgbm<=4.5.0
|
|
@@ -79,7 +80,7 @@ Clone the repository and install in editable mode with optional dependencies:
|
|
|
79
80
|
```bash
|
|
80
81
|
git clone https://github.com/DrAg0n-BoRn/ML_tools.git
|
|
81
82
|
cd ML_tools
|
|
82
|
-
pip install -e
|
|
83
|
+
pip install -e .
|
|
83
84
|
```
|
|
84
85
|
|
|
85
86
|
## Usage
|
|
@@ -90,3 +91,19 @@ After installation, import modules like this:
|
|
|
90
91
|
from ml_tools.utilities import sanitize_filename
|
|
91
92
|
from ml_tools.logger import custom_logger
|
|
92
93
|
```
|
|
94
|
+
|
|
95
|
+
## Available modules
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
data_exploration
|
|
99
|
+
datasetmaster
|
|
100
|
+
ensemble_learning
|
|
101
|
+
handle_excel
|
|
102
|
+
logger
|
|
103
|
+
MICE_imputation
|
|
104
|
+
particle_swarm_optimization
|
|
105
|
+
trainer
|
|
106
|
+
utilities
|
|
107
|
+
VIF_factor
|
|
108
|
+
vision_helpers
|
|
109
|
+
```
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
dragon_ml_toolbox-1.4.1.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
+
dragon_ml_toolbox-1.4.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=e1Hg5ZtaBpDV7ZvxhLe1ac28l7nMjvi1MSE5YvB1s-o,1472
|
|
3
|
+
ml_tools/MICE_imputation.py,sha256=CK0tYZ_kQkdETohOlhI7RP7oFkJTXrP-XtIxb--dzpU,9726
|
|
4
|
+
ml_tools/VIF_factor.py,sha256=LQWr1P8WYij07FX_3RZC6Rr22bfAMnrt0Lhvi7SbBpY,9846
|
|
5
|
+
ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
ml_tools/data_exploration.py,sha256=FXP5i6bQo8J3RCyLRmlX-qJVh4VH8DbMjrdUmyd1mF0,18708
|
|
7
|
+
ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
|
|
8
|
+
ml_tools/ensemble_learning.py,sha256=khXXRiR7boWwI4CAvb2bxzS3fhLADNETMOiRe3ihZ4Y,28821
|
|
9
|
+
ml_tools/handle_excel.py,sha256=ZJui5__0rc2T8UGHTheqZGhKmdVZ7Q2I54IoYCjAqJw,12612
|
|
10
|
+
ml_tools/logger.py,sha256=NOtL3YSuffAGmpTpXjY-uJjqFLdRG_jpL7MDyloBw9c,4712
|
|
11
|
+
ml_tools/particle_swarm_optimization.py,sha256=714kZo6lvUvRaPTtj6kJGecZwHcehcSkLysokXAf3No,20706
|
|
12
|
+
ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
|
|
13
|
+
ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
|
|
14
|
+
ml_tools/utilities.py,sha256=z2JPy4GM2YBLUC0sPq7aNLuesPFAQu5KNcsgmuOywdU,8738
|
|
15
|
+
ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
|
|
16
|
+
dragon_ml_toolbox-1.4.1.dist-info/METADATA,sha256=0XdPwNWe81rCvJLJfSS5XvB2ZdJKpBLLoqMU5uxYLMc,2516
|
|
17
|
+
dragon_ml_toolbox-1.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
18
|
+
dragon_ml_toolbox-1.4.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
19
|
+
dragon_ml_toolbox-1.4.1.dist-info/RECORD,,
|
ml_tools/MICE_imputation.py
CHANGED
|
@@ -3,9 +3,20 @@ import miceforest as mf
|
|
|
3
3
|
import os
|
|
4
4
|
import matplotlib.pyplot as plt
|
|
5
5
|
import numpy as np
|
|
6
|
-
from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename
|
|
6
|
+
from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info
|
|
7
7
|
from plotnine import ggplot, labs, theme, element_blank # type: ignore
|
|
8
8
|
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"apply_mice",
|
|
12
|
+
"save_imputed_datasets",
|
|
13
|
+
"get_na_column_names",
|
|
14
|
+
"get_convergence_diagnostic",
|
|
15
|
+
"get_imputed_distributions",
|
|
16
|
+
"run_mice_pipeline"
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
|
|
9
20
|
def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
|
|
10
21
|
|
|
11
22
|
# Initialize kernel with number of imputed datasets to generate
|
|
@@ -120,7 +131,7 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
|
|
|
120
131
|
'''
|
|
121
132
|
# Check path
|
|
122
133
|
os.makedirs(root_dir, exist_ok=True)
|
|
123
|
-
local_save_dir = os.path.join(root_dir, f"Distribution_Metrics_{df_name}")
|
|
134
|
+
local_save_dir = os.path.join(root_dir, f"Distribution_Metrics_{df_name}_imputed")
|
|
124
135
|
if not os.path.isdir(local_save_dir):
|
|
125
136
|
os.makedirs(local_save_dir)
|
|
126
137
|
|
|
@@ -169,8 +180,12 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
|
|
|
169
180
|
# Adjust layout and save
|
|
170
181
|
# fig.tight_layout()
|
|
171
182
|
# fig.subplots_adjust(bottom=0.2, left=0.2) # Optional, depending on overflow
|
|
183
|
+
|
|
184
|
+
# sanitize savename
|
|
185
|
+
feature_save_name = sanitize_filename(filename)
|
|
186
|
+
|
|
172
187
|
fig.savefig(
|
|
173
|
-
os.path.join(local_save_dir,
|
|
188
|
+
os.path.join(local_save_dir, feature_save_name + ".svg"),
|
|
174
189
|
format='svg',
|
|
175
190
|
bbox_inches='tight',
|
|
176
191
|
pad_inches=0.1
|
|
@@ -185,8 +200,7 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
|
|
|
185
200
|
else:
|
|
186
201
|
for feature in column_names:
|
|
187
202
|
fig = kernel.plot_imputed_distributions(variables=[feature])
|
|
188
|
-
|
|
189
|
-
_process_figure(fig, feature_save_name)
|
|
203
|
+
_process_figure(fig, feature)
|
|
190
204
|
|
|
191
205
|
print("\tImputed distributions saved successfully.")
|
|
192
206
|
|
|
@@ -207,7 +221,7 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
|
|
|
207
221
|
if os.path.isfile(df_path_or_dir):
|
|
208
222
|
all_file_paths = [df_path_or_dir]
|
|
209
223
|
elif os.path.isdir(df_path_or_dir):
|
|
210
|
-
all_file_paths
|
|
224
|
+
all_file_paths = list(list_csv_paths(df_path_or_dir).values())
|
|
211
225
|
else:
|
|
212
226
|
raise ValueError(f"Invalid path or directory: {df_path_or_dir}")
|
|
213
227
|
|
|
@@ -223,3 +237,7 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
|
|
|
223
237
|
get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_dir)
|
|
224
238
|
|
|
225
239
|
get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_dir, column_names=imputed_column_names)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def info():
|
|
243
|
+
_script_info(__all__)
|
ml_tools/VIF_factor.py
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import numpy as np
|
|
4
|
+
import matplotlib.pyplot as plt
|
|
5
|
+
from typing import Optional
|
|
6
|
+
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
|
7
|
+
from statsmodels.tools.tools import add_constant
|
|
8
|
+
import warnings
|
|
9
|
+
import os
|
|
10
|
+
from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe, _script_info
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"compute_vif",
|
|
15
|
+
"drop_vif_based",
|
|
16
|
+
"compute_vif_multi"
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def compute_vif(
|
|
21
|
+
df: pd.DataFrame,
|
|
22
|
+
use_columns: Optional[list[str]] = None,
|
|
23
|
+
ignore_columns: Optional[list[str]] = None,
|
|
24
|
+
max_features_to_plot: int = 20,
|
|
25
|
+
save_dir: Optional[str] = None,
|
|
26
|
+
filename: Optional[str] = None,
|
|
27
|
+
fontsize: int = 14,
|
|
28
|
+
show_plot: bool = True,
|
|
29
|
+
) -> pd.DataFrame:
|
|
30
|
+
"""
|
|
31
|
+
Computes Variance Inflation Factors (VIF) for numeric columns in a DataFrame. Optionally, generates a bar plot of VIF values.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
df (pd.DataFrame): The input DataFrame.
|
|
35
|
+
use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
|
|
36
|
+
ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
|
|
37
|
+
max_features_to_plot (int): Adjust the number of features shown in the plot.
|
|
38
|
+
save_dir (str | None): Directory to save the plot as SVG. If None, the plot is not saved.
|
|
39
|
+
filename (str | None): Optional filename for saving the plot. Defaults to "VIF_plot.svg".
|
|
40
|
+
fontsize (int): Base fontsize to scale title and labels on the plot.
|
|
41
|
+
show_plot (bool): Display plot.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
pd.DataFrame: DataFrame with features and their corresponding VIF values.
|
|
45
|
+
|
|
46
|
+
NOTE:
|
|
47
|
+
**Variance Inflation Factor (VIF)** quantifies the degree of multicollinearity among features in a dataset.
|
|
48
|
+
A VIF value indicates how much the variance of a regression coefficient is inflated due to linear dependence with other features.
|
|
49
|
+
A VIF of 1 suggests no correlation, values between 1 and 5 indicate moderate correlation, and values greater than 10 typically signal high multicollinearity, which may distort model interpretation and degrade performance.
|
|
50
|
+
"""
|
|
51
|
+
ground_truth_cols = df.columns.to_list()
|
|
52
|
+
if use_columns is None:
|
|
53
|
+
sanitized_columns = df.select_dtypes(include='number').columns.tolist()
|
|
54
|
+
missing_features = set(ground_truth_cols) - set(sanitized_columns)
|
|
55
|
+
if missing_features:
|
|
56
|
+
print(f"⚠️ These columns are not Numeric:\n{missing_features}")
|
|
57
|
+
else:
|
|
58
|
+
sanitized_columns = list()
|
|
59
|
+
for feature in use_columns:
|
|
60
|
+
if feature not in ground_truth_cols:
|
|
61
|
+
print(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
|
|
62
|
+
else:
|
|
63
|
+
sanitized_columns.append(feature)
|
|
64
|
+
|
|
65
|
+
if ignore_columns is not None and use_columns is None:
|
|
66
|
+
missing_ignore = set(ignore_columns) - set(ground_truth_cols)
|
|
67
|
+
if missing_ignore:
|
|
68
|
+
print(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
|
|
69
|
+
sanitized_columns = [f for f in sanitized_columns if f not in ignore_columns]
|
|
70
|
+
|
|
71
|
+
X = df[sanitized_columns].copy()
|
|
72
|
+
X = add_constant(X, has_constant='add')
|
|
73
|
+
|
|
74
|
+
vif_data = pd.DataFrame()
|
|
75
|
+
vif_data["feature"] = X.columns # type: ignore
|
|
76
|
+
|
|
77
|
+
with warnings.catch_warnings():
|
|
78
|
+
warnings.simplefilter("ignore", category=RuntimeWarning)
|
|
79
|
+
|
|
80
|
+
vif_data["VIF"] = [
|
|
81
|
+
variance_inflation_factor(X.values, i) for i in range(X.shape[1]) # type: ignore
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
# Replace infinite values (perfect multicollinearity)
|
|
85
|
+
vif_data["VIF"] = vif_data["VIF"].replace([np.inf, -np.inf], 999.0)
|
|
86
|
+
|
|
87
|
+
# Drop the constant column
|
|
88
|
+
vif_data = vif_data[vif_data["feature"] != "const"]
|
|
89
|
+
|
|
90
|
+
# Add color coding
|
|
91
|
+
def vif_color(v: float) -> str:
|
|
92
|
+
if v >= 10:
|
|
93
|
+
return "red"
|
|
94
|
+
elif v >= 5:
|
|
95
|
+
return "gold"
|
|
96
|
+
else:
|
|
97
|
+
return "green"
|
|
98
|
+
|
|
99
|
+
vif_data["color"] = vif_data["VIF"].apply(vif_color)
|
|
100
|
+
|
|
101
|
+
# Sort by VIF descending
|
|
102
|
+
vif_data = vif_data.sort_values(by="VIF", ascending=False).reset_index(drop=True)
|
|
103
|
+
|
|
104
|
+
# Filter for plotting
|
|
105
|
+
plot_data = vif_data.head(max_features_to_plot)
|
|
106
|
+
|
|
107
|
+
if save_dir or show_plot:
|
|
108
|
+
if not plot_data.empty:
|
|
109
|
+
plt.figure(figsize=(10, 6))
|
|
110
|
+
plt.barh(
|
|
111
|
+
plot_data["feature"],
|
|
112
|
+
plot_data["VIF"],
|
|
113
|
+
color=plot_data["color"],
|
|
114
|
+
edgecolor='black'
|
|
115
|
+
)
|
|
116
|
+
plt.title("Variance Inflation Factor (VIF) per Feature", fontsize=fontsize+1)
|
|
117
|
+
plt.xlabel("VIF value", fontsize=fontsize)
|
|
118
|
+
plt.xticks(fontsize=fontsize)
|
|
119
|
+
plt.yticks(fontsize=fontsize)
|
|
120
|
+
plt.axvline(x=5, color='gold', linestyle='--', label='VIF = 5')
|
|
121
|
+
plt.axvline(x=10, color='red', linestyle='--', label='VIF = 10')
|
|
122
|
+
plt.xlim(0, 12)
|
|
123
|
+
plt.legend(loc='lower right', fontsize=fontsize-1)
|
|
124
|
+
plt.gca().invert_yaxis()
|
|
125
|
+
plt.grid(axis='x', linestyle='--', alpha=0.5)
|
|
126
|
+
plt.tight_layout()
|
|
127
|
+
|
|
128
|
+
if save_dir:
|
|
129
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
130
|
+
if filename is None:
|
|
131
|
+
filename = "VIF_plot.svg"
|
|
132
|
+
else:
|
|
133
|
+
filename = sanitize_filename(filename)
|
|
134
|
+
if not filename.endswith(".svg"):
|
|
135
|
+
filename += ".svg"
|
|
136
|
+
save_path = os.path.join(save_dir, "VIF_" + filename)
|
|
137
|
+
plt.savefig(save_path, format='svg', bbox_inches='tight')
|
|
138
|
+
print(f"\tSaved VIF plot: '{filename}'")
|
|
139
|
+
|
|
140
|
+
if show_plot:
|
|
141
|
+
plt.show()
|
|
142
|
+
plt.close()
|
|
143
|
+
|
|
144
|
+
return vif_data.drop(columns="color")
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10.0) -> tuple[pd.DataFrame, list[str]]:
|
|
148
|
+
"""
|
|
149
|
+
Drops columns from the original DataFrame based on their VIF values exceeding a given threshold.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
df (pd.DataFrame): Original DataFrame containing the columns to test.
|
|
153
|
+
vif_df (pd.DataFrame): DataFrame with 'feature' and 'VIF' columns as returned by `compute_vif()`.
|
|
154
|
+
threshold (float): VIF threshold above which columns will be dropped.
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
(tuple[pd.DataFrame, list[str]]):
|
|
158
|
+
- A new DataFrame with high-VIF columns removed.
|
|
159
|
+
- A list with dropped column names.
|
|
160
|
+
"""
|
|
161
|
+
# Ensure expected structure
|
|
162
|
+
if 'feature' not in vif_df.columns or 'VIF' not in vif_df.columns:
|
|
163
|
+
raise ValueError("`vif_df` must contain 'feature' and 'VIF' columns.")
|
|
164
|
+
|
|
165
|
+
# Identify features to drop
|
|
166
|
+
to_drop = vif_df[vif_df["VIF"] > threshold]["feature"].tolist()
|
|
167
|
+
print(f"\tDropping {len(to_drop)} column(s) with VIF > {threshold}: {to_drop}")
|
|
168
|
+
|
|
169
|
+
result_df = df.drop(columns=to_drop)
|
|
170
|
+
|
|
171
|
+
if result_df.empty:
|
|
172
|
+
print(f"\t⚠️ Warning: All columns were dropped.")
|
|
173
|
+
|
|
174
|
+
return result_df, to_drop
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def compute_vif_multi(input_directory: str,
|
|
178
|
+
output_plot_directory: str,
|
|
179
|
+
output_dataset_directory: Optional[str] = None,
|
|
180
|
+
use_columns: Optional[list[str]] = None,
|
|
181
|
+
ignore_columns: Optional[list[str]] = None,
|
|
182
|
+
max_features_to_plot: int = 20,
|
|
183
|
+
fontsize: int = 14):
|
|
184
|
+
"""
|
|
185
|
+
Computes Variance Inflation Factors (VIF) for numeric columns in a directory with CSV files (loaded as pandas DataFrames).
|
|
186
|
+
Generates a bar plot of VIF values. Optionally drops columns with VIF >= 10 and saves as a new CSV file.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
input_directory (str): Target directory with CSV files able to be loaded as DataFrame.
|
|
190
|
+
output_plot_directory (str): Save plots to this directory.
|
|
191
|
+
output_dataset_directory (str | None): If provided, saves new CSV files to this directory.
|
|
192
|
+
use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
|
|
193
|
+
ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
|
|
194
|
+
max_features_to_plot (int): Adjust the number of features shown in the plot.
|
|
195
|
+
fontsize (int): Base fontsize to scale title and labels on hte plot.
|
|
196
|
+
|
|
197
|
+
NOTE:
|
|
198
|
+
**Variance Inflation Factor (VIF)** quantifies the degree of multicollinearity among features in a dataset.
|
|
199
|
+
A VIF value indicates how much the variance of a regression coefficient is inflated due to linear dependence with other features.
|
|
200
|
+
A VIF of 1 suggests no correlation, values between 1 and 5 indicate moderate correlation, and values greater than 10 typically signal high multicollinearity, which may distort model interpretation and degrade performance.
|
|
201
|
+
"""
|
|
202
|
+
if output_dataset_directory is not None:
|
|
203
|
+
os.makedirs(output_dataset_directory, exist_ok=True)
|
|
204
|
+
|
|
205
|
+
for df, df_name in yield_dataframes_from_dir(datasets_dir=input_directory):
|
|
206
|
+
vif_dataframe = compute_vif(df=df,
|
|
207
|
+
use_columns=use_columns,
|
|
208
|
+
ignore_columns=ignore_columns,
|
|
209
|
+
max_features_to_plot=max_features_to_plot,
|
|
210
|
+
fontsize=fontsize,
|
|
211
|
+
save_dir=output_plot_directory,
|
|
212
|
+
filename=df_name,
|
|
213
|
+
show_plot=False)
|
|
214
|
+
|
|
215
|
+
if output_dataset_directory is not None:
|
|
216
|
+
new_filename = 'VIF_' + df_name
|
|
217
|
+
result_df, dropped_cols = drop_vif_based(df=df, vif_df=vif_dataframe)
|
|
218
|
+
|
|
219
|
+
if len(dropped_cols) > 0:
|
|
220
|
+
save_dataframe(df=result_df, save_dir=output_dataset_directory, filename=new_filename)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def info():
|
|
224
|
+
_script_info(__all__)
|