dragon-ml-toolbox 1.3.0__py3-none-any.whl → 1.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-1.3.0.dist-info → dragon_ml_toolbox-1.3.2.dist-info}/METADATA +8 -4
- {dragon_ml_toolbox-1.3.0.dist-info → dragon_ml_toolbox-1.3.2.dist-info}/RECORD +7 -7
- ml_tools/MICE_imputation.py +75 -28
- {dragon_ml_toolbox-1.3.0.dist-info → dragon_ml_toolbox-1.3.2.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-1.3.0.dist-info → dragon_ml_toolbox-1.3.2.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-1.3.0.dist-info → dragon_ml_toolbox-1.3.2.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-1.3.0.dist-info → dragon_ml_toolbox-1.3.2.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version: 1.3.
|
|
3
|
+
Version: 1.3.2
|
|
4
4
|
Summary: A collection of tools for data science and machine learning projects
|
|
5
5
|
Author-email: Karl Loza <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -15,17 +15,21 @@ License-File: LICENSE-THIRD-PARTY.md
|
|
|
15
15
|
Requires-Dist: numpy<2.0
|
|
16
16
|
Requires-Dist: scikit-learn
|
|
17
17
|
Requires-Dist: openpyxl
|
|
18
|
-
Requires-Dist: miceforest
|
|
18
|
+
Requires-Dist: miceforest<7.0.0,>=6.0.0
|
|
19
|
+
Requires-Dist: plotnine<0.13,>=0.12
|
|
19
20
|
Requires-Dist: matplotlib
|
|
20
21
|
Requires-Dist: seaborn
|
|
21
22
|
Requires-Dist: pandas
|
|
22
23
|
Requires-Dist: polars
|
|
23
|
-
Requires-Dist:
|
|
24
|
+
Requires-Dist: imbalanced-learn
|
|
24
25
|
Requires-Dist: statsmodels
|
|
25
26
|
Requires-Dist: ipython
|
|
27
|
+
Requires-Dist: ipykernel
|
|
28
|
+
Requires-Dist: notebook
|
|
29
|
+
Requires-Dist: jupyterlab
|
|
26
30
|
Requires-Dist: joblib
|
|
27
31
|
Requires-Dist: xgboost
|
|
28
|
-
Requires-Dist: lightgbm
|
|
32
|
+
Requires-Dist: lightgbm<=4.5.0
|
|
29
33
|
Requires-Dist: shap
|
|
30
34
|
Provides-Extra: pytorch
|
|
31
35
|
Requires-Dist: torch; extra == "pytorch"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
dragon_ml_toolbox-1.3.
|
|
2
|
-
dragon_ml_toolbox-1.3.
|
|
3
|
-
ml_tools/MICE_imputation.py,sha256=
|
|
1
|
+
dragon_ml_toolbox-1.3.2.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
+
dragon_ml_toolbox-1.3.2.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=e1Hg5ZtaBpDV7ZvxhLe1ac28l7nMjvi1MSE5YvB1s-o,1472
|
|
3
|
+
ml_tools/MICE_imputation.py,sha256=71Kdi5rhPePIT5rJKIyRCM7ORPSjeujQCzKcLIwXs90,9428
|
|
4
4
|
ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
5
|
ml_tools/data_exploration.py,sha256=laTNbN5_xlhqWiKfF-cJ9yMZ8zAM2a-AryqgiIQBBLg,26649
|
|
6
6
|
ml_tools/datasetmaster.py,sha256=VUneKshnmjOGbtqVVGTFcIMRKF3s6ZDYrosIYKDjD80,28956
|
|
@@ -12,7 +12,7 @@ ml_tools/pytorch_models.py,sha256=Oykw02sOZLCjvSadQd64UGesBN7kq0x1EGXHusvYiQI,99
|
|
|
12
12
|
ml_tools/trainer.py,sha256=Zd7AaHeoNd8dEas2JChWoHaCUpWUVRDUMybuHaKJ0XY,16740
|
|
13
13
|
ml_tools/utilities.py,sha256=mG_--EFplfI9H7OhrWI8VkdNJtTbs4Wbz32xvcFWps8,5518
|
|
14
14
|
ml_tools/vision_helpers.py,sha256=lBAW6dzAK-HOswAt1fU_tfP9hkNLY5D8c_I_7hhEXno,7528
|
|
15
|
-
dragon_ml_toolbox-1.3.
|
|
16
|
-
dragon_ml_toolbox-1.3.
|
|
17
|
-
dragon_ml_toolbox-1.3.
|
|
18
|
-
dragon_ml_toolbox-1.3.
|
|
15
|
+
dragon_ml_toolbox-1.3.2.dist-info/METADATA,sha256=NgNKZD1v97kBBdE96OJELolvlAXviJ-DgJvZAjjy5Ik,2309
|
|
16
|
+
dragon_ml_toolbox-1.3.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
17
|
+
dragon_ml_toolbox-1.3.2.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
18
|
+
dragon_ml_toolbox-1.3.2.dist-info/RECORD,,
|
ml_tools/MICE_imputation.py
CHANGED
|
@@ -3,15 +3,15 @@ import miceforest as mf
|
|
|
3
3
|
import os
|
|
4
4
|
import matplotlib.pyplot as plt
|
|
5
5
|
import numpy as np
|
|
6
|
-
from ml_tools.utilities import load_dataframe, list_csv_paths
|
|
7
|
-
|
|
6
|
+
from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename
|
|
7
|
+
from plotnine import ggplot, labs, theme, element_blank # type: ignore
|
|
8
8
|
|
|
9
9
|
def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
|
|
10
10
|
|
|
11
11
|
# Initialize kernel with number of imputed datasets to generate
|
|
12
12
|
kernel = mf.ImputationKernel(
|
|
13
13
|
data=df,
|
|
14
|
-
|
|
14
|
+
num_datasets=resulting_datasets,
|
|
15
15
|
random_state=random_state
|
|
16
16
|
)
|
|
17
17
|
|
|
@@ -21,6 +21,9 @@ def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterat
|
|
|
21
21
|
# Retrieve the imputed datasets
|
|
22
22
|
imputed_datasets = [kernel.complete_data(dataset=i) for i in range(resulting_datasets)]
|
|
23
23
|
|
|
24
|
+
if imputed_datasets is None or len(imputed_datasets) == 0:
|
|
25
|
+
raise ValueError("No imputed datasets were generated. Check the MICE process.")
|
|
26
|
+
|
|
24
27
|
if resulting_datasets == 1:
|
|
25
28
|
imputed_dataset_names = [f"{df_name}_imputed"]
|
|
26
29
|
else:
|
|
@@ -28,8 +31,8 @@ def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterat
|
|
|
28
31
|
|
|
29
32
|
# Ensure indexes match
|
|
30
33
|
for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
|
|
31
|
-
assert imputed_df.shape[0] == df.shape[0], f"Row count mismatch in dataset {subname}"
|
|
32
|
-
assert all(imputed_df.index == df.index), f"Index mismatch in dataset {subname}"
|
|
34
|
+
assert imputed_df.shape[0] == df.shape[0], f"Row count mismatch in dataset {subname}" # type: ignore
|
|
35
|
+
assert all(imputed_df.index == df.index), f"Index mismatch in dataset {subname}" # type: ignore
|
|
33
36
|
# print("✅ All imputed datasets match the original DataFrame indexes.")
|
|
34
37
|
|
|
35
38
|
return kernel, imputed_datasets, imputed_dataset_names
|
|
@@ -51,46 +54,65 @@ def get_na_column_names(df: pd.DataFrame):
|
|
|
51
54
|
|
|
52
55
|
|
|
53
56
|
#Convergence diagnostic
|
|
54
|
-
def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_names: list[str], column_names: list[str], root_dir: str):
|
|
57
|
+
def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_names: list[str], column_names: list[str], root_dir: str, fontsize: int=16):
|
|
58
|
+
"""
|
|
59
|
+
Generate and save convergence diagnostic plots for imputed variables.
|
|
60
|
+
|
|
61
|
+
Parameters:
|
|
62
|
+
- kernel: Trained miceforest.ImputationKernel.
|
|
63
|
+
- imputed_dataset_names: Names assigned to each imputed dataset.
|
|
64
|
+
- column_names: List of feature names to track over iterations.
|
|
65
|
+
- root_dir: Directory to save convergence plots.
|
|
66
|
+
"""
|
|
55
67
|
# get number of iterations used
|
|
56
68
|
iterations_cap = kernel.iteration_count()
|
|
69
|
+
dataset_count = kernel.num_datasets
|
|
70
|
+
|
|
71
|
+
if dataset_count != len(imputed_dataset_names):
|
|
72
|
+
raise ValueError(f"Expected {dataset_count} names in imputed_dataset_names, got {len(imputed_dataset_names)}")
|
|
57
73
|
|
|
58
74
|
# Check path
|
|
59
75
|
os.makedirs(root_dir, exist_ok=True)
|
|
60
76
|
|
|
77
|
+
# Styling parameters
|
|
78
|
+
label_font = {'size': fontsize, 'weight': 'bold'}
|
|
79
|
+
|
|
61
80
|
# iterate over each imputed dataset
|
|
62
|
-
for dataset_id, imputed_dataset_name in zip(range(
|
|
81
|
+
for dataset_id, imputed_dataset_name in zip(range(dataset_count), imputed_dataset_names):
|
|
63
82
|
#Check directory for current dataset
|
|
64
83
|
dataset_file_dir = f"Convergence_Metrics_{imputed_dataset_name}"
|
|
65
84
|
local_save_dir = os.path.join(root_dir, dataset_file_dir)
|
|
66
|
-
|
|
67
|
-
os.makedirs(local_save_dir)
|
|
85
|
+
os.makedirs(local_save_dir, exist_ok=True)
|
|
68
86
|
|
|
69
87
|
for feature_name in column_names:
|
|
70
88
|
means_per_iteration = []
|
|
71
89
|
for iteration in range(iterations_cap):
|
|
72
90
|
current_imputed = kernel.complete_data(dataset=dataset_id, iteration=iteration)
|
|
73
|
-
means_per_iteration.append(np.mean(current_imputed[feature_name]))
|
|
74
|
-
|
|
91
|
+
means_per_iteration.append(np.mean(current_imputed[feature_name])) # type: ignore
|
|
92
|
+
|
|
93
|
+
plt.figure(figsize=(10, 8))
|
|
75
94
|
plt.plot(means_per_iteration, marker='o')
|
|
76
|
-
plt.xlabel("Iteration")
|
|
77
|
-
plt.ylabel("Mean of Imputed Values")
|
|
78
|
-
plt.title(f"Mean Convergence for '{feature_name}'")
|
|
95
|
+
plt.xlabel("Iteration", **label_font)
|
|
96
|
+
plt.ylabel("Mean of Imputed Values", **label_font)
|
|
97
|
+
plt.title(f"Mean Convergence for '{feature_name}'", **label_font)
|
|
79
98
|
|
|
80
99
|
# Adjust plot display for the X axis
|
|
81
100
|
_ticks = np.arange(iterations_cap)
|
|
82
101
|
_labels = np.arange(1, iterations_cap + 1)
|
|
83
102
|
plt.xticks(ticks=_ticks, labels=_labels)
|
|
103
|
+
plt.grid(True)
|
|
84
104
|
|
|
85
|
-
|
|
105
|
+
feature_save_name = sanitize_filename(feature_name)
|
|
106
|
+
|
|
107
|
+
save_path = os.path.join(local_save_dir, feature_save_name + ".svg")
|
|
86
108
|
plt.savefig(save_path, bbox_inches='tight', format="svg")
|
|
87
109
|
plt.close()
|
|
88
110
|
|
|
89
|
-
print(f"{dataset_file_dir} completed.")
|
|
111
|
+
print(f"\t{dataset_file_dir} completed.")
|
|
90
112
|
|
|
91
113
|
|
|
92
114
|
# Imputed distributions
|
|
93
|
-
def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_dir: str, column_names: list[str], one_plot: bool=False, fontsize: int=
|
|
115
|
+
def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_dir: str, column_names: list[str], one_plot: bool=False, fontsize: int=14):
|
|
94
116
|
'''
|
|
95
117
|
It works using miceforest's authors implementation of the method `.plot_imputed_distributions()`.
|
|
96
118
|
|
|
@@ -106,12 +128,35 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
|
|
|
106
128
|
legend_kwargs = {'frameon': True, 'facecolor': 'white', 'framealpha': 0.8}
|
|
107
129
|
label_font = {'size': fontsize, 'weight': 'bold'}
|
|
108
130
|
|
|
109
|
-
def _process_figure(fig, filename):
|
|
131
|
+
def _process_figure(fig, filename: str):
|
|
110
132
|
"""Helper function to add labels and legends to a figure"""
|
|
111
|
-
|
|
133
|
+
|
|
134
|
+
if not isinstance(fig, ggplot):
|
|
135
|
+
raise TypeError("Expected a plotnine.ggplot object")
|
|
136
|
+
|
|
137
|
+
# Edit labels and title
|
|
138
|
+
fig = fig + theme(
|
|
139
|
+
plot_title=element_blank(), # removes labs(title=...)
|
|
140
|
+
strip_text=element_blank() # removes facet_wrap labels
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
fig = fig + labs(y="", x="")
|
|
144
|
+
|
|
145
|
+
# Render to matplotlib figure
|
|
146
|
+
fig = fig.draw()
|
|
147
|
+
|
|
148
|
+
if not hasattr(fig, 'axes') or len(fig.axes) == 0:
|
|
149
|
+
raise RuntimeError("Rendered figure has no axes to modify")
|
|
150
|
+
|
|
151
|
+
if filename == "Combined_Distributions":
|
|
152
|
+
custom_xlabel = "Feature Values"
|
|
153
|
+
else:
|
|
154
|
+
custom_xlabel = filename
|
|
155
|
+
|
|
156
|
+
for ax in fig.axes:
|
|
112
157
|
# Set axis labels
|
|
113
|
-
ax.set_xlabel(
|
|
114
|
-
ax.set_ylabel('
|
|
158
|
+
ax.set_xlabel(custom_xlabel, **label_font)
|
|
159
|
+
ax.set_ylabel('Distribution', **label_font)
|
|
115
160
|
|
|
116
161
|
# Add legend based on line colors
|
|
117
162
|
lines = ax.get_lines()
|
|
@@ -122,26 +167,28 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
|
|
|
122
167
|
ax.legend(**legend_kwargs)
|
|
123
168
|
|
|
124
169
|
# Adjust layout and save
|
|
125
|
-
fig.tight_layout()
|
|
170
|
+
# fig.tight_layout()
|
|
171
|
+
# fig.subplots_adjust(bottom=0.2, left=0.2) # Optional, depending on overflow
|
|
126
172
|
fig.savefig(
|
|
127
|
-
os.path.join(local_save_dir, filename),
|
|
173
|
+
os.path.join(local_save_dir, filename + ".svg"),
|
|
128
174
|
format='svg',
|
|
129
175
|
bbox_inches='tight',
|
|
130
|
-
pad_inches=0
|
|
176
|
+
pad_inches=0.1
|
|
131
177
|
)
|
|
132
178
|
plt.close(fig)
|
|
133
|
-
|
|
179
|
+
|
|
134
180
|
if one_plot:
|
|
135
181
|
# Generate combined plot
|
|
136
182
|
fig = kernel.plot_imputed_distributions(variables=column_names)
|
|
137
|
-
_process_figure(fig, "Combined_Distributions
|
|
183
|
+
_process_figure(fig, "Combined_Distributions")
|
|
138
184
|
# Generate individual plots per feature
|
|
139
185
|
else:
|
|
140
186
|
for feature in column_names:
|
|
141
187
|
fig = kernel.plot_imputed_distributions(variables=[feature])
|
|
142
|
-
|
|
188
|
+
feature_save_name = sanitize_filename(feature)
|
|
189
|
+
_process_figure(fig, feature_save_name)
|
|
143
190
|
|
|
144
|
-
print("
|
|
191
|
+
print("\tImputed distributions saved successfully.")
|
|
145
192
|
|
|
146
193
|
|
|
147
194
|
def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_dir: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|