dragon-ml-toolbox 1.4.1__py3-none-any.whl → 1.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-1.4.1.dist-info → dragon_ml_toolbox-1.4.3.dist-info}/METADATA +1 -1
- dragon_ml_toolbox-1.4.3.dist-info/RECORD +19 -0
- {dragon_ml_toolbox-1.4.1.dist-info → dragon_ml_toolbox-1.4.3.dist-info}/licenses/LICENSE-THIRD-PARTY.md +6 -1
- ml_tools/MICE_imputation.py +22 -14
- ml_tools/data_exploration.py +41 -8
- ml_tools/ensemble_learning.py +446 -187
- ml_tools/particle_swarm_optimization.py +43 -52
- ml_tools/utilities.py +44 -8
- dragon_ml_toolbox-1.4.1.dist-info/RECORD +0 -19
- {dragon_ml_toolbox-1.4.1.dist-info → dragon_ml_toolbox-1.4.3.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-1.4.1.dist-info → dragon_ml_toolbox-1.4.3.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-1.4.1.dist-info → dragon_ml_toolbox-1.4.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
dragon_ml_toolbox-1.4.3.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
+
dragon_ml_toolbox-1.4.3.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=jDnniT0tgD0uw1NpjibsPF-qK3wmOKgTykLG2iNQU7E,1840
|
|
3
|
+
ml_tools/MICE_imputation.py,sha256=3CN_Z5NnQnr9BQOBcccIV13BcV-zRSvWUpYXoMZpPt8,10142
|
|
4
|
+
ml_tools/VIF_factor.py,sha256=LQWr1P8WYij07FX_3RZC6Rr22bfAMnrt0Lhvi7SbBpY,9846
|
|
5
|
+
ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
ml_tools/data_exploration.py,sha256=iRMyn-H0ffjhLkL-B5zKSb1tlyT4bKm0H4vE_GMaXP0,19903
|
|
7
|
+
ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
|
|
8
|
+
ml_tools/ensemble_learning.py,sha256=5CCd8w0j-uDkf7ToN2ENT_KdZbB8ZQUFYlrKN-OHUxA,37533
|
|
9
|
+
ml_tools/handle_excel.py,sha256=ZJui5__0rc2T8UGHTheqZGhKmdVZ7Q2I54IoYCjAqJw,12612
|
|
10
|
+
ml_tools/logger.py,sha256=NOtL3YSuffAGmpTpXjY-uJjqFLdRG_jpL7MDyloBw9c,4712
|
|
11
|
+
ml_tools/particle_swarm_optimization.py,sha256=g_KwPQL77HuVwceABP17RsF__qLmNAp2YVsXOxmFEOM,20034
|
|
12
|
+
ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
|
|
13
|
+
ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
|
|
14
|
+
ml_tools/utilities.py,sha256=r7uEmo38Imly56BP3-Jv6dFJvLsbGipeBlkiZx2fcNQ,10189
|
|
15
|
+
ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
|
|
16
|
+
dragon_ml_toolbox-1.4.3.dist-info/METADATA,sha256=l0uOaYlimIH_YCT89C2mOkHaiKDEtq9XxhHpbcMCppU,2516
|
|
17
|
+
dragon_ml_toolbox-1.4.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
18
|
+
dragon_ml_toolbox-1.4.3.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
19
|
+
dragon_ml_toolbox-1.4.3.dist-info/RECORD,,
|
|
@@ -8,7 +8,12 @@ This project depends on the following third-party packages. Each is governed by
|
|
|
8
8
|
- [seaborn](https://github.com/mwaskom/seaborn/blob/main/LICENSE)
|
|
9
9
|
- [statsmodels](https://github.com/statsmodels/statsmodels/blob/main/LICENSE.txt)
|
|
10
10
|
- [ipython](https://github.com/ipython/ipython/blob/main/COPYING.rst)
|
|
11
|
+
- [ipykernel](https://github.com/ipython/ipykernel/blob/main/COPYING.rst)
|
|
12
|
+
- [notebook](https://github.com/jupyter/notebook/blob/main/LICENSE)
|
|
13
|
+
- [jupyterlab](https://github.com/jupyterlab/jupyterlab/blob/main/LICENSE)
|
|
14
|
+
- [ipywidgets](https://github.com/jupyter-widgets/ipywidgets/blob/main/LICENSE)
|
|
11
15
|
- [torch](https://github.com/pytorch/pytorch/blob/main/LICENSE)
|
|
16
|
+
- [torchvision](https://github.com/pytorch/vision/blob/main/LICENSE)
|
|
12
17
|
- [scikit-learn](https://github.com/scikit-learn/scikit-learn/blob/main/COPYING)
|
|
13
18
|
- [imblearn](https://github.com/scikit-learn-contrib/imbalanced-learn/blob/main/LICENSE)
|
|
14
19
|
- [Pillow](https://github.com/python-pillow/Pillow/blob/main/LICENSE)
|
|
@@ -19,5 +24,5 @@ This project depends on the following third-party packages. Each is governed by
|
|
|
19
24
|
- [openpyxl](https://github.com/chronossc/openpyxl/blob/main/LICENSE)
|
|
20
25
|
- [miceforest](https://github.com/AnotherSamWilson/miceforest/blob/main/LICENSE)
|
|
21
26
|
- [polars](https://github.com/pola-rs/polars/blob/main/LICENSE)
|
|
22
|
-
- [
|
|
27
|
+
- [plotnine](https://github.com/has2k1/plotnine/blob/main/LICENSE.txt)
|
|
23
28
|
- [pyswarm](https://pythonhosted.org/pyswarm/#license)
|
ml_tools/MICE_imputation.py
CHANGED
|
@@ -3,7 +3,7 @@ import miceforest as mf
|
|
|
3
3
|
import os
|
|
4
4
|
import matplotlib.pyplot as plt
|
|
5
5
|
import numpy as np
|
|
6
|
-
from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info
|
|
6
|
+
from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe
|
|
7
7
|
from plotnine import ggplot, labs, theme, element_blank # type: ignore
|
|
8
8
|
|
|
9
9
|
|
|
@@ -49,15 +49,11 @@ def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterat
|
|
|
49
49
|
return kernel, imputed_datasets, imputed_dataset_names
|
|
50
50
|
|
|
51
51
|
|
|
52
|
-
def save_imputed_datasets(save_dir: str, imputed_datasets: list, imputed_dataset_names: list[str]):
|
|
53
|
-
# Check path
|
|
54
|
-
os.makedirs(save_dir, exist_ok=True)
|
|
55
|
-
|
|
52
|
+
def save_imputed_datasets(save_dir: str, imputed_datasets: list, df_targets: pd.DataFrame, imputed_dataset_names: list[str]):
|
|
56
53
|
for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
54
|
+
merged_df = merge_dataframes(imputed_df, df_targets, direction="horizontal", verbose=False)
|
|
55
|
+
save_dataframe(df=merged_df, save_dir=save_dir, filename=subname)
|
|
56
|
+
|
|
61
57
|
|
|
62
58
|
#Get names of features that had missing values before imputation
|
|
63
59
|
def get_na_column_names(df: pd.DataFrame):
|
|
@@ -119,7 +115,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
|
|
|
119
115
|
plt.savefig(save_path, bbox_inches='tight', format="svg")
|
|
120
116
|
plt.close()
|
|
121
117
|
|
|
122
|
-
print(f"
|
|
118
|
+
print(f"{dataset_file_dir} completed.")
|
|
123
119
|
|
|
124
120
|
|
|
125
121
|
# Imputed distributions
|
|
@@ -131,7 +127,8 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
|
|
|
131
127
|
'''
|
|
132
128
|
# Check path
|
|
133
129
|
os.makedirs(root_dir, exist_ok=True)
|
|
134
|
-
|
|
130
|
+
local_dir_name = f"Distribution_Metrics_{df_name}_imputed"
|
|
131
|
+
local_save_dir = os.path.join(root_dir, local_dir_name)
|
|
135
132
|
if not os.path.isdir(local_save_dir):
|
|
136
133
|
os.makedirs(local_save_dir)
|
|
137
134
|
|
|
@@ -202,10 +199,10 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
|
|
|
202
199
|
fig = kernel.plot_imputed_distributions(variables=[feature])
|
|
203
200
|
_process_figure(fig, feature)
|
|
204
201
|
|
|
205
|
-
print("
|
|
202
|
+
print(f"{local_dir_name} completed.")
|
|
206
203
|
|
|
207
204
|
|
|
208
|
-
def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_dir: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
|
|
205
|
+
def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str], save_datasets_dir: str, save_metrics_dir: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
|
|
209
206
|
"""
|
|
210
207
|
Call functions in sequence for each dataset in the provided path or directory:
|
|
211
208
|
1. Load dataframe
|
|
@@ -213,6 +210,8 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
|
|
|
213
210
|
3. Save imputed dataset(s)
|
|
214
211
|
4. Save convergence metrics
|
|
215
212
|
5. Save distribution metrics
|
|
213
|
+
|
|
214
|
+
Target columns must be skipped from the imputation.
|
|
216
215
|
"""
|
|
217
216
|
# Check paths
|
|
218
217
|
os.makedirs(save_datasets_dir, exist_ok=True)
|
|
@@ -228,9 +227,11 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
|
|
|
228
227
|
for df_path in all_file_paths:
|
|
229
228
|
df, df_name = load_dataframe(df_path=df_path)
|
|
230
229
|
|
|
230
|
+
df, df_targets = _skip_targets(df, target_columns)
|
|
231
|
+
|
|
231
232
|
kernel, imputed_datasets, imputed_dataset_names = apply_mice(df=df, df_name=df_name, resulting_datasets=resulting_datasets, iterations=iterations, random_state=random_state)
|
|
232
233
|
|
|
233
|
-
save_imputed_datasets(save_dir=save_datasets_dir, imputed_datasets=imputed_datasets, imputed_dataset_names=imputed_dataset_names)
|
|
234
|
+
save_imputed_datasets(save_dir=save_datasets_dir, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
|
|
234
235
|
|
|
235
236
|
imputed_column_names = get_na_column_names(df=df)
|
|
236
237
|
|
|
@@ -239,5 +240,12 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
|
|
|
239
240
|
get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_dir, column_names=imputed_column_names)
|
|
240
241
|
|
|
241
242
|
|
|
243
|
+
def _skip_targets(df: pd.DataFrame, target_cols: list[str]):
|
|
244
|
+
valid_targets = [col for col in target_cols if col in df.columns]
|
|
245
|
+
df_targets = df[valid_targets]
|
|
246
|
+
df_feats = df.drop(columns=valid_targets)
|
|
247
|
+
return df_feats, df_targets
|
|
248
|
+
|
|
249
|
+
|
|
242
250
|
def info():
|
|
243
251
|
_script_info(__all__)
|
ml_tools/data_exploration.py
CHANGED
|
@@ -5,10 +5,8 @@ import seaborn as sns
|
|
|
5
5
|
from IPython import get_ipython
|
|
6
6
|
from IPython.display import clear_output
|
|
7
7
|
import time
|
|
8
|
-
from typing import Union, Literal, Dict, Tuple
|
|
8
|
+
from typing import Union, Literal, Dict, Tuple, Iterator
|
|
9
9
|
import os
|
|
10
|
-
import sys
|
|
11
|
-
import textwrap
|
|
12
10
|
from ml_tools.utilities import sanitize_filename, _script_info
|
|
13
11
|
|
|
14
12
|
|
|
@@ -24,7 +22,8 @@ __all__ = [
|
|
|
24
22
|
"check_value_distributions",
|
|
25
23
|
"plot_value_distributions",
|
|
26
24
|
"clip_outliers_single",
|
|
27
|
-
"clip_outliers_multi"
|
|
25
|
+
"clip_outliers_multi",
|
|
26
|
+
"distribute_datasets_by_target"
|
|
28
27
|
]
|
|
29
28
|
|
|
30
29
|
|
|
@@ -113,7 +112,7 @@ def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
|
|
|
113
112
|
Parameters:
|
|
114
113
|
df (pd.DataFrame): The input DataFrame.
|
|
115
114
|
round_digits (int): Number of decimal places for the percentage.
|
|
116
|
-
|
|
115
|
+
|
|
117
116
|
Returns:
|
|
118
117
|
pd.DataFrame: A DataFrame summarizing missing values in each column.
|
|
119
118
|
"""
|
|
@@ -133,13 +132,14 @@ def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
|
|
|
133
132
|
return null_summary
|
|
134
133
|
|
|
135
134
|
|
|
136
|
-
def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) -> pd.DataFrame:
|
|
135
|
+
def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, show_nulls_after: bool = True) -> pd.DataFrame:
|
|
137
136
|
"""
|
|
138
137
|
Drops columns with more than `threshold` fraction of missing values.
|
|
139
138
|
|
|
140
139
|
Parameters:
|
|
141
140
|
df (pd.DataFrame): The input DataFrame.
|
|
142
141
|
threshold (float): Fraction of missing values above which columns are dropped.
|
|
142
|
+
show_nulls_after (bool): Prints `show_null_columns` after dropping columns.
|
|
143
143
|
|
|
144
144
|
Returns:
|
|
145
145
|
pd.DataFrame: A new DataFrame without the dropped columns.
|
|
@@ -150,10 +150,15 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) ->
|
|
|
150
150
|
if len(cols_to_drop) > 0:
|
|
151
151
|
print(f"Dropping columns with more than {threshold*100:.0f}% missing data:")
|
|
152
152
|
print(list(cols_to_drop))
|
|
153
|
+
|
|
154
|
+
result_df = df.drop(columns=cols_to_drop)
|
|
155
|
+
if show_nulls_after:
|
|
156
|
+
show_null_columns(df=result_df).head(20)
|
|
157
|
+
|
|
158
|
+
return result_df
|
|
153
159
|
else:
|
|
154
160
|
print(f"No columns have more than {threshold*100:.0f}% missing data.")
|
|
155
|
-
|
|
156
|
-
return df.drop(columns=cols_to_drop)
|
|
161
|
+
return df
|
|
157
162
|
|
|
158
163
|
|
|
159
164
|
def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
@@ -514,6 +519,34 @@ def clip_outliers_multi(
|
|
|
514
519
|
return new_df
|
|
515
520
|
|
|
516
521
|
|
|
522
|
+
def distribute_datasets_by_target(
|
|
523
|
+
df: pd.DataFrame,
|
|
524
|
+
target_columns: list[str]
|
|
525
|
+
) -> Iterator[Tuple[str, pd.DataFrame]]:
|
|
526
|
+
"""
|
|
527
|
+
Yields cleaned DataFrames for each target column, where rows with missing
|
|
528
|
+
target values are removed. The target column is placed at the end.
|
|
529
|
+
|
|
530
|
+
Parameters
|
|
531
|
+
----------
|
|
532
|
+
df : pd.DataFrame
|
|
533
|
+
Preprocessed dataframe with all feature and target columns ready to train.
|
|
534
|
+
target_columns : List[str]
|
|
535
|
+
List of target column names to generate per-target DataFrames.
|
|
536
|
+
|
|
537
|
+
Yields
|
|
538
|
+
------
|
|
539
|
+
Tuple[str, pd.DataFrame]
|
|
540
|
+
* First element is the target column name.
|
|
541
|
+
* Second element is the corresponding cleaned DataFrame.
|
|
542
|
+
"""
|
|
543
|
+
feature_columns = [col for col in df.columns if col not in target_columns]
|
|
544
|
+
|
|
545
|
+
for target in target_columns:
|
|
546
|
+
subset = df[feature_columns + [target]].dropna(subset=[target])
|
|
547
|
+
yield target, subset
|
|
548
|
+
|
|
549
|
+
|
|
517
550
|
def _is_notebook():
|
|
518
551
|
return get_ipython() is not None
|
|
519
552
|
|