dragon-ml-toolbox 1.4.8__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-1.4.8.dist-info → dragon_ml_toolbox-2.1.0.dist-info}/METADATA +24 -14
- dragon_ml_toolbox-2.1.0.dist-info/RECORD +20 -0
- {dragon_ml_toolbox-1.4.8.dist-info → dragon_ml_toolbox-2.1.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +5 -4
- ml_tools/MICE_imputation.py +27 -28
- ml_tools/PSO_optimization.py +490 -0
- ml_tools/VIF_factor.py +20 -17
- ml_tools/{particle_swarm_optimization.py → _particle_swarm_optimization.py} +5 -0
- ml_tools/data_exploration.py +58 -32
- ml_tools/ensemble_learning.py +40 -42
- ml_tools/handle_excel.py +98 -78
- ml_tools/logger.py +13 -11
- ml_tools/utilities.py +134 -46
- dragon_ml_toolbox-1.4.8.dist-info/RECORD +0 -19
- {dragon_ml_toolbox-1.4.8.dist-info → dragon_ml_toolbox-2.1.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-1.4.8.dist-info → dragon_ml_toolbox-2.1.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-1.4.8.dist-info → dragon_ml_toolbox-2.1.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 2.1.0
|
|
4
4
|
Summary: A collection of tools for data science and machine learning projects
|
|
5
5
|
Author-email: Karl Loza <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -8,7 +8,7 @@ Project-URL: Homepage, https://github.com/DrAg0n-BoRn/ML_tools
|
|
|
8
8
|
Project-URL: Changelog, https://github.com/DrAg0n-BoRn/ML_tools/blob/master/CHANGELOG.md
|
|
9
9
|
Classifier: Programming Language :: Python :: 3
|
|
10
10
|
Classifier: Operating System :: OS Independent
|
|
11
|
-
Requires-Python: >=3.
|
|
11
|
+
Requires-Python: >=3.10
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
|
14
14
|
License-File: LICENSE-THIRD-PARTY.md
|
|
@@ -32,9 +32,10 @@ Requires-Dist: joblib
|
|
|
32
32
|
Requires-Dist: xgboost
|
|
33
33
|
Requires-Dist: lightgbm<=4.5.0
|
|
34
34
|
Requires-Dist: shap
|
|
35
|
+
Requires-Dist: tqdm>=4.0
|
|
36
|
+
Requires-Dist: Pillow
|
|
35
37
|
Provides-Extra: pytorch
|
|
36
38
|
Requires-Dist: torch; extra == "pytorch"
|
|
37
|
-
Requires-Dist: Pillow; extra == "pytorch"
|
|
38
39
|
Requires-Dist: torchvision; extra == "pytorch"
|
|
39
40
|
Dynamic: license-file
|
|
40
41
|
|
|
@@ -49,7 +50,7 @@ A collection of Python utilities for data science and machine learning, structur
|
|
|
49
50
|
|
|
50
51
|
## Installation
|
|
51
52
|
|
|
52
|
-
**Python 3.
|
|
53
|
+
**Python 3.10+ recommended.**
|
|
53
54
|
|
|
54
55
|
### Via PyPI
|
|
55
56
|
|
|
@@ -59,6 +60,16 @@ Install the latest stable release from PyPI:
|
|
|
59
60
|
pip install dragon-ml-tools
|
|
60
61
|
```
|
|
61
62
|
|
|
63
|
+
### Via GitHub (Editable)
|
|
64
|
+
|
|
65
|
+
Clone the repository and install in editable mode with optional dependencies:
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
git clone https://github.com/DrAg0n-BoRn/ML_tools.git
|
|
69
|
+
cd ML_tools
|
|
70
|
+
pip install -e .
|
|
71
|
+
```
|
|
72
|
+
|
|
62
73
|
### Via conda-forge
|
|
63
74
|
|
|
64
75
|
Install from the conda-forge channel:
|
|
@@ -66,22 +77,21 @@ Install from the conda-forge channel:
|
|
|
66
77
|
```bash
|
|
67
78
|
conda install -c conda-forge dragon-ml-toolbox
|
|
68
79
|
```
|
|
80
|
+
**Note:** This version is outdated or broken due to dependency incompatibilities.
|
|
69
81
|
|
|
70
|
-
|
|
82
|
+
## Optional dependencies
|
|
83
|
+
|
|
84
|
+
**PyTorch**, which provides different builds depending on the **platform** and **hardware acceleration** (e.g., CUDA for NVIDIA GPUs on Linux/Windows, or MPS for Apple Silicon on macOS).
|
|
85
|
+
|
|
86
|
+
Install the default CPU-only version with
|
|
71
87
|
|
|
72
88
|
```bash
|
|
73
89
|
pip install dragon-ml-tools[pytorch]
|
|
74
90
|
```
|
|
75
91
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
Clone the repository and install in editable mode with optional dependencies:
|
|
92
|
+
To make use of GPU acceleration use the official PyTorch installation instructions:
|
|
79
93
|
|
|
80
|
-
|
|
81
|
-
git clone https://github.com/DrAg0n-BoRn/ML_tools.git
|
|
82
|
-
cd ML_tools
|
|
83
|
-
pip install -e .
|
|
84
|
-
```
|
|
94
|
+
[PyTorch Instructions](https://pytorch.org/get-started/locally/)
|
|
85
95
|
|
|
86
96
|
## Usage
|
|
87
97
|
|
|
@@ -101,7 +111,7 @@ ensemble_learning
|
|
|
101
111
|
handle_excel
|
|
102
112
|
logger
|
|
103
113
|
MICE_imputation
|
|
104
|
-
|
|
114
|
+
PSO_optimization
|
|
105
115
|
trainer
|
|
106
116
|
utilities
|
|
107
117
|
VIF_factor
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
dragon_ml_toolbox-2.1.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
+
dragon_ml_toolbox-2.1.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
|
|
3
|
+
ml_tools/MICE_imputation.py,sha256=1fovHycZMdZ6OgVh_bk8-r3wGi4rqf6rS10LOEWYaQo,11177
|
|
4
|
+
ml_tools/PSO_optimization.py,sha256=vty1dZDY7P2iGUuE_oojyGdgM1EkDj5kXCfCxRMdk28,20957
|
|
5
|
+
ml_tools/VIF_factor.py,sha256=lpM3Z2X_iZfXUWbCbURoeI0Tb196lU0bAsRo7q6AzBM,10235
|
|
6
|
+
ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
ml_tools/_particle_swarm_optimization.py,sha256=b_eNNkA89Y40hj76KauivT8KLScH1B9wF2IXptOqkOw,22220
|
|
8
|
+
ml_tools/data_exploration.py,sha256=CDUVRTHfww105IXDRpBQ81KZWx5HXSsA-FVsVYBzNw8,21298
|
|
9
|
+
ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
|
|
10
|
+
ml_tools/ensemble_learning.py,sha256=q9jbu7SupvXz61sURFQ9V2-7gUsLbA3cSgyb2MQFyyc,37351
|
|
11
|
+
ml_tools/handle_excel.py,sha256=Uasx-DX7RNVQSzGHVJhX7UQ9RgBbX5H1ud1Hw_y8Kp4,12944
|
|
12
|
+
ml_tools/logger.py,sha256=_k7WJdpFJj3IsjOgvjLJgUFZyF8RK3Jlgp5tAu_dLQU,4767
|
|
13
|
+
ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
|
|
14
|
+
ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
|
|
15
|
+
ml_tools/utilities.py,sha256=5vVXqIH-jiY4PHUAoDI1o26mZYPsmrWO6I97Fs3oC90,18661
|
|
16
|
+
ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
|
|
17
|
+
dragon_ml_toolbox-2.1.0.dist-info/METADATA,sha256=LDXrXkR1nm6WiEVHudCy7wI0dwkMejT0NzPuYptGSmw,2974
|
|
18
|
+
dragon_ml_toolbox-2.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
19
|
+
dragon_ml_toolbox-2.1.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
20
|
+
dragon_ml_toolbox-2.1.0.dist-info/RECORD,,
|
|
@@ -5,10 +5,10 @@ This project depends on the following third-party packages. Each is governed by
|
|
|
5
5
|
- [pandas](https://github.com/pandas-dev/pandas/blob/main/LICENSE)
|
|
6
6
|
- [numpy](https://github.com/numpy/numpy/blob/main/LICENSE.txt)
|
|
7
7
|
- [matplotlib](https://github.com/matplotlib/matplotlib/blob/main/LICENSE/LICENSE)
|
|
8
|
-
- [seaborn](https://github.com/mwaskom/seaborn/blob/
|
|
8
|
+
- [seaborn](https://github.com/mwaskom/seaborn/blob/master/LICENSE.md)
|
|
9
9
|
- [statsmodels](https://github.com/statsmodels/statsmodels/blob/main/LICENSE.txt)
|
|
10
|
-
- [ipython](https://github.com/ipython/ipython/blob/main/
|
|
11
|
-
- [ipykernel](https://github.com/ipython/ipykernel/blob/main/
|
|
10
|
+
- [ipython](https://github.com/ipython/ipython/blob/main/LICENSE)
|
|
11
|
+
- [ipykernel](https://github.com/ipython/ipykernel/blob/main/LICENSE)
|
|
12
12
|
- [notebook](https://github.com/jupyter/notebook/blob/main/LICENSE)
|
|
13
13
|
- [jupyterlab](https://github.com/jupyterlab/jupyterlab/blob/main/LICENSE)
|
|
14
14
|
- [ipywidgets](https://github.com/jupyter-widgets/ipywidgets/blob/main/LICENSE)
|
|
@@ -24,5 +24,6 @@ This project depends on the following third-party packages. Each is governed by
|
|
|
24
24
|
- [openpyxl](https://github.com/chronossc/openpyxl/blob/main/LICENSE)
|
|
25
25
|
- [miceforest](https://github.com/AnotherSamWilson/miceforest/blob/main/LICENSE)
|
|
26
26
|
- [polars](https://github.com/pola-rs/polars/blob/main/LICENSE)
|
|
27
|
-
- [plotnine](https://github.com/has2k1/plotnine/blob/main/LICENSE
|
|
27
|
+
- [plotnine](https://github.com/has2k1/plotnine/blob/main/LICENSE)
|
|
28
28
|
- [pyswarm](https://pythonhosted.org/pyswarm/#license)
|
|
29
|
+
- [tqdm](https://github.com/tqdm/tqdm/blob/master/LICENSE)
|
ml_tools/MICE_imputation.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
import miceforest as mf
|
|
3
|
-
import
|
|
3
|
+
from pathlib import Path
|
|
4
4
|
import matplotlib.pyplot as plt
|
|
5
5
|
import numpy as np
|
|
6
|
-
from .utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe, threshold_binary_values
|
|
6
|
+
from .utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe, threshold_binary_values, make_fullpath
|
|
7
7
|
from plotnine import ggplot, labs, theme, element_blank # type: ignore
|
|
8
|
-
from typing import Optional
|
|
8
|
+
from typing import Optional, Union
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
__all__ = [
|
|
@@ -60,7 +60,7 @@ def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str
|
|
|
60
60
|
return kernel, imputed_datasets, imputed_dataset_names
|
|
61
61
|
|
|
62
62
|
|
|
63
|
-
def save_imputed_datasets(save_dir: str, imputed_datasets: list, df_targets: pd.DataFrame, imputed_dataset_names: list[str]):
|
|
63
|
+
def save_imputed_datasets(save_dir: Union[str, Path], imputed_datasets: list, df_targets: pd.DataFrame, imputed_dataset_names: list[str]):
|
|
64
64
|
for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
|
|
65
65
|
merged_df = merge_dataframes(imputed_df, df_targets, direction="horizontal", verbose=False)
|
|
66
66
|
save_dataframe(df=merged_df, save_dir=save_dir, filename=subname)
|
|
@@ -72,7 +72,7 @@ def get_na_column_names(df: pd.DataFrame):
|
|
|
72
72
|
|
|
73
73
|
|
|
74
74
|
#Convergence diagnostic
|
|
75
|
-
def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_names: list[str], column_names: list[str], root_dir: str, fontsize: int=16):
|
|
75
|
+
def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_names: list[str], column_names: list[str], root_dir: Union[str,Path], fontsize: int=16):
|
|
76
76
|
"""
|
|
77
77
|
Generate and save convergence diagnostic plots for imputed variables.
|
|
78
78
|
|
|
@@ -90,7 +90,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
|
|
|
90
90
|
raise ValueError(f"Expected {dataset_count} names in imputed_dataset_names, got {len(imputed_dataset_names)}")
|
|
91
91
|
|
|
92
92
|
# Check path
|
|
93
|
-
|
|
93
|
+
root_path = make_fullpath(root_dir, make=True)
|
|
94
94
|
|
|
95
95
|
# Styling parameters
|
|
96
96
|
label_font = {'size': fontsize, 'weight': 'bold'}
|
|
@@ -99,8 +99,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
|
|
|
99
99
|
for dataset_id, imputed_dataset_name in zip(range(dataset_count), imputed_dataset_names):
|
|
100
100
|
#Check directory for current dataset
|
|
101
101
|
dataset_file_dir = f"Convergence_Metrics_{imputed_dataset_name}"
|
|
102
|
-
local_save_dir =
|
|
103
|
-
os.makedirs(local_save_dir, exist_ok=True)
|
|
102
|
+
local_save_dir = make_fullpath(input_path=root_path / dataset_file_dir, make=True)
|
|
104
103
|
|
|
105
104
|
for feature_name in column_names:
|
|
106
105
|
means_per_iteration = []
|
|
@@ -121,8 +120,8 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
|
|
|
121
120
|
plt.grid(True)
|
|
122
121
|
|
|
123
122
|
feature_save_name = sanitize_filename(feature_name)
|
|
124
|
-
|
|
125
|
-
save_path =
|
|
123
|
+
feature_save_name = feature_save_name + ".svg"
|
|
124
|
+
save_path = local_save_dir / feature_save_name
|
|
126
125
|
plt.savefig(save_path, bbox_inches='tight', format="svg")
|
|
127
126
|
plt.close()
|
|
128
127
|
|
|
@@ -130,18 +129,17 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
|
|
|
130
129
|
|
|
131
130
|
|
|
132
131
|
# Imputed distributions
|
|
133
|
-
def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_dir: str, column_names: list[str], one_plot: bool=False, fontsize: int=14):
|
|
132
|
+
def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_dir: Union[str, Path], column_names: list[str], one_plot: bool=False, fontsize: int=14):
|
|
134
133
|
'''
|
|
135
134
|
It works using miceforest's authors implementation of the method `.plot_imputed_distributions()`.
|
|
136
135
|
|
|
137
136
|
Set `one_plot=True` to save a single image including all feature distribution plots instead.
|
|
138
137
|
'''
|
|
139
138
|
# Check path
|
|
140
|
-
|
|
139
|
+
root_path = make_fullpath(root_dir, make=True)
|
|
140
|
+
|
|
141
141
|
local_dir_name = f"Distribution_Metrics_{df_name}_imputed"
|
|
142
|
-
local_save_dir =
|
|
143
|
-
if not os.path.isdir(local_save_dir):
|
|
144
|
-
os.makedirs(local_save_dir)
|
|
142
|
+
local_save_dir = make_fullpath(root_path / local_dir_name, make=True)
|
|
145
143
|
|
|
146
144
|
# Styling parameters
|
|
147
145
|
legend_kwargs = {'frameon': True, 'facecolor': 'white', 'framealpha': 0.8}
|
|
@@ -191,9 +189,11 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
|
|
|
191
189
|
|
|
192
190
|
# sanitize savename
|
|
193
191
|
feature_save_name = sanitize_filename(filename)
|
|
192
|
+
feature_save_name = feature_save_name + ".svg"
|
|
193
|
+
new_save_path = local_save_dir / feature_save_name
|
|
194
194
|
|
|
195
195
|
fig.savefig(
|
|
196
|
-
|
|
196
|
+
new_save_path,
|
|
197
197
|
format='svg',
|
|
198
198
|
bbox_inches='tight',
|
|
199
199
|
pad_inches=0.1
|
|
@@ -213,8 +213,8 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
|
|
|
213
213
|
print(f"{local_dir_name} completed.")
|
|
214
214
|
|
|
215
215
|
|
|
216
|
-
def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str],
|
|
217
|
-
save_datasets_dir: str, save_metrics_dir: str,
|
|
216
|
+
def run_mice_pipeline(df_path_or_dir: Union[str,Path], target_columns: list[str],
|
|
217
|
+
save_datasets_dir: Union[str,Path], save_metrics_dir: Union[str,Path],
|
|
218
218
|
binary_columns: Optional[list[str]]=None,
|
|
219
219
|
resulting_datasets: int=1,
|
|
220
220
|
iterations: int=20,
|
|
@@ -230,15 +230,14 @@ def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str],
|
|
|
230
230
|
Target columns must be skipped from the imputation. Binary columns will be thresholded after imputation.
|
|
231
231
|
"""
|
|
232
232
|
# Check paths
|
|
233
|
-
|
|
234
|
-
|
|
233
|
+
save_datasets_path = make_fullpath(save_datasets_dir, make=True)
|
|
234
|
+
save_metrics_path = make_fullpath(save_metrics_dir, make=True)
|
|
235
235
|
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
all_file_paths = list(list_csv_paths(df_path_or_dir).values())
|
|
236
|
+
input_path = make_fullpath(df_path_or_dir)
|
|
237
|
+
if input_path.is_file():
|
|
238
|
+
all_file_paths = [input_path]
|
|
240
239
|
else:
|
|
241
|
-
|
|
240
|
+
all_file_paths = list(list_csv_paths(input_path).values())
|
|
242
241
|
|
|
243
242
|
for df_path in all_file_paths:
|
|
244
243
|
df, df_name = load_dataframe(df_path=df_path)
|
|
@@ -247,13 +246,13 @@ def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str],
|
|
|
247
246
|
|
|
248
247
|
kernel, imputed_datasets, imputed_dataset_names = apply_mice(df=df, df_name=df_name, binary_columns=binary_columns, resulting_datasets=resulting_datasets, iterations=iterations, random_state=random_state)
|
|
249
248
|
|
|
250
|
-
save_imputed_datasets(save_dir=
|
|
249
|
+
save_imputed_datasets(save_dir=save_datasets_path, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
|
|
251
250
|
|
|
252
251
|
imputed_column_names = get_na_column_names(df=df)
|
|
253
252
|
|
|
254
|
-
get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=
|
|
253
|
+
get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_path)
|
|
255
254
|
|
|
256
|
-
get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=
|
|
255
|
+
get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_path, column_names=imputed_column_names)
|
|
257
256
|
|
|
258
257
|
|
|
259
258
|
def _skip_targets(df: pd.DataFrame, target_cols: list[str]):
|