dragon-ml-toolbox 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- dragon_ml_toolbox-1.1.2.dist-info/METADATA +114 -0
- dragon_ml_toolbox-1.1.2.dist-info/RECORD +16 -0
- dragon_ml_toolbox-1.1.2.dist-info/WHEEL +5 -0
- dragon_ml_toolbox-1.1.2.dist-info/top_level.txt +1 -0
- ml_tools/MICE_imputation.py +178 -0
- ml_tools/__init__.py +0 -0
- ml_tools/data_exploration.py +751 -0
- ml_tools/datasetmaster.py +595 -0
- ml_tools/ensemble_learning.py +701 -0
- ml_tools/handle_excel.py +310 -0
- ml_tools/logger.py +145 -0
- ml_tools/particle_swarm_optimization.py +467 -0
- ml_tools/pytorch_models.py +227 -0
- ml_tools/trainer.py +366 -0
- ml_tools/utilities.py +168 -0
- ml_tools/vision_helpers.py +218 -0
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dragon-ml-toolbox
|
|
3
|
+
Version: 1.1.2
|
|
4
|
+
Summary: A collection of tools for machine learning projects
|
|
5
|
+
Author-email: Karl Loza <luigiloza@gmail.com>
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Provides-Extra: data-exploration
|
|
9
|
+
Requires-Dist: pandas; extra == "data-exploration"
|
|
10
|
+
Requires-Dist: numpy; extra == "data-exploration"
|
|
11
|
+
Requires-Dist: matplotlib; extra == "data-exploration"
|
|
12
|
+
Requires-Dist: seaborn; extra == "data-exploration"
|
|
13
|
+
Requires-Dist: statsmodels; extra == "data-exploration"
|
|
14
|
+
Requires-Dist: ipython; extra == "data-exploration"
|
|
15
|
+
Provides-Extra: datasetmaster
|
|
16
|
+
Requires-Dist: torch; extra == "datasetmaster"
|
|
17
|
+
Requires-Dist: pandas; extra == "datasetmaster"
|
|
18
|
+
Requires-Dist: numpy; extra == "datasetmaster"
|
|
19
|
+
Requires-Dist: scikit-learn; extra == "datasetmaster"
|
|
20
|
+
Requires-Dist: imblearn; extra == "datasetmaster"
|
|
21
|
+
Requires-Dist: Pillow; extra == "datasetmaster"
|
|
22
|
+
Requires-Dist: matplotlib; extra == "datasetmaster"
|
|
23
|
+
Provides-Extra: ensemble-learning
|
|
24
|
+
Requires-Dist: pandas; extra == "ensemble-learning"
|
|
25
|
+
Requires-Dist: numpy; extra == "ensemble-learning"
|
|
26
|
+
Requires-Dist: seaborn; extra == "ensemble-learning"
|
|
27
|
+
Requires-Dist: matplotlib; extra == "ensemble-learning"
|
|
28
|
+
Requires-Dist: joblib; extra == "ensemble-learning"
|
|
29
|
+
Requires-Dist: imblearn; extra == "ensemble-learning"
|
|
30
|
+
Requires-Dist: scikit-learn; extra == "ensemble-learning"
|
|
31
|
+
Requires-Dist: xgboost; extra == "ensemble-learning"
|
|
32
|
+
Requires-Dist: lightgbm; extra == "ensemble-learning"
|
|
33
|
+
Requires-Dist: shap; extra == "ensemble-learning"
|
|
34
|
+
Provides-Extra: handle-excel
|
|
35
|
+
Requires-Dist: openpyxl; extra == "handle-excel"
|
|
36
|
+
Requires-Dist: pandas; extra == "handle-excel"
|
|
37
|
+
Provides-Extra: logger
|
|
38
|
+
Requires-Dist: pandas; extra == "logger"
|
|
39
|
+
Requires-Dist: openpyxl; extra == "logger"
|
|
40
|
+
Provides-Extra: mice-imputation
|
|
41
|
+
Requires-Dist: pandas; extra == "mice-imputation"
|
|
42
|
+
Requires-Dist: miceforest; extra == "mice-imputation"
|
|
43
|
+
Requires-Dist: matplotlib; extra == "mice-imputation"
|
|
44
|
+
Requires-Dist: numpy; extra == "mice-imputation"
|
|
45
|
+
Provides-Extra: particle-swarm-optimization
|
|
46
|
+
Requires-Dist: numpy; extra == "particle-swarm-optimization"
|
|
47
|
+
Requires-Dist: joblib; extra == "particle-swarm-optimization"
|
|
48
|
+
Requires-Dist: xgboost; extra == "particle-swarm-optimization"
|
|
49
|
+
Requires-Dist: lightgbm; extra == "particle-swarm-optimization"
|
|
50
|
+
Requires-Dist: scikit-learn; extra == "particle-swarm-optimization"
|
|
51
|
+
Requires-Dist: polars; extra == "particle-swarm-optimization"
|
|
52
|
+
Provides-Extra: pytorch-models
|
|
53
|
+
Requires-Dist: torch; extra == "pytorch-models"
|
|
54
|
+
Provides-Extra: trainer
|
|
55
|
+
Requires-Dist: numpy; extra == "trainer"
|
|
56
|
+
Requires-Dist: torch; extra == "trainer"
|
|
57
|
+
Requires-Dist: matplotlib; extra == "trainer"
|
|
58
|
+
Requires-Dist: scikit-learn; extra == "trainer"
|
|
59
|
+
Provides-Extra: utilities
|
|
60
|
+
Requires-Dist: numpy; extra == "utilities"
|
|
61
|
+
Requires-Dist: pandas; extra == "utilities"
|
|
62
|
+
Provides-Extra: vision-helpers
|
|
63
|
+
Requires-Dist: Pillow; extra == "vision-helpers"
|
|
64
|
+
Requires-Dist: torch; extra == "vision-helpers"
|
|
65
|
+
Requires-Dist: torchvision; extra == "vision-helpers"
|
|
66
|
+
|
|
67
|
+
# ml_tools
|
|
68
|
+
|
|
69
|
+
A collection of Python utilities and machine learning tools, structured as a modular package for easy reuse and installation.
|
|
70
|
+
|
|
71
|
+
## Features
|
|
72
|
+
|
|
73
|
+
- Modular scripts for data exploration, logging, machine learning, and more.
|
|
74
|
+
- Optional dependencies grouped by functionality for lightweight installs.
|
|
75
|
+
- Designed for seamless integration as a Git submodule or installable Python package.
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
## Installation
|
|
79
|
+
|
|
80
|
+
### Via GitHub (Editable / Development Mode)
|
|
81
|
+
|
|
82
|
+
Clone the repository and install in editable mode with optional dependencies:
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
git clone https://github.com/DrAg0n-BoRn/ML_tools.git
|
|
86
|
+
cd ML_tools
|
|
87
|
+
pip install -e '.[utilities]'
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Via PyPI (Stable Releases)
|
|
91
|
+
|
|
92
|
+
Install the latest stable release from PyPI with optional dependencies:
|
|
93
|
+
|
|
94
|
+
pip install ml_tools[utilities]
|
|
95
|
+
|
|
96
|
+
## Usage
|
|
97
|
+
|
|
98
|
+
After installation, import modules like this:
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
from ml_tools.utilities import sanitize_filename
|
|
102
|
+
from ml_tools.logger import custom_logger
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Development
|
|
106
|
+
|
|
107
|
+
Python 3.9+ recommended.
|
|
108
|
+
|
|
109
|
+
To install all dependencies including development tools:
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
pip install -e '.[dev]'
|
|
113
|
+
```
|
|
114
|
+
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
ml_tools/MICE_imputation.py,sha256=wLM4DJTs-CxuGzEmuTj7Tmb7AoKGs16cdxQD2Ne8Dv0,7340
|
|
2
|
+
ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
ml_tools/data_exploration.py,sha256=Nx8V6xYmh2XqMF3WXg0BdAQnDAFq5cFd36JHFIf56vc,26989
|
|
4
|
+
ml_tools/datasetmaster.py,sha256=VUneKshnmjOGbtqVVGTFcIMRKF3s6ZDYrosIYKDjD80,28956
|
|
5
|
+
ml_tools/ensemble_learning.py,sha256=uA7A94CLv8o2l125oTEi0cjHusZkB-7Mnrtn7SGTfjs,29714
|
|
6
|
+
ml_tools/handle_excel.py,sha256=IR0VQc3hYdmjwC31E5YxDnRcWig4jSIx7Y_7to-KZz4,11969
|
|
7
|
+
ml_tools/logger.py,sha256=yQ5v8e2UnkKgQDszpg5zihpLPI8ehEci7p_2PKkshls,4613
|
|
8
|
+
ml_tools/particle_swarm_optimization.py,sha256=jpkje4OETC9fyISxxUTx4XGrImSU6gDEcwz46ZDs2bQ,19250
|
|
9
|
+
ml_tools/pytorch_models.py,sha256=Oykw02sOZLCjvSadQd64UGesBN7kq0x1EGXHusvYiQI,9908
|
|
10
|
+
ml_tools/trainer.py,sha256=Zd7AaHeoNd8dEas2JChWoHaCUpWUVRDUMybuHaKJ0XY,16740
|
|
11
|
+
ml_tools/utilities.py,sha256=mG_--EFplfI9H7OhrWI8VkdNJtTbs4Wbz32xvcFWps8,5518
|
|
12
|
+
ml_tools/vision_helpers.py,sha256=lBAW6dzAK-HOswAt1fU_tfP9hkNLY5D8c_I_7hhEXno,7528
|
|
13
|
+
dragon_ml_toolbox-1.1.2.dist-info/METADATA,sha256=PaAoR_VbZS6USH1kzYFkWM6DqXEP_mDsCtprwFN7IIs,4076
|
|
14
|
+
dragon_ml_toolbox-1.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
15
|
+
dragon_ml_toolbox-1.1.2.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
16
|
+
dragon_ml_toolbox-1.1.2.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ml_tools
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import miceforest as mf
|
|
3
|
+
import os
|
|
4
|
+
import matplotlib.pyplot as plt
|
|
5
|
+
import numpy as np
|
|
6
|
+
from utilities import load_dataframe, list_csv_paths
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
|
|
10
|
+
|
|
11
|
+
# Initialize kernel with number of imputed datasets to generate
|
|
12
|
+
kernel = mf.ImputationKernel(
|
|
13
|
+
data=df,
|
|
14
|
+
datasets=resulting_datasets,
|
|
15
|
+
random_state=random_state
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
# Perform MICE with n iterations per dataset
|
|
19
|
+
kernel.mice(iterations)
|
|
20
|
+
|
|
21
|
+
# Retrieve the imputed datasets
|
|
22
|
+
imputed_datasets = [kernel.complete_data(dataset=i) for i in range(resulting_datasets)]
|
|
23
|
+
|
|
24
|
+
if resulting_datasets == 1:
|
|
25
|
+
imputed_dataset_names = [f"{df_name}_imputed"]
|
|
26
|
+
else:
|
|
27
|
+
imputed_dataset_names = [f"{df_name}_imputed_{i+1}" for i in range(resulting_datasets)]
|
|
28
|
+
|
|
29
|
+
# Ensure indexes match
|
|
30
|
+
for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
|
|
31
|
+
assert imputed_df.shape[0] == df.shape[0], f"Row count mismatch in dataset {subname}"
|
|
32
|
+
assert all(imputed_df.index == df.index), f"Index mismatch in dataset {subname}"
|
|
33
|
+
# print("✅ All imputed datasets match the original DataFrame indexes.")
|
|
34
|
+
|
|
35
|
+
return kernel, imputed_datasets, imputed_dataset_names
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def save_imputed_datasets(save_dir: str, imputed_datasets: list, imputed_dataset_names: list[str]):
|
|
39
|
+
# Check path
|
|
40
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
41
|
+
|
|
42
|
+
for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
|
|
43
|
+
output_path = os.path.join(save_dir, subname + ".csv")
|
|
44
|
+
imputed_df.to_csv(output_path, index=False, encoding='utf-8')
|
|
45
|
+
print(f"\tSaved {subname} with shape {imputed_df.shape}")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
#Get names of features that had missing values before imputation
|
|
49
|
+
def get_na_column_names(df: pd.DataFrame):
|
|
50
|
+
return [col for col in df.columns if df[col].isna().any()]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
#Convergence diagnostic
|
|
54
|
+
def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_names: list[str], column_names: list[str], root_dir: str):
|
|
55
|
+
# get number of iterations used
|
|
56
|
+
iterations_cap = kernel.iteration_count()
|
|
57
|
+
|
|
58
|
+
# Check path
|
|
59
|
+
os.makedirs(root_dir, exist_ok=True)
|
|
60
|
+
|
|
61
|
+
# iterate over each imputed dataset
|
|
62
|
+
for dataset_id, imputed_dataset_name in zip(range(kernel.num_datasets), imputed_dataset_names):
|
|
63
|
+
#Check directory for current dataset
|
|
64
|
+
dataset_file_dir = f"Convergence_Metrics_{imputed_dataset_name}"
|
|
65
|
+
local_save_dir = os.path.join(root_dir, dataset_file_dir)
|
|
66
|
+
if not os.path.isdir(local_save_dir):
|
|
67
|
+
os.makedirs(local_save_dir)
|
|
68
|
+
|
|
69
|
+
for feature_name in column_names:
|
|
70
|
+
means_per_iteration = []
|
|
71
|
+
for iteration in range(iterations_cap):
|
|
72
|
+
current_imputed = kernel.complete_data(dataset=dataset_id, iteration=iteration)
|
|
73
|
+
means_per_iteration.append(np.mean(current_imputed[feature_name]))
|
|
74
|
+
|
|
75
|
+
plt.plot(means_per_iteration, marker='o')
|
|
76
|
+
plt.xlabel("Iteration")
|
|
77
|
+
plt.ylabel("Mean of Imputed Values")
|
|
78
|
+
plt.title(f"Mean Convergence for '{feature_name}'")
|
|
79
|
+
|
|
80
|
+
# Adjust plot display for the X axis
|
|
81
|
+
_ticks = np.arange(iterations_cap)
|
|
82
|
+
_labels = np.arange(1, iterations_cap + 1)
|
|
83
|
+
plt.xticks(ticks=_ticks, labels=_labels)
|
|
84
|
+
|
|
85
|
+
save_path = os.path.join(local_save_dir, feature_name + ".svg")
|
|
86
|
+
plt.savefig(save_path, bbox_inches='tight', format="svg")
|
|
87
|
+
plt.close()
|
|
88
|
+
|
|
89
|
+
print(f"{dataset_file_dir} completed.")
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# Imputed distributions
|
|
93
|
+
def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_dir: str, column_names: list[str], one_plot: bool=False, fontsize: int=18):
|
|
94
|
+
'''
|
|
95
|
+
It works using miceforest's authors implementation of the method `.plot_imputed_distributions()`.
|
|
96
|
+
|
|
97
|
+
Set `one_plot=True` to save a single image including all feature distribution plots instead.
|
|
98
|
+
'''
|
|
99
|
+
# Check path
|
|
100
|
+
os.makedirs(root_dir, exist_ok=True)
|
|
101
|
+
local_save_dir = os.path.join(root_dir, f"Distribution_Metrics_{df_name}")
|
|
102
|
+
if not os.path.isdir(local_save_dir):
|
|
103
|
+
os.makedirs(local_save_dir)
|
|
104
|
+
|
|
105
|
+
# Styling parameters
|
|
106
|
+
legend_kwargs = {'frameon': True, 'facecolor': 'white', 'framealpha': 0.8}
|
|
107
|
+
label_font = {'size': fontsize, 'weight': 'bold'}
|
|
108
|
+
|
|
109
|
+
def _process_figure(fig, filename):
|
|
110
|
+
"""Helper function to add labels and legends to a figure"""
|
|
111
|
+
for ax in fig.axes:
|
|
112
|
+
# Set axis labels
|
|
113
|
+
ax.set_xlabel('Value', **label_font)
|
|
114
|
+
ax.set_ylabel('Density', **label_font)
|
|
115
|
+
|
|
116
|
+
# Add legend based on line colors
|
|
117
|
+
lines = ax.get_lines()
|
|
118
|
+
if len(lines) >= 1:
|
|
119
|
+
lines[0].set_label('Original Data')
|
|
120
|
+
if len(lines) > 1:
|
|
121
|
+
lines[1].set_label('Imputed Data')
|
|
122
|
+
ax.legend(**legend_kwargs)
|
|
123
|
+
|
|
124
|
+
# Adjust layout and save
|
|
125
|
+
fig.tight_layout()
|
|
126
|
+
fig.savefig(
|
|
127
|
+
os.path.join(local_save_dir, filename),
|
|
128
|
+
format='svg',
|
|
129
|
+
bbox_inches='tight',
|
|
130
|
+
pad_inches=0
|
|
131
|
+
)
|
|
132
|
+
plt.close(fig)
|
|
133
|
+
|
|
134
|
+
if one_plot:
|
|
135
|
+
# Generate combined plot
|
|
136
|
+
fig = kernel.plot_imputed_distributions(variables=column_names)
|
|
137
|
+
_process_figure(fig, "Combined_Distributions.svg")
|
|
138
|
+
# Generate individual plots per feature
|
|
139
|
+
else:
|
|
140
|
+
for feature in column_names:
|
|
141
|
+
fig = kernel.plot_imputed_distributions(variables=[feature])
|
|
142
|
+
_process_figure(fig, f"{feature}.svg")
|
|
143
|
+
|
|
144
|
+
print("Imputed distributions saved successfully.")
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_dir: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
|
|
148
|
+
"""
|
|
149
|
+
Call functions in sequence for each dataset in the provided path or directory:
|
|
150
|
+
1. Load dataframe
|
|
151
|
+
2. Apply MICE
|
|
152
|
+
3. Save imputed dataset(s)
|
|
153
|
+
4. Save convergence metrics
|
|
154
|
+
5. Save distribution metrics
|
|
155
|
+
"""
|
|
156
|
+
# Check paths
|
|
157
|
+
os.makedirs(save_datasets_dir, exist_ok=True)
|
|
158
|
+
os.makedirs(save_metrics_dir, exist_ok=True)
|
|
159
|
+
|
|
160
|
+
if os.path.isfile(df_path_or_dir):
|
|
161
|
+
all_file_paths = [df_path_or_dir]
|
|
162
|
+
elif os.path.isdir(df_path_or_dir):
|
|
163
|
+
all_file_paths, _ = list_csv_paths(df_path_or_dir)
|
|
164
|
+
else:
|
|
165
|
+
raise ValueError(f"Invalid path or directory: {df_path_or_dir}")
|
|
166
|
+
|
|
167
|
+
for df_path in all_file_paths:
|
|
168
|
+
df, df_name = load_dataframe(df_path=df_path)
|
|
169
|
+
|
|
170
|
+
kernel, imputed_datasets, imputed_dataset_names = apply_mice(df=df, df_name=df_name, resulting_datasets=resulting_datasets, iterations=iterations, random_state=random_state)
|
|
171
|
+
|
|
172
|
+
save_imputed_datasets(save_dir=save_datasets_dir, imputed_datasets=imputed_datasets, imputed_dataset_names=imputed_dataset_names)
|
|
173
|
+
|
|
174
|
+
imputed_column_names = get_na_column_names(df=df)
|
|
175
|
+
|
|
176
|
+
get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_dir)
|
|
177
|
+
|
|
178
|
+
get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_dir, column_names=imputed_column_names)
|
ml_tools/__init__.py
ADDED
|
File without changes
|