dragon-ml-toolbox 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -0,0 +1,114 @@
1
+ Metadata-Version: 2.4
2
+ Name: dragon-ml-toolbox
3
+ Version: 1.1.2
4
+ Summary: A collection of tools for machine learning projects
5
+ Author-email: Karl Loza <luigiloza@gmail.com>
6
+ Requires-Python: >=3.9
7
+ Description-Content-Type: text/markdown
8
+ Provides-Extra: data-exploration
9
+ Requires-Dist: pandas; extra == "data-exploration"
10
+ Requires-Dist: numpy; extra == "data-exploration"
11
+ Requires-Dist: matplotlib; extra == "data-exploration"
12
+ Requires-Dist: seaborn; extra == "data-exploration"
13
+ Requires-Dist: statsmodels; extra == "data-exploration"
14
+ Requires-Dist: ipython; extra == "data-exploration"
15
+ Provides-Extra: datasetmaster
16
+ Requires-Dist: torch; extra == "datasetmaster"
17
+ Requires-Dist: pandas; extra == "datasetmaster"
18
+ Requires-Dist: numpy; extra == "datasetmaster"
19
+ Requires-Dist: scikit-learn; extra == "datasetmaster"
20
+ Requires-Dist: imblearn; extra == "datasetmaster"
21
+ Requires-Dist: Pillow; extra == "datasetmaster"
22
+ Requires-Dist: matplotlib; extra == "datasetmaster"
23
+ Provides-Extra: ensemble-learning
24
+ Requires-Dist: pandas; extra == "ensemble-learning"
25
+ Requires-Dist: numpy; extra == "ensemble-learning"
26
+ Requires-Dist: seaborn; extra == "ensemble-learning"
27
+ Requires-Dist: matplotlib; extra == "ensemble-learning"
28
+ Requires-Dist: joblib; extra == "ensemble-learning"
29
+ Requires-Dist: imblearn; extra == "ensemble-learning"
30
+ Requires-Dist: scikit-learn; extra == "ensemble-learning"
31
+ Requires-Dist: xgboost; extra == "ensemble-learning"
32
+ Requires-Dist: lightgbm; extra == "ensemble-learning"
33
+ Requires-Dist: shap; extra == "ensemble-learning"
34
+ Provides-Extra: handle-excel
35
+ Requires-Dist: openpyxl; extra == "handle-excel"
36
+ Requires-Dist: pandas; extra == "handle-excel"
37
+ Provides-Extra: logger
38
+ Requires-Dist: pandas; extra == "logger"
39
+ Requires-Dist: openpyxl; extra == "logger"
40
+ Provides-Extra: mice-imputation
41
+ Requires-Dist: pandas; extra == "mice-imputation"
42
+ Requires-Dist: miceforest; extra == "mice-imputation"
43
+ Requires-Dist: matplotlib; extra == "mice-imputation"
44
+ Requires-Dist: numpy; extra == "mice-imputation"
45
+ Provides-Extra: particle-swarm-optimization
46
+ Requires-Dist: numpy; extra == "particle-swarm-optimization"
47
+ Requires-Dist: joblib; extra == "particle-swarm-optimization"
48
+ Requires-Dist: xgboost; extra == "particle-swarm-optimization"
49
+ Requires-Dist: lightgbm; extra == "particle-swarm-optimization"
50
+ Requires-Dist: scikit-learn; extra == "particle-swarm-optimization"
51
+ Requires-Dist: polars; extra == "particle-swarm-optimization"
52
+ Provides-Extra: pytorch-models
53
+ Requires-Dist: torch; extra == "pytorch-models"
54
+ Provides-Extra: trainer
55
+ Requires-Dist: numpy; extra == "trainer"
56
+ Requires-Dist: torch; extra == "trainer"
57
+ Requires-Dist: matplotlib; extra == "trainer"
58
+ Requires-Dist: scikit-learn; extra == "trainer"
59
+ Provides-Extra: utilities
60
+ Requires-Dist: numpy; extra == "utilities"
61
+ Requires-Dist: pandas; extra == "utilities"
62
+ Provides-Extra: vision-helpers
63
+ Requires-Dist: Pillow; extra == "vision-helpers"
64
+ Requires-Dist: torch; extra == "vision-helpers"
65
+ Requires-Dist: torchvision; extra == "vision-helpers"
66
+
67
+ # ml_tools
68
+
69
+ A collection of Python utilities and machine learning tools, structured as a modular package for easy reuse and installation.
70
+
71
+ ## Features
72
+
73
+ - Modular scripts for data exploration, logging, machine learning, and more.
74
+ - Optional dependencies grouped by functionality for lightweight installs.
75
+ - Designed for seamless integration as a Git submodule or installable Python package.
76
+
77
+
78
+ ## Installation
79
+
80
+ ### Via GitHub (Editable / Development Mode)
81
+
82
+ Clone the repository and install in editable mode with optional dependencies:
83
+
84
+ ```bash
85
+ git clone https://github.com/DrAg0n-BoRn/ML_tools.git
86
+ cd ML_tools
87
+ pip install -e '.[utilities]'
88
+ ```
89
+
90
+ ### Via PyPI (Stable Releases)
91
+
92
+ Install the latest stable release from PyPI with optional dependencies:
93
+
94
+ pip install ml_tools[utilities]
95
+
96
+ ## Usage
97
+
98
+ After installation, import modules like this:
99
+
100
+ ```python
101
+ from ml_tools.utilities import sanitize_filename
102
+ from ml_tools.logger import custom_logger
103
+ ```
104
+
105
+ ## Development
106
+
107
+ Python 3.9+ recommended.
108
+
109
+ To install all dependencies including development tools:
110
+
111
+ ```python
112
+ pip install -e '.[dev]'
113
+ ```
114
+
@@ -0,0 +1,16 @@
1
+ ml_tools/MICE_imputation.py,sha256=wLM4DJTs-CxuGzEmuTj7Tmb7AoKGs16cdxQD2Ne8Dv0,7340
2
+ ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ ml_tools/data_exploration.py,sha256=Nx8V6xYmh2XqMF3WXg0BdAQnDAFq5cFd36JHFIf56vc,26989
4
+ ml_tools/datasetmaster.py,sha256=VUneKshnmjOGbtqVVGTFcIMRKF3s6ZDYrosIYKDjD80,28956
5
+ ml_tools/ensemble_learning.py,sha256=uA7A94CLv8o2l125oTEi0cjHusZkB-7Mnrtn7SGTfjs,29714
6
+ ml_tools/handle_excel.py,sha256=IR0VQc3hYdmjwC31E5YxDnRcWig4jSIx7Y_7to-KZz4,11969
7
+ ml_tools/logger.py,sha256=yQ5v8e2UnkKgQDszpg5zihpLPI8ehEci7p_2PKkshls,4613
8
+ ml_tools/particle_swarm_optimization.py,sha256=jpkje4OETC9fyISxxUTx4XGrImSU6gDEcwz46ZDs2bQ,19250
9
+ ml_tools/pytorch_models.py,sha256=Oykw02sOZLCjvSadQd64UGesBN7kq0x1EGXHusvYiQI,9908
10
+ ml_tools/trainer.py,sha256=Zd7AaHeoNd8dEas2JChWoHaCUpWUVRDUMybuHaKJ0XY,16740
11
+ ml_tools/utilities.py,sha256=mG_--EFplfI9H7OhrWI8VkdNJtTbs4Wbz32xvcFWps8,5518
12
+ ml_tools/vision_helpers.py,sha256=lBAW6dzAK-HOswAt1fU_tfP9hkNLY5D8c_I_7hhEXno,7528
13
+ dragon_ml_toolbox-1.1.2.dist-info/METADATA,sha256=PaAoR_VbZS6USH1kzYFkWM6DqXEP_mDsCtprwFN7IIs,4076
14
+ dragon_ml_toolbox-1.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ dragon_ml_toolbox-1.1.2.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
16
+ dragon_ml_toolbox-1.1.2.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ ml_tools
@@ -0,0 +1,178 @@
1
+ import pandas as pd
2
+ import miceforest as mf
3
+ import os
4
+ import matplotlib.pyplot as plt
5
+ import numpy as np
6
+ from utilities import load_dataframe, list_csv_paths
7
+
8
+
9
+ def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
10
+
11
+ # Initialize kernel with number of imputed datasets to generate
12
+ kernel = mf.ImputationKernel(
13
+ data=df,
14
+ datasets=resulting_datasets,
15
+ random_state=random_state
16
+ )
17
+
18
+ # Perform MICE with n iterations per dataset
19
+ kernel.mice(iterations)
20
+
21
+ # Retrieve the imputed datasets
22
+ imputed_datasets = [kernel.complete_data(dataset=i) for i in range(resulting_datasets)]
23
+
24
+ if resulting_datasets == 1:
25
+ imputed_dataset_names = [f"{df_name}_imputed"]
26
+ else:
27
+ imputed_dataset_names = [f"{df_name}_imputed_{i+1}" for i in range(resulting_datasets)]
28
+
29
+ # Ensure indexes match
30
+ for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
31
+ assert imputed_df.shape[0] == df.shape[0], f"Row count mismatch in dataset {subname}"
32
+ assert all(imputed_df.index == df.index), f"Index mismatch in dataset {subname}"
33
+ # print("✅ All imputed datasets match the original DataFrame indexes.")
34
+
35
+ return kernel, imputed_datasets, imputed_dataset_names
36
+
37
+
38
+ def save_imputed_datasets(save_dir: str, imputed_datasets: list, imputed_dataset_names: list[str]):
39
+ # Check path
40
+ os.makedirs(save_dir, exist_ok=True)
41
+
42
+ for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
43
+ output_path = os.path.join(save_dir, subname + ".csv")
44
+ imputed_df.to_csv(output_path, index=False, encoding='utf-8')
45
+ print(f"\tSaved {subname} with shape {imputed_df.shape}")
46
+
47
+
48
+ #Get names of features that had missing values before imputation
49
+ def get_na_column_names(df: pd.DataFrame):
50
+ return [col for col in df.columns if df[col].isna().any()]
51
+
52
+
53
+ #Convergence diagnostic
54
+ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_names: list[str], column_names: list[str], root_dir: str):
55
+ # get number of iterations used
56
+ iterations_cap = kernel.iteration_count()
57
+
58
+ # Check path
59
+ os.makedirs(root_dir, exist_ok=True)
60
+
61
+ # iterate over each imputed dataset
62
+ for dataset_id, imputed_dataset_name in zip(range(kernel.num_datasets), imputed_dataset_names):
63
+ #Check directory for current dataset
64
+ dataset_file_dir = f"Convergence_Metrics_{imputed_dataset_name}"
65
+ local_save_dir = os.path.join(root_dir, dataset_file_dir)
66
+ if not os.path.isdir(local_save_dir):
67
+ os.makedirs(local_save_dir)
68
+
69
+ for feature_name in column_names:
70
+ means_per_iteration = []
71
+ for iteration in range(iterations_cap):
72
+ current_imputed = kernel.complete_data(dataset=dataset_id, iteration=iteration)
73
+ means_per_iteration.append(np.mean(current_imputed[feature_name]))
74
+
75
+ plt.plot(means_per_iteration, marker='o')
76
+ plt.xlabel("Iteration")
77
+ plt.ylabel("Mean of Imputed Values")
78
+ plt.title(f"Mean Convergence for '{feature_name}'")
79
+
80
+ # Adjust plot display for the X axis
81
+ _ticks = np.arange(iterations_cap)
82
+ _labels = np.arange(1, iterations_cap + 1)
83
+ plt.xticks(ticks=_ticks, labels=_labels)
84
+
85
+ save_path = os.path.join(local_save_dir, feature_name + ".svg")
86
+ plt.savefig(save_path, bbox_inches='tight', format="svg")
87
+ plt.close()
88
+
89
+ print(f"{dataset_file_dir} completed.")
90
+
91
+
92
+ # Imputed distributions
93
+ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_dir: str, column_names: list[str], one_plot: bool=False, fontsize: int=18):
94
+ '''
95
+ It works using miceforest's authors implementation of the method `.plot_imputed_distributions()`.
96
+
97
+ Set `one_plot=True` to save a single image including all feature distribution plots instead.
98
+ '''
99
+ # Check path
100
+ os.makedirs(root_dir, exist_ok=True)
101
+ local_save_dir = os.path.join(root_dir, f"Distribution_Metrics_{df_name}")
102
+ if not os.path.isdir(local_save_dir):
103
+ os.makedirs(local_save_dir)
104
+
105
+ # Styling parameters
106
+ legend_kwargs = {'frameon': True, 'facecolor': 'white', 'framealpha': 0.8}
107
+ label_font = {'size': fontsize, 'weight': 'bold'}
108
+
109
+ def _process_figure(fig, filename):
110
+ """Helper function to add labels and legends to a figure"""
111
+ for ax in fig.axes:
112
+ # Set axis labels
113
+ ax.set_xlabel('Value', **label_font)
114
+ ax.set_ylabel('Density', **label_font)
115
+
116
+ # Add legend based on line colors
117
+ lines = ax.get_lines()
118
+ if len(lines) >= 1:
119
+ lines[0].set_label('Original Data')
120
+ if len(lines) > 1:
121
+ lines[1].set_label('Imputed Data')
122
+ ax.legend(**legend_kwargs)
123
+
124
+ # Adjust layout and save
125
+ fig.tight_layout()
126
+ fig.savefig(
127
+ os.path.join(local_save_dir, filename),
128
+ format='svg',
129
+ bbox_inches='tight',
130
+ pad_inches=0
131
+ )
132
+ plt.close(fig)
133
+
134
+ if one_plot:
135
+ # Generate combined plot
136
+ fig = kernel.plot_imputed_distributions(variables=column_names)
137
+ _process_figure(fig, "Combined_Distributions.svg")
138
+ # Generate individual plots per feature
139
+ else:
140
+ for feature in column_names:
141
+ fig = kernel.plot_imputed_distributions(variables=[feature])
142
+ _process_figure(fig, f"{feature}.svg")
143
+
144
+ print("Imputed distributions saved successfully.")
145
+
146
+
147
+ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_dir: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
148
+ """
149
+ Call functions in sequence for each dataset in the provided path or directory:
150
+ 1. Load dataframe
151
+ 2. Apply MICE
152
+ 3. Save imputed dataset(s)
153
+ 4. Save convergence metrics
154
+ 5. Save distribution metrics
155
+ """
156
+ # Check paths
157
+ os.makedirs(save_datasets_dir, exist_ok=True)
158
+ os.makedirs(save_metrics_dir, exist_ok=True)
159
+
160
+ if os.path.isfile(df_path_or_dir):
161
+ all_file_paths = [df_path_or_dir]
162
+ elif os.path.isdir(df_path_or_dir):
163
+ all_file_paths, _ = list_csv_paths(df_path_or_dir)
164
+ else:
165
+ raise ValueError(f"Invalid path or directory: {df_path_or_dir}")
166
+
167
+ for df_path in all_file_paths:
168
+ df, df_name = load_dataframe(df_path=df_path)
169
+
170
+ kernel, imputed_datasets, imputed_dataset_names = apply_mice(df=df, df_name=df_name, resulting_datasets=resulting_datasets, iterations=iterations, random_state=random_state)
171
+
172
+ save_imputed_datasets(save_dir=save_datasets_dir, imputed_datasets=imputed_datasets, imputed_dataset_names=imputed_dataset_names)
173
+
174
+ imputed_column_names = get_na_column_names(df=df)
175
+
176
+ get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_dir)
177
+
178
+ get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_dir, column_names=imputed_column_names)
ml_tools/__init__.py ADDED
File without changes