dragon-ml-toolbox 1.4.0__py3-none-any.whl → 1.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-1.4.0.dist-info → dragon_ml_toolbox-1.4.2.dist-info}/METADATA +18 -2
- dragon_ml_toolbox-1.4.2.dist-info/RECORD +19 -0
- ml_tools/MICE_imputation.py +17 -2
- ml_tools/VIF_factor.py +29 -14
- ml_tools/data_exploration.py +68 -140
- ml_tools/datasetmaster.py +13 -1
- ml_tools/ensemble_learning.py +83 -82
- ml_tools/handle_excel.py +32 -9
- ml_tools/logger.py +10 -1
- ml_tools/particle_swarm_optimization.py +92 -64
- ml_tools/pytorch_models.py +13 -1
- ml_tools/trainer.py +10 -30
- ml_tools/utilities.py +133 -18
- ml_tools/vision_helpers.py +14 -1
- dragon_ml_toolbox-1.4.0.dist-info/RECORD +0 -19
- {dragon_ml_toolbox-1.4.0.dist-info → dragon_ml_toolbox-1.4.2.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-1.4.0.dist-info → dragon_ml_toolbox-1.4.2.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-1.4.0.dist-info → dragon_ml_toolbox-1.4.2.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-1.4.0.dist-info → dragon_ml_toolbox-1.4.2.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version: 1.4.
|
|
3
|
+
Version: 1.4.2
|
|
4
4
|
Summary: A collection of tools for data science and machine learning projects
|
|
5
5
|
Author-email: Karl Loza <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -80,7 +80,7 @@ Clone the repository and install in editable mode with optional dependencies:
|
|
|
80
80
|
```bash
|
|
81
81
|
git clone https://github.com/DrAg0n-BoRn/ML_tools.git
|
|
82
82
|
cd ML_tools
|
|
83
|
-
pip install -e
|
|
83
|
+
pip install -e .
|
|
84
84
|
```
|
|
85
85
|
|
|
86
86
|
## Usage
|
|
@@ -91,3 +91,19 @@ After installation, import modules like this:
|
|
|
91
91
|
from ml_tools.utilities import sanitize_filename
|
|
92
92
|
from ml_tools.logger import custom_logger
|
|
93
93
|
```
|
|
94
|
+
|
|
95
|
+
## Available modules
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
data_exploration
|
|
99
|
+
datasetmaster
|
|
100
|
+
ensemble_learning
|
|
101
|
+
handle_excel
|
|
102
|
+
logger
|
|
103
|
+
MICE_imputation
|
|
104
|
+
particle_swarm_optimization
|
|
105
|
+
trainer
|
|
106
|
+
utilities
|
|
107
|
+
VIF_factor
|
|
108
|
+
vision_helpers
|
|
109
|
+
```
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
dragon_ml_toolbox-1.4.2.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
+
dragon_ml_toolbox-1.4.2.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=e1Hg5ZtaBpDV7ZvxhLe1ac28l7nMjvi1MSE5YvB1s-o,1472
|
|
3
|
+
ml_tools/MICE_imputation.py,sha256=CK0tYZ_kQkdETohOlhI7RP7oFkJTXrP-XtIxb--dzpU,9726
|
|
4
|
+
ml_tools/VIF_factor.py,sha256=LQWr1P8WYij07FX_3RZC6Rr22bfAMnrt0Lhvi7SbBpY,9846
|
|
5
|
+
ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
ml_tools/data_exploration.py,sha256=FXP5i6bQo8J3RCyLRmlX-qJVh4VH8DbMjrdUmyd1mF0,18708
|
|
7
|
+
ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
|
|
8
|
+
ml_tools/ensemble_learning.py,sha256=p8t5PI63N3G0ZgvOKmvFOvwJ24qqPdZCvyiDAx4ggXY,27670
|
|
9
|
+
ml_tools/handle_excel.py,sha256=ZJui5__0rc2T8UGHTheqZGhKmdVZ7Q2I54IoYCjAqJw,12612
|
|
10
|
+
ml_tools/logger.py,sha256=NOtL3YSuffAGmpTpXjY-uJjqFLdRG_jpL7MDyloBw9c,4712
|
|
11
|
+
ml_tools/particle_swarm_optimization.py,sha256=3xsc6sg-5o3cPbG_dWUyF3HdRVxgL4k_kRuPMU11NnM,20020
|
|
12
|
+
ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
|
|
13
|
+
ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
|
|
14
|
+
ml_tools/utilities.py,sha256=Pou-8IZsZj9NiZ_shhLt552yaKNvbnQ1Ztoj6VMHIeE,10091
|
|
15
|
+
ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
|
|
16
|
+
dragon_ml_toolbox-1.4.2.dist-info/METADATA,sha256=c95w_AETVdAwMYWrowJKxkC0wYCsgRrTmxyekPz7WBE,2516
|
|
17
|
+
dragon_ml_toolbox-1.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
18
|
+
dragon_ml_toolbox-1.4.2.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
19
|
+
dragon_ml_toolbox-1.4.2.dist-info/RECORD,,
|
ml_tools/MICE_imputation.py
CHANGED
|
@@ -3,9 +3,20 @@ import miceforest as mf
|
|
|
3
3
|
import os
|
|
4
4
|
import matplotlib.pyplot as plt
|
|
5
5
|
import numpy as np
|
|
6
|
-
from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename
|
|
6
|
+
from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info
|
|
7
7
|
from plotnine import ggplot, labs, theme, element_blank # type: ignore
|
|
8
8
|
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"apply_mice",
|
|
12
|
+
"save_imputed_datasets",
|
|
13
|
+
"get_na_column_names",
|
|
14
|
+
"get_convergence_diagnostic",
|
|
15
|
+
"get_imputed_distributions",
|
|
16
|
+
"run_mice_pipeline"
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
|
|
9
20
|
def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
|
|
10
21
|
|
|
11
22
|
# Initialize kernel with number of imputed datasets to generate
|
|
@@ -210,7 +221,7 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
|
|
|
210
221
|
if os.path.isfile(df_path_or_dir):
|
|
211
222
|
all_file_paths = [df_path_or_dir]
|
|
212
223
|
elif os.path.isdir(df_path_or_dir):
|
|
213
|
-
all_file_paths = list_csv_paths(df_path_or_dir).values()
|
|
224
|
+
all_file_paths = list(list_csv_paths(df_path_or_dir).values())
|
|
214
225
|
else:
|
|
215
226
|
raise ValueError(f"Invalid path or directory: {df_path_or_dir}")
|
|
216
227
|
|
|
@@ -226,3 +237,7 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
|
|
|
226
237
|
get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_dir)
|
|
227
238
|
|
|
228
239
|
get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_dir, column_names=imputed_column_names)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def info():
|
|
243
|
+
_script_info(__all__)
|
ml_tools/VIF_factor.py
CHANGED
|
@@ -7,12 +7,19 @@ from statsmodels.stats.outliers_influence import variance_inflation_factor
|
|
|
7
7
|
from statsmodels.tools.tools import add_constant
|
|
8
8
|
import warnings
|
|
9
9
|
import os
|
|
10
|
-
from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe
|
|
10
|
+
from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe, _script_info
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"compute_vif",
|
|
15
|
+
"drop_vif_based",
|
|
16
|
+
"compute_vif_multi"
|
|
17
|
+
]
|
|
11
18
|
|
|
12
19
|
|
|
13
20
|
def compute_vif(
|
|
14
21
|
df: pd.DataFrame,
|
|
15
|
-
|
|
22
|
+
use_columns: Optional[list[str]] = None,
|
|
16
23
|
ignore_columns: Optional[list[str]] = None,
|
|
17
24
|
max_features_to_plot: int = 20,
|
|
18
25
|
save_dir: Optional[str] = None,
|
|
@@ -25,7 +32,7 @@ def compute_vif(
|
|
|
25
32
|
|
|
26
33
|
Args:
|
|
27
34
|
df (pd.DataFrame): The input DataFrame.
|
|
28
|
-
|
|
35
|
+
use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
|
|
29
36
|
ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
|
|
30
37
|
max_features_to_plot (int): Adjust the number of features shown in the plot.
|
|
31
38
|
save_dir (str | None): Directory to save the plot as SVG. If None, the plot is not saved.
|
|
@@ -42,20 +49,20 @@ def compute_vif(
|
|
|
42
49
|
A VIF of 1 suggests no correlation, values between 1 and 5 indicate moderate correlation, and values greater than 10 typically signal high multicollinearity, which may distort model interpretation and degrade performance.
|
|
43
50
|
"""
|
|
44
51
|
ground_truth_cols = df.columns.to_list()
|
|
45
|
-
if
|
|
52
|
+
if use_columns is None:
|
|
46
53
|
sanitized_columns = df.select_dtypes(include='number').columns.tolist()
|
|
47
54
|
missing_features = set(ground_truth_cols) - set(sanitized_columns)
|
|
48
55
|
if missing_features:
|
|
49
56
|
print(f"⚠️ These columns are not Numeric:\n{missing_features}")
|
|
50
57
|
else:
|
|
51
58
|
sanitized_columns = list()
|
|
52
|
-
for feature in
|
|
59
|
+
for feature in use_columns:
|
|
53
60
|
if feature not in ground_truth_cols:
|
|
54
61
|
print(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
|
|
55
62
|
else:
|
|
56
63
|
sanitized_columns.append(feature)
|
|
57
64
|
|
|
58
|
-
if ignore_columns is not None and
|
|
65
|
+
if ignore_columns is not None and use_columns is None:
|
|
59
66
|
missing_ignore = set(ignore_columns) - set(ground_truth_cols)
|
|
60
67
|
if missing_ignore:
|
|
61
68
|
print(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
|
|
@@ -137,7 +144,7 @@ def compute_vif(
|
|
|
137
144
|
return vif_data.drop(columns="color")
|
|
138
145
|
|
|
139
146
|
|
|
140
|
-
def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10.0) -> pd.DataFrame:
|
|
147
|
+
def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10.0) -> tuple[pd.DataFrame, list[str]]:
|
|
141
148
|
"""
|
|
142
149
|
Drops columns from the original DataFrame based on their VIF values exceeding a given threshold.
|
|
143
150
|
|
|
@@ -147,7 +154,9 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
|
|
|
147
154
|
threshold (float): VIF threshold above which columns will be dropped.
|
|
148
155
|
|
|
149
156
|
Returns:
|
|
150
|
-
pd.DataFrame:
|
|
157
|
+
(tuple[pd.DataFrame, list[str]]):
|
|
158
|
+
- A new DataFrame with high-VIF columns removed.
|
|
159
|
+
- A list with dropped column names.
|
|
151
160
|
"""
|
|
152
161
|
# Ensure expected structure
|
|
153
162
|
if 'feature' not in vif_df.columns or 'VIF' not in vif_df.columns:
|
|
@@ -162,13 +171,13 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
|
|
|
162
171
|
if result_df.empty:
|
|
163
172
|
print(f"\t⚠️ Warning: All columns were dropped.")
|
|
164
173
|
|
|
165
|
-
return result_df
|
|
174
|
+
return result_df, to_drop
|
|
166
175
|
|
|
167
176
|
|
|
168
177
|
def compute_vif_multi(input_directory: str,
|
|
169
178
|
output_plot_directory: str,
|
|
170
179
|
output_dataset_directory: Optional[str] = None,
|
|
171
|
-
|
|
180
|
+
use_columns: Optional[list[str]] = None,
|
|
172
181
|
ignore_columns: Optional[list[str]] = None,
|
|
173
182
|
max_features_to_plot: int = 20,
|
|
174
183
|
fontsize: int = 14):
|
|
@@ -180,7 +189,7 @@ def compute_vif_multi(input_directory: str,
|
|
|
180
189
|
input_directory (str): Target directory with CSV files able to be loaded as DataFrame.
|
|
181
190
|
output_plot_directory (str): Save plots to this directory.
|
|
182
191
|
output_dataset_directory (str | None): If provided, saves new CSV files to this directory.
|
|
183
|
-
|
|
192
|
+
use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
|
|
184
193
|
ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
|
|
185
194
|
max_features_to_plot (int): Adjust the number of features shown in the plot.
|
|
186
195
|
fontsize (int): Base fontsize to scale title and labels on hte plot.
|
|
@@ -195,7 +204,7 @@ def compute_vif_multi(input_directory: str,
|
|
|
195
204
|
|
|
196
205
|
for df, df_name in yield_dataframes_from_dir(datasets_dir=input_directory):
|
|
197
206
|
vif_dataframe = compute_vif(df=df,
|
|
198
|
-
|
|
207
|
+
use_columns=use_columns,
|
|
199
208
|
ignore_columns=ignore_columns,
|
|
200
209
|
max_features_to_plot=max_features_to_plot,
|
|
201
210
|
fontsize=fontsize,
|
|
@@ -205,5 +214,11 @@ def compute_vif_multi(input_directory: str,
|
|
|
205
214
|
|
|
206
215
|
if output_dataset_directory is not None:
|
|
207
216
|
new_filename = 'VIF_' + df_name
|
|
208
|
-
result_df = drop_vif_based(df=df, vif_df=vif_dataframe)
|
|
209
|
-
|
|
217
|
+
result_df, dropped_cols = drop_vif_based(df=df, vif_df=vif_dataframe)
|
|
218
|
+
|
|
219
|
+
if len(dropped_cols) > 0:
|
|
220
|
+
save_dataframe(df=result_df, save_dir=output_dataset_directory, filename=new_filename)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def info():
|
|
224
|
+
_script_info(__all__)
|
ml_tools/data_exploration.py
CHANGED
|
@@ -9,22 +9,23 @@ from typing import Union, Literal, Dict, Tuple
|
|
|
9
9
|
import os
|
|
10
10
|
import sys
|
|
11
11
|
import textwrap
|
|
12
|
-
from ml_tools.utilities import sanitize_filename
|
|
12
|
+
from ml_tools.utilities import sanitize_filename, _script_info
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
# Keep track of all available
|
|
16
|
-
__all__ = [
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
15
|
+
# Keep track of all available tools, show using `info()`
|
|
16
|
+
__all__ = [
|
|
17
|
+
"summarize_dataframe",
|
|
18
|
+
"drop_rows_with_missing_data",
|
|
19
|
+
"split_features_targets",
|
|
20
|
+
"show_null_columns",
|
|
21
|
+
"drop_columns_with_missing_data",
|
|
22
|
+
"split_continuous_binary",
|
|
23
|
+
"plot_correlation_heatmap",
|
|
24
|
+
"check_value_distributions",
|
|
25
|
+
"plot_value_distributions",
|
|
26
|
+
"clip_outliers_single",
|
|
27
|
+
"clip_outliers_multi"
|
|
28
|
+
]
|
|
28
29
|
|
|
29
30
|
|
|
30
31
|
def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
|
|
@@ -58,34 +59,6 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
|
|
|
58
59
|
return summary
|
|
59
60
|
|
|
60
61
|
|
|
61
|
-
def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
|
|
62
|
-
"""
|
|
63
|
-
Displays a table of columns with missing values, showing both the count and
|
|
64
|
-
percentage of missing entries per column.
|
|
65
|
-
|
|
66
|
-
Parameters:
|
|
67
|
-
df (pd.DataFrame): The input DataFrame.
|
|
68
|
-
round_digits (int): Number of decimal places for the percentage.
|
|
69
|
-
|
|
70
|
-
Returns:
|
|
71
|
-
pd.DataFrame: A DataFrame summarizing missing values in each column.
|
|
72
|
-
"""
|
|
73
|
-
null_counts = df.isnull().sum()
|
|
74
|
-
null_percent = df.isnull().mean() * 100
|
|
75
|
-
|
|
76
|
-
# Filter only columns with at least one null
|
|
77
|
-
mask = null_counts > 0
|
|
78
|
-
null_summary = pd.DataFrame({
|
|
79
|
-
'Missing Count': null_counts[mask],
|
|
80
|
-
'Missing %': null_percent[mask].round(round_digits)
|
|
81
|
-
})
|
|
82
|
-
|
|
83
|
-
# Sort by descending percentage of missing values
|
|
84
|
-
null_summary = null_summary.sort_values(by='Missing %', ascending=False)
|
|
85
|
-
# print(null_summary)
|
|
86
|
-
return null_summary
|
|
87
|
-
|
|
88
|
-
|
|
89
62
|
def drop_rows_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) -> pd.DataFrame:
|
|
90
63
|
"""
|
|
91
64
|
Drops rows with more than `threshold` fraction of missing values.
|
|
@@ -132,6 +105,57 @@ def split_features_targets(df: pd.DataFrame, targets: list[str]):
|
|
|
132
105
|
return df_targets, df_features
|
|
133
106
|
|
|
134
107
|
|
|
108
|
+
def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
|
|
109
|
+
"""
|
|
110
|
+
Displays a table of columns with missing values, showing both the count and
|
|
111
|
+
percentage of missing entries per column.
|
|
112
|
+
|
|
113
|
+
Parameters:
|
|
114
|
+
df (pd.DataFrame): The input DataFrame.
|
|
115
|
+
round_digits (int): Number of decimal places for the percentage.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
pd.DataFrame: A DataFrame summarizing missing values in each column.
|
|
119
|
+
"""
|
|
120
|
+
null_counts = df.isnull().sum()
|
|
121
|
+
null_percent = df.isnull().mean() * 100
|
|
122
|
+
|
|
123
|
+
# Filter only columns with at least one null
|
|
124
|
+
mask = null_counts > 0
|
|
125
|
+
null_summary = pd.DataFrame({
|
|
126
|
+
'Missing Count': null_counts[mask],
|
|
127
|
+
'Missing %': null_percent[mask].round(round_digits)
|
|
128
|
+
})
|
|
129
|
+
|
|
130
|
+
# Sort by descending percentage of missing values
|
|
131
|
+
null_summary = null_summary.sort_values(by='Missing %', ascending=False)
|
|
132
|
+
# print(null_summary)
|
|
133
|
+
return null_summary
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) -> pd.DataFrame:
|
|
137
|
+
"""
|
|
138
|
+
Drops columns with more than `threshold` fraction of missing values.
|
|
139
|
+
|
|
140
|
+
Parameters:
|
|
141
|
+
df (pd.DataFrame): The input DataFrame.
|
|
142
|
+
threshold (float): Fraction of missing values above which columns are dropped.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
pd.DataFrame: A new DataFrame without the dropped columns.
|
|
146
|
+
"""
|
|
147
|
+
missing_fraction = df.isnull().mean()
|
|
148
|
+
cols_to_drop = missing_fraction[missing_fraction > threshold].index
|
|
149
|
+
|
|
150
|
+
if len(cols_to_drop) > 0:
|
|
151
|
+
print(f"Dropping columns with more than {threshold*100:.0f}% missing data:")
|
|
152
|
+
print(list(cols_to_drop))
|
|
153
|
+
else:
|
|
154
|
+
print(f"No columns have more than {threshold*100:.0f}% missing data.")
|
|
155
|
+
|
|
156
|
+
return df.drop(columns=cols_to_drop)
|
|
157
|
+
|
|
158
|
+
|
|
135
159
|
def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
136
160
|
"""
|
|
137
161
|
Split DataFrame into two DataFrames: one with continuous columns, one with binary columns.
|
|
@@ -174,29 +198,6 @@ def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFram
|
|
|
174
198
|
|
|
175
199
|
return df_cont, df_bin # type: ignore
|
|
176
200
|
|
|
177
|
-
|
|
178
|
-
def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) -> pd.DataFrame:
|
|
179
|
-
"""
|
|
180
|
-
Drops columns with more than `threshold` fraction of missing values.
|
|
181
|
-
|
|
182
|
-
Parameters:
|
|
183
|
-
df (pd.DataFrame): The input DataFrame.
|
|
184
|
-
threshold (float): Fraction of missing values above which columns are dropped.
|
|
185
|
-
|
|
186
|
-
Returns:
|
|
187
|
-
pd.DataFrame: A new DataFrame without the dropped columns.
|
|
188
|
-
"""
|
|
189
|
-
missing_fraction = df.isnull().mean()
|
|
190
|
-
cols_to_drop = missing_fraction[missing_fraction > threshold].index
|
|
191
|
-
|
|
192
|
-
if len(cols_to_drop) > 0:
|
|
193
|
-
print(f"Dropping columns with more than {threshold*100:.0f}% missing data:")
|
|
194
|
-
print(list(cols_to_drop))
|
|
195
|
-
else:
|
|
196
|
-
print(f"No columns have more than {threshold*100:.0f}% missing data.")
|
|
197
|
-
|
|
198
|
-
return df.drop(columns=cols_to_drop)
|
|
199
|
-
|
|
200
201
|
|
|
201
202
|
def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None, method: Literal["pearson", "kendall", "spearman"]="pearson", plot_title: str="Correlation Heatmap"):
|
|
202
203
|
"""
|
|
@@ -513,83 +514,10 @@ def clip_outliers_multi(
|
|
|
513
514
|
return new_df
|
|
514
515
|
|
|
515
516
|
|
|
516
|
-
def merge_dataframes(
|
|
517
|
-
*dfs: pd.DataFrame,
|
|
518
|
-
reset_index: bool = False,
|
|
519
|
-
direction: Literal["horizontal", "vertical"] = "horizontal"
|
|
520
|
-
) -> pd.DataFrame:
|
|
521
|
-
"""
|
|
522
|
-
Merges multiple DataFrames either horizontally or vertically.
|
|
523
|
-
|
|
524
|
-
Parameters:
|
|
525
|
-
*dfs (pd.DataFrame): Variable number of DataFrames to merge.
|
|
526
|
-
reset_index (bool): Whether to reset index in the final merged DataFrame.
|
|
527
|
-
direction (["horizontal" | "vertical"]):
|
|
528
|
-
- "horizontal": Merge on index, adding columns.
|
|
529
|
-
- "vertical": Append rows; all DataFrames must have identical columns.
|
|
530
|
-
|
|
531
|
-
Returns:
|
|
532
|
-
pd.DataFrame: A single merged DataFrame.
|
|
533
|
-
|
|
534
|
-
Raises:
|
|
535
|
-
ValueError:
|
|
536
|
-
- If fewer than 2 DataFrames are provided.
|
|
537
|
-
- If indexes do not match for horizontal merge.
|
|
538
|
-
- If column names or order differ for vertical merge.
|
|
539
|
-
"""
|
|
540
|
-
if len(dfs) < 2:
|
|
541
|
-
raise ValueError("At least 2 DataFrames must be provided.")
|
|
542
|
-
|
|
543
|
-
for i, df in enumerate(dfs, start=1):
|
|
544
|
-
print(f"DataFrame {i} shape: {df.shape}")
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
if direction == "horizontal":
|
|
548
|
-
reference_index = dfs[0].index
|
|
549
|
-
for i, df in enumerate(dfs, start=1):
|
|
550
|
-
if not df.index.equals(reference_index):
|
|
551
|
-
raise ValueError(f"Indexes do not match: Dataset 1 and Dataset {i}.")
|
|
552
|
-
merged_df = pd.concat(dfs, axis=1)
|
|
553
|
-
|
|
554
|
-
elif direction == "vertical":
|
|
555
|
-
reference_columns = dfs[0].columns
|
|
556
|
-
for i, df in enumerate(dfs, start=1):
|
|
557
|
-
if not df.columns.equals(reference_columns):
|
|
558
|
-
raise ValueError(f"Column names/order do not match: Dataset 1 and Dataset {i}.")
|
|
559
|
-
merged_df = pd.concat(dfs, axis=0)
|
|
560
|
-
|
|
561
|
-
else:
|
|
562
|
-
raise ValueError(f"Invalid merge direction: {direction}")
|
|
563
|
-
|
|
564
|
-
if reset_index:
|
|
565
|
-
merged_df = merged_df.reset_index(drop=True)
|
|
566
|
-
|
|
567
|
-
print(f"Merged DataFrame shape: {merged_df.shape}")
|
|
568
|
-
|
|
569
|
-
return merged_df
|
|
570
|
-
|
|
571
|
-
|
|
572
517
|
def _is_notebook():
|
|
573
518
|
return get_ipython() is not None
|
|
574
519
|
|
|
575
520
|
|
|
576
|
-
def info(
|
|
577
|
-
|
|
578
|
-
List available functions and their descriptions.
|
|
579
|
-
"""
|
|
580
|
-
print("Available functions for data exploration:")
|
|
581
|
-
if full_info:
|
|
582
|
-
module = sys.modules[__name__]
|
|
583
|
-
for name in __all__:
|
|
584
|
-
obj = getattr(module, name, None)
|
|
585
|
-
if callable(obj):
|
|
586
|
-
doc = obj.__doc__ or "No docstring provided."
|
|
587
|
-
formatted_doc = textwrap.indent(textwrap.dedent(doc.strip()), prefix=" ")
|
|
588
|
-
print(f"\n{name}:\n{formatted_doc}")
|
|
589
|
-
else:
|
|
590
|
-
for i, name in enumerate(__all__, start=1):
|
|
591
|
-
print(f"{i} - {name}")
|
|
592
|
-
|
|
521
|
+
def info():
|
|
522
|
+
_script_info(__all__)
|
|
593
523
|
|
|
594
|
-
if __name__ == "__main__":
|
|
595
|
-
info()
|
ml_tools/datasetmaster.py
CHANGED
|
@@ -11,6 +11,15 @@ from PIL import Image
|
|
|
11
11
|
from torchvision.datasets import ImageFolder
|
|
12
12
|
from torchvision import transforms
|
|
13
13
|
import matplotlib.pyplot as plt
|
|
14
|
+
from .utilities import _script_info
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"DatasetMaker",
|
|
19
|
+
"PytorchDataset",
|
|
20
|
+
"make_vision_dataset",
|
|
21
|
+
"SequenceDataset",
|
|
22
|
+
]
|
|
14
23
|
|
|
15
24
|
|
|
16
25
|
class DatasetMaker():
|
|
@@ -592,4 +601,7 @@ class SequenceDataset():
|
|
|
592
601
|
|
|
593
602
|
def __len__(self):
|
|
594
603
|
return f"Train: {len(self.train_dataset)}, Test: {len(self.test_dataset)}"
|
|
595
|
-
|
|
604
|
+
|
|
605
|
+
|
|
606
|
+
def info():
|
|
607
|
+
_script_info(__all__)
|