dragon-ml-toolbox 1.4.0__tar.gz → 1.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-1.4.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-1.4.1}/PKG-INFO +18 -2
- {dragon_ml_toolbox-1.4.0 → dragon_ml_toolbox-1.4.1}/README.md +17 -1
- {dragon_ml_toolbox-1.4.0 → dragon_ml_toolbox-1.4.1/dragon_ml_toolbox.egg-info}/PKG-INFO +18 -2
- {dragon_ml_toolbox-1.4.0 → dragon_ml_toolbox-1.4.1}/ml_tools/MICE_imputation.py +17 -2
- {dragon_ml_toolbox-1.4.0 → dragon_ml_toolbox-1.4.1}/ml_tools/VIF_factor.py +29 -14
- {dragon_ml_toolbox-1.4.0 → dragon_ml_toolbox-1.4.1}/ml_tools/data_exploration.py +68 -140
- {dragon_ml_toolbox-1.4.0 → dragon_ml_toolbox-1.4.1}/ml_tools/datasetmaster.py +13 -1
- {dragon_ml_toolbox-1.4.0 → dragon_ml_toolbox-1.4.1}/ml_tools/ensemble_learning.py +21 -13
- {dragon_ml_toolbox-1.4.0 → dragon_ml_toolbox-1.4.1}/ml_tools/handle_excel.py +32 -9
- {dragon_ml_toolbox-1.4.0 → dragon_ml_toolbox-1.4.1}/ml_tools/logger.py +10 -1
- {dragon_ml_toolbox-1.4.0 → dragon_ml_toolbox-1.4.1}/ml_tools/particle_swarm_optimization.py +71 -34
- {dragon_ml_toolbox-1.4.0 → dragon_ml_toolbox-1.4.1}/ml_tools/pytorch_models.py +13 -1
- {dragon_ml_toolbox-1.4.0 → dragon_ml_toolbox-1.4.1}/ml_tools/trainer.py +10 -30
- {dragon_ml_toolbox-1.4.0 → dragon_ml_toolbox-1.4.1}/ml_tools/utilities.py +105 -23
- {dragon_ml_toolbox-1.4.0 → dragon_ml_toolbox-1.4.1}/ml_tools/vision_helpers.py +14 -1
- {dragon_ml_toolbox-1.4.0 → dragon_ml_toolbox-1.4.1}/pyproject.toml +1 -1
- {dragon_ml_toolbox-1.4.0 → dragon_ml_toolbox-1.4.1}/LICENSE +0 -0
- {dragon_ml_toolbox-1.4.0 → dragon_ml_toolbox-1.4.1}/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-1.4.0 → dragon_ml_toolbox-1.4.1}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
- {dragon_ml_toolbox-1.4.0 → dragon_ml_toolbox-1.4.1}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
- {dragon_ml_toolbox-1.4.0 → dragon_ml_toolbox-1.4.1}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
- {dragon_ml_toolbox-1.4.0 → dragon_ml_toolbox-1.4.1}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
- {dragon_ml_toolbox-1.4.0 → dragon_ml_toolbox-1.4.1}/ml_tools/__init__.py +0 -0
- {dragon_ml_toolbox-1.4.0 → dragon_ml_toolbox-1.4.1}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version: 1.4.
|
|
3
|
+
Version: 1.4.1
|
|
4
4
|
Summary: A collection of tools for data science and machine learning projects
|
|
5
5
|
Author-email: Karl Loza <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -80,7 +80,7 @@ Clone the repository and install in editable mode with optional dependencies:
|
|
|
80
80
|
```bash
|
|
81
81
|
git clone https://github.com/DrAg0n-BoRn/ML_tools.git
|
|
82
82
|
cd ML_tools
|
|
83
|
-
pip install -e
|
|
83
|
+
pip install -e .
|
|
84
84
|
```
|
|
85
85
|
|
|
86
86
|
## Usage
|
|
@@ -91,3 +91,19 @@ After installation, import modules like this:
|
|
|
91
91
|
from ml_tools.utilities import sanitize_filename
|
|
92
92
|
from ml_tools.logger import custom_logger
|
|
93
93
|
```
|
|
94
|
+
|
|
95
|
+
## Available modules
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
data_exploration
|
|
99
|
+
datasetmaster
|
|
100
|
+
ensemble_learning
|
|
101
|
+
handle_excel
|
|
102
|
+
logger
|
|
103
|
+
MICE_imputation
|
|
104
|
+
particle_swarm_optimization
|
|
105
|
+
trainer
|
|
106
|
+
utilities
|
|
107
|
+
VIF_factor
|
|
108
|
+
vision_helpers
|
|
109
|
+
```
|
|
@@ -40,7 +40,7 @@ Clone the repository and install in editable mode with optional dependencies:
|
|
|
40
40
|
```bash
|
|
41
41
|
git clone https://github.com/DrAg0n-BoRn/ML_tools.git
|
|
42
42
|
cd ML_tools
|
|
43
|
-
pip install -e
|
|
43
|
+
pip install -e .
|
|
44
44
|
```
|
|
45
45
|
|
|
46
46
|
## Usage
|
|
@@ -51,3 +51,19 @@ After installation, import modules like this:
|
|
|
51
51
|
from ml_tools.utilities import sanitize_filename
|
|
52
52
|
from ml_tools.logger import custom_logger
|
|
53
53
|
```
|
|
54
|
+
|
|
55
|
+
## Available modules
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
data_exploration
|
|
59
|
+
datasetmaster
|
|
60
|
+
ensemble_learning
|
|
61
|
+
handle_excel
|
|
62
|
+
logger
|
|
63
|
+
MICE_imputation
|
|
64
|
+
particle_swarm_optimization
|
|
65
|
+
trainer
|
|
66
|
+
utilities
|
|
67
|
+
VIF_factor
|
|
68
|
+
vision_helpers
|
|
69
|
+
```
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version: 1.4.
|
|
3
|
+
Version: 1.4.1
|
|
4
4
|
Summary: A collection of tools for data science and machine learning projects
|
|
5
5
|
Author-email: Karl Loza <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -80,7 +80,7 @@ Clone the repository and install in editable mode with optional dependencies:
|
|
|
80
80
|
```bash
|
|
81
81
|
git clone https://github.com/DrAg0n-BoRn/ML_tools.git
|
|
82
82
|
cd ML_tools
|
|
83
|
-
pip install -e
|
|
83
|
+
pip install -e .
|
|
84
84
|
```
|
|
85
85
|
|
|
86
86
|
## Usage
|
|
@@ -91,3 +91,19 @@ After installation, import modules like this:
|
|
|
91
91
|
from ml_tools.utilities import sanitize_filename
|
|
92
92
|
from ml_tools.logger import custom_logger
|
|
93
93
|
```
|
|
94
|
+
|
|
95
|
+
## Available modules
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
data_exploration
|
|
99
|
+
datasetmaster
|
|
100
|
+
ensemble_learning
|
|
101
|
+
handle_excel
|
|
102
|
+
logger
|
|
103
|
+
MICE_imputation
|
|
104
|
+
particle_swarm_optimization
|
|
105
|
+
trainer
|
|
106
|
+
utilities
|
|
107
|
+
VIF_factor
|
|
108
|
+
vision_helpers
|
|
109
|
+
```
|
|
@@ -3,9 +3,20 @@ import miceforest as mf
|
|
|
3
3
|
import os
|
|
4
4
|
import matplotlib.pyplot as plt
|
|
5
5
|
import numpy as np
|
|
6
|
-
from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename
|
|
6
|
+
from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info
|
|
7
7
|
from plotnine import ggplot, labs, theme, element_blank # type: ignore
|
|
8
8
|
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"apply_mice",
|
|
12
|
+
"save_imputed_datasets",
|
|
13
|
+
"get_na_column_names",
|
|
14
|
+
"get_convergence_diagnostic",
|
|
15
|
+
"get_imputed_distributions",
|
|
16
|
+
"run_mice_pipeline"
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
|
|
9
20
|
def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
|
|
10
21
|
|
|
11
22
|
# Initialize kernel with number of imputed datasets to generate
|
|
@@ -210,7 +221,7 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
|
|
|
210
221
|
if os.path.isfile(df_path_or_dir):
|
|
211
222
|
all_file_paths = [df_path_or_dir]
|
|
212
223
|
elif os.path.isdir(df_path_or_dir):
|
|
213
|
-
all_file_paths = list_csv_paths(df_path_or_dir).values()
|
|
224
|
+
all_file_paths = list(list_csv_paths(df_path_or_dir).values())
|
|
214
225
|
else:
|
|
215
226
|
raise ValueError(f"Invalid path or directory: {df_path_or_dir}")
|
|
216
227
|
|
|
@@ -226,3 +237,7 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
|
|
|
226
237
|
get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_dir)
|
|
227
238
|
|
|
228
239
|
get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_dir, column_names=imputed_column_names)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def info():
|
|
243
|
+
_script_info(__all__)
|
|
@@ -7,12 +7,19 @@ from statsmodels.stats.outliers_influence import variance_inflation_factor
|
|
|
7
7
|
from statsmodels.tools.tools import add_constant
|
|
8
8
|
import warnings
|
|
9
9
|
import os
|
|
10
|
-
from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe
|
|
10
|
+
from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe, _script_info
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"compute_vif",
|
|
15
|
+
"drop_vif_based",
|
|
16
|
+
"compute_vif_multi"
|
|
17
|
+
]
|
|
11
18
|
|
|
12
19
|
|
|
13
20
|
def compute_vif(
|
|
14
21
|
df: pd.DataFrame,
|
|
15
|
-
|
|
22
|
+
use_columns: Optional[list[str]] = None,
|
|
16
23
|
ignore_columns: Optional[list[str]] = None,
|
|
17
24
|
max_features_to_plot: int = 20,
|
|
18
25
|
save_dir: Optional[str] = None,
|
|
@@ -25,7 +32,7 @@ def compute_vif(
|
|
|
25
32
|
|
|
26
33
|
Args:
|
|
27
34
|
df (pd.DataFrame): The input DataFrame.
|
|
28
|
-
|
|
35
|
+
use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
|
|
29
36
|
ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
|
|
30
37
|
max_features_to_plot (int): Adjust the number of features shown in the plot.
|
|
31
38
|
save_dir (str | None): Directory to save the plot as SVG. If None, the plot is not saved.
|
|
@@ -42,20 +49,20 @@ def compute_vif(
|
|
|
42
49
|
A VIF of 1 suggests no correlation, values between 1 and 5 indicate moderate correlation, and values greater than 10 typically signal high multicollinearity, which may distort model interpretation and degrade performance.
|
|
43
50
|
"""
|
|
44
51
|
ground_truth_cols = df.columns.to_list()
|
|
45
|
-
if
|
|
52
|
+
if use_columns is None:
|
|
46
53
|
sanitized_columns = df.select_dtypes(include='number').columns.tolist()
|
|
47
54
|
missing_features = set(ground_truth_cols) - set(sanitized_columns)
|
|
48
55
|
if missing_features:
|
|
49
56
|
print(f"⚠️ These columns are not Numeric:\n{missing_features}")
|
|
50
57
|
else:
|
|
51
58
|
sanitized_columns = list()
|
|
52
|
-
for feature in
|
|
59
|
+
for feature in use_columns:
|
|
53
60
|
if feature not in ground_truth_cols:
|
|
54
61
|
print(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
|
|
55
62
|
else:
|
|
56
63
|
sanitized_columns.append(feature)
|
|
57
64
|
|
|
58
|
-
if ignore_columns is not None and
|
|
65
|
+
if ignore_columns is not None and use_columns is None:
|
|
59
66
|
missing_ignore = set(ignore_columns) - set(ground_truth_cols)
|
|
60
67
|
if missing_ignore:
|
|
61
68
|
print(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
|
|
@@ -137,7 +144,7 @@ def compute_vif(
|
|
|
137
144
|
return vif_data.drop(columns="color")
|
|
138
145
|
|
|
139
146
|
|
|
140
|
-
def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10.0) -> pd.DataFrame:
|
|
147
|
+
def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10.0) -> tuple[pd.DataFrame, list[str]]:
|
|
141
148
|
"""
|
|
142
149
|
Drops columns from the original DataFrame based on their VIF values exceeding a given threshold.
|
|
143
150
|
|
|
@@ -147,7 +154,9 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
|
|
|
147
154
|
threshold (float): VIF threshold above which columns will be dropped.
|
|
148
155
|
|
|
149
156
|
Returns:
|
|
150
|
-
pd.DataFrame:
|
|
157
|
+
(tuple[pd.DataFrame, list[str]]):
|
|
158
|
+
- A new DataFrame with high-VIF columns removed.
|
|
159
|
+
- A list with dropped column names.
|
|
151
160
|
"""
|
|
152
161
|
# Ensure expected structure
|
|
153
162
|
if 'feature' not in vif_df.columns or 'VIF' not in vif_df.columns:
|
|
@@ -162,13 +171,13 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
|
|
|
162
171
|
if result_df.empty:
|
|
163
172
|
print(f"\t⚠️ Warning: All columns were dropped.")
|
|
164
173
|
|
|
165
|
-
return result_df
|
|
174
|
+
return result_df, to_drop
|
|
166
175
|
|
|
167
176
|
|
|
168
177
|
def compute_vif_multi(input_directory: str,
|
|
169
178
|
output_plot_directory: str,
|
|
170
179
|
output_dataset_directory: Optional[str] = None,
|
|
171
|
-
|
|
180
|
+
use_columns: Optional[list[str]] = None,
|
|
172
181
|
ignore_columns: Optional[list[str]] = None,
|
|
173
182
|
max_features_to_plot: int = 20,
|
|
174
183
|
fontsize: int = 14):
|
|
@@ -180,7 +189,7 @@ def compute_vif_multi(input_directory: str,
|
|
|
180
189
|
input_directory (str): Target directory with CSV files able to be loaded as DataFrame.
|
|
181
190
|
output_plot_directory (str): Save plots to this directory.
|
|
182
191
|
output_dataset_directory (str | None): If provided, saves new CSV files to this directory.
|
|
183
|
-
|
|
192
|
+
use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
|
|
184
193
|
ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
|
|
185
194
|
max_features_to_plot (int): Adjust the number of features shown in the plot.
|
|
186
195
|
fontsize (int): Base fontsize to scale title and labels on hte plot.
|
|
@@ -195,7 +204,7 @@ def compute_vif_multi(input_directory: str,
|
|
|
195
204
|
|
|
196
205
|
for df, df_name in yield_dataframes_from_dir(datasets_dir=input_directory):
|
|
197
206
|
vif_dataframe = compute_vif(df=df,
|
|
198
|
-
|
|
207
|
+
use_columns=use_columns,
|
|
199
208
|
ignore_columns=ignore_columns,
|
|
200
209
|
max_features_to_plot=max_features_to_plot,
|
|
201
210
|
fontsize=fontsize,
|
|
@@ -205,5 +214,11 @@ def compute_vif_multi(input_directory: str,
|
|
|
205
214
|
|
|
206
215
|
if output_dataset_directory is not None:
|
|
207
216
|
new_filename = 'VIF_' + df_name
|
|
208
|
-
result_df = drop_vif_based(df=df, vif_df=vif_dataframe)
|
|
209
|
-
|
|
217
|
+
result_df, dropped_cols = drop_vif_based(df=df, vif_df=vif_dataframe)
|
|
218
|
+
|
|
219
|
+
if len(dropped_cols) > 0:
|
|
220
|
+
save_dataframe(df=result_df, save_dir=output_dataset_directory, filename=new_filename)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def info():
|
|
224
|
+
_script_info(__all__)
|
|
@@ -9,22 +9,23 @@ from typing import Union, Literal, Dict, Tuple
|
|
|
9
9
|
import os
|
|
10
10
|
import sys
|
|
11
11
|
import textwrap
|
|
12
|
-
from ml_tools.utilities import sanitize_filename
|
|
12
|
+
from ml_tools.utilities import sanitize_filename, _script_info
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
# Keep track of all available
|
|
16
|
-
__all__ = [
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
15
|
+
# Keep track of all available tools, show using `info()`
|
|
16
|
+
__all__ = [
|
|
17
|
+
"summarize_dataframe",
|
|
18
|
+
"drop_rows_with_missing_data",
|
|
19
|
+
"split_features_targets",
|
|
20
|
+
"show_null_columns",
|
|
21
|
+
"drop_columns_with_missing_data",
|
|
22
|
+
"split_continuous_binary",
|
|
23
|
+
"plot_correlation_heatmap",
|
|
24
|
+
"check_value_distributions",
|
|
25
|
+
"plot_value_distributions",
|
|
26
|
+
"clip_outliers_single",
|
|
27
|
+
"clip_outliers_multi"
|
|
28
|
+
]
|
|
28
29
|
|
|
29
30
|
|
|
30
31
|
def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
|
|
@@ -58,34 +59,6 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
|
|
|
58
59
|
return summary
|
|
59
60
|
|
|
60
61
|
|
|
61
|
-
def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
|
|
62
|
-
"""
|
|
63
|
-
Displays a table of columns with missing values, showing both the count and
|
|
64
|
-
percentage of missing entries per column.
|
|
65
|
-
|
|
66
|
-
Parameters:
|
|
67
|
-
df (pd.DataFrame): The input DataFrame.
|
|
68
|
-
round_digits (int): Number of decimal places for the percentage.
|
|
69
|
-
|
|
70
|
-
Returns:
|
|
71
|
-
pd.DataFrame: A DataFrame summarizing missing values in each column.
|
|
72
|
-
"""
|
|
73
|
-
null_counts = df.isnull().sum()
|
|
74
|
-
null_percent = df.isnull().mean() * 100
|
|
75
|
-
|
|
76
|
-
# Filter only columns with at least one null
|
|
77
|
-
mask = null_counts > 0
|
|
78
|
-
null_summary = pd.DataFrame({
|
|
79
|
-
'Missing Count': null_counts[mask],
|
|
80
|
-
'Missing %': null_percent[mask].round(round_digits)
|
|
81
|
-
})
|
|
82
|
-
|
|
83
|
-
# Sort by descending percentage of missing values
|
|
84
|
-
null_summary = null_summary.sort_values(by='Missing %', ascending=False)
|
|
85
|
-
# print(null_summary)
|
|
86
|
-
return null_summary
|
|
87
|
-
|
|
88
|
-
|
|
89
62
|
def drop_rows_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) -> pd.DataFrame:
|
|
90
63
|
"""
|
|
91
64
|
Drops rows with more than `threshold` fraction of missing values.
|
|
@@ -132,6 +105,57 @@ def split_features_targets(df: pd.DataFrame, targets: list[str]):
|
|
|
132
105
|
return df_targets, df_features
|
|
133
106
|
|
|
134
107
|
|
|
108
|
+
def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
|
|
109
|
+
"""
|
|
110
|
+
Displays a table of columns with missing values, showing both the count and
|
|
111
|
+
percentage of missing entries per column.
|
|
112
|
+
|
|
113
|
+
Parameters:
|
|
114
|
+
df (pd.DataFrame): The input DataFrame.
|
|
115
|
+
round_digits (int): Number of decimal places for the percentage.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
pd.DataFrame: A DataFrame summarizing missing values in each column.
|
|
119
|
+
"""
|
|
120
|
+
null_counts = df.isnull().sum()
|
|
121
|
+
null_percent = df.isnull().mean() * 100
|
|
122
|
+
|
|
123
|
+
# Filter only columns with at least one null
|
|
124
|
+
mask = null_counts > 0
|
|
125
|
+
null_summary = pd.DataFrame({
|
|
126
|
+
'Missing Count': null_counts[mask],
|
|
127
|
+
'Missing %': null_percent[mask].round(round_digits)
|
|
128
|
+
})
|
|
129
|
+
|
|
130
|
+
# Sort by descending percentage of missing values
|
|
131
|
+
null_summary = null_summary.sort_values(by='Missing %', ascending=False)
|
|
132
|
+
# print(null_summary)
|
|
133
|
+
return null_summary
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) -> pd.DataFrame:
|
|
137
|
+
"""
|
|
138
|
+
Drops columns with more than `threshold` fraction of missing values.
|
|
139
|
+
|
|
140
|
+
Parameters:
|
|
141
|
+
df (pd.DataFrame): The input DataFrame.
|
|
142
|
+
threshold (float): Fraction of missing values above which columns are dropped.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
pd.DataFrame: A new DataFrame without the dropped columns.
|
|
146
|
+
"""
|
|
147
|
+
missing_fraction = df.isnull().mean()
|
|
148
|
+
cols_to_drop = missing_fraction[missing_fraction > threshold].index
|
|
149
|
+
|
|
150
|
+
if len(cols_to_drop) > 0:
|
|
151
|
+
print(f"Dropping columns with more than {threshold*100:.0f}% missing data:")
|
|
152
|
+
print(list(cols_to_drop))
|
|
153
|
+
else:
|
|
154
|
+
print(f"No columns have more than {threshold*100:.0f}% missing data.")
|
|
155
|
+
|
|
156
|
+
return df.drop(columns=cols_to_drop)
|
|
157
|
+
|
|
158
|
+
|
|
135
159
|
def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
136
160
|
"""
|
|
137
161
|
Split DataFrame into two DataFrames: one with continuous columns, one with binary columns.
|
|
@@ -174,29 +198,6 @@ def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFram
|
|
|
174
198
|
|
|
175
199
|
return df_cont, df_bin # type: ignore
|
|
176
200
|
|
|
177
|
-
|
|
178
|
-
def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) -> pd.DataFrame:
|
|
179
|
-
"""
|
|
180
|
-
Drops columns with more than `threshold` fraction of missing values.
|
|
181
|
-
|
|
182
|
-
Parameters:
|
|
183
|
-
df (pd.DataFrame): The input DataFrame.
|
|
184
|
-
threshold (float): Fraction of missing values above which columns are dropped.
|
|
185
|
-
|
|
186
|
-
Returns:
|
|
187
|
-
pd.DataFrame: A new DataFrame without the dropped columns.
|
|
188
|
-
"""
|
|
189
|
-
missing_fraction = df.isnull().mean()
|
|
190
|
-
cols_to_drop = missing_fraction[missing_fraction > threshold].index
|
|
191
|
-
|
|
192
|
-
if len(cols_to_drop) > 0:
|
|
193
|
-
print(f"Dropping columns with more than {threshold*100:.0f}% missing data:")
|
|
194
|
-
print(list(cols_to_drop))
|
|
195
|
-
else:
|
|
196
|
-
print(f"No columns have more than {threshold*100:.0f}% missing data.")
|
|
197
|
-
|
|
198
|
-
return df.drop(columns=cols_to_drop)
|
|
199
|
-
|
|
200
201
|
|
|
201
202
|
def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None, method: Literal["pearson", "kendall", "spearman"]="pearson", plot_title: str="Correlation Heatmap"):
|
|
202
203
|
"""
|
|
@@ -513,83 +514,10 @@ def clip_outliers_multi(
|
|
|
513
514
|
return new_df
|
|
514
515
|
|
|
515
516
|
|
|
516
|
-
def merge_dataframes(
|
|
517
|
-
*dfs: pd.DataFrame,
|
|
518
|
-
reset_index: bool = False,
|
|
519
|
-
direction: Literal["horizontal", "vertical"] = "horizontal"
|
|
520
|
-
) -> pd.DataFrame:
|
|
521
|
-
"""
|
|
522
|
-
Merges multiple DataFrames either horizontally or vertically.
|
|
523
|
-
|
|
524
|
-
Parameters:
|
|
525
|
-
*dfs (pd.DataFrame): Variable number of DataFrames to merge.
|
|
526
|
-
reset_index (bool): Whether to reset index in the final merged DataFrame.
|
|
527
|
-
direction (["horizontal" | "vertical"]):
|
|
528
|
-
- "horizontal": Merge on index, adding columns.
|
|
529
|
-
- "vertical": Append rows; all DataFrames must have identical columns.
|
|
530
|
-
|
|
531
|
-
Returns:
|
|
532
|
-
pd.DataFrame: A single merged DataFrame.
|
|
533
|
-
|
|
534
|
-
Raises:
|
|
535
|
-
ValueError:
|
|
536
|
-
- If fewer than 2 DataFrames are provided.
|
|
537
|
-
- If indexes do not match for horizontal merge.
|
|
538
|
-
- If column names or order differ for vertical merge.
|
|
539
|
-
"""
|
|
540
|
-
if len(dfs) < 2:
|
|
541
|
-
raise ValueError("At least 2 DataFrames must be provided.")
|
|
542
|
-
|
|
543
|
-
for i, df in enumerate(dfs, start=1):
|
|
544
|
-
print(f"DataFrame {i} shape: {df.shape}")
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
if direction == "horizontal":
|
|
548
|
-
reference_index = dfs[0].index
|
|
549
|
-
for i, df in enumerate(dfs, start=1):
|
|
550
|
-
if not df.index.equals(reference_index):
|
|
551
|
-
raise ValueError(f"Indexes do not match: Dataset 1 and Dataset {i}.")
|
|
552
|
-
merged_df = pd.concat(dfs, axis=1)
|
|
553
|
-
|
|
554
|
-
elif direction == "vertical":
|
|
555
|
-
reference_columns = dfs[0].columns
|
|
556
|
-
for i, df in enumerate(dfs, start=1):
|
|
557
|
-
if not df.columns.equals(reference_columns):
|
|
558
|
-
raise ValueError(f"Column names/order do not match: Dataset 1 and Dataset {i}.")
|
|
559
|
-
merged_df = pd.concat(dfs, axis=0)
|
|
560
|
-
|
|
561
|
-
else:
|
|
562
|
-
raise ValueError(f"Invalid merge direction: {direction}")
|
|
563
|
-
|
|
564
|
-
if reset_index:
|
|
565
|
-
merged_df = merged_df.reset_index(drop=True)
|
|
566
|
-
|
|
567
|
-
print(f"Merged DataFrame shape: {merged_df.shape}")
|
|
568
|
-
|
|
569
|
-
return merged_df
|
|
570
|
-
|
|
571
|
-
|
|
572
517
|
def _is_notebook():
|
|
573
518
|
return get_ipython() is not None
|
|
574
519
|
|
|
575
520
|
|
|
576
|
-
def info(
|
|
577
|
-
|
|
578
|
-
List available functions and their descriptions.
|
|
579
|
-
"""
|
|
580
|
-
print("Available functions for data exploration:")
|
|
581
|
-
if full_info:
|
|
582
|
-
module = sys.modules[__name__]
|
|
583
|
-
for name in __all__:
|
|
584
|
-
obj = getattr(module, name, None)
|
|
585
|
-
if callable(obj):
|
|
586
|
-
doc = obj.__doc__ or "No docstring provided."
|
|
587
|
-
formatted_doc = textwrap.indent(textwrap.dedent(doc.strip()), prefix=" ")
|
|
588
|
-
print(f"\n{name}:\n{formatted_doc}")
|
|
589
|
-
else:
|
|
590
|
-
for i, name in enumerate(__all__, start=1):
|
|
591
|
-
print(f"{i} - {name}")
|
|
592
|
-
|
|
521
|
+
def info():
|
|
522
|
+
_script_info(__all__)
|
|
593
523
|
|
|
594
|
-
if __name__ == "__main__":
|
|
595
|
-
info()
|
|
@@ -11,6 +11,15 @@ from PIL import Image
|
|
|
11
11
|
from torchvision.datasets import ImageFolder
|
|
12
12
|
from torchvision import transforms
|
|
13
13
|
import matplotlib.pyplot as plt
|
|
14
|
+
from .utilities import _script_info
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"DatasetMaker",
|
|
19
|
+
"PytorchDataset",
|
|
20
|
+
"make_vision_dataset",
|
|
21
|
+
"SequenceDataset",
|
|
22
|
+
]
|
|
14
23
|
|
|
15
24
|
|
|
16
25
|
class DatasetMaker():
|
|
@@ -592,4 +601,7 @@ class SequenceDataset():
|
|
|
592
601
|
|
|
593
602
|
def __len__(self):
|
|
594
603
|
return f"Train: {len(self.train_dataset)}, Test: {len(self.test_dataset)}"
|
|
595
|
-
|
|
604
|
+
|
|
605
|
+
|
|
606
|
+
def info():
|
|
607
|
+
_script_info(__all__)
|
|
@@ -21,7 +21,7 @@ from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
|
|
|
21
21
|
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, mean_absolute_error, mean_squared_error, r2_score, roc_curve, roc_auc_score
|
|
22
22
|
import shap
|
|
23
23
|
|
|
24
|
-
from .utilities import yield_dataframes_from_dir
|
|
24
|
+
from .utilities import yield_dataframes_from_dir, sanitize_filename
|
|
25
25
|
|
|
26
26
|
import warnings # Ignore warnings
|
|
27
27
|
warnings.filterwarnings('ignore', category=DeprecationWarning)
|
|
@@ -245,7 +245,9 @@ def _local_directories(model_name: str, dataset_id: str, save_dir: str):
|
|
|
245
245
|
|
|
246
246
|
# save model
|
|
247
247
|
def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str, scaler_object: Union[StandardScaler, MinMaxScaler, MaxAbsScaler]):
|
|
248
|
-
|
|
248
|
+
#Sanitize filenames to save
|
|
249
|
+
sanitized_target_name = sanitize_filename(target_name)
|
|
250
|
+
full_path = os.path.join(save_directory, f"{model_name}_{sanitized_target_name}.joblib")
|
|
249
251
|
joblib.dump({'model': trained_model, 'scaler':scaler_object, 'feature_names': feature_names, 'target_name':target_name}, full_path)
|
|
250
252
|
|
|
251
253
|
# function to evaluate the model and save metrics (Classification)
|
|
@@ -298,7 +300,8 @@ def evaluate_model_classification(
|
|
|
298
300
|
)
|
|
299
301
|
|
|
300
302
|
# Save text report
|
|
301
|
-
|
|
303
|
+
sanitized_target_id = sanitize_filename(target_id)
|
|
304
|
+
report_path = os.path.join(save_dir, f"Classification_Report_{sanitized_target_id}.txt")
|
|
302
305
|
with open(report_path, "w") as f:
|
|
303
306
|
f.write(f"{model_name} - {target_id}\t\tAccuracy: {accuracy:.2f}\n")
|
|
304
307
|
f.write("Classification Report:\n")
|
|
@@ -328,7 +331,7 @@ def evaluate_model_classification(
|
|
|
328
331
|
text.set_fontsize(title_fontsize+4)
|
|
329
332
|
|
|
330
333
|
fig.tight_layout()
|
|
331
|
-
fig_path = os.path.join(save_dir, f"Confusion_Matrix_{
|
|
334
|
+
fig_path = os.path.join(save_dir, f"Confusion_Matrix_{sanitized_target_id}.svg")
|
|
332
335
|
fig.savefig(fig_path, format="svg", bbox_inches="tight")
|
|
333
336
|
plt.close(fig)
|
|
334
337
|
|
|
@@ -411,7 +414,8 @@ def plot_roc_curve(
|
|
|
411
414
|
|
|
412
415
|
# Save figure
|
|
413
416
|
os.makedirs(save_directory, exist_ok=True)
|
|
414
|
-
|
|
417
|
+
sanitized_target_name = sanitize_filename(target_name)
|
|
418
|
+
save_path = os.path.join(save_directory, f"ROC_{sanitized_target_name}.svg")
|
|
415
419
|
fig.savefig(save_path, bbox_inches="tight", format="svg")
|
|
416
420
|
|
|
417
421
|
return fig
|
|
@@ -435,7 +439,8 @@ def evaluate_model_regression(model, model_name: str,
|
|
|
435
439
|
r2 = r2_score(single_y_test, y_pred)
|
|
436
440
|
|
|
437
441
|
# Create formatted report
|
|
438
|
-
|
|
442
|
+
sanitized_target_id = sanitize_filename(target_id)
|
|
443
|
+
report_path = os.path.join(save_dir, f"Regression_Report_{sanitized_target_id}.txt")
|
|
439
444
|
with open(report_path, "w") as f:
|
|
440
445
|
f.write(f"{model_name} - {target_id} Regression Performance\n")
|
|
441
446
|
f.write(f"Mean Absolute Error (MAE): {mae:.4f}\n")
|
|
@@ -453,7 +458,7 @@ def evaluate_model_regression(model, model_name: str,
|
|
|
453
458
|
plt.title(f"{model_name} - Residual Plot for {target_id}", fontsize=base_fontsize)
|
|
454
459
|
plt.grid(True)
|
|
455
460
|
plt.tight_layout()
|
|
456
|
-
plt.savefig(os.path.join(save_dir, f"Residual_Plot_{
|
|
461
|
+
plt.savefig(os.path.join(save_dir, f"Residual_Plot_{sanitized_target_id}.svg"), bbox_inches='tight', format="svg")
|
|
457
462
|
plt.close()
|
|
458
463
|
|
|
459
464
|
# Create true vs predicted values plot
|
|
@@ -466,12 +471,13 @@ def evaluate_model_regression(model, model_name: str,
|
|
|
466
471
|
plt.ylabel('Predictions', fontsize=base_fontsize)
|
|
467
472
|
plt.title(f"{model_name} - True vs Predicted for {target_id}", fontsize=base_fontsize)
|
|
468
473
|
plt.grid(True)
|
|
469
|
-
plot_path = os.path.join(save_dir, f"Regression_Plot_{
|
|
474
|
+
plot_path = os.path.join(save_dir, f"Regression_Plot_{sanitized_target_id}.svg")
|
|
470
475
|
plt.savefig(plot_path, bbox_inches='tight', format="svg")
|
|
471
476
|
plt.close()
|
|
472
477
|
|
|
473
478
|
return y_pred
|
|
474
479
|
|
|
480
|
+
|
|
475
481
|
# Get SHAP values
|
|
476
482
|
def get_shap_values(
|
|
477
483
|
model,
|
|
@@ -498,7 +504,8 @@ def get_shap_values(
|
|
|
498
504
|
features_to_explain: Should match the model's training data format, including scaling.
|
|
499
505
|
save_dir: Directory to save visualizations
|
|
500
506
|
"""
|
|
501
|
-
|
|
507
|
+
sanitized_target_id = sanitize_filename(target_id)
|
|
508
|
+
|
|
502
509
|
def _apply_plot_style():
|
|
503
510
|
styles = ['seaborn', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8', 'default']
|
|
504
511
|
for style in styles:
|
|
@@ -560,7 +567,7 @@ def get_shap_values(
|
|
|
560
567
|
_create_shap_plot(
|
|
561
568
|
shap_values=class_shap,
|
|
562
569
|
features=features_to_explain,
|
|
563
|
-
save_path=os.path.join(save_dir, f"SHAP_{
|
|
570
|
+
save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_id}_Class{class_name}_{plot_type}.svg"),
|
|
564
571
|
plot_type=plot_type,
|
|
565
572
|
title=f"{model_name} - {target_id} (Class {class_name})"
|
|
566
573
|
)
|
|
@@ -570,7 +577,7 @@ def get_shap_values(
|
|
|
570
577
|
_create_shap_plot(
|
|
571
578
|
shap_values=values,
|
|
572
579
|
features=features_to_explain,
|
|
573
|
-
save_path=os.path.join(save_dir, f"SHAP_{
|
|
580
|
+
save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_id}_{plot_type}.svg"),
|
|
574
581
|
plot_type=plot_type,
|
|
575
582
|
title=f"{model_name} - {target_id}"
|
|
576
583
|
)
|
|
@@ -580,10 +587,11 @@ def get_shap_values(
|
|
|
580
587
|
_create_shap_plot(
|
|
581
588
|
shap_values=shap_values,
|
|
582
589
|
features=features_to_explain,
|
|
583
|
-
save_path=os.path.join(save_dir, f"SHAP_{
|
|
590
|
+
save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_id}_{plot_type}.svg"),
|
|
584
591
|
plot_type=plot_type,
|
|
585
592
|
title=f"{model_name} - {target_id}"
|
|
586
593
|
)
|
|
594
|
+
#START_O
|
|
587
595
|
|
|
588
596
|
explainer = shap.TreeExplainer(model)
|
|
589
597
|
shap_values = explainer.shap_values(features_to_explain)
|
|
@@ -672,6 +680,6 @@ def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list
|
|
|
672
680
|
|
|
673
681
|
def _check_paths(datasets_dir: str, save_dir:str):
|
|
674
682
|
if not os.path.isdir(save_dir):
|
|
675
|
-
os.makedirs(save_dir)
|
|
683
|
+
os.makedirs(save_dir)
|
|
676
684
|
if not os.path.isdir(datasets_dir):
|
|
677
685
|
raise IOError(f"Datasets directory '{datasets_dir}' not found.")
|
|
@@ -2,6 +2,16 @@ import os
|
|
|
2
2
|
from openpyxl import load_workbook, Workbook
|
|
3
3
|
import pandas as pd
|
|
4
4
|
from typing import List, Optional
|
|
5
|
+
from utilities import _script_info, sanitize_filename
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"unmerge_and_split_excel",
|
|
10
|
+
"unmerge_and_split_from_directory",
|
|
11
|
+
"validate_excel_schema",
|
|
12
|
+
"vertical_merge_transform_excel",
|
|
13
|
+
"horizontal_merge_transform_excel"
|
|
14
|
+
]
|
|
5
15
|
|
|
6
16
|
|
|
7
17
|
def unmerge_and_split_excel(filepath: str) -> None:
|
|
@@ -25,12 +35,12 @@ def unmerge_and_split_excel(filepath: str) -> None:
|
|
|
25
35
|
ws = wb[sheet_name]
|
|
26
36
|
new_wb = Workbook()
|
|
27
37
|
new_ws = new_wb.active
|
|
28
|
-
new_ws.title = sheet_name
|
|
38
|
+
new_ws.title = sheet_name # type: ignore
|
|
29
39
|
|
|
30
40
|
# Copy all cell values
|
|
31
41
|
for row in ws.iter_rows():
|
|
32
42
|
for cell in row:
|
|
33
|
-
new_ws.cell(row=cell.row, column=cell.column, value=cell.value)
|
|
43
|
+
new_ws.cell(row=cell.row, column=cell.column, value=cell.value) # type: ignore
|
|
34
44
|
|
|
35
45
|
# Fill and unmerge merged regions
|
|
36
46
|
for merged_range in list(ws.merged_cells.ranges):
|
|
@@ -41,10 +51,10 @@ def unmerge_and_split_excel(filepath: str) -> None:
|
|
|
41
51
|
value = ws.cell(row=min_row, column=min_col).value
|
|
42
52
|
for row in range(min_row, max_row + 1):
|
|
43
53
|
for col in range(min_col, max_col + 1):
|
|
44
|
-
new_ws.cell(row=row, column=col, value=value)
|
|
54
|
+
new_ws.cell(row=row, column=col, value=value) # type: ignore
|
|
45
55
|
|
|
46
56
|
# Construct flat output file name
|
|
47
|
-
sanitized_sheet_name = sheet_name
|
|
57
|
+
sanitized_sheet_name = sanitize_filename(sheet_name)
|
|
48
58
|
output_filename = f"{base_name}_{sanitized_sheet_name}.xlsx"
|
|
49
59
|
output_path = os.path.join(base_dir, output_filename)
|
|
50
60
|
new_wb.save(output_path)
|
|
@@ -85,12 +95,12 @@ def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
|
|
|
85
95
|
ws = wb[sheet_name]
|
|
86
96
|
new_wb = Workbook()
|
|
87
97
|
new_ws = new_wb.active
|
|
88
|
-
new_ws.title = sheet_name
|
|
98
|
+
new_ws.title = sheet_name # type: ignore
|
|
89
99
|
|
|
90
100
|
# Copy all cell values
|
|
91
101
|
for row in ws.iter_rows():
|
|
92
102
|
for cell in row:
|
|
93
|
-
new_ws.cell(row=cell.row, column=cell.column, value=cell.value)
|
|
103
|
+
new_ws.cell(row=cell.row, column=cell.column, value=cell.value) # type: ignore
|
|
94
104
|
|
|
95
105
|
# Fill and unmerge merged regions
|
|
96
106
|
for merged_range in list(ws.merged_cells.ranges):
|
|
@@ -101,10 +111,10 @@ def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
|
|
|
101
111
|
value = ws.cell(row=min_row, column=min_col).value
|
|
102
112
|
for row in range(min_row, max_row + 1):
|
|
103
113
|
for col in range(min_col, max_col + 1):
|
|
104
|
-
new_ws.cell(row=row, column=col, value=value)
|
|
114
|
+
new_ws.cell(row=row, column=col, value=value) # type: ignore
|
|
105
115
|
|
|
106
116
|
# Construct flat output file name
|
|
107
|
-
sanitized_sheet_name = sheet_name
|
|
117
|
+
sanitized_sheet_name = sanitize_filename(sheet_name)
|
|
108
118
|
output_filename = f"{base_name}_{sanitized_sheet_name}.xlsx"
|
|
109
119
|
output_path = os.path.join(output_dir, output_filename)
|
|
110
120
|
new_wb.save(output_path)
|
|
@@ -151,7 +161,7 @@ def validate_excel_schema(
|
|
|
151
161
|
wb = load_workbook(file_path, read_only=True)
|
|
152
162
|
ws = wb.active # Only check the first worksheet
|
|
153
163
|
|
|
154
|
-
header = [cell.value for cell in next(ws.iter_rows(max_row=1))]
|
|
164
|
+
header = [cell.value for cell in next(ws.iter_rows(max_row=1))] # type: ignore
|
|
155
165
|
|
|
156
166
|
if strict:
|
|
157
167
|
if header != expected_columns:
|
|
@@ -202,6 +212,11 @@ def vertical_merge_transform_excel(
|
|
|
202
212
|
|
|
203
213
|
if not excel_files:
|
|
204
214
|
raise ValueError("No Excel files found in the target directory.")
|
|
215
|
+
|
|
216
|
+
# sanitize filename
|
|
217
|
+
csv_filename = sanitize_filename(csv_filename)
|
|
218
|
+
# make directory
|
|
219
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
205
220
|
|
|
206
221
|
csv_filename = csv_filename if csv_filename.endswith('.csv') else f"{csv_filename}.csv"
|
|
207
222
|
csv_path = os.path.join(output_dir, csv_filename)
|
|
@@ -260,6 +275,11 @@ def horizontal_merge_transform_excel(
|
|
|
260
275
|
excel_files = [f for f in raw_excel_files if not f.startswith('~')] # Exclude temporary files
|
|
261
276
|
if not excel_files:
|
|
262
277
|
raise ValueError("No Excel files found in the target directory.")
|
|
278
|
+
|
|
279
|
+
# sanitize filename
|
|
280
|
+
csv_filename = sanitize_filename(csv_filename)
|
|
281
|
+
# make directory
|
|
282
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
263
283
|
|
|
264
284
|
csv_filename = csv_filename if csv_filename.endswith('.csv') else f"{csv_filename}.csv"
|
|
265
285
|
csv_path = os.path.join(output_dir, csv_filename)
|
|
@@ -308,3 +328,6 @@ def horizontal_merge_transform_excel(
|
|
|
308
328
|
if duplicate_columns:
|
|
309
329
|
print(f"⚠️ Duplicate columns: {duplicate_columns}")
|
|
310
330
|
|
|
331
|
+
|
|
332
|
+
def info():
|
|
333
|
+
_script_info(__all__)
|
|
@@ -5,7 +5,12 @@ import pandas as pd
|
|
|
5
5
|
from openpyxl.styles import Font, PatternFill
|
|
6
6
|
import traceback
|
|
7
7
|
import json
|
|
8
|
-
from ml_tools.utilities import sanitize_filename
|
|
8
|
+
from ml_tools.utilities import sanitize_filename, _script_info
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"custom_logger"
|
|
13
|
+
]
|
|
9
14
|
|
|
10
15
|
|
|
11
16
|
def custom_logger(
|
|
@@ -143,3 +148,7 @@ def _log_exception_to_log(exc: BaseException, path: str) -> None:
|
|
|
143
148
|
def _log_dict_to_json(data: Dict[Any, Any], path: str) -> None:
|
|
144
149
|
with open(path, 'w', encoding='utf-8') as f:
|
|
145
150
|
json.dump(data, f, indent=4, ensure_ascii=False)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def info():
|
|
154
|
+
_script_info(__all__)
|
|
@@ -5,23 +5,29 @@ import xgboost as xgb
|
|
|
5
5
|
import lightgbm as lgb
|
|
6
6
|
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
|
|
7
7
|
from sklearn.base import ClassifierMixin
|
|
8
|
-
from sklearn.preprocessing import StandardScaler
|
|
8
|
+
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
|
|
9
9
|
from typing import Literal, Union, Tuple, Dict
|
|
10
|
-
from collections.abc import Sequence
|
|
11
10
|
import polars as pl
|
|
12
11
|
from functools import partial
|
|
12
|
+
from .utilities import sanitize_filename, _script_info
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"ObjectiveFunction",
|
|
17
|
+
"run_pso"
|
|
18
|
+
]
|
|
13
19
|
|
|
14
20
|
|
|
15
21
|
class ObjectiveFunction():
|
|
16
22
|
"""
|
|
17
23
|
Callable objective function designed for optimizing continuous outputs from regression models.
|
|
18
24
|
|
|
19
|
-
The
|
|
25
|
+
The target serialized file (joblib) must include a 'model' and a 'scaler'. Additionally 'feature_names' and 'target_name' will be parsed if present.
|
|
20
26
|
|
|
21
27
|
Parameters
|
|
22
28
|
----------
|
|
23
29
|
trained_model_path : str
|
|
24
|
-
Path to a serialized model (joblib) compatible with scikit-learn-like `.predict`.
|
|
30
|
+
Path to a serialized model and its scaler (joblib) compatible with scikit-learn-like `.predict`.
|
|
25
31
|
add_noise : bool
|
|
26
32
|
Whether to apply multiplicative noise to the input features during evaluation.
|
|
27
33
|
binary_features : int, default=0
|
|
@@ -67,8 +73,18 @@ class ObjectiveFunction():
|
|
|
67
73
|
return new_feature_values
|
|
68
74
|
|
|
69
75
|
def _handle_hybrid(self, features_array):
|
|
70
|
-
|
|
71
|
-
|
|
76
|
+
total_features = features_array.shape[0]
|
|
77
|
+
if self.binary_features > total_features:
|
|
78
|
+
raise ValueError("self.binary_features exceeds total number of features.")
|
|
79
|
+
|
|
80
|
+
# Handle corner case where all features are binary
|
|
81
|
+
if self.binary_features == total_features:
|
|
82
|
+
feat_binary = (features_array > 0.5).astype(int)
|
|
83
|
+
return feat_binary
|
|
84
|
+
|
|
85
|
+
# Normal case: split into continuous and binary parts
|
|
86
|
+
feat_continuous = features_array[:-self.binary_features]
|
|
87
|
+
feat_binary = (features_array[-self.binary_features:] > 0.5).astype(int) #threshold binary values
|
|
72
88
|
new_feature_values = np.concatenate([feat_continuous, feat_binary])
|
|
73
89
|
return new_feature_values
|
|
74
90
|
|
|
@@ -92,7 +108,7 @@ class ObjectiveFunction():
|
|
|
92
108
|
return (f"<ObjectiveFunction(model={type(self.model).__name__}, scaler={type(self.scaler).__name__}, use_noise={self.use_noise}, is_hybrid={self.is_hybrid}, task='{self.task}')>")
|
|
93
109
|
|
|
94
110
|
|
|
95
|
-
def _set_boundaries(lower_boundaries:
|
|
111
|
+
def _set_boundaries(lower_boundaries: list[float], upper_boundaries: list[float]):
|
|
96
112
|
assert len(lower_boundaries) == len(upper_boundaries), "Lower and upper boundaries must have the same length."
|
|
97
113
|
assert len(lower_boundaries) >= 1, "At least one boundary pair is required."
|
|
98
114
|
lower = np.array(lower_boundaries)
|
|
@@ -112,31 +128,40 @@ def _save_results(*dicts, save_dir: str, target_name: str):
|
|
|
112
128
|
combined_dict = dict()
|
|
113
129
|
for single_dict in dicts:
|
|
114
130
|
combined_dict.update(single_dict)
|
|
115
|
-
|
|
116
|
-
|
|
131
|
+
|
|
132
|
+
sanitized_target_name = sanitize_filename(target_name)
|
|
133
|
+
|
|
134
|
+
full_path = os.path.join(save_dir, f"Optimization_{sanitized_target_name}.csv")
|
|
117
135
|
pl.DataFrame(combined_dict).write_csv(full_path)
|
|
118
136
|
|
|
119
137
|
|
|
120
|
-
def run_pso(lower_boundaries:
|
|
121
|
-
|
|
138
|
+
def run_pso(lower_boundaries: list[float],
|
|
139
|
+
upper_boundaries: list[float],
|
|
140
|
+
objective_function: ObjectiveFunction,
|
|
141
|
+
save_results_dir: str,
|
|
142
|
+
auto_binary_boundaries: bool=True,
|
|
122
143
|
target_name: Union[str, None]=None,
|
|
123
144
|
feature_names: Union[list[str], None]=None,
|
|
124
|
-
swarm_size: int=100,
|
|
145
|
+
swarm_size: int=100,
|
|
146
|
+
max_iterations: int=100,
|
|
125
147
|
inequality_constrain_function=None,
|
|
126
|
-
post_hoc_analysis: Union[int, None]=None
|
|
148
|
+
post_hoc_analysis: Union[int, None]=None,
|
|
149
|
+
workers: int=5) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
|
|
127
150
|
"""
|
|
128
|
-
Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results.
|
|
151
|
+
Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
|
|
129
152
|
|
|
130
153
|
Parameters
|
|
131
154
|
----------
|
|
132
|
-
lower_boundaries :
|
|
133
|
-
Lower bounds for each feature in the search space.
|
|
134
|
-
upper_boundaries :
|
|
135
|
-
Upper bounds for each feature in the search space.
|
|
155
|
+
lower_boundaries : list[float]
|
|
156
|
+
Lower bounds for each feature in the search space (as many as features expected by the model).
|
|
157
|
+
upper_boundaries : list[float]
|
|
158
|
+
Upper bounds for each feature in the search space (as many as features expected by the model).
|
|
136
159
|
objective_function : ObjectiveFunction
|
|
137
160
|
A callable object encapsulating a regression model and its scaler.
|
|
138
161
|
save_results_dir : str
|
|
139
162
|
Directory path to save the results CSV file.
|
|
163
|
+
auto_binary_boundaries : bool
|
|
164
|
+
Use `ObjectiveFunction.binary_features` to append as many binary boundaries as needed to `lower_boundaries` and `upper_boundaries` automatically.
|
|
140
165
|
target_name : str or None, optional
|
|
141
166
|
Name of the target variable. If None, attempts to retrieve from the ObjectiveFunction object.
|
|
142
167
|
feature_names : list[str] or None, optional
|
|
@@ -149,30 +174,38 @@ def run_pso(lower_boundaries: Sequence[float], upper_boundaries: Sequence[float]
|
|
|
149
174
|
Optional function defining inequality constraints to be respected by the optimization.
|
|
150
175
|
post_hoc_analysis : int or None, optional
|
|
151
176
|
If specified, runs the optimization multiple times to perform post hoc analysis. The value indicates the number of repetitions.
|
|
177
|
+
workers : int
|
|
178
|
+
Number of parallel processes to use.
|
|
152
179
|
|
|
153
180
|
Returns
|
|
154
181
|
-------
|
|
155
182
|
Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]
|
|
156
183
|
If `post_hoc_analysis` is None, returns two dictionaries:
|
|
157
|
-
-
|
|
158
|
-
-
|
|
184
|
+
- feature_names: Feature values (after inverse scaling) that yield the best result.
|
|
185
|
+
- target_name: Best result obtained for the target variable.
|
|
159
186
|
|
|
160
187
|
If `post_hoc_analysis` is an integer, returns two dictionaries:
|
|
161
|
-
-
|
|
162
|
-
-
|
|
188
|
+
- feature_names: Lists of best feature values (after inverse scaling) for each repetition.
|
|
189
|
+
- target_name: List of best target values across repetitions.
|
|
163
190
|
|
|
164
191
|
Notes
|
|
165
192
|
-----
|
|
166
193
|
- PSO minimizes the objective function by default; if maximization is desired, it should be handled inside the ObjectiveFunction.
|
|
167
194
|
- Feature values are scaled before being passed to the model and inverse-transformed before result saving.
|
|
168
195
|
"""
|
|
196
|
+
# Append binary boundaries
|
|
197
|
+
binary_number = objective_function.binary_features
|
|
198
|
+
if auto_binary_boundaries and binary_number > 0:
|
|
199
|
+
lower_boundaries.extend([0] * binary_number)
|
|
200
|
+
upper_boundaries.extend([1] * binary_number)
|
|
201
|
+
|
|
169
202
|
lower, upper = _set_boundaries(lower_boundaries, upper_boundaries)
|
|
170
|
-
|
|
203
|
+
|
|
171
204
|
# feature names
|
|
172
205
|
if feature_names is None and objective_function.feature_names is not None:
|
|
173
206
|
feature_names = objective_function.feature_names
|
|
174
207
|
names = _set_feature_names(size=len(lower_boundaries), names=feature_names)
|
|
175
|
-
|
|
208
|
+
|
|
176
209
|
# target name
|
|
177
210
|
if target_name is None and objective_function.target_name is not None:
|
|
178
211
|
target_name = objective_function.target_name
|
|
@@ -186,13 +219,15 @@ def run_pso(lower_boundaries: Sequence[float], upper_boundaries: Sequence[float]
|
|
|
186
219
|
"f_ieqcons": inequality_constrain_function,
|
|
187
220
|
"swarmsize": swarm_size,
|
|
188
221
|
"maxiter": max_iterations,
|
|
189
|
-
"processes":
|
|
190
|
-
"particle_output":
|
|
222
|
+
"processes": workers,
|
|
223
|
+
"particle_output": False
|
|
191
224
|
}
|
|
192
225
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
226
|
+
os.makedirs(save_results_dir, exist_ok=True)
|
|
227
|
+
|
|
228
|
+
if post_hoc_analysis is None or post_hoc_analysis == 1:
|
|
229
|
+
best_features, best_target, *_ = _pso(**arguments)
|
|
230
|
+
# best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
|
|
196
231
|
|
|
197
232
|
# inverse transformation
|
|
198
233
|
best_features = np.array(best_features).reshape(1, -1)
|
|
@@ -209,9 +244,9 @@ def run_pso(lower_boundaries: Sequence[float], upper_boundaries: Sequence[float]
|
|
|
209
244
|
else:
|
|
210
245
|
all_best_targets = list()
|
|
211
246
|
all_best_features = [[] for _ in range(len(lower_boundaries))]
|
|
212
|
-
for
|
|
213
|
-
|
|
214
|
-
best_features, best_target, _particle_positions, _target_values_per_position =
|
|
247
|
+
for _ in range(post_hoc_analysis):
|
|
248
|
+
best_features, best_target, *_ = _pso(**arguments)
|
|
249
|
+
# best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
|
|
215
250
|
|
|
216
251
|
# inverse transformation
|
|
217
252
|
best_features = np.array(best_features).reshape(1, -1)
|
|
@@ -231,6 +266,8 @@ def run_pso(lower_boundaries: Sequence[float], upper_boundaries: Sequence[float]
|
|
|
231
266
|
return all_best_features_named, all_best_targets_named # type: ignore
|
|
232
267
|
|
|
233
268
|
|
|
269
|
+
def info():
|
|
270
|
+
_script_info(__all__)
|
|
234
271
|
|
|
235
272
|
|
|
236
273
|
### SOURCE CODE FOR PSO ###
|
|
@@ -249,7 +286,7 @@ def _cons_ieqcons_wrapper(ieqcons, args, kwargs, x):
|
|
|
249
286
|
def _cons_f_ieqcons_wrapper(f_ieqcons, args, kwargs, x):
|
|
250
287
|
return np.array(f_ieqcons(x, *args, **kwargs))
|
|
251
288
|
|
|
252
|
-
def
|
|
289
|
+
def _pso(func, lb, ub, ieqcons=[], f_ieqcons=None, args=(), kwargs={},
|
|
253
290
|
swarmsize=100, omega=0.5, phip=0.5, phig=0.5, maxiter=100,
|
|
254
291
|
minstep=1e-8, minfunc=1e-8, debug=False, processes=1,
|
|
255
292
|
particle_output=False):
|
|
@@ -377,7 +414,7 @@ def pso(func, lb, ub, ieqcons=[], f_ieqcons=None, args=(), kwargs={},
|
|
|
377
414
|
for i in range(S):
|
|
378
415
|
fx[i] = obj(x[i, :])
|
|
379
416
|
fs[i] = is_feasible(x[i, :])
|
|
380
|
-
|
|
417
|
+
|
|
381
418
|
# Store particle's best position (if constraints are satisfied)
|
|
382
419
|
i_update = np.logical_and((fx < fp), fs)
|
|
383
420
|
p[i_update, :] = x[i_update, :].copy()
|
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
import torch
|
|
2
2
|
from torch import nn
|
|
3
|
+
from .utilities import _script_info
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"MyNeuralNetwork",
|
|
8
|
+
"MyLSTMNetwork"
|
|
9
|
+
]
|
|
3
10
|
|
|
4
11
|
|
|
5
12
|
class MyNeuralNetwork(nn.Module):
|
|
@@ -73,9 +80,11 @@ class MyNeuralNetwork(nn.Module):
|
|
|
73
80
|
return X
|
|
74
81
|
|
|
75
82
|
|
|
76
|
-
class
|
|
83
|
+
class _MyConvolutionalNetwork(nn.Module):
|
|
77
84
|
def __init__(self, outputs: int, color_channels: int=3, img_size: int=256, drop_out: float=0.2):
|
|
78
85
|
"""
|
|
86
|
+
- EDUCATIONAL PURPOSES ONLY, not optimized and requires lots of memory.
|
|
87
|
+
|
|
79
88
|
Create a basic Convolutional Neural Network with two convolution layers with a pooling layer after each convolution.
|
|
80
89
|
|
|
81
90
|
Args:
|
|
@@ -225,3 +234,6 @@ class MyLSTMNetwork(nn.Module):
|
|
|
225
234
|
else:
|
|
226
235
|
return output
|
|
227
236
|
|
|
237
|
+
|
|
238
|
+
def info():
|
|
239
|
+
_script_info(__all__)
|
|
@@ -6,6 +6,12 @@ import matplotlib.pyplot as plt
|
|
|
6
6
|
import torch
|
|
7
7
|
from torch import nn
|
|
8
8
|
from sklearn.metrics import mean_squared_error, classification_report, ConfusionMatrixDisplay, roc_curve, roc_auc_score, r2_score, median_absolute_error
|
|
9
|
+
from .utilities import _script_info
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"MyTrainer"
|
|
14
|
+
]
|
|
9
15
|
|
|
10
16
|
|
|
11
17
|
class MyTrainer():
|
|
@@ -288,36 +294,6 @@ class MyTrainer():
|
|
|
288
294
|
print(f"Area under the curve score: {area_under_curve:4.2f}")
|
|
289
295
|
else:
|
|
290
296
|
print("Error encountered while retrieving 'model.kind' attribute.")
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
def forecast(self, samples_list: list[torch.Tensor], view_as: tuple[int,int]=(1,-1)):
|
|
294
|
-
"""
|
|
295
|
-
DEPRECATED - Use `helpers.model_predict()` instead
|
|
296
|
-
|
|
297
|
-
Returns a list containing lists of predicted values, one for each sample.
|
|
298
|
-
|
|
299
|
-
Each sample must be a tensor and have the same shape and normalization expected by the model
|
|
300
|
-
(this method will add the batch dimension automatically).
|
|
301
|
-
|
|
302
|
-
Args:
|
|
303
|
-
`samples_list`: list of tensors.
|
|
304
|
-
|
|
305
|
-
`view_as`: reshape each output, default is (1,-1).
|
|
306
|
-
|
|
307
|
-
Returns: List of lists.
|
|
308
|
-
"""
|
|
309
|
-
self.model.eval()
|
|
310
|
-
results = list()
|
|
311
|
-
with torch.no_grad():
|
|
312
|
-
for data_point in samples_list:
|
|
313
|
-
data_point = data_point.unsqueeze(0).to(self.device)
|
|
314
|
-
output = self.model(data_point)
|
|
315
|
-
if self.kind == "classification":
|
|
316
|
-
results.append(output.argmax(dim=1).view(view_as).cpu().tolist())
|
|
317
|
-
else: #regression
|
|
318
|
-
results.append(output.view(view_as).cpu().tolist())
|
|
319
|
-
|
|
320
|
-
return results
|
|
321
297
|
|
|
322
298
|
|
|
323
299
|
def rnn_forecast(self, sequence: torch.Tensor, steps: int):
|
|
@@ -364,3 +340,7 @@ class MyTrainer():
|
|
|
364
340
|
# Cast to array and return
|
|
365
341
|
predictions = numpy.array(predictions)
|
|
366
342
|
return predictions
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def info():
|
|
346
|
+
_script_info(__all__)
|
|
@@ -4,6 +4,19 @@ import pandas as pd
|
|
|
4
4
|
import os
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
import re
|
|
7
|
+
from typing import Literal
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# Keep track of available tools
|
|
11
|
+
__all__ = [
|
|
12
|
+
"list_csv_paths",
|
|
13
|
+
"load_dataframe",
|
|
14
|
+
"yield_dataframes_from_dir",
|
|
15
|
+
"merge_dataframes",
|
|
16
|
+
"save_dataframe",
|
|
17
|
+
"normalize_mixed_list",
|
|
18
|
+
"sanitize_filename"
|
|
19
|
+
]
|
|
7
20
|
|
|
8
21
|
|
|
9
22
|
def list_csv_paths(directory: str) -> dict[str, str]:
|
|
@@ -76,11 +89,93 @@ def yield_dataframes_from_dir(datasets_dir: str):
|
|
|
76
89
|
for df_name, df_path in list_csv_paths(datasets_dir).items():
|
|
77
90
|
df, _ = load_dataframe(df_path)
|
|
78
91
|
yield df, df_name
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def merge_dataframes(
|
|
95
|
+
*dfs: pd.DataFrame,
|
|
96
|
+
reset_index: bool = False,
|
|
97
|
+
direction: Literal["horizontal", "vertical"] = "horizontal"
|
|
98
|
+
) -> pd.DataFrame:
|
|
99
|
+
"""
|
|
100
|
+
Merges multiple DataFrames either horizontally or vertically.
|
|
101
|
+
|
|
102
|
+
Parameters:
|
|
103
|
+
*dfs (pd.DataFrame): Variable number of DataFrames to merge.
|
|
104
|
+
reset_index (bool): Whether to reset index in the final merged DataFrame.
|
|
105
|
+
direction (["horizontal" | "vertical"]):
|
|
106
|
+
- "horizontal": Merge on index, adding columns.
|
|
107
|
+
- "vertical": Append rows; all DataFrames must have identical columns.
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
pd.DataFrame: A single merged DataFrame.
|
|
111
|
+
|
|
112
|
+
Raises:
|
|
113
|
+
ValueError:
|
|
114
|
+
- If fewer than 2 DataFrames are provided.
|
|
115
|
+
- If indexes do not match for horizontal merge.
|
|
116
|
+
- If column names or order differ for vertical merge.
|
|
117
|
+
"""
|
|
118
|
+
if len(dfs) < 2:
|
|
119
|
+
raise ValueError("At least 2 DataFrames must be provided.")
|
|
120
|
+
|
|
121
|
+
for i, df in enumerate(dfs, start=1):
|
|
122
|
+
print(f"DataFrame {i} shape: {df.shape}")
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
if direction == "horizontal":
|
|
126
|
+
reference_index = dfs[0].index
|
|
127
|
+
for i, df in enumerate(dfs, start=1):
|
|
128
|
+
if not df.index.equals(reference_index):
|
|
129
|
+
raise ValueError(f"Indexes do not match: Dataset 1 and Dataset {i}.")
|
|
130
|
+
merged_df = pd.concat(dfs, axis=1)
|
|
131
|
+
|
|
132
|
+
elif direction == "vertical":
|
|
133
|
+
reference_columns = dfs[0].columns
|
|
134
|
+
for i, df in enumerate(dfs, start=1):
|
|
135
|
+
if not df.columns.equals(reference_columns):
|
|
136
|
+
raise ValueError(f"Column names/order do not match: Dataset 1 and Dataset {i}.")
|
|
137
|
+
merged_df = pd.concat(dfs, axis=0)
|
|
138
|
+
|
|
139
|
+
else:
|
|
140
|
+
raise ValueError(f"Invalid merge direction: {direction}")
|
|
141
|
+
|
|
142
|
+
if reset_index:
|
|
143
|
+
merged_df = merged_df.reset_index(drop=True)
|
|
144
|
+
|
|
145
|
+
print(f"Merged DataFrame shape: {merged_df.shape}")
|
|
146
|
+
|
|
147
|
+
return merged_df
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def save_dataframe(df: pd.DataFrame, save_dir: str, filename: str) -> None:
|
|
151
|
+
"""
|
|
152
|
+
Save a pandas DataFrame to a CSV file.
|
|
153
|
+
|
|
154
|
+
Parameters:
|
|
155
|
+
df: pandas.DataFrame to save
|
|
156
|
+
save_dir: str, directory where the CSV file will be saved.
|
|
157
|
+
filename: str, CSV filename, extension will be added if missing.
|
|
158
|
+
"""
|
|
159
|
+
if df.empty:
|
|
160
|
+
print(f"⚠️ Attempting to save an empty DataFrame: '{filename}'. Process Skipped.")
|
|
161
|
+
return
|
|
162
|
+
|
|
163
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
164
|
+
|
|
165
|
+
filename = sanitize_filename(filename)
|
|
166
|
+
|
|
167
|
+
if not filename.endswith('.csv'):
|
|
168
|
+
filename += '.csv'
|
|
79
169
|
|
|
170
|
+
output_path = os.path.join(save_dir, filename)
|
|
80
171
|
|
|
172
|
+
df.to_csv(output_path, index=False, encoding='utf-8')
|
|
173
|
+
print(f"✅ Saved file: '{filename}'")
|
|
174
|
+
|
|
175
|
+
|
|
81
176
|
def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
|
|
82
177
|
"""
|
|
83
|
-
Normalize a mixed list of numeric values and strings so that the sum of the values equals 1.0,
|
|
178
|
+
Normalize a mixed list of numeric values and strings casted to floats so that the sum of the values equals 1.0,
|
|
84
179
|
applying heuristic adjustments to correct for potential data entry scale mismatches.
|
|
85
180
|
|
|
86
181
|
Parameters:
|
|
@@ -168,27 +263,14 @@ def sanitize_filename(filename: str) -> str:
|
|
|
168
263
|
return sanitized
|
|
169
264
|
|
|
170
265
|
|
|
171
|
-
def
|
|
266
|
+
def _script_info(all_data: list[str]):
|
|
172
267
|
"""
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
Parameters:
|
|
176
|
-
df: pandas.DataFrame to save
|
|
177
|
-
save_dir: str, directory where the CSV file will be saved.
|
|
178
|
-
filename: str, CSV filename, extension will be added if missing.
|
|
268
|
+
List available names.
|
|
179
269
|
"""
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
if not filename.endswith('.csv'):
|
|
189
|
-
filename += '.csv'
|
|
190
|
-
|
|
191
|
-
output_path = os.path.join(save_dir, filename)
|
|
192
|
-
|
|
193
|
-
df.to_csv(output_path, index=False, encoding='utf-8')
|
|
194
|
-
print(f"✅ Saved file: '{filename}'")
|
|
270
|
+
print("Available functions and objects:")
|
|
271
|
+
for i, name in enumerate(all_data, start=1):
|
|
272
|
+
print(f"{i} - {name}")
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def info():
|
|
276
|
+
_script_info(__all__)
|
|
@@ -4,9 +4,18 @@ from PIL import Image, ImageOps
|
|
|
4
4
|
from typing import Literal
|
|
5
5
|
from torchvision import transforms
|
|
6
6
|
import torch
|
|
7
|
+
from .utilities import _script_info
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"inspect_images",
|
|
12
|
+
"image_augmentation",
|
|
13
|
+
"ResizeAspectFill",
|
|
14
|
+
"is_image",
|
|
15
|
+
"model_predict"
|
|
16
|
+
]
|
|
7
17
|
|
|
8
18
|
|
|
9
|
-
# --- Helper Functions ---
|
|
10
19
|
def inspect_images(path: str):
|
|
11
20
|
"""
|
|
12
21
|
Prints out the types, sizes and channels of image files found in the directory and its subdirectories.
|
|
@@ -216,3 +225,7 @@ def model_predict(model: torch.nn.Module, kind: Literal["regression", "classific
|
|
|
216
225
|
results.append(output.view(view_as).cpu().tolist())
|
|
217
226
|
|
|
218
227
|
return results
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def info():
|
|
231
|
+
_script_info(__all__)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-1.4.0 → dragon_ml_toolbox-1.4.1}/dragon_ml_toolbox.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-1.4.0 → dragon_ml_toolbox-1.4.1}/dragon_ml_toolbox.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|