workbench 0.8.198__py3-none-any.whl → 0.8.203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- workbench/algorithms/dataframe/proximity.py +11 -4
- workbench/api/__init__.py +2 -1
- workbench/api/df_store.py +17 -108
- workbench/api/feature_set.py +48 -11
- workbench/api/model.py +1 -1
- workbench/api/parameter_store.py +3 -52
- workbench/core/artifacts/__init__.py +11 -2
- workbench/core/artifacts/artifact.py +5 -5
- workbench/core/artifacts/df_store_core.py +114 -0
- workbench/core/artifacts/endpoint_core.py +261 -78
- workbench/core/artifacts/feature_set_core.py +69 -1
- workbench/core/artifacts/model_core.py +48 -14
- workbench/core/artifacts/parameter_store_core.py +98 -0
- workbench/core/transforms/features_to_model/features_to_model.py +50 -33
- workbench/core/transforms/pandas_transforms/pandas_to_features.py +11 -2
- workbench/core/views/view.py +2 -2
- workbench/model_scripts/chemprop/chemprop.template +933 -0
- workbench/model_scripts/chemprop/generated_model_script.py +933 -0
- workbench/model_scripts/chemprop/requirements.txt +11 -0
- workbench/model_scripts/custom_models/chem_info/fingerprints.py +134 -0
- workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
- workbench/model_scripts/custom_models/proximity/proximity.py +11 -4
- workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +11 -5
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +11 -5
- workbench/model_scripts/custom_models/uq_models/ngboost.template +11 -5
- workbench/model_scripts/custom_models/uq_models/proximity.py +11 -4
- workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +11 -5
- workbench/model_scripts/pytorch_model/generated_model_script.py +365 -173
- workbench/model_scripts/pytorch_model/pytorch.template +362 -170
- workbench/model_scripts/scikit_learn/generated_model_script.py +302 -0
- workbench/model_scripts/script_generation.py +10 -7
- workbench/model_scripts/uq_models/generated_model_script.py +43 -27
- workbench/model_scripts/uq_models/mapie.template +40 -24
- workbench/model_scripts/xgb_model/generated_model_script.py +36 -7
- workbench/model_scripts/xgb_model/xgb_model.template +36 -7
- workbench/repl/workbench_shell.py +14 -5
- workbench/resources/open_source_api.key +1 -1
- workbench/scripts/endpoint_test.py +162 -0
- workbench/scripts/{lambda_launcher.py → lambda_test.py} +10 -0
- workbench/utils/chemprop_utils.py +761 -0
- workbench/utils/pytorch_utils.py +527 -0
- workbench/utils/xgboost_model_utils.py +10 -5
- workbench/web_interface/components/model_plot.py +7 -1
- {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/METADATA +3 -3
- {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/RECORD +49 -43
- {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/entry_points.txt +2 -1
- workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
- workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
- workbench/model_scripts/__pycache__/script_generation.cpython-312.pyc +0 -0
- workbench/model_scripts/__pycache__/script_generation.cpython-313.pyc +0 -0
- {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/WHEEL +0 -0
- {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/top_level.txt +0 -0
|
@@ -13,39 +13,38 @@ from pytorch_tabular.models import CategoryEmbeddingModelConfig
|
|
|
13
13
|
# Model Performance Scores
|
|
14
14
|
from sklearn.metrics import (
|
|
15
15
|
mean_absolute_error,
|
|
16
|
+
median_absolute_error,
|
|
16
17
|
r2_score,
|
|
17
18
|
root_mean_squared_error,
|
|
18
19
|
precision_recall_fscore_support,
|
|
19
20
|
confusion_matrix,
|
|
20
21
|
)
|
|
22
|
+
from scipy.stats import spearmanr
|
|
21
23
|
|
|
22
24
|
# Classification Encoder
|
|
23
25
|
from sklearn.preprocessing import LabelEncoder
|
|
24
26
|
|
|
25
27
|
# Scikit Learn Imports
|
|
26
|
-
from sklearn.model_selection import train_test_split
|
|
28
|
+
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
|
|
27
29
|
|
|
28
30
|
from io import StringIO
|
|
29
31
|
import json
|
|
30
32
|
import argparse
|
|
31
33
|
import joblib
|
|
32
|
-
import os
|
|
33
34
|
import pandas as pd
|
|
34
|
-
from typing import List, Tuple
|
|
35
35
|
|
|
36
36
|
# Template Parameters
|
|
37
37
|
TEMPLATE_PARAMS = {
|
|
38
|
-
"model_type": "
|
|
39
|
-
"target": "
|
|
40
|
-
"features": ['
|
|
38
|
+
"model_type": "uq_regressor",
|
|
39
|
+
"target": "mppb",
|
|
40
|
+
"features": ['mollogp', 'mi', 'fr_benzene', 'smr_vsa3', 'fr_halogen', 'c2sp2', 'peoe_vsa6', 'bcut2d_mwhi', 'vsa_estate1', 'mv', 'numaromaticcarbocycles', 'vsa_estate5', 'fr_nh0', 'mm', 'smr_vsa7', 'tpsa', 'c1sp2', 'mz', 'vsa_estate2', 'peoe_vsa7', 'vsa_estate10', 'vsa_estate7', 'vsa_estate6', 'smr_vsa10', 'slogp_vsa2', 'bcut2d_logphi', 'naromatom', 'axp_2dv', 'bcut2d_mrhi', 'vsa_estate8', 'slogp_vsa3', 'vsa_estate4', 'xpc_6dv', 'slogp_vsa12', 'peoe_vsa9', 'mp', 'slogp_vsa1', 'peoe_vsa1', 'xch_5dv', 'qed', 'vsa_estate3', 'fpdensitymorgan3', 'axp_2d', 'axp_0d', 'mse', 'numhacceptors', 'bertzct', 'estate_vsa8', 'minestateindex', 'estate_vsa3', 'fpdensitymorgan2', 'smr_vsa6', 'peoe_vsa8', 'slogp_vsa6', 'xp_5dv', 'hallkieralpha', 'avgipc', 'fr_arn', 'xp_7d', 'mare', 'xp_6d', 'bcut2d_mrlow', 'estate_vsa4', 'bcut2d_logplow', 'peoe_vsa10', 'maxabspartialcharge', 'peoe_vsa3', 'bcut2d_mwlow', 'axp_7d', 'minpartialcharge', 'xpc_4d', 'axp_1d', 'estate_vsa9', 'vsa_estate9', 'estate_vsa7', 'maxestateindex', 'estate_vsa6', 'smr_vsa1', 'xpc_6d', 'xch_7d', 'xc_5d', 'phi', 'axp_0dv', 'axp_3dv', 'mpe', 'xc_3d', 'xch_5d', 'xc_5dv', 'xch_6d', 'chi4n', 'axp_7dv', 'slogp_vsa5', 'axp_1dv', 'xch_6dv', 'minabsestateindex', 'numrotatablebonds', 'peoe_vsa2', 'estate_vsa2', 'slogp_vsa8', 'bcut2d_chglo', 'xch_7dv', 'kappa2', 'axp_4dv', 'xc_3dv', 'kappa1', 'nbase', 'xpc_5dv', 'maxpartialcharge', 'bcut2d_chghi', 'axp_5d', 'balabanj', 'xpc_5d', 'fpdensitymorgan1', 'xp_5d', 'smr_vsa5', 'axp_4d', 'kappa3', 'fr_morpholine', 'estate_vsa5', 'chi2n', 'labuteasa', 'axp_5dv', 'molwt', 'smr_vsa9', 'maxabsestateindex', 'xp_7dv', 'fr_bicyclic', 'numaliphaticheterocycles', 'axp_6dv', 'slogp_vsa4', 'axp_3d', 'xp_6dv', 'nocount', 'axp_6d', 'fr_aniline', 'xpc_4dv', 'xp_1d', 'c3sp2', 'numheterocycles', 'nhohcount', 'molmr', 'numaromaticheterocycles', 'chi0', 'minabspartialcharge', 'fr_ar_n', 'xp_3d', 'chi2v', 'fr_ether', 'chi1v', 'chi1', 'xp_2d', 'xp_4dv', 'xp_4d', 'chi4v', 'fr_pyridine', 'smr_vsa4', 'sps', 'chi3n', 'heavyatommolwt', 'slogp_vsa11', 'fr_aryl_methyl', 'si', 'fractioncsp3', 'sse', 'fr_para_hydroxylation', 'slogp_vsa10', 'c1sp3', 'exactmolwt', 'numsaturatedheterocycles', 'chi1n', 'chi0n', 'fcsp3'],
|
|
41
|
+
"id_column": "molecule_name",
|
|
41
42
|
"compressed_features": [],
|
|
42
|
-
"model_metrics_s3_path": "s3://
|
|
43
|
-
"
|
|
44
|
-
"hyperparameters": {},
|
|
43
|
+
"model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/mppb-reg-pytorch/training",
|
|
44
|
+
"hyperparameters": {'n_folds': 5},
|
|
45
45
|
}
|
|
46
46
|
|
|
47
47
|
|
|
48
|
-
# Function to check if dataframe is empty
|
|
49
48
|
def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
|
|
50
49
|
"""
|
|
51
50
|
Check if the provided dataframe is empty and raise an exception if it is.
|
|
@@ -60,19 +59,17 @@ def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
|
|
|
60
59
|
raise ValueError(msg)
|
|
61
60
|
|
|
62
61
|
|
|
63
|
-
def expand_proba_column(df: pd.DataFrame, class_labels:
|
|
62
|
+
def expand_proba_column(df: pd.DataFrame, class_labels: list[str]) -> pd.DataFrame:
|
|
64
63
|
"""
|
|
65
64
|
Expands a column in a DataFrame containing a list of probabilities into separate columns.
|
|
66
65
|
|
|
67
66
|
Args:
|
|
68
67
|
df (pd.DataFrame): DataFrame containing a "pred_proba" column
|
|
69
|
-
class_labels (
|
|
68
|
+
class_labels (list[str]): List of class labels
|
|
70
69
|
|
|
71
70
|
Returns:
|
|
72
71
|
pd.DataFrame: DataFrame with the "pred_proba" expanded into separate columns
|
|
73
72
|
"""
|
|
74
|
-
|
|
75
|
-
# Sanity check
|
|
76
73
|
proba_column = "pred_proba"
|
|
77
74
|
if proba_column not in df.columns:
|
|
78
75
|
raise ValueError('DataFrame does not contain a "pred_proba" column')
|
|
@@ -89,11 +86,10 @@ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFra
|
|
|
89
86
|
|
|
90
87
|
# Concatenate the new columns with the original DataFrame
|
|
91
88
|
df = pd.concat([df, proba_df], axis=1)
|
|
92
|
-
print(df)
|
|
93
89
|
return df
|
|
94
90
|
|
|
95
91
|
|
|
96
|
-
def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
|
|
92
|
+
def match_features_case_insensitive(df: pd.DataFrame, model_features: list[str]) -> pd.DataFrame:
|
|
97
93
|
"""
|
|
98
94
|
Matches and renames DataFrame columns to match model feature names (case-insensitive).
|
|
99
95
|
Prioritizes exact matches, then case-insensitive matches.
|
|
@@ -118,55 +114,60 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
118
114
|
return df.rename(columns=rename_dict)
|
|
119
115
|
|
|
120
116
|
|
|
121
|
-
def convert_categorical_types(
|
|
117
|
+
def convert_categorical_types(
|
|
118
|
+
df: pd.DataFrame, features: list[str], category_mappings: dict[str, list[str]] | None = None
|
|
119
|
+
) -> tuple[pd.DataFrame, dict[str, list[str]]]:
|
|
122
120
|
"""
|
|
123
121
|
Converts appropriate columns to categorical type with consistent mappings.
|
|
124
122
|
|
|
125
123
|
Args:
|
|
126
124
|
df (pd.DataFrame): The DataFrame to process.
|
|
127
125
|
features (list): List of feature names to consider for conversion.
|
|
128
|
-
category_mappings (dict, optional): Existing category mappings. If empty
|
|
129
|
-
training mode. If populated, we're in
|
|
126
|
+
category_mappings (dict, optional): Existing category mappings. If None or empty,
|
|
127
|
+
we're in training mode. If populated, we're in
|
|
128
|
+
inference mode.
|
|
130
129
|
|
|
131
130
|
Returns:
|
|
132
131
|
tuple: (processed DataFrame, category mappings dictionary)
|
|
133
132
|
"""
|
|
133
|
+
if category_mappings is None:
|
|
134
|
+
category_mappings = {}
|
|
135
|
+
|
|
134
136
|
# Training mode
|
|
135
|
-
if category_mappings
|
|
137
|
+
if not category_mappings:
|
|
136
138
|
for col in df.select_dtypes(include=["object", "string"]):
|
|
137
139
|
if col in features and df[col].nunique() < 20:
|
|
138
140
|
print(f"Training mode: Converting {col} to category")
|
|
139
141
|
df[col] = df[col].astype("category")
|
|
140
|
-
category_mappings[col] = df[col].cat.categories.tolist()
|
|
142
|
+
category_mappings[col] = df[col].cat.categories.tolist()
|
|
141
143
|
|
|
142
144
|
# Inference mode
|
|
143
145
|
else:
|
|
144
146
|
for col, categories in category_mappings.items():
|
|
145
147
|
if col in df.columns:
|
|
146
148
|
print(f"Inference mode: Applying categorical mapping for {col}")
|
|
147
|
-
df[col] = pd.Categorical(df[col], categories=categories)
|
|
149
|
+
df[col] = pd.Categorical(df[col], categories=categories)
|
|
148
150
|
|
|
149
151
|
return df, category_mappings
|
|
150
152
|
|
|
151
153
|
|
|
152
154
|
def decompress_features(
|
|
153
|
-
df: pd.DataFrame, features:
|
|
154
|
-
) ->
|
|
155
|
+
df: pd.DataFrame, features: list[str], compressed_features: list[str]
|
|
156
|
+
) -> tuple[pd.DataFrame, list[str]]:
|
|
155
157
|
"""Prepare features for the model
|
|
156
158
|
|
|
157
159
|
Args:
|
|
158
160
|
df (pd.DataFrame): The features DataFrame
|
|
159
|
-
features (
|
|
160
|
-
compressed_features (
|
|
161
|
+
features (list[str]): Full list of feature names
|
|
162
|
+
compressed_features (list[str]): List of feature names to decompress (bitstrings)
|
|
161
163
|
|
|
162
164
|
Returns:
|
|
163
165
|
pd.DataFrame: DataFrame with the decompressed features
|
|
164
|
-
|
|
166
|
+
list[str]: Updated list of feature names after decompression
|
|
165
167
|
|
|
166
168
|
Raises:
|
|
167
169
|
ValueError: If any missing values are found in the specified features
|
|
168
170
|
"""
|
|
169
|
-
|
|
170
171
|
# Check for any missing values in the required features
|
|
171
172
|
missing_counts = df[features].isna().sum()
|
|
172
173
|
if missing_counts.any():
|
|
@@ -176,10 +177,11 @@ def decompress_features(
|
|
|
176
177
|
"WARNING: You might want to remove/replace all NaN values before processing."
|
|
177
178
|
)
|
|
178
179
|
|
|
179
|
-
#
|
|
180
|
-
decompressed_features = features
|
|
180
|
+
# Make a copy to avoid mutating the original list
|
|
181
|
+
decompressed_features = features.copy()
|
|
182
|
+
|
|
181
183
|
for feature in compressed_features:
|
|
182
|
-
if (feature not in df.columns) or (feature not in
|
|
184
|
+
if (feature not in df.columns) or (feature not in decompressed_features):
|
|
183
185
|
print(f"Feature '{feature}' not in the features list, skipping decompression.")
|
|
184
186
|
continue
|
|
185
187
|
|
|
@@ -204,26 +206,60 @@ def decompress_features(
|
|
|
204
206
|
return df, decompressed_features
|
|
205
207
|
|
|
206
208
|
|
|
207
|
-
def model_fn(model_dir):
|
|
209
|
+
def model_fn(model_dir: str) -> dict:
|
|
210
|
+
"""Load the PyTorch Tabular ensemble models from the specified directory.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
model_dir: Directory containing the saved model(s)
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Dictionary with ensemble models and metadata
|
|
217
|
+
"""
|
|
218
|
+
import torch
|
|
219
|
+
from functools import partial
|
|
220
|
+
|
|
221
|
+
# Load ensemble metadata if present
|
|
222
|
+
ensemble_metadata_path = os.path.join(model_dir, "ensemble_metadata.joblib")
|
|
223
|
+
if os.path.exists(ensemble_metadata_path):
|
|
224
|
+
ensemble_metadata = joblib.load(ensemble_metadata_path)
|
|
225
|
+
n_ensemble = ensemble_metadata["n_ensemble"]
|
|
226
|
+
else:
|
|
227
|
+
n_ensemble = 1
|
|
228
|
+
|
|
229
|
+
# Determine map_location for loading models (handle CUDA trained models on CPU inference)
|
|
230
|
+
map_location = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
231
|
+
|
|
232
|
+
# Patch torch.load globally to use map_location (needed for joblib-loaded callbacks)
|
|
233
|
+
# This handles the case where pytorch-tabular loads callbacks.sav via joblib,
|
|
234
|
+
# which internally calls torch.load without map_location
|
|
235
|
+
original_torch_load = torch.load
|
|
236
|
+
torch.load = partial(original_torch_load, map_location=map_location)
|
|
208
237
|
|
|
209
238
|
# Save current working directory
|
|
210
239
|
original_cwd = os.getcwd()
|
|
240
|
+
ensemble_models = []
|
|
241
|
+
|
|
211
242
|
try:
|
|
212
243
|
# Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
|
|
213
244
|
os.chdir("/tmp")
|
|
214
245
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
246
|
+
for ens_idx in range(n_ensemble):
|
|
247
|
+
# Try numbered model path first, fall back to legacy path
|
|
248
|
+
model_path = os.path.join(model_dir, f"tabular_model_{ens_idx}")
|
|
249
|
+
if not os.path.exists(model_path):
|
|
250
|
+
model_path = os.path.join(model_dir, "tabular_model")
|
|
251
|
+
model = TabularModel.load_model(model_path, map_location=map_location)
|
|
252
|
+
ensemble_models.append(model)
|
|
218
253
|
|
|
219
|
-
# Restore the original working directory
|
|
220
254
|
finally:
|
|
255
|
+
# Restore torch.load and working directory
|
|
256
|
+
torch.load = original_torch_load
|
|
221
257
|
os.chdir(original_cwd)
|
|
222
258
|
|
|
223
|
-
return
|
|
259
|
+
return {"ensemble_models": ensemble_models, "n_ensemble": n_ensemble}
|
|
224
260
|
|
|
225
261
|
|
|
226
|
-
def input_fn(input_data, content_type):
|
|
262
|
+
def input_fn(input_data, content_type: str) -> pd.DataFrame:
|
|
227
263
|
"""Parse input data and return a DataFrame."""
|
|
228
264
|
if not input_data:
|
|
229
265
|
raise ValueError("Empty input data is not supported!")
|
|
@@ -240,29 +276,34 @@ def input_fn(input_data, content_type):
|
|
|
240
276
|
raise ValueError(f"{content_type} not supported!")
|
|
241
277
|
|
|
242
278
|
|
|
243
|
-
def output_fn(output_df, accept_type):
|
|
279
|
+
def output_fn(output_df: pd.DataFrame, accept_type: str) -> tuple[str, str]:
|
|
244
280
|
"""Supports both CSV and JSON output formats."""
|
|
245
281
|
if "text/csv" in accept_type:
|
|
246
|
-
csv_output = output_df.fillna("N/A").to_csv(index=False)
|
|
282
|
+
csv_output = output_df.fillna("N/A").to_csv(index=False)
|
|
247
283
|
return csv_output, "text/csv"
|
|
248
284
|
elif "application/json" in accept_type:
|
|
249
|
-
return output_df.to_json(orient="records"), "application/json"
|
|
285
|
+
return output_df.to_json(orient="records"), "application/json"
|
|
250
286
|
else:
|
|
251
287
|
raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
|
|
252
288
|
|
|
253
289
|
|
|
254
|
-
def predict_fn(df,
|
|
255
|
-
"""Make Predictions with our PyTorch Tabular Model
|
|
290
|
+
def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
|
|
291
|
+
"""Make Predictions with our PyTorch Tabular Model ensemble.
|
|
256
292
|
|
|
257
293
|
Args:
|
|
258
294
|
df (pd.DataFrame): The input DataFrame
|
|
259
|
-
|
|
295
|
+
model_dict: Dictionary containing ensemble models and metadata
|
|
260
296
|
|
|
261
297
|
Returns:
|
|
262
|
-
pd.DataFrame: The DataFrame with
|
|
298
|
+
pd.DataFrame: The DataFrame with predictions (and prediction_std for ensembles)
|
|
263
299
|
"""
|
|
300
|
+
model_type = TEMPLATE_PARAMS["model_type"]
|
|
264
301
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
265
302
|
|
|
303
|
+
# Extract ensemble models
|
|
304
|
+
ensemble_models = model_dict["ensemble_models"]
|
|
305
|
+
n_ensemble = model_dict["n_ensemble"]
|
|
306
|
+
|
|
266
307
|
# Grab our feature columns (from training)
|
|
267
308
|
model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
|
|
268
309
|
with open(os.path.join(model_dir, "feature_columns.json")) as fp:
|
|
@@ -275,12 +316,11 @@ def predict_fn(df, model) -> pd.DataFrame:
|
|
|
275
316
|
|
|
276
317
|
# Load our Label Encoder if we have one
|
|
277
318
|
label_encoder = None
|
|
278
|
-
|
|
279
|
-
|
|
319
|
+
label_encoder_path = os.path.join(model_dir, "label_encoder.joblib")
|
|
320
|
+
if os.path.exists(label_encoder_path):
|
|
321
|
+
label_encoder = joblib.load(label_encoder_path)
|
|
280
322
|
|
|
281
|
-
#
|
|
282
|
-
# - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
|
|
283
|
-
# - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
|
|
323
|
+
# Match features in a case-insensitive manner
|
|
284
324
|
matched_df = match_features_case_insensitive(df, features)
|
|
285
325
|
|
|
286
326
|
# Detect categorical types in the incoming DataFrame
|
|
@@ -291,36 +331,80 @@ def predict_fn(df, model) -> pd.DataFrame:
|
|
|
291
331
|
print("Decompressing features for prediction...")
|
|
292
332
|
matched_df, features = decompress_features(matched_df, features, compressed_features)
|
|
293
333
|
|
|
294
|
-
#
|
|
295
|
-
|
|
334
|
+
# Track rows with missing features
|
|
335
|
+
missing_mask = matched_df[features].isna().any(axis=1)
|
|
336
|
+
if missing_mask.any():
|
|
337
|
+
print(f"Warning: {missing_mask.sum()} rows have missing features, will return NaN predictions")
|
|
338
|
+
|
|
339
|
+
# Initialize prediction columns
|
|
340
|
+
df["prediction"] = np.nan
|
|
341
|
+
if model_type in ["regressor", "uq_regressor"]:
|
|
342
|
+
df["prediction_std"] = np.nan
|
|
343
|
+
|
|
344
|
+
# Only predict on complete rows
|
|
345
|
+
complete_df = matched_df[~missing_mask]
|
|
346
|
+
if len(complete_df) == 0:
|
|
347
|
+
print("Warning: No complete rows to predict on")
|
|
348
|
+
return df
|
|
296
349
|
|
|
297
350
|
# pytorch-tabular returns predictions using f"{target}_prediction" column
|
|
298
|
-
|
|
299
|
-
target = TEMPLATE_PARAMS["target_column"]
|
|
351
|
+
target = TEMPLATE_PARAMS["target"]
|
|
300
352
|
prediction_column = f"{target}_prediction"
|
|
301
|
-
if prediction_column in result.columns:
|
|
302
|
-
predictions = result[prediction_column].values
|
|
303
|
-
else:
|
|
304
|
-
raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
|
|
305
353
|
|
|
306
|
-
#
|
|
307
|
-
|
|
308
|
-
|
|
354
|
+
# Collect predictions from all ensemble members
|
|
355
|
+
all_ensemble_preds = []
|
|
356
|
+
all_ensemble_probs = []
|
|
357
|
+
|
|
358
|
+
for ens_idx, ens_model in enumerate(ensemble_models):
|
|
359
|
+
result = ens_model.predict(complete_df[features])
|
|
360
|
+
|
|
361
|
+
if prediction_column in result.columns:
|
|
362
|
+
ens_preds = result[prediction_column].values
|
|
363
|
+
else:
|
|
364
|
+
raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
|
|
365
|
+
|
|
366
|
+
all_ensemble_preds.append(ens_preds)
|
|
309
367
|
|
|
310
|
-
|
|
311
|
-
|
|
368
|
+
# For classification, collect probabilities
|
|
369
|
+
if label_encoder is not None:
|
|
370
|
+
prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
|
|
371
|
+
if prob_cols:
|
|
372
|
+
all_ensemble_probs.append(result[prob_cols].values)
|
|
312
373
|
|
|
313
|
-
#
|
|
374
|
+
# Stack and compute mean/std (std is 0 for single model)
|
|
375
|
+
ensemble_preds = np.stack(all_ensemble_preds, axis=0) # (n_ensemble, n_samples)
|
|
376
|
+
preds = np.mean(ensemble_preds, axis=0)
|
|
377
|
+
preds_std = np.std(ensemble_preds, axis=0) # Will be 0s for n_ensemble=1
|
|
378
|
+
|
|
379
|
+
print(f"Inference: Ensemble predictions shape: {preds.shape}, n_ensemble: {n_ensemble}")
|
|
380
|
+
|
|
381
|
+
# Handle classification vs regression
|
|
314
382
|
if label_encoder is not None:
|
|
315
|
-
|
|
316
|
-
if
|
|
317
|
-
|
|
318
|
-
|
|
383
|
+
# For classification, average probabilities then take argmax
|
|
384
|
+
if all_ensemble_probs:
|
|
385
|
+
ensemble_probs = np.stack(all_ensemble_probs, axis=0) # (n_ensemble, n_samples, n_classes)
|
|
386
|
+
avg_probs = np.mean(ensemble_probs, axis=0) # (n_samples, n_classes)
|
|
387
|
+
class_preds = np.argmax(avg_probs, axis=1)
|
|
388
|
+
predictions = label_encoder.inverse_transform(class_preds)
|
|
389
|
+
|
|
390
|
+
# Build full proba Series with None for missing rows
|
|
391
|
+
all_proba = pd.Series([None] * len(df), index=df.index, dtype=object)
|
|
392
|
+
all_proba.loc[~missing_mask] = [p.tolist() for p in avg_probs]
|
|
393
|
+
df["pred_proba"] = all_proba
|
|
319
394
|
|
|
320
395
|
# Expand the pred_proba column into separate columns for each class
|
|
321
396
|
df = expand_proba_column(df, label_encoder.classes_)
|
|
397
|
+
else:
|
|
398
|
+
# No probabilities, use averaged predictions
|
|
399
|
+
predictions = label_encoder.inverse_transform(preds.astype(int))
|
|
400
|
+
else:
|
|
401
|
+
# Regression (includes uq_regressor)
|
|
402
|
+
predictions = preds
|
|
403
|
+
df.loc[~missing_mask, "prediction_std"] = preds_std
|
|
404
|
+
|
|
405
|
+
# Set predictions only for complete rows
|
|
406
|
+
df.loc[~missing_mask, "prediction"] = predictions
|
|
322
407
|
|
|
323
|
-
# All done, return the DataFrame with new columns for the predictions
|
|
324
408
|
return df
|
|
325
409
|
|
|
326
410
|
|
|
@@ -331,12 +415,11 @@ if __name__ == "__main__":
|
|
|
331
415
|
target = TEMPLATE_PARAMS["target"]
|
|
332
416
|
features = TEMPLATE_PARAMS["features"]
|
|
333
417
|
orig_features = features.copy()
|
|
418
|
+
id_column = TEMPLATE_PARAMS["id_column"]
|
|
334
419
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
335
420
|
model_type = TEMPLATE_PARAMS["model_type"]
|
|
336
421
|
model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
|
|
337
|
-
train_all_data = TEMPLATE_PARAMS["train_all_data"]
|
|
338
422
|
hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
|
|
339
|
-
validation_split = 0.2
|
|
340
423
|
|
|
341
424
|
# Script arguments for input/output directories
|
|
342
425
|
parser = argparse.ArgumentParser()
|
|
@@ -354,9 +437,21 @@ if __name__ == "__main__":
|
|
|
354
437
|
# Combine files and read them all into a single pandas dataframe
|
|
355
438
|
all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
356
439
|
|
|
440
|
+
# Print out some info about the dataframe
|
|
441
|
+
print(f"All Data Shape: {all_df.shape}")
|
|
442
|
+
print(f"Feature dtypes:\n{all_df[features].dtypes.value_counts()}")
|
|
443
|
+
print(f"Int64 columns: {all_df[features].select_dtypes(include=['int64']).columns.tolist()}")
|
|
444
|
+
|
|
357
445
|
# Check if the dataframe is empty
|
|
358
446
|
check_dataframe(all_df, "training_df")
|
|
359
447
|
|
|
448
|
+
# Drop any rows with missing feature values
|
|
449
|
+
initial_row_count = all_df.shape[0]
|
|
450
|
+
all_df = all_df.dropna(subset=features)
|
|
451
|
+
dropped_rows = initial_row_count - all_df.shape[0]
|
|
452
|
+
if dropped_rows > 0:
|
|
453
|
+
print(f"Dropped {dropped_rows} rows due to missing feature values.")
|
|
454
|
+
|
|
360
455
|
# Features/Target output
|
|
361
456
|
print(f"Target: {target}")
|
|
362
457
|
print(f"Features: {str(features)}")
|
|
@@ -364,82 +459,88 @@ if __name__ == "__main__":
|
|
|
364
459
|
# Convert any features that might be categorical to 'category' type
|
|
365
460
|
all_df, category_mappings = convert_categorical_types(all_df, features)
|
|
366
461
|
|
|
462
|
+
# Print out some info about the dataframe
|
|
463
|
+
print(f"All Data Shape: {all_df.shape}")
|
|
464
|
+
print(f"Feature dtypes:\n{all_df[features].dtypes.value_counts()}")
|
|
465
|
+
print(f"Int64 columns: {all_df[features].select_dtypes(include=['int64']).columns.tolist()}")
|
|
466
|
+
|
|
367
467
|
# If we have compressed features, decompress them
|
|
368
468
|
if compressed_features:
|
|
369
469
|
print(f"Decompressing features {compressed_features}...")
|
|
370
470
|
all_df, features = decompress_features(all_df, features, compressed_features)
|
|
371
471
|
|
|
372
|
-
# Do we want to train on all the data?
|
|
373
|
-
if train_all_data:
|
|
374
|
-
print("Training on ALL of the data")
|
|
375
|
-
df_train = all_df.copy()
|
|
376
|
-
df_val = all_df.copy()
|
|
377
|
-
|
|
378
|
-
# Does the dataframe have a training column?
|
|
379
|
-
elif "training" in all_df.columns:
|
|
380
|
-
print("Found training column, splitting data based on training column")
|
|
381
|
-
df_train = all_df[all_df["training"]]
|
|
382
|
-
df_val = all_df[~all_df["training"]]
|
|
383
|
-
else:
|
|
384
|
-
# Just do a random training Split
|
|
385
|
-
print("WARNING: No training column found, splitting data with random state=42")
|
|
386
|
-
df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
|
|
387
|
-
print(f"FIT/TRAIN: {df_train.shape}")
|
|
388
|
-
print(f"VALIDATION: {df_val.shape}")
|
|
389
|
-
|
|
390
472
|
# Determine categorical and continuous columns
|
|
391
|
-
categorical_cols = [col for col in features if
|
|
473
|
+
categorical_cols = [col for col in features if all_df[col].dtype.name == "category"]
|
|
392
474
|
continuous_cols = [col for col in features if col not in categorical_cols]
|
|
393
|
-
|
|
394
475
|
print(f"Categorical columns: {categorical_cols}")
|
|
395
476
|
print(f"Continuous columns: {continuous_cols}")
|
|
396
477
|
|
|
397
|
-
#
|
|
398
|
-
|
|
399
|
-
target=[target],
|
|
400
|
-
continuous_cols=continuous_cols,
|
|
401
|
-
categorical_cols=categorical_cols,
|
|
402
|
-
)
|
|
478
|
+
# Cast continuous columns to float
|
|
479
|
+
all_df[continuous_cols] = all_df[continuous_cols].astype("float64")
|
|
403
480
|
|
|
404
|
-
# Choose the 'task' based on model type
|
|
481
|
+
# Choose the 'task' based on model type and set up the label encoder if needed
|
|
405
482
|
if model_type == "classifier":
|
|
406
483
|
task = "classification"
|
|
407
|
-
# Encode the target column
|
|
484
|
+
# Encode the target column on full dataset for consistent encoding
|
|
408
485
|
label_encoder = LabelEncoder()
|
|
409
|
-
|
|
410
|
-
|
|
486
|
+
all_df[target] = label_encoder.fit_transform(all_df[target])
|
|
487
|
+
num_classes = len(label_encoder.classes_)
|
|
411
488
|
else:
|
|
412
489
|
task = "regression"
|
|
413
490
|
label_encoder = None
|
|
491
|
+
num_classes = None
|
|
414
492
|
|
|
415
493
|
# Use any hyperparameters to set up both the trainer and model configurations
|
|
416
494
|
print(f"Hyperparameters: {hyperparameters}")
|
|
495
|
+
n_folds = hyperparameters.get("n_folds", 5) # Number of CV folds (default: 5)
|
|
496
|
+
|
|
497
|
+
# =========================================================================
|
|
498
|
+
# UNIFIED TRAINING: Works for n_folds=1 (single model) or n_folds>1 (K-fold CV)
|
|
499
|
+
# =========================================================================
|
|
500
|
+
print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold cross-validation ensemble'}...")
|
|
501
|
+
|
|
502
|
+
# Create fold splits
|
|
503
|
+
if n_folds == 1:
|
|
504
|
+
# Single fold: use train/val split from "training" column or random split
|
|
505
|
+
if "training" in all_df.columns:
|
|
506
|
+
print("Found training column, splitting data based on training column")
|
|
507
|
+
train_idx = np.where(all_df["training"])[0]
|
|
508
|
+
val_idx = np.where(~all_df["training"])[0]
|
|
509
|
+
else:
|
|
510
|
+
print("WARNING: No training column found, splitting data with random 80/20 split")
|
|
511
|
+
indices = np.arange(len(all_df))
|
|
512
|
+
train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)
|
|
513
|
+
folds = [(train_idx, val_idx)]
|
|
514
|
+
else:
|
|
515
|
+
# K-Fold CV
|
|
516
|
+
if model_type == "classifier":
|
|
517
|
+
kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
518
|
+
split_target = all_df[target]
|
|
519
|
+
else:
|
|
520
|
+
kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
521
|
+
split_target = None
|
|
522
|
+
folds = list(kfold.split(all_df, split_target))
|
|
523
|
+
|
|
524
|
+
# Initialize storage for out-of-fold predictions
|
|
525
|
+
oof_predictions = np.full(len(all_df), np.nan, dtype=np.float64)
|
|
526
|
+
if model_type == "classifier" and num_classes and num_classes > 1:
|
|
527
|
+
oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
|
|
528
|
+
else:
|
|
529
|
+
oof_proba = None
|
|
417
530
|
|
|
418
|
-
|
|
419
|
-
trainer_defaults = {
|
|
420
|
-
"auto_lr_find": True,
|
|
421
|
-
"batch_size": min(1024, max(32, len(df_train) // 4)),
|
|
422
|
-
"max_epochs": 100,
|
|
423
|
-
"early_stopping": "valid_loss",
|
|
424
|
-
"early_stopping_patience": 15,
|
|
425
|
-
"checkpoints": "valid_loss",
|
|
426
|
-
"accelerator": "auto",
|
|
427
|
-
"progress_bar": "none",
|
|
428
|
-
"gradient_clip_val": 1.0,
|
|
429
|
-
}
|
|
531
|
+
ensemble_models = []
|
|
430
532
|
|
|
431
|
-
#
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
trainer_config = TrainerConfig(**trainer_params)
|
|
533
|
+
# Set up PyTorch Tabular data configuration (shared across folds)
|
|
534
|
+
data_config = DataConfig(
|
|
535
|
+
target=[target],
|
|
536
|
+
continuous_cols=continuous_cols,
|
|
537
|
+
categorical_cols=categorical_cols,
|
|
538
|
+
)
|
|
438
539
|
|
|
439
540
|
# Model config defaults
|
|
440
541
|
model_defaults = {
|
|
441
|
-
"layers": "
|
|
442
|
-
"activation": "
|
|
542
|
+
"layers": "256-128-64",
|
|
543
|
+
"activation": "LeakyReLU",
|
|
443
544
|
"learning_rate": 1e-3,
|
|
444
545
|
"dropout": 0.1,
|
|
445
546
|
"use_batch_norm": True,
|
|
@@ -447,63 +548,139 @@ if __name__ == "__main__":
|
|
|
447
548
|
}
|
|
448
549
|
# Override defaults with model_config if present
|
|
449
550
|
model_overrides = {k: v for k, v in hyperparameters.get("model_config", {}).items() if k in model_defaults}
|
|
450
|
-
# Print overwrites
|
|
451
551
|
for key, value in model_overrides.items():
|
|
452
552
|
print(f"MODEL CONFIG Override: {key}: {model_defaults[key]} → {value}")
|
|
453
553
|
model_params = {**model_defaults, **model_overrides}
|
|
454
554
|
|
|
455
|
-
# Use CategoryEmbedding model configuration for general-purpose tabular modeling.
|
|
456
|
-
# Works effectively for both regression and classification as the foundational
|
|
457
|
-
# architecture in PyTorch Tabular
|
|
458
555
|
model_config = CategoryEmbeddingModelConfig(task=task, **model_params)
|
|
459
556
|
optimizer_config = OptimizerConfig()
|
|
460
557
|
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
558
|
+
for fold_idx, (train_idx, val_idx) in enumerate(folds):
|
|
559
|
+
print(f"\n{'='*50}")
|
|
560
|
+
print(f"Training Fold {fold_idx + 1}/{len(folds)}")
|
|
561
|
+
print(f"{'='*50}")
|
|
562
|
+
|
|
563
|
+
# Split data for this fold
|
|
564
|
+
df_train = all_df.iloc[train_idx].reset_index(drop=True)
|
|
565
|
+
df_val = all_df.iloc[val_idx].reset_index(drop=True)
|
|
566
|
+
|
|
567
|
+
print(f"Fold {fold_idx + 1} - Train: {len(df_train)}, Val: {len(df_val)}")
|
|
568
|
+
|
|
569
|
+
# Set up PyTorch Tabular trainer configuration (per-fold for batch_size)
|
|
570
|
+
# Calculate batch size that avoids single-sample last batch (batch norm requires >1)
|
|
571
|
+
batch_size = min(128, max(32, len(df_train) // 16))
|
|
572
|
+
if len(df_train) % batch_size == 1:
|
|
573
|
+
batch_size += 1 # Adjust to avoid last batch of size 1
|
|
574
|
+
trainer_defaults = {
|
|
575
|
+
"auto_lr_find": False,
|
|
576
|
+
"batch_size": batch_size,
|
|
577
|
+
"max_epochs": 200,
|
|
578
|
+
"min_epochs": 10,
|
|
579
|
+
"early_stopping": "valid_loss",
|
|
580
|
+
"early_stopping_patience": 20,
|
|
581
|
+
"checkpoints": "valid_loss",
|
|
582
|
+
"accelerator": "auto",
|
|
583
|
+
"progress_bar": "none",
|
|
584
|
+
"gradient_clip_val": 1.0,
|
|
585
|
+
"seed": 42 + fold_idx,
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
# Override defaults with training_config if present
|
|
589
|
+
training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
|
|
590
|
+
if fold_idx == 0: # Only print overrides once
|
|
591
|
+
for key, value in training_overrides.items():
|
|
592
|
+
print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
|
|
593
|
+
trainer_params = {**trainer_defaults, **training_overrides}
|
|
594
|
+
trainer_config = TrainerConfig(**trainer_params)
|
|
595
|
+
|
|
596
|
+
# Create and train the TabularModel for this fold
|
|
597
|
+
tabular_model = TabularModel(
|
|
598
|
+
data_config=data_config,
|
|
599
|
+
model_config=model_config,
|
|
600
|
+
optimizer_config=optimizer_config,
|
|
601
|
+
trainer_config=trainer_config,
|
|
602
|
+
)
|
|
603
|
+
tabular_model.fit(train=df_train, validation=df_val)
|
|
604
|
+
ensemble_models.append(tabular_model)
|
|
605
|
+
|
|
606
|
+
# Make out-of-fold predictions
|
|
607
|
+
result = tabular_model.predict(df_val, include_input_features=False)
|
|
608
|
+
fold_preds = result[f"{target}_prediction"].values
|
|
609
|
+
|
|
610
|
+
# Store out-of-fold predictions
|
|
611
|
+
if model_type == "classifier":
|
|
612
|
+
oof_predictions[val_idx] = fold_preds.astype(int)
|
|
613
|
+
prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
|
|
614
|
+
if prob_cols and oof_proba is not None:
|
|
615
|
+
oof_proba[val_idx] = result[prob_cols].values
|
|
616
|
+
else:
|
|
617
|
+
oof_predictions[val_idx] = fold_preds.flatten()
|
|
471
618
|
|
|
472
|
-
|
|
473
|
-
print("Making Predictions on Validation Set...")
|
|
474
|
-
result = tabular_model.predict(df_val, include_input_features=False)
|
|
619
|
+
print(f"Fold {fold_idx + 1} complete!")
|
|
475
620
|
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
621
|
+
print(f"\nTraining complete! Trained {len(ensemble_models)} model(s).")
|
|
622
|
+
|
|
623
|
+
# Use out-of-fold predictions for metrics
|
|
624
|
+
# For n_folds=1, we only have predictions for val_idx, so filter to those rows
|
|
625
|
+
if n_folds == 1:
|
|
626
|
+
val_mask = ~np.isnan(oof_predictions)
|
|
627
|
+
preds = oof_predictions[val_mask]
|
|
628
|
+
df_val = all_df[val_mask].copy()
|
|
629
|
+
if oof_proba is not None:
|
|
630
|
+
oof_proba = oof_proba[val_mask]
|
|
480
631
|
else:
|
|
481
|
-
|
|
482
|
-
|
|
632
|
+
preds = oof_predictions
|
|
633
|
+
df_val = all_df.copy()
|
|
634
|
+
|
|
635
|
+
# Compute prediction_std by running all ensemble models on validation data
|
|
636
|
+
# For n_folds=1, std will be 0 (only one model). For n_folds>1, std shows ensemble disagreement.
|
|
637
|
+
preds_std = None
|
|
638
|
+
if model_type in ["regressor", "uq_regressor"] and len(ensemble_models) > 0:
|
|
639
|
+
print("Computing prediction_std from ensemble predictions on validation data...")
|
|
640
|
+
all_ensemble_preds_for_std = []
|
|
641
|
+
for ens_model in ensemble_models:
|
|
642
|
+
result = ens_model.predict(df_val[features], include_input_features=False)
|
|
643
|
+
ens_preds = result[f"{target}_prediction"].values.flatten()
|
|
644
|
+
all_ensemble_preds_for_std.append(ens_preds)
|
|
645
|
+
|
|
646
|
+
ensemble_preds_stacked = np.stack(all_ensemble_preds_for_std, axis=0)
|
|
647
|
+
preds_std = np.std(ensemble_preds_stacked, axis=0)
|
|
648
|
+
print(f"Ensemble prediction_std - mean: {np.mean(preds_std):.4f}, max: {np.max(preds_std):.4f}")
|
|
483
649
|
|
|
484
650
|
if model_type == "classifier":
|
|
485
651
|
# Get probabilities for classification
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
probs = result[prob_cols].values
|
|
490
|
-
df_val["pred_proba"] = [p.tolist() for p in probs]
|
|
491
|
-
|
|
492
|
-
# Expand the pred_proba column into separate columns for each class
|
|
493
|
-
print(df_val.columns)
|
|
652
|
+
if oof_proba is not None:
|
|
653
|
+
df_val = df_val.copy()
|
|
654
|
+
df_val["pred_proba"] = [p.tolist() for p in oof_proba]
|
|
494
655
|
df_val = expand_proba_column(df_val, label_encoder.classes_)
|
|
495
|
-
print(df_val.columns)
|
|
496
656
|
|
|
497
657
|
# Decode the target and prediction labels
|
|
498
658
|
y_validate = label_encoder.inverse_transform(df_val[target])
|
|
499
|
-
|
|
659
|
+
preds_decoded = label_encoder.inverse_transform(preds.astype(int))
|
|
500
660
|
else:
|
|
501
661
|
y_validate = df_val[target].values
|
|
662
|
+
preds_decoded = preds
|
|
663
|
+
|
|
664
|
+
# Save predictions to S3
|
|
665
|
+
df_val = df_val.copy()
|
|
666
|
+
df_val["prediction"] = preds_decoded
|
|
667
|
+
|
|
668
|
+
# Build output columns - include id_column if it exists
|
|
669
|
+
output_columns = []
|
|
670
|
+
if id_column in df_val.columns:
|
|
671
|
+
output_columns.append(id_column)
|
|
672
|
+
output_columns += [target, "prediction"]
|
|
673
|
+
|
|
674
|
+
# Add prediction_std for regression models (always present, 0 for single model)
|
|
675
|
+
if model_type in ["regressor", "uq_regressor"]:
|
|
676
|
+
if preds_std is not None:
|
|
677
|
+
df_val["prediction_std"] = preds_std
|
|
678
|
+
else:
|
|
679
|
+
df_val["prediction_std"] = 0.0
|
|
680
|
+
output_columns.append("prediction_std")
|
|
681
|
+
print(f"Ensemble std - mean: {df_val['prediction_std'].mean():.4f}, max: {df_val['prediction_std'].max():.4f}")
|
|
502
682
|
|
|
503
|
-
|
|
504
|
-
df_val["prediction"] = preds
|
|
505
|
-
output_columns = [target, "prediction"]
|
|
506
|
-
output_columns += [col for col in df_val.columns if col.endswith("_probability")]
|
|
683
|
+
output_columns += [col for col in df_val.columns if col.endswith("_proba")]
|
|
507
684
|
wr.s3.to_csv(
|
|
508
685
|
df_val[output_columns],
|
|
509
686
|
path=f"{model_metrics_s3_path}/validation_predictions.csv",
|
|
@@ -516,7 +693,7 @@ if __name__ == "__main__":
|
|
|
516
693
|
label_names = label_encoder.classes_
|
|
517
694
|
|
|
518
695
|
# Calculate various model performance metrics
|
|
519
|
-
scores = precision_recall_fscore_support(y_validate,
|
|
696
|
+
scores = precision_recall_fscore_support(y_validate, preds_decoded, average=None, labels=label_names)
|
|
520
697
|
|
|
521
698
|
# Put the scores into a dataframe
|
|
522
699
|
score_df = pd.DataFrame(
|
|
@@ -529,7 +706,7 @@ if __name__ == "__main__":
|
|
|
529
706
|
}
|
|
530
707
|
)
|
|
531
708
|
|
|
532
|
-
#
|
|
709
|
+
# Output metrics per class
|
|
533
710
|
metrics = ["precision", "recall", "f1", "support"]
|
|
534
711
|
for t in label_names:
|
|
535
712
|
for m in metrics:
|
|
@@ -537,7 +714,7 @@ if __name__ == "__main__":
|
|
|
537
714
|
print(f"Metrics:{t}:{m} {value}")
|
|
538
715
|
|
|
539
716
|
# Compute and output the confusion matrix
|
|
540
|
-
conf_mtx = confusion_matrix(y_validate,
|
|
717
|
+
conf_mtx = confusion_matrix(y_validate, preds_decoded, labels=label_names)
|
|
541
718
|
for i, row_name in enumerate(label_names):
|
|
542
719
|
for j, col_name in enumerate(label_names):
|
|
543
720
|
value = conf_mtx[i, j]
|
|
@@ -545,22 +722,37 @@ if __name__ == "__main__":
|
|
|
545
722
|
|
|
546
723
|
else:
|
|
547
724
|
# Calculate various model performance metrics (regression)
|
|
548
|
-
rmse = root_mean_squared_error(y_validate,
|
|
549
|
-
mae = mean_absolute_error(y_validate,
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
print(f"
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
725
|
+
rmse = root_mean_squared_error(y_validate, preds_decoded)
|
|
726
|
+
mae = mean_absolute_error(y_validate, preds_decoded)
|
|
727
|
+
medae = median_absolute_error(y_validate, preds_decoded)
|
|
728
|
+
r2 = r2_score(y_validate, preds_decoded)
|
|
729
|
+
spearman_corr = spearmanr(y_validate, preds_decoded).correlation
|
|
730
|
+
support = len(df_val)
|
|
731
|
+
print(f"rmse: {rmse:.3f}")
|
|
732
|
+
print(f"mae: {mae:.3f}")
|
|
733
|
+
print(f"medae: {medae:.3f}")
|
|
734
|
+
print(f"r2: {r2:.3f}")
|
|
735
|
+
print(f"spearmanr: {spearman_corr:.3f}")
|
|
736
|
+
print(f"support: {support}")
|
|
737
|
+
|
|
738
|
+
# Save ensemble models
|
|
739
|
+
for model_idx, ens_model in enumerate(ensemble_models):
|
|
740
|
+
model_path = os.path.join(args.model_dir, f"tabular_model_{model_idx}")
|
|
741
|
+
ens_model.save_model(model_path)
|
|
742
|
+
print(f"Saved model {model_idx + 1} to {model_path}")
|
|
743
|
+
|
|
744
|
+
# Save ensemble metadata
|
|
745
|
+
n_ensemble = len(ensemble_models)
|
|
746
|
+
ensemble_metadata = {"n_ensemble": n_ensemble, "n_folds": n_folds}
|
|
747
|
+
joblib.dump(ensemble_metadata, os.path.join(args.model_dir, "ensemble_metadata.joblib"))
|
|
748
|
+
print(f"Saved ensemble metadata (n_ensemble={n_ensemble}, n_folds={n_folds})")
|
|
749
|
+
|
|
558
750
|
if label_encoder:
|
|
559
751
|
joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
|
|
560
752
|
|
|
561
753
|
# Save the features (this will validate input during predictions)
|
|
562
754
|
with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
|
|
563
|
-
json.dump(orig_features, fp)
|
|
755
|
+
json.dump(orig_features, fp)
|
|
564
756
|
|
|
565
757
|
# Save the category mappings
|
|
566
758
|
with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
|