workbench 0.8.205__py3-none-any.whl → 0.8.212__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- workbench/algorithms/models/noise_model.py +388 -0
- workbench/api/endpoint.py +3 -6
- workbench/api/feature_set.py +1 -1
- workbench/api/model.py +5 -11
- workbench/cached/cached_model.py +4 -4
- workbench/core/artifacts/endpoint_core.py +57 -145
- workbench/core/artifacts/model_core.py +21 -19
- workbench/core/transforms/features_to_model/features_to_model.py +2 -2
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +1 -1
- workbench/model_script_utils/model_script_utils.py +335 -0
- workbench/model_script_utils/pytorch_utils.py +395 -0
- workbench/model_script_utils/uq_harness.py +278 -0
- workbench/model_scripts/chemprop/chemprop.template +289 -666
- workbench/model_scripts/chemprop/generated_model_script.py +292 -669
- workbench/model_scripts/chemprop/model_script_utils.py +335 -0
- workbench/model_scripts/chemprop/requirements.txt +2 -10
- workbench/model_scripts/pytorch_model/generated_model_script.py +355 -612
- workbench/model_scripts/pytorch_model/model_script_utils.py +335 -0
- workbench/model_scripts/pytorch_model/pytorch.template +350 -607
- workbench/model_scripts/pytorch_model/pytorch_utils.py +395 -0
- workbench/model_scripts/pytorch_model/requirements.txt +1 -1
- workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
- workbench/model_scripts/script_generation.py +2 -5
- workbench/model_scripts/uq_models/generated_model_script.py +65 -422
- workbench/model_scripts/xgb_model/generated_model_script.py +349 -412
- workbench/model_scripts/xgb_model/model_script_utils.py +335 -0
- workbench/model_scripts/xgb_model/uq_harness.py +278 -0
- workbench/model_scripts/xgb_model/xgb_model.template +344 -407
- workbench/scripts/training_test.py +85 -0
- workbench/utils/chemprop_utils.py +18 -656
- workbench/utils/metrics_utils.py +172 -0
- workbench/utils/model_utils.py +104 -47
- workbench/utils/pytorch_utils.py +32 -472
- workbench/utils/xgboost_local_crossfold.py +267 -0
- workbench/utils/xgboost_model_utils.py +49 -356
- workbench/web_interface/components/plugins/model_details.py +30 -68
- {workbench-0.8.205.dist-info → workbench-0.8.212.dist-info}/METADATA +5 -5
- {workbench-0.8.205.dist-info → workbench-0.8.212.dist-info}/RECORD +42 -31
- {workbench-0.8.205.dist-info → workbench-0.8.212.dist-info}/entry_points.txt +1 -0
- workbench/model_scripts/uq_models/mapie.template +0 -605
- workbench/model_scripts/uq_models/requirements.txt +0 -1
- {workbench-0.8.205.dist-info → workbench-0.8.212.dist-info}/WHEEL +0 -0
- {workbench-0.8.205.dist-info → workbench-0.8.212.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.205.dist-info → workbench-0.8.212.dist-info}/top_level.txt +0 -0
|
@@ -1,417 +1,248 @@
|
|
|
1
|
-
#
|
|
1
|
+
# PyTorch Tabular Model Template for Workbench
|
|
2
|
+
#
|
|
3
|
+
# This template handles both classification and regression models with:
|
|
4
|
+
# - K-fold cross-validation ensemble training (or single train/val split)
|
|
5
|
+
# - Out-of-fold predictions for validation metrics
|
|
6
|
+
# - Categorical feature embedding via TabularMLP
|
|
7
|
+
# - Compressed feature decompression
|
|
8
|
+
|
|
9
|
+
import argparse
|
|
10
|
+
import json
|
|
2
11
|
import os
|
|
12
|
+
|
|
3
13
|
import awswrangler as wr
|
|
14
|
+
import joblib
|
|
4
15
|
import numpy as np
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
os.environ["TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD"] = "1"
|
|
9
|
-
from pytorch_tabular import TabularModel
|
|
10
|
-
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
|
|
11
|
-
from pytorch_tabular.models import CategoryEmbeddingModelConfig
|
|
12
|
-
|
|
13
|
-
# Model Performance Scores
|
|
14
|
-
from sklearn.metrics import (
|
|
15
|
-
mean_absolute_error,
|
|
16
|
-
median_absolute_error,
|
|
17
|
-
r2_score,
|
|
18
|
-
root_mean_squared_error,
|
|
19
|
-
precision_recall_fscore_support,
|
|
20
|
-
confusion_matrix,
|
|
21
|
-
)
|
|
22
|
-
from scipy.stats import spearmanr
|
|
23
|
-
|
|
24
|
-
# Classification Encoder
|
|
16
|
+
import pandas as pd
|
|
17
|
+
import torch
|
|
18
|
+
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
|
|
25
19
|
from sklearn.preprocessing import LabelEncoder
|
|
26
20
|
|
|
27
|
-
#
|
|
28
|
-
|
|
21
|
+
# Enable Tensor Core optimization for GPUs that support it
|
|
22
|
+
torch.set_float32_matmul_precision("medium")
|
|
23
|
+
|
|
24
|
+
from model_script_utils import (
|
|
25
|
+
check_dataframe,
|
|
26
|
+
compute_classification_metrics,
|
|
27
|
+
compute_regression_metrics,
|
|
28
|
+
convert_categorical_types,
|
|
29
|
+
decompress_features,
|
|
30
|
+
expand_proba_column,
|
|
31
|
+
input_fn,
|
|
32
|
+
match_features_case_insensitive,
|
|
33
|
+
output_fn,
|
|
34
|
+
print_classification_metrics,
|
|
35
|
+
print_confusion_matrix,
|
|
36
|
+
print_regression_metrics,
|
|
37
|
+
)
|
|
38
|
+
from pytorch_utils import (
|
|
39
|
+
FeatureScaler,
|
|
40
|
+
create_model,
|
|
41
|
+
load_model,
|
|
42
|
+
predict,
|
|
43
|
+
prepare_data,
|
|
44
|
+
save_model,
|
|
45
|
+
train_model,
|
|
46
|
+
)
|
|
47
|
+
from uq_harness import (
|
|
48
|
+
compute_confidence,
|
|
49
|
+
load_uq_models,
|
|
50
|
+
predict_intervals,
|
|
51
|
+
save_uq_models,
|
|
52
|
+
train_uq_models,
|
|
53
|
+
)
|
|
29
54
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
55
|
+
# =============================================================================
|
|
56
|
+
# Default Hyperparameters
|
|
57
|
+
# =============================================================================
|
|
58
|
+
DEFAULT_HYPERPARAMETERS = {
|
|
59
|
+
# Training parameters
|
|
60
|
+
"n_folds": 5,
|
|
61
|
+
"max_epochs": 200,
|
|
62
|
+
"early_stopping_patience": 20,
|
|
63
|
+
"batch_size": 128,
|
|
64
|
+
# Model architecture
|
|
65
|
+
"layers": "256-128-64",
|
|
66
|
+
"learning_rate": 1e-3,
|
|
67
|
+
"dropout": 0.1,
|
|
68
|
+
"use_batch_norm": True,
|
|
69
|
+
# Random seed
|
|
70
|
+
"seed": 42,
|
|
71
|
+
}
|
|
35
72
|
|
|
36
|
-
# Template
|
|
73
|
+
# Template parameters (filled in by Workbench)
|
|
37
74
|
TEMPLATE_PARAMS = {
|
|
38
75
|
"model_type": "uq_regressor",
|
|
39
|
-
"target": "
|
|
40
|
-
"features": ['
|
|
41
|
-
"id_column": "
|
|
76
|
+
"target": "udm_asy_res_efflux_ratio",
|
|
77
|
+
"features": ['smr_vsa4', 'tpsa', 'numhdonors', 'nhohcount', 'nbase', 'vsa_estate3', 'fr_guanido', 'mollogp', 'peoe_vsa8', 'peoe_vsa1', 'fr_imine', 'vsa_estate2', 'estate_vsa10', 'asphericity', 'xc_3dv', 'smr_vsa3', 'charge_centroid_distance', 'c3sp3', 'nitrogen_span', 'estate_vsa2', 'minpartialcharge', 'hba_hbd_ratio', 'slogp_vsa1', 'axp_7d', 'nocount', 'vsa_estate4', 'vsa_estate6', 'estate_vsa4', 'xc_4dv', 'xc_4d', 'num_s_centers', 'vsa_estate9', 'chi2v', 'axp_5d', 'mi', 'mse', 'bcut2d_mrhi', 'smr_vsa6', 'hallkieralpha', 'balabanj', 'amphiphilic_moment', 'type_ii_pattern_count', 'minabsestateindex', 'bcut2d_mwlow', 'axp_0dv', 'slogp_vsa5', 'axp_2d', 'axp_1dv', 'xch_5d', 'peoe_vsa10', 'molecular_asymmetry', 'kappa3', 'estate_vsa3', 'sse', 'bcut2d_logphi', 'fr_imidazole', 'molecular_volume_3d', 'bertzct', 'maxestateindex', 'aromatic_interaction_score', 'axp_3d', 'radius_of_gyration', 'vsa_estate7', 'si', 'axp_5dv', 'molecular_axis_length', 'estate_vsa6', 'fpdensitymorgan1', 'axp_6d', 'estate_vsa9', 'fpdensitymorgan2', 'xp_0dv', 'xp_6dv', 'molmr', 'qed', 'estate_vsa8', 'peoe_vsa9', 'xch_6dv', 'xp_7d', 'slogp_vsa2', 'xp_5dv', 'bcut2d_chghi', 'xch_6d', 'chi0n', 'slogp_vsa3', 'chi1v', 'chi3v', 'bcut2d_chglo', 'axp_1d', 'mp', 'num_defined_stereocenters', 'xp_3dv', 'bcut2d_mrlow', 'fr_al_oh', 'peoe_vsa7', 'chi2n', 'axp_6dv', 'axp_2dv', 'chi4n', 'xc_3d', 'axp_7dv', 'vsa_estate8', 'xch_7d', 'maxpartialcharge', 'chi1n', 'peoe_vsa2', 'axp_3dv', 'bcut2d_logplow', 'mv', 'xpc_5dv', 'kappa2', 'vsa_estate5', 'xp_5d', 'mm', 'maxabspartialcharge', 'axp_4dv', 'maxabsestateindex', 'axp_4d', 'xch_4dv', 'xp_2dv', 'heavyatommolwt', 'numatomstereocenters', 'xp_7dv', 'numsaturatedheterocycles', 'xp_3d', 'kappa1', 'mz', 'axp_0d', 'chi1', 'xch_4d', 'smr_vsa1', 'xp_2d', 'estate_vsa5', 'phi', 'fr_ether', 'xc_5d', 'c1sp3', 'estate_vsa7', 'estate_vsa1', 'vsa_estate1', 'slogp_vsa4', 'avgipc', 'smr_vsa10', 'numvalenceelectrons', 'xc_5dv', 'peoe_vsa12', 'peoe_vsa6', 'xpc_5d', 'xpc_6d', 'minestateindex', 'chi3n', 'smr_vsa5', 'xp_4d', 'numheteroatoms', 'fpdensitymorgan3', 'xpc_4d', 'sps', 'xp_1d', 'sv', 'fr_ar_n', 'slogp_vsa10', 'c2sp3', 'xpc_4dv', 'chi0v', 'xpc_6dv', 'xp_1dv', 'vsa_estate10', 'sare', 'c2sp2', 'mpe', 'xch_7dv', 'chi4v', 'type_i_pattern_count', 'sp', 'slogp_vsa8', 'amide_count', 'num_stereocenters', 'num_r_centers', 'tertiary_amine_count', 'spe', 'xp_4dv', 'numsaturatedrings', 'mare', 'numhacceptors', 'chi0', 'fractioncsp3', 'fr_nh0', 'xch_5dv', 'fr_aniline', 'smr_vsa7', 'labuteasa', 'c3sp2', 'xp_0d', 'xp_6d', 'peoe_vsa11', 'fr_ar_nh', 'molwt', 'intramolecular_hbond_potential', 'peoe_vsa3', 'fr_nhpyrrole', 'numaliphaticrings', 'hybratio', 'smr_vsa9', 'peoe_vsa13', 'bcut2d_mwhi', 'c1sp2', 'slogp_vsa11', 'numrotatablebonds', 'numaliphaticcarbocycles', 'slogp_vsa6', 'peoe_vsa4', 'numunspecifiedatomstereocenters', 'xc_6d', 'xc_6dv', 'num_unspecified_stereocenters', 'sz', 'minabspartialcharge', 'fcsp3', 'c1sp1', 'fr_piperzine', 'numaliphaticheterocycles', 'numamidebonds', 'fr_benzene', 'numaromaticheterocycles', 'sm', 'fr_priamide', 'fr_piperdine', 'fr_methoxy', 'c4sp3', 'fr_c_o_nocoo', 'exactmolwt', 'stereo_complexity', 'fr_hoccn', 'numaromaticcarbocycles', 'fr_nh2', 'numheterocycles', 'fr_morpholine', 'fr_ketone', 'fr_nh1', 'frac_defined_stereo', 'fr_aryl_methyl', 'fr_alkyl_halide', 'fr_phenol', 'fr_al_oh_notert', 'fr_ar_oh', 'fr_pyridine', 'fr_amide', 'slogp_vsa7', 'fr_halogen', 'numsaturatedcarbocycles', 'slogp_vsa12', 'fr_ndealkylation1', 'xch_3d', 'fr_bicyclic', 'naromatom', 'narombond'],
|
|
78
|
+
"id_column": "udm_mol_bat_id",
|
|
42
79
|
"compressed_features": [],
|
|
43
|
-
"model_metrics_s3_path": "s3://
|
|
44
|
-
"hyperparameters": {
|
|
80
|
+
"model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/caco2-er-reg-pytorch/training",
|
|
81
|
+
"hyperparameters": {},
|
|
45
82
|
}
|
|
46
83
|
|
|
47
84
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
Args:
|
|
53
|
-
df (pd.DataFrame): DataFrame to check
|
|
54
|
-
df_name (str): Name of the DataFrame
|
|
55
|
-
"""
|
|
56
|
-
if df.empty:
|
|
57
|
-
msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
|
|
58
|
-
print(msg)
|
|
59
|
-
raise ValueError(msg)
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def expand_proba_column(df: pd.DataFrame, class_labels: list[str]) -> pd.DataFrame:
|
|
63
|
-
"""
|
|
64
|
-
Expands a column in a DataFrame containing a list of probabilities into separate columns.
|
|
65
|
-
|
|
66
|
-
Args:
|
|
67
|
-
df (pd.DataFrame): DataFrame containing a "pred_proba" column
|
|
68
|
-
class_labels (list[str]): List of class labels
|
|
69
|
-
|
|
70
|
-
Returns:
|
|
71
|
-
pd.DataFrame: DataFrame with the "pred_proba" expanded into separate columns
|
|
72
|
-
"""
|
|
73
|
-
proba_column = "pred_proba"
|
|
74
|
-
if proba_column not in df.columns:
|
|
75
|
-
raise ValueError('DataFrame does not contain a "pred_proba" column')
|
|
76
|
-
|
|
77
|
-
# Construct new column names with '_proba' suffix
|
|
78
|
-
proba_splits = [f"{label}_proba" for label in class_labels]
|
|
79
|
-
|
|
80
|
-
# Expand the proba_column into separate columns for each probability
|
|
81
|
-
proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
|
|
82
|
-
|
|
83
|
-
# Drop any proba columns and reset the index in prep for the concat
|
|
84
|
-
df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
|
|
85
|
-
df = df.reset_index(drop=True)
|
|
86
|
-
|
|
87
|
-
# Concatenate the new columns with the original DataFrame
|
|
88
|
-
df = pd.concat([df, proba_df], axis=1)
|
|
89
|
-
return df
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def match_features_case_insensitive(df: pd.DataFrame, model_features: list[str]) -> pd.DataFrame:
|
|
93
|
-
"""
|
|
94
|
-
Matches and renames DataFrame columns to match model feature names (case-insensitive).
|
|
95
|
-
Prioritizes exact matches, then case-insensitive matches.
|
|
96
|
-
|
|
97
|
-
Raises ValueError if any model features cannot be matched.
|
|
98
|
-
"""
|
|
99
|
-
df_columns_lower = {col.lower(): col for col in df.columns}
|
|
100
|
-
rename_dict = {}
|
|
101
|
-
missing = []
|
|
102
|
-
for feature in model_features:
|
|
103
|
-
if feature in df.columns:
|
|
104
|
-
continue # Exact match
|
|
105
|
-
elif feature.lower() in df_columns_lower:
|
|
106
|
-
rename_dict[df_columns_lower[feature.lower()]] = feature
|
|
107
|
-
else:
|
|
108
|
-
missing.append(feature)
|
|
109
|
-
|
|
110
|
-
if missing:
|
|
111
|
-
raise ValueError(f"Features not found: {missing}")
|
|
112
|
-
|
|
113
|
-
# Rename the DataFrame columns to match the model features
|
|
114
|
-
return df.rename(columns=rename_dict)
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
def convert_categorical_types(
|
|
118
|
-
df: pd.DataFrame, features: list[str], category_mappings: dict[str, list[str]] | None = None
|
|
119
|
-
) -> tuple[pd.DataFrame, dict[str, list[str]]]:
|
|
120
|
-
"""
|
|
121
|
-
Converts appropriate columns to categorical type with consistent mappings.
|
|
122
|
-
|
|
123
|
-
Args:
|
|
124
|
-
df (pd.DataFrame): The DataFrame to process.
|
|
125
|
-
features (list): List of feature names to consider for conversion.
|
|
126
|
-
category_mappings (dict, optional): Existing category mappings. If None or empty,
|
|
127
|
-
we're in training mode. If populated, we're in
|
|
128
|
-
inference mode.
|
|
129
|
-
|
|
130
|
-
Returns:
|
|
131
|
-
tuple: (processed DataFrame, category mappings dictionary)
|
|
132
|
-
"""
|
|
133
|
-
if category_mappings is None:
|
|
134
|
-
category_mappings = {}
|
|
135
|
-
|
|
136
|
-
# Training mode
|
|
137
|
-
if not category_mappings:
|
|
138
|
-
for col in df.select_dtypes(include=["object", "string"]):
|
|
139
|
-
if col in features and df[col].nunique() < 20:
|
|
140
|
-
print(f"Training mode: Converting {col} to category")
|
|
141
|
-
df[col] = df[col].astype("category")
|
|
142
|
-
category_mappings[col] = df[col].cat.categories.tolist()
|
|
143
|
-
|
|
144
|
-
# Inference mode
|
|
145
|
-
else:
|
|
146
|
-
for col, categories in category_mappings.items():
|
|
147
|
-
if col in df.columns:
|
|
148
|
-
print(f"Inference mode: Applying categorical mapping for {col}")
|
|
149
|
-
df[col] = pd.Categorical(df[col], categories=categories)
|
|
150
|
-
|
|
151
|
-
return df, category_mappings
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
def decompress_features(
|
|
155
|
-
df: pd.DataFrame, features: list[str], compressed_features: list[str]
|
|
156
|
-
) -> tuple[pd.DataFrame, list[str]]:
|
|
157
|
-
"""Prepare features for the model
|
|
158
|
-
|
|
159
|
-
Args:
|
|
160
|
-
df (pd.DataFrame): The features DataFrame
|
|
161
|
-
features (list[str]): Full list of feature names
|
|
162
|
-
compressed_features (list[str]): List of feature names to decompress (bitstrings)
|
|
163
|
-
|
|
164
|
-
Returns:
|
|
165
|
-
pd.DataFrame: DataFrame with the decompressed features
|
|
166
|
-
list[str]: Updated list of feature names after decompression
|
|
167
|
-
|
|
168
|
-
Raises:
|
|
169
|
-
ValueError: If any missing values are found in the specified features
|
|
170
|
-
"""
|
|
171
|
-
# Check for any missing values in the required features
|
|
172
|
-
missing_counts = df[features].isna().sum()
|
|
173
|
-
if missing_counts.any():
|
|
174
|
-
missing_features = missing_counts[missing_counts > 0]
|
|
175
|
-
print(
|
|
176
|
-
f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
|
|
177
|
-
"WARNING: You might want to remove/replace all NaN values before processing."
|
|
178
|
-
)
|
|
179
|
-
|
|
180
|
-
# Make a copy to avoid mutating the original list
|
|
181
|
-
decompressed_features = features.copy()
|
|
182
|
-
|
|
183
|
-
for feature in compressed_features:
|
|
184
|
-
if (feature not in df.columns) or (feature not in decompressed_features):
|
|
185
|
-
print(f"Feature '{feature}' not in the features list, skipping decompression.")
|
|
186
|
-
continue
|
|
187
|
-
|
|
188
|
-
# Remove the feature from the list of features to avoid duplication
|
|
189
|
-
decompressed_features.remove(feature)
|
|
190
|
-
|
|
191
|
-
# Handle all compressed features as bitstrings
|
|
192
|
-
bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
|
|
193
|
-
prefix = feature[:3]
|
|
194
|
-
|
|
195
|
-
# Create all new columns at once - avoids fragmentation
|
|
196
|
-
new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
|
|
197
|
-
new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
|
|
198
|
-
|
|
199
|
-
# Add to features list
|
|
200
|
-
decompressed_features.extend(new_col_names)
|
|
201
|
-
|
|
202
|
-
# Drop original column and concatenate new ones
|
|
203
|
-
df = df.drop(columns=[feature])
|
|
204
|
-
df = pd.concat([df, new_df], axis=1)
|
|
205
|
-
|
|
206
|
-
return df, decompressed_features
|
|
207
|
-
|
|
208
|
-
|
|
85
|
+
# =============================================================================
|
|
86
|
+
# Model Loading (for SageMaker inference)
|
|
87
|
+
# =============================================================================
|
|
209
88
|
def model_fn(model_dir: str) -> dict:
|
|
210
|
-
"""Load
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
Dictionary with ensemble models and metadata
|
|
217
|
-
"""
|
|
218
|
-
import torch
|
|
219
|
-
from functools import partial
|
|
220
|
-
|
|
221
|
-
# Load ensemble metadata if present
|
|
222
|
-
ensemble_metadata_path = os.path.join(model_dir, "ensemble_metadata.joblib")
|
|
223
|
-
if os.path.exists(ensemble_metadata_path):
|
|
224
|
-
ensemble_metadata = joblib.load(ensemble_metadata_path)
|
|
225
|
-
n_ensemble = ensemble_metadata["n_ensemble"]
|
|
89
|
+
"""Load TabularMLP ensemble from the specified directory."""
|
|
90
|
+
# Load ensemble metadata
|
|
91
|
+
metadata_path = os.path.join(model_dir, "ensemble_metadata.joblib")
|
|
92
|
+
if os.path.exists(metadata_path):
|
|
93
|
+
metadata = joblib.load(metadata_path)
|
|
94
|
+
n_ensemble = metadata["n_ensemble"]
|
|
226
95
|
else:
|
|
227
96
|
n_ensemble = 1
|
|
228
97
|
|
|
229
|
-
# Determine
|
|
230
|
-
|
|
98
|
+
# Determine device
|
|
99
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
231
100
|
|
|
232
|
-
#
|
|
233
|
-
# This handles the case where pytorch-tabular loads callbacks.sav via joblib,
|
|
234
|
-
# which internally calls torch.load without map_location
|
|
235
|
-
original_torch_load = torch.load
|
|
236
|
-
torch.load = partial(original_torch_load, map_location=map_location)
|
|
237
|
-
|
|
238
|
-
# Save current working directory
|
|
239
|
-
original_cwd = os.getcwd()
|
|
101
|
+
# Load ensemble models
|
|
240
102
|
ensemble_models = []
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
if not input_data:
|
|
265
|
-
raise ValueError("Empty input data is not supported!")
|
|
266
|
-
|
|
267
|
-
# Decode bytes to string if necessary
|
|
268
|
-
if isinstance(input_data, bytes):
|
|
269
|
-
input_data = input_data.decode("utf-8")
|
|
270
|
-
|
|
271
|
-
if "text/csv" in content_type:
|
|
272
|
-
return pd.read_csv(StringIO(input_data))
|
|
273
|
-
elif "application/json" in content_type:
|
|
274
|
-
return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
|
|
275
|
-
else:
|
|
276
|
-
raise ValueError(f"{content_type} not supported!")
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
def output_fn(output_df: pd.DataFrame, accept_type: str) -> tuple[str, str]:
|
|
280
|
-
"""Supports both CSV and JSON output formats."""
|
|
281
|
-
if "text/csv" in accept_type:
|
|
282
|
-
csv_output = output_df.fillna("N/A").to_csv(index=False)
|
|
283
|
-
return csv_output, "text/csv"
|
|
284
|
-
elif "application/json" in accept_type:
|
|
285
|
-
return output_df.to_json(orient="records"), "application/json"
|
|
286
|
-
else:
|
|
287
|
-
raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
|
|
103
|
+
for i in range(n_ensemble):
|
|
104
|
+
model_path = os.path.join(model_dir, f"model_{i}")
|
|
105
|
+
model = load_model(model_path, device=device)
|
|
106
|
+
ensemble_models.append(model)
|
|
107
|
+
|
|
108
|
+
print(f"Loaded {len(ensemble_models)} model(s)")
|
|
109
|
+
|
|
110
|
+
# Load feature scaler
|
|
111
|
+
scaler = FeatureScaler.load(os.path.join(model_dir, "scaler.joblib"))
|
|
112
|
+
|
|
113
|
+
# Load UQ models (regression only)
|
|
114
|
+
uq_models, uq_metadata = None, None
|
|
115
|
+
uq_path = os.path.join(model_dir, "uq_metadata.json")
|
|
116
|
+
if os.path.exists(uq_path):
|
|
117
|
+
uq_models, uq_metadata = load_uq_models(model_dir)
|
|
118
|
+
|
|
119
|
+
return {
|
|
120
|
+
"ensemble_models": ensemble_models,
|
|
121
|
+
"n_ensemble": n_ensemble,
|
|
122
|
+
"scaler": scaler,
|
|
123
|
+
"uq_models": uq_models,
|
|
124
|
+
"uq_metadata": uq_metadata,
|
|
125
|
+
}
|
|
288
126
|
|
|
289
127
|
|
|
128
|
+
# =============================================================================
|
|
129
|
+
# Inference (for SageMaker inference)
|
|
130
|
+
# =============================================================================
|
|
290
131
|
def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
|
|
291
|
-
"""Make
|
|
292
|
-
|
|
293
|
-
Args:
|
|
294
|
-
df (pd.DataFrame): The input DataFrame
|
|
295
|
-
model_dict: Dictionary containing ensemble models and metadata
|
|
296
|
-
|
|
297
|
-
Returns:
|
|
298
|
-
pd.DataFrame: The DataFrame with predictions (and prediction_std for ensembles)
|
|
299
|
-
"""
|
|
132
|
+
"""Make predictions with TabularMLP ensemble."""
|
|
300
133
|
model_type = TEMPLATE_PARAMS["model_type"]
|
|
301
134
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
135
|
+
model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
|
|
302
136
|
|
|
303
|
-
#
|
|
137
|
+
# Load artifacts
|
|
304
138
|
ensemble_models = model_dict["ensemble_models"]
|
|
305
|
-
|
|
139
|
+
scaler = model_dict["scaler"]
|
|
140
|
+
uq_models = model_dict.get("uq_models")
|
|
141
|
+
uq_metadata = model_dict.get("uq_metadata")
|
|
306
142
|
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
with open(os.path.join(model_dir, "
|
|
310
|
-
|
|
311
|
-
|
|
143
|
+
with open(os.path.join(model_dir, "feature_columns.json")) as f:
|
|
144
|
+
features = json.load(f)
|
|
145
|
+
with open(os.path.join(model_dir, "category_mappings.json")) as f:
|
|
146
|
+
category_mappings = json.load(f)
|
|
147
|
+
with open(os.path.join(model_dir, "feature_metadata.json")) as f:
|
|
148
|
+
feature_metadata = json.load(f)
|
|
312
149
|
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
category_mappings = json.load(fp)
|
|
150
|
+
continuous_cols = feature_metadata["continuous_cols"]
|
|
151
|
+
categorical_cols = feature_metadata["categorical_cols"]
|
|
316
152
|
|
|
317
|
-
# Load our Label Encoder if we have one
|
|
318
153
|
label_encoder = None
|
|
319
|
-
|
|
320
|
-
if os.path.exists(
|
|
321
|
-
label_encoder = joblib.load(
|
|
154
|
+
encoder_path = os.path.join(model_dir, "label_encoder.joblib")
|
|
155
|
+
if os.path.exists(encoder_path):
|
|
156
|
+
label_encoder = joblib.load(encoder_path)
|
|
322
157
|
|
|
323
|
-
|
|
324
|
-
matched_df = match_features_case_insensitive(df, features)
|
|
158
|
+
print(f"Model Features: {features}")
|
|
325
159
|
|
|
326
|
-
#
|
|
160
|
+
# Prepare features
|
|
161
|
+
matched_df = match_features_case_insensitive(df, features)
|
|
327
162
|
matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
|
|
328
163
|
|
|
329
|
-
# If we have compressed features, decompress them
|
|
330
164
|
if compressed_features:
|
|
331
165
|
print("Decompressing features for prediction...")
|
|
332
166
|
matched_df, features = decompress_features(matched_df, features, compressed_features)
|
|
333
167
|
|
|
334
|
-
# Track
|
|
168
|
+
# Track missing features
|
|
335
169
|
missing_mask = matched_df[features].isna().any(axis=1)
|
|
336
170
|
if missing_mask.any():
|
|
337
|
-
print(f"Warning: {missing_mask.sum()} rows have missing features
|
|
171
|
+
print(f"Warning: {missing_mask.sum()} rows have missing features")
|
|
338
172
|
|
|
339
|
-
# Initialize
|
|
173
|
+
# Initialize output columns
|
|
340
174
|
df["prediction"] = np.nan
|
|
341
175
|
if model_type in ["regressor", "uq_regressor"]:
|
|
342
176
|
df["prediction_std"] = np.nan
|
|
343
177
|
|
|
344
|
-
|
|
345
|
-
complete_df = matched_df[~missing_mask]
|
|
178
|
+
complete_df = matched_df[~missing_mask].copy()
|
|
346
179
|
if len(complete_df) == 0:
|
|
347
180
|
print("Warning: No complete rows to predict on")
|
|
348
181
|
return df
|
|
349
182
|
|
|
350
|
-
#
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
# Collect predictions from all ensemble members
|
|
355
|
-
all_ensemble_preds = []
|
|
356
|
-
all_ensemble_probs = []
|
|
357
|
-
|
|
358
|
-
for ens_idx, ens_model in enumerate(ensemble_models):
|
|
359
|
-
result = ens_model.predict(complete_df[features])
|
|
360
|
-
|
|
361
|
-
if prediction_column in result.columns:
|
|
362
|
-
ens_preds = result[prediction_column].values
|
|
363
|
-
else:
|
|
364
|
-
raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
|
|
365
|
-
|
|
366
|
-
all_ensemble_preds.append(ens_preds)
|
|
183
|
+
# Prepare data for inference (with standardization)
|
|
184
|
+
x_cont, x_cat, _, _, _ = prepare_data(
|
|
185
|
+
complete_df, continuous_cols, categorical_cols, category_mappings=category_mappings, scaler=scaler
|
|
186
|
+
)
|
|
367
187
|
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
188
|
+
# Collect ensemble predictions
|
|
189
|
+
all_preds = []
|
|
190
|
+
for model in ensemble_models:
|
|
191
|
+
preds = predict(model, x_cont, x_cat)
|
|
192
|
+
all_preds.append(preds)
|
|
373
193
|
|
|
374
|
-
#
|
|
375
|
-
ensemble_preds = np.stack(
|
|
194
|
+
# Aggregate predictions
|
|
195
|
+
ensemble_preds = np.stack(all_preds, axis=0)
|
|
376
196
|
preds = np.mean(ensemble_preds, axis=0)
|
|
377
|
-
preds_std = np.std(ensemble_preds, axis=0)
|
|
197
|
+
preds_std = np.std(ensemble_preds, axis=0)
|
|
378
198
|
|
|
379
|
-
print(f"Inference
|
|
199
|
+
print(f"Inference complete: {len(preds)} predictions, {len(ensemble_models)} ensemble members")
|
|
380
200
|
|
|
381
|
-
# Handle classification vs regression
|
|
382
201
|
if label_encoder is not None:
|
|
383
|
-
#
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
all_proba.loc[~missing_mask] = [p.tolist() for p in avg_probs]
|
|
393
|
-
df["pred_proba"] = all_proba
|
|
394
|
-
|
|
395
|
-
# Expand the pred_proba column into separate columns for each class
|
|
396
|
-
df = expand_proba_column(df, label_encoder.classes_)
|
|
397
|
-
else:
|
|
398
|
-
# No probabilities, use averaged predictions
|
|
399
|
-
predictions = label_encoder.inverse_transform(preds.astype(int))
|
|
202
|
+
# Classification: average probabilities, then argmax
|
|
203
|
+
avg_probs = preds # Already softmax output
|
|
204
|
+
class_preds = np.argmax(avg_probs, axis=1)
|
|
205
|
+
predictions = label_encoder.inverse_transform(class_preds)
|
|
206
|
+
|
|
207
|
+
all_proba = pd.Series([None] * len(df), index=df.index, dtype=object)
|
|
208
|
+
all_proba.loc[~missing_mask] = [p.tolist() for p in avg_probs]
|
|
209
|
+
df["pred_proba"] = all_proba
|
|
210
|
+
df = expand_proba_column(df, label_encoder.classes_)
|
|
400
211
|
else:
|
|
401
|
-
# Regression
|
|
402
|
-
predictions = preds
|
|
403
|
-
df.loc[~missing_mask, "prediction_std"] = preds_std
|
|
212
|
+
# Regression
|
|
213
|
+
predictions = preds.flatten()
|
|
214
|
+
df.loc[~missing_mask, "prediction_std"] = preds_std.flatten()
|
|
215
|
+
|
|
216
|
+
# Add UQ intervals if available
|
|
217
|
+
if uq_models and uq_metadata:
|
|
218
|
+
X_complete = complete_df[features]
|
|
219
|
+
df_complete = df.loc[~missing_mask].copy()
|
|
220
|
+
df_complete["prediction"] = predictions # Set prediction before compute_confidence
|
|
221
|
+
df_complete = predict_intervals(df_complete, X_complete, uq_models, uq_metadata)
|
|
222
|
+
df_complete = compute_confidence(df_complete, uq_metadata["median_interval_width"], "q_10", "q_90")
|
|
223
|
+
# Copy UQ columns back to main dataframe
|
|
224
|
+
for col in df_complete.columns:
|
|
225
|
+
if col.startswith("q_") or col == "confidence":
|
|
226
|
+
df.loc[~missing_mask, col] = df_complete[col].values
|
|
404
227
|
|
|
405
|
-
# Set predictions only for complete rows
|
|
406
228
|
df.loc[~missing_mask, "prediction"] = predictions
|
|
407
|
-
|
|
408
229
|
return df
|
|
409
230
|
|
|
410
231
|
|
|
232
|
+
# =============================================================================
|
|
233
|
+
# Training
|
|
234
|
+
# =============================================================================
|
|
411
235
|
if __name__ == "__main__":
|
|
412
|
-
|
|
236
|
+
# -------------------------------------------------------------------------
|
|
237
|
+
# Setup: Parse arguments and load data
|
|
238
|
+
# -------------------------------------------------------------------------
|
|
239
|
+
parser = argparse.ArgumentParser()
|
|
240
|
+
parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
|
|
241
|
+
parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
|
|
242
|
+
parser.add_argument("--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data"))
|
|
243
|
+
args = parser.parse_args()
|
|
413
244
|
|
|
414
|
-
#
|
|
245
|
+
# Extract template parameters
|
|
415
246
|
target = TEMPLATE_PARAMS["target"]
|
|
416
247
|
features = TEMPLATE_PARAMS["features"]
|
|
417
248
|
orig_features = features.copy()
|
|
@@ -419,341 +250,253 @@ if __name__ == "__main__":
|
|
|
419
250
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
420
251
|
model_type = TEMPLATE_PARAMS["model_type"]
|
|
421
252
|
model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
|
|
422
|
-
hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
|
|
423
|
-
|
|
424
|
-
# Script arguments for input/output directories
|
|
425
|
-
parser = argparse.ArgumentParser()
|
|
426
|
-
parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
|
|
427
|
-
parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
|
|
428
|
-
parser.add_argument(
|
|
429
|
-
"--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
|
|
430
|
-
)
|
|
431
|
-
args = parser.parse_args()
|
|
253
|
+
hyperparameters = {**DEFAULT_HYPERPARAMETERS, **(TEMPLATE_PARAMS["hyperparameters"] or {})}
|
|
432
254
|
|
|
433
|
-
#
|
|
434
|
-
training_files = [os.path.join(args.train,
|
|
255
|
+
# Load training data
|
|
256
|
+
training_files = [os.path.join(args.train, f) for f in os.listdir(args.train) if f.endswith(".csv")]
|
|
435
257
|
print(f"Training Files: {training_files}")
|
|
436
|
-
|
|
437
|
-
# Combine files and read them all into a single pandas dataframe
|
|
438
|
-
all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
439
|
-
|
|
440
|
-
# Print out some info about the dataframe
|
|
441
|
-
print(f"All Data Shape: {all_df.shape}")
|
|
442
|
-
print(f"Feature dtypes:\n{all_df[features].dtypes.value_counts()}")
|
|
443
|
-
print(f"Int64 columns: {all_df[features].select_dtypes(include=['int64']).columns.tolist()}")
|
|
444
|
-
|
|
445
|
-
# Check if the dataframe is empty
|
|
258
|
+
all_df = pd.concat([pd.read_csv(f, engine="python") for f in training_files])
|
|
446
259
|
check_dataframe(all_df, "training_df")
|
|
447
260
|
|
|
448
|
-
# Drop
|
|
449
|
-
|
|
261
|
+
# Drop rows with missing features
|
|
262
|
+
initial_count = len(all_df)
|
|
450
263
|
all_df = all_df.dropna(subset=features)
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
print(f"Dropped {dropped_rows} rows due to missing feature values.")
|
|
264
|
+
if len(all_df) < initial_count:
|
|
265
|
+
print(f"Dropped {initial_count - len(all_df)} rows with missing features")
|
|
454
266
|
|
|
455
|
-
# Features/Target output
|
|
456
267
|
print(f"Target: {target}")
|
|
457
|
-
print(f"Features: {
|
|
268
|
+
print(f"Features: {features}")
|
|
269
|
+
print(f"Hyperparameters: {hyperparameters}")
|
|
458
270
|
|
|
459
|
-
#
|
|
271
|
+
# -------------------------------------------------------------------------
|
|
272
|
+
# Preprocessing
|
|
273
|
+
# -------------------------------------------------------------------------
|
|
460
274
|
all_df, category_mappings = convert_categorical_types(all_df, features)
|
|
461
275
|
|
|
462
|
-
# Print out some info about the dataframe
|
|
463
|
-
print(f"All Data Shape: {all_df.shape}")
|
|
464
|
-
print(f"Feature dtypes:\n{all_df[features].dtypes.value_counts()}")
|
|
465
|
-
print(f"Int64 columns: {all_df[features].select_dtypes(include=['int64']).columns.tolist()}")
|
|
466
|
-
|
|
467
|
-
# If we have compressed features, decompress them
|
|
468
276
|
if compressed_features:
|
|
469
|
-
print(f"Decompressing features {compressed_features}
|
|
277
|
+
print(f"Decompressing features: {compressed_features}")
|
|
470
278
|
all_df, features = decompress_features(all_df, features, compressed_features)
|
|
471
279
|
|
|
472
|
-
# Determine categorical
|
|
473
|
-
categorical_cols = [
|
|
474
|
-
continuous_cols = [
|
|
475
|
-
print(f"Categorical columns: {categorical_cols}")
|
|
476
|
-
print(f"Continuous columns: {continuous_cols}")
|
|
477
|
-
|
|
478
|
-
# Cast continuous columns to float
|
|
280
|
+
# Determine categorical vs continuous columns
|
|
281
|
+
categorical_cols = [c for c in features if all_df[c].dtype.name == "category"]
|
|
282
|
+
continuous_cols = [c for c in features if c not in categorical_cols]
|
|
479
283
|
all_df[continuous_cols] = all_df[continuous_cols].astype("float64")
|
|
284
|
+
print(f"Categorical: {categorical_cols}")
|
|
285
|
+
print(f"Continuous: {len(continuous_cols)} columns")
|
|
480
286
|
|
|
481
|
-
#
|
|
287
|
+
# -------------------------------------------------------------------------
|
|
288
|
+
# Classification setup
|
|
289
|
+
# -------------------------------------------------------------------------
|
|
290
|
+
label_encoder = None
|
|
291
|
+
n_outputs = 1
|
|
482
292
|
if model_type == "classifier":
|
|
483
|
-
task = "classification"
|
|
484
|
-
# Encode the target column on full dataset for consistent encoding
|
|
485
293
|
label_encoder = LabelEncoder()
|
|
486
294
|
all_df[target] = label_encoder.fit_transform(all_df[target])
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
task = "regression"
|
|
490
|
-
label_encoder = None
|
|
491
|
-
num_classes = None
|
|
295
|
+
n_outputs = len(label_encoder.classes_)
|
|
296
|
+
print(f"Class labels: {label_encoder.classes_.tolist()}")
|
|
492
297
|
|
|
493
|
-
#
|
|
494
|
-
|
|
495
|
-
|
|
298
|
+
# -------------------------------------------------------------------------
|
|
299
|
+
# Cross-validation setup
|
|
300
|
+
# -------------------------------------------------------------------------
|
|
301
|
+
n_folds = hyperparameters["n_folds"]
|
|
302
|
+
task = "classification" if model_type == "classifier" else "regression"
|
|
303
|
+
hidden_layers = [int(x) for x in hyperparameters["layers"].split("-")]
|
|
496
304
|
|
|
497
|
-
#
|
|
498
|
-
|
|
499
|
-
# =========================================================================
|
|
500
|
-
print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold cross-validation ensemble'}...")
|
|
305
|
+
# Get categorical cardinalities
|
|
306
|
+
categorical_cardinalities = [len(category_mappings.get(col, {})) for col in categorical_cols]
|
|
501
307
|
|
|
502
|
-
# Create fold splits
|
|
503
308
|
if n_folds == 1:
|
|
504
|
-
# Single fold: use train/val split from "training" column or random split
|
|
505
309
|
if "training" in all_df.columns:
|
|
506
|
-
print("
|
|
310
|
+
print("Using 'training' column for train/val split")
|
|
507
311
|
train_idx = np.where(all_df["training"])[0]
|
|
508
312
|
val_idx = np.where(~all_df["training"])[0]
|
|
509
313
|
else:
|
|
510
|
-
print("WARNING: No training column found,
|
|
511
|
-
|
|
512
|
-
train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)
|
|
314
|
+
print("WARNING: No 'training' column found, using random 80/20 split")
|
|
315
|
+
train_idx, val_idx = train_test_split(np.arange(len(all_df)), test_size=0.2, random_state=42)
|
|
513
316
|
folds = [(train_idx, val_idx)]
|
|
514
317
|
else:
|
|
515
|
-
# K-Fold CV
|
|
516
318
|
if model_type == "classifier":
|
|
517
319
|
kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
518
|
-
|
|
320
|
+
folds = list(kfold.split(all_df, all_df[target]))
|
|
519
321
|
else:
|
|
520
322
|
kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
521
|
-
|
|
522
|
-
folds = list(kfold.split(all_df, split_target))
|
|
523
|
-
|
|
524
|
-
# Initialize storage for out-of-fold predictions
|
|
525
|
-
oof_predictions = np.full(len(all_df), np.nan, dtype=np.float64)
|
|
526
|
-
if model_type == "classifier" and num_classes and num_classes > 1:
|
|
527
|
-
oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
|
|
528
|
-
else:
|
|
529
|
-
oof_proba = None
|
|
323
|
+
folds = list(kfold.split(all_df))
|
|
530
324
|
|
|
531
|
-
|
|
325
|
+
print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold ensemble'}...")
|
|
532
326
|
|
|
533
|
-
#
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
categorical_cols=categorical_cols,
|
|
538
|
-
)
|
|
327
|
+
# Fit scaler on all training data (used across all folds)
|
|
328
|
+
scaler = FeatureScaler()
|
|
329
|
+
scaler.fit(all_df, continuous_cols)
|
|
330
|
+
print(f"Fitted scaler on {len(continuous_cols)} continuous features")
|
|
539
331
|
|
|
540
|
-
#
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
"activation": "LeakyReLU",
|
|
544
|
-
"learning_rate": 1e-3,
|
|
545
|
-
"dropout": 0.1,
|
|
546
|
-
"use_batch_norm": True,
|
|
547
|
-
"initialization": "kaiming",
|
|
548
|
-
}
|
|
549
|
-
# Override defaults with model_config if present
|
|
550
|
-
model_overrides = {k: v for k, v in hyperparameters.get("model_config", {}).items() if k in model_defaults}
|
|
551
|
-
for key, value in model_overrides.items():
|
|
552
|
-
print(f"MODEL CONFIG Override: {key}: {model_defaults[key]} → {value}")
|
|
553
|
-
model_params = {**model_defaults, **model_overrides}
|
|
332
|
+
# Determine device
|
|
333
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
334
|
+
print(f"Using device: {device}")
|
|
554
335
|
|
|
555
|
-
|
|
556
|
-
|
|
336
|
+
# -------------------------------------------------------------------------
|
|
337
|
+
# Training loop
|
|
338
|
+
# -------------------------------------------------------------------------
|
|
339
|
+
oof_predictions = np.full((len(all_df), n_outputs), np.nan, dtype=np.float64)
|
|
557
340
|
|
|
341
|
+
ensemble_models = []
|
|
558
342
|
for fold_idx, (train_idx, val_idx) in enumerate(folds):
|
|
559
343
|
print(f"\n{'='*50}")
|
|
560
|
-
print(f"
|
|
344
|
+
print(f"Fold {fold_idx + 1}/{len(folds)} - Train: {len(train_idx)}, Val: {len(val_idx)}")
|
|
561
345
|
print(f"{'='*50}")
|
|
562
346
|
|
|
563
|
-
# Split data for this fold
|
|
564
347
|
df_train = all_df.iloc[train_idx].reset_index(drop=True)
|
|
565
348
|
df_val = all_df.iloc[val_idx].reset_index(drop=True)
|
|
566
349
|
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
batch_size += 1 # Adjust to avoid last batch of size 1
|
|
574
|
-
trainer_defaults = {
|
|
575
|
-
"auto_lr_find": False,
|
|
576
|
-
"batch_size": batch_size,
|
|
577
|
-
"max_epochs": 200,
|
|
578
|
-
"min_epochs": 10,
|
|
579
|
-
"early_stopping": "valid_loss",
|
|
580
|
-
"early_stopping_patience": 20,
|
|
581
|
-
"checkpoints": "valid_loss",
|
|
582
|
-
"accelerator": "auto",
|
|
583
|
-
"progress_bar": "none",
|
|
584
|
-
"gradient_clip_val": 1.0,
|
|
585
|
-
"seed": 42 + fold_idx,
|
|
586
|
-
}
|
|
587
|
-
|
|
588
|
-
# Override defaults with training_config if present
|
|
589
|
-
training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
|
|
590
|
-
if fold_idx == 0: # Only print overrides once
|
|
591
|
-
for key, value in training_overrides.items():
|
|
592
|
-
print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
|
|
593
|
-
trainer_params = {**trainer_defaults, **training_overrides}
|
|
594
|
-
trainer_config = TrainerConfig(**trainer_params)
|
|
595
|
-
|
|
596
|
-
# Create and train the TabularModel for this fold
|
|
597
|
-
tabular_model = TabularModel(
|
|
598
|
-
data_config=data_config,
|
|
599
|
-
model_config=model_config,
|
|
600
|
-
optimizer_config=optimizer_config,
|
|
601
|
-
trainer_config=trainer_config,
|
|
350
|
+
# Prepare data (using pre-fitted scaler)
|
|
351
|
+
train_x_cont, train_x_cat, train_y, _, _ = prepare_data(
|
|
352
|
+
df_train, continuous_cols, categorical_cols, target, category_mappings, scaler=scaler
|
|
353
|
+
)
|
|
354
|
+
val_x_cont, val_x_cat, val_y, _, _ = prepare_data(
|
|
355
|
+
df_val, continuous_cols, categorical_cols, target, category_mappings, scaler=scaler
|
|
602
356
|
)
|
|
603
|
-
tabular_model.fit(train=df_train, validation=df_val)
|
|
604
|
-
ensemble_models.append(tabular_model)
|
|
605
357
|
|
|
606
|
-
#
|
|
607
|
-
|
|
608
|
-
|
|
358
|
+
# Create model
|
|
359
|
+
torch.manual_seed(hyperparameters["seed"] + fold_idx)
|
|
360
|
+
model = create_model(
|
|
361
|
+
n_continuous=len(continuous_cols),
|
|
362
|
+
categorical_cardinalities=categorical_cardinalities,
|
|
363
|
+
hidden_layers=hidden_layers,
|
|
364
|
+
n_outputs=n_outputs,
|
|
365
|
+
task=task,
|
|
366
|
+
dropout=hyperparameters["dropout"],
|
|
367
|
+
use_batch_norm=hyperparameters["use_batch_norm"],
|
|
368
|
+
)
|
|
609
369
|
|
|
610
|
-
#
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
370
|
+
# Train
|
|
371
|
+
model, history = train_model(
|
|
372
|
+
model,
|
|
373
|
+
train_x_cont, train_x_cat, train_y,
|
|
374
|
+
val_x_cont, val_x_cat, val_y,
|
|
375
|
+
task=task,
|
|
376
|
+
max_epochs=hyperparameters["max_epochs"],
|
|
377
|
+
patience=hyperparameters["early_stopping_patience"],
|
|
378
|
+
batch_size=hyperparameters["batch_size"],
|
|
379
|
+
learning_rate=hyperparameters["learning_rate"],
|
|
380
|
+
device=device,
|
|
381
|
+
)
|
|
382
|
+
ensemble_models.append(model)
|
|
618
383
|
|
|
619
|
-
|
|
384
|
+
# Out-of-fold predictions
|
|
385
|
+
fold_preds = predict(model, val_x_cont, val_x_cat)
|
|
386
|
+
oof_predictions[val_idx] = fold_preds
|
|
620
387
|
|
|
621
388
|
print(f"\nTraining complete! Trained {len(ensemble_models)} model(s).")
|
|
622
389
|
|
|
623
|
-
#
|
|
624
|
-
#
|
|
390
|
+
# -------------------------------------------------------------------------
|
|
391
|
+
# Prepare validation results
|
|
392
|
+
# -------------------------------------------------------------------------
|
|
625
393
|
if n_folds == 1:
|
|
626
|
-
val_mask = ~np.isnan(oof_predictions)
|
|
627
|
-
preds = oof_predictions[val_mask]
|
|
394
|
+
val_mask = ~np.isnan(oof_predictions[:, 0])
|
|
628
395
|
df_val = all_df[val_mask].copy()
|
|
629
|
-
|
|
630
|
-
oof_proba = oof_proba[val_mask]
|
|
396
|
+
predictions = oof_predictions[val_mask]
|
|
631
397
|
else:
|
|
632
|
-
preds = oof_predictions
|
|
633
398
|
df_val = all_df.copy()
|
|
399
|
+
predictions = oof_predictions
|
|
634
400
|
|
|
635
|
-
#
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
401
|
+
# Decode labels for classification
|
|
402
|
+
if model_type == "classifier":
|
|
403
|
+
class_preds = np.argmax(predictions, axis=1)
|
|
404
|
+
df_val[target] = label_encoder.inverse_transform(df_val[target].astype(int))
|
|
405
|
+
df_val["prediction"] = label_encoder.inverse_transform(class_preds)
|
|
406
|
+
df_val["pred_proba"] = [p.tolist() for p in predictions]
|
|
407
|
+
df_val = expand_proba_column(df_val, label_encoder.classes_)
|
|
408
|
+
else:
|
|
409
|
+
df_val["prediction"] = predictions.flatten()
|
|
410
|
+
|
|
411
|
+
# -------------------------------------------------------------------------
|
|
412
|
+
# Compute and print metrics
|
|
413
|
+
# -------------------------------------------------------------------------
|
|
414
|
+
y_true = df_val[target].values
|
|
415
|
+
y_pred = df_val["prediction"].values
|
|
649
416
|
|
|
650
417
|
if model_type == "classifier":
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
df_val["pred_proba"] = [p.tolist() for p in oof_proba]
|
|
655
|
-
df_val = expand_proba_column(df_val, label_encoder.classes_)
|
|
656
|
-
|
|
657
|
-
# Decode the target and prediction labels
|
|
658
|
-
y_validate = label_encoder.inverse_transform(df_val[target])
|
|
659
|
-
preds_decoded = label_encoder.inverse_transform(preds.astype(int))
|
|
418
|
+
score_df = compute_classification_metrics(y_true, y_pred, label_encoder.classes_, target)
|
|
419
|
+
print_classification_metrics(score_df, target, label_encoder.classes_)
|
|
420
|
+
print_confusion_matrix(y_true, y_pred, label_encoder.classes_)
|
|
660
421
|
else:
|
|
661
|
-
|
|
662
|
-
|
|
422
|
+
metrics = compute_regression_metrics(y_true, y_pred)
|
|
423
|
+
print_regression_metrics(metrics)
|
|
424
|
+
|
|
425
|
+
# Compute ensemble prediction_std
|
|
426
|
+
if n_folds > 1:
|
|
427
|
+
# Re-run inference with all models to get std
|
|
428
|
+
x_cont, x_cat, _, _, _ = prepare_data(
|
|
429
|
+
df_val, continuous_cols, categorical_cols, category_mappings=category_mappings, scaler=scaler
|
|
430
|
+
)
|
|
431
|
+
all_preds = [predict(m, x_cont, x_cat).flatten() for m in ensemble_models]
|
|
432
|
+
df_val["prediction_std"] = np.std(np.stack(all_preds), axis=0)
|
|
433
|
+
print(f"Ensemble std - mean: {df_val['prediction_std'].mean():.4f}, max: {df_val['prediction_std'].max():.4f}")
|
|
434
|
+
else:
|
|
435
|
+
df_val["prediction_std"] = 0.0
|
|
663
436
|
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
437
|
+
# Train UQ models for uncertainty quantification
|
|
438
|
+
print("\n" + "=" * 50)
|
|
439
|
+
print("Training UQ Models")
|
|
440
|
+
print("=" * 50)
|
|
441
|
+
uq_models, uq_metadata = train_uq_models(
|
|
442
|
+
all_df[features], all_df[target], df_val[features], y_true
|
|
443
|
+
)
|
|
444
|
+
df_val = predict_intervals(df_val, df_val[features], uq_models, uq_metadata)
|
|
445
|
+
df_val = compute_confidence(df_val, uq_metadata["median_interval_width"])
|
|
667
446
|
|
|
668
|
-
#
|
|
447
|
+
# -------------------------------------------------------------------------
|
|
448
|
+
# Save validation predictions to S3
|
|
449
|
+
# -------------------------------------------------------------------------
|
|
669
450
|
output_columns = []
|
|
670
451
|
if id_column in df_val.columns:
|
|
671
452
|
output_columns.append(id_column)
|
|
672
453
|
output_columns += [target, "prediction"]
|
|
673
454
|
|
|
674
|
-
|
|
675
|
-
if model_type in ["regressor", "uq_regressor"]:
|
|
676
|
-
if preds_std is not None:
|
|
677
|
-
df_val["prediction_std"] = preds_std
|
|
678
|
-
else:
|
|
679
|
-
df_val["prediction_std"] = 0.0
|
|
455
|
+
if model_type != "classifier":
|
|
680
456
|
output_columns.append("prediction_std")
|
|
681
|
-
|
|
457
|
+
output_columns += [c for c in df_val.columns if c.startswith("q_") or c == "confidence"]
|
|
458
|
+
|
|
459
|
+
output_columns += [c for c in df_val.columns if c.endswith("_proba")]
|
|
460
|
+
|
|
461
|
+
wr.s3.to_csv(df_val[output_columns], f"{model_metrics_s3_path}/validation_predictions.csv", index=False)
|
|
462
|
+
|
|
463
|
+
# -------------------------------------------------------------------------
|
|
464
|
+
# Save model artifacts
|
|
465
|
+
# -------------------------------------------------------------------------
|
|
466
|
+
model_config = {
|
|
467
|
+
"n_continuous": len(continuous_cols),
|
|
468
|
+
"categorical_cardinalities": categorical_cardinalities,
|
|
469
|
+
"hidden_layers": hidden_layers,
|
|
470
|
+
"n_outputs": n_outputs,
|
|
471
|
+
"task": task,
|
|
472
|
+
"dropout": hyperparameters["dropout"],
|
|
473
|
+
"use_batch_norm": hyperparameters["use_batch_norm"],
|
|
474
|
+
}
|
|
682
475
|
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
path=f"{model_metrics_s3_path}/validation_predictions.csv",
|
|
687
|
-
index=False,
|
|
688
|
-
)
|
|
476
|
+
for idx, m in enumerate(ensemble_models):
|
|
477
|
+
save_model(m, os.path.join(args.model_dir, f"model_{idx}"), model_config)
|
|
478
|
+
print(f"Saved {len(ensemble_models)} model(s)")
|
|
689
479
|
|
|
690
|
-
|
|
691
|
-
if model_type == "classifier":
|
|
692
|
-
# Get the label names and their integer mapping
|
|
693
|
-
label_names = label_encoder.classes_
|
|
694
|
-
|
|
695
|
-
# Calculate various model performance metrics
|
|
696
|
-
scores = precision_recall_fscore_support(y_validate, preds_decoded, average=None, labels=label_names)
|
|
697
|
-
|
|
698
|
-
# Put the scores into a dataframe
|
|
699
|
-
score_df = pd.DataFrame(
|
|
700
|
-
{
|
|
701
|
-
target: label_names,
|
|
702
|
-
"precision": scores[0],
|
|
703
|
-
"recall": scores[1],
|
|
704
|
-
"f1": scores[2],
|
|
705
|
-
"support": scores[3],
|
|
706
|
-
}
|
|
707
|
-
)
|
|
480
|
+
joblib.dump({"n_ensemble": len(ensemble_models), "n_folds": n_folds}, os.path.join(args.model_dir, "ensemble_metadata.joblib"))
|
|
708
481
|
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
for t in label_names:
|
|
712
|
-
for m in metrics:
|
|
713
|
-
value = score_df.loc[score_df[target] == t, m].iloc[0]
|
|
714
|
-
print(f"Metrics:{t}:{m} {value}")
|
|
482
|
+
with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as f:
|
|
483
|
+
json.dump(orig_features, f)
|
|
715
484
|
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
for i, row_name in enumerate(label_names):
|
|
719
|
-
for j, col_name in enumerate(label_names):
|
|
720
|
-
value = conf_mtx[i, j]
|
|
721
|
-
print(f"ConfusionMatrix:{row_name}:{col_name} {value}")
|
|
485
|
+
with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as f:
|
|
486
|
+
json.dump(category_mappings, f)
|
|
722
487
|
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
support = len(df_val)
|
|
731
|
-
print(f"rmse: {rmse:.3f}")
|
|
732
|
-
print(f"mae: {mae:.3f}")
|
|
733
|
-
print(f"medae: {medae:.3f}")
|
|
734
|
-
print(f"r2: {r2:.3f}")
|
|
735
|
-
print(f"spearmanr: {spearman_corr:.3f}")
|
|
736
|
-
print(f"support: {support}")
|
|
737
|
-
|
|
738
|
-
# Save ensemble models
|
|
739
|
-
for model_idx, ens_model in enumerate(ensemble_models):
|
|
740
|
-
model_path = os.path.join(args.model_dir, f"tabular_model_{model_idx}")
|
|
741
|
-
ens_model.save_model(model_path)
|
|
742
|
-
print(f"Saved model {model_idx + 1} to {model_path}")
|
|
743
|
-
|
|
744
|
-
# Save ensemble metadata
|
|
745
|
-
n_ensemble = len(ensemble_models)
|
|
746
|
-
ensemble_metadata = {"n_ensemble": n_ensemble, "n_folds": n_folds}
|
|
747
|
-
joblib.dump(ensemble_metadata, os.path.join(args.model_dir, "ensemble_metadata.joblib"))
|
|
748
|
-
print(f"Saved ensemble metadata (n_ensemble={n_ensemble}, n_folds={n_folds})")
|
|
488
|
+
with open(os.path.join(args.model_dir, "feature_metadata.json"), "w") as f:
|
|
489
|
+
json.dump({"continuous_cols": continuous_cols, "categorical_cols": categorical_cols}, f)
|
|
490
|
+
|
|
491
|
+
with open(os.path.join(args.model_dir, "hyperparameters.json"), "w") as f:
|
|
492
|
+
json.dump(hyperparameters, f, indent=2)
|
|
493
|
+
|
|
494
|
+
scaler.save(os.path.join(args.model_dir, "scaler.joblib"))
|
|
749
495
|
|
|
750
496
|
if label_encoder:
|
|
751
497
|
joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
|
|
752
498
|
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
json.dump(orig_features, fp)
|
|
499
|
+
if model_type != "classifier":
|
|
500
|
+
save_uq_models(uq_models, uq_metadata, args.model_dir)
|
|
756
501
|
|
|
757
|
-
|
|
758
|
-
with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
|
|
759
|
-
json.dump(category_mappings, fp)
|
|
502
|
+
print(f"\nModel training complete! Artifacts saved to {args.model_dir}")
|