workbench 0.8.205__py3-none-any.whl → 0.8.213__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- workbench/algorithms/models/noise_model.py +388 -0
- workbench/api/endpoint.py +3 -6
- workbench/api/feature_set.py +1 -1
- workbench/api/model.py +5 -11
- workbench/cached/cached_model.py +4 -4
- workbench/core/artifacts/endpoint_core.py +63 -153
- workbench/core/artifacts/model_core.py +21 -19
- workbench/core/transforms/features_to_model/features_to_model.py +2 -2
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +1 -1
- workbench/model_script_utils/model_script_utils.py +335 -0
- workbench/model_script_utils/pytorch_utils.py +395 -0
- workbench/model_script_utils/uq_harness.py +278 -0
- workbench/model_scripts/chemprop/chemprop.template +289 -666
- workbench/model_scripts/chemprop/generated_model_script.py +292 -669
- workbench/model_scripts/chemprop/model_script_utils.py +335 -0
- workbench/model_scripts/chemprop/requirements.txt +2 -10
- workbench/model_scripts/pytorch_model/generated_model_script.py +355 -612
- workbench/model_scripts/pytorch_model/model_script_utils.py +335 -0
- workbench/model_scripts/pytorch_model/pytorch.template +350 -607
- workbench/model_scripts/pytorch_model/pytorch_utils.py +395 -0
- workbench/model_scripts/pytorch_model/requirements.txt +1 -1
- workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
- workbench/model_scripts/script_generation.py +2 -5
- workbench/model_scripts/uq_models/generated_model_script.py +65 -422
- workbench/model_scripts/xgb_model/generated_model_script.py +349 -412
- workbench/model_scripts/xgb_model/model_script_utils.py +335 -0
- workbench/model_scripts/xgb_model/uq_harness.py +278 -0
- workbench/model_scripts/xgb_model/xgb_model.template +344 -407
- workbench/scripts/training_test.py +85 -0
- workbench/utils/chemprop_utils.py +18 -656
- workbench/utils/metrics_utils.py +172 -0
- workbench/utils/model_utils.py +104 -47
- workbench/utils/pytorch_utils.py +32 -472
- workbench/utils/xgboost_local_crossfold.py +267 -0
- workbench/utils/xgboost_model_utils.py +49 -356
- workbench/web_interface/components/plugins/model_details.py +30 -68
- {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/METADATA +5 -5
- {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/RECORD +42 -31
- {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/entry_points.txt +1 -0
- workbench/model_scripts/uq_models/mapie.template +0 -605
- workbench/model_scripts/uq_models/requirements.txt +0 -1
- {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/WHEEL +0 -0
- {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/top_level.txt +0 -0
|
@@ -1,497 +1,434 @@
|
|
|
1
|
-
#
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
#
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
r2_score,
|
|
11
|
-
root_mean_squared_error,
|
|
12
|
-
precision_recall_fscore_support,
|
|
13
|
-
confusion_matrix,
|
|
14
|
-
)
|
|
15
|
-
from scipy.stats import spearmanr
|
|
1
|
+
# XGBoost Model Template for Workbench
|
|
2
|
+
#
|
|
3
|
+
# This template handles both classification and regression models with:
|
|
4
|
+
# - K-fold cross-validation ensemble training (or single train/val split)
|
|
5
|
+
# - Out-of-fold predictions for validation metrics
|
|
6
|
+
# - Uncertainty quantification for regression models
|
|
7
|
+
# - Sample weights support
|
|
8
|
+
# - Categorical feature handling
|
|
9
|
+
# - Compressed feature decompression
|
|
16
10
|
|
|
17
|
-
# Classification Encoder
|
|
18
|
-
from sklearn.preprocessing import LabelEncoder
|
|
19
|
-
|
|
20
|
-
# Scikit Learn Imports
|
|
21
|
-
from sklearn.model_selection import train_test_split
|
|
22
|
-
|
|
23
|
-
from io import StringIO
|
|
24
|
-
import json
|
|
25
11
|
import argparse
|
|
26
|
-
import
|
|
12
|
+
import json
|
|
27
13
|
import os
|
|
14
|
+
|
|
15
|
+
import awswrangler as wr
|
|
16
|
+
import joblib
|
|
17
|
+
import numpy as np
|
|
28
18
|
import pandas as pd
|
|
29
|
-
|
|
19
|
+
import xgboost as xgb
|
|
20
|
+
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
|
|
21
|
+
from sklearn.preprocessing import LabelEncoder
|
|
30
22
|
|
|
31
|
-
|
|
23
|
+
from model_script_utils import (
|
|
24
|
+
check_dataframe,
|
|
25
|
+
compute_classification_metrics,
|
|
26
|
+
compute_regression_metrics,
|
|
27
|
+
convert_categorical_types,
|
|
28
|
+
decompress_features,
|
|
29
|
+
expand_proba_column,
|
|
30
|
+
input_fn,
|
|
31
|
+
match_features_case_insensitive,
|
|
32
|
+
output_fn,
|
|
33
|
+
print_classification_metrics,
|
|
34
|
+
print_confusion_matrix,
|
|
35
|
+
print_regression_metrics,
|
|
36
|
+
)
|
|
37
|
+
from uq_harness import (
|
|
38
|
+
compute_confidence,
|
|
39
|
+
load_uq_models,
|
|
40
|
+
predict_intervals,
|
|
41
|
+
save_uq_models,
|
|
42
|
+
train_uq_models,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# =============================================================================
|
|
46
|
+
# Default Hyperparameters
|
|
47
|
+
# =============================================================================
|
|
32
48
|
DEFAULT_HYPERPARAMETERS = {
|
|
49
|
+
# Training parameters
|
|
50
|
+
"n_folds": 5, # Number of CV folds (1 = single train/val split)
|
|
33
51
|
# Core tree parameters
|
|
34
|
-
"n_estimators": 200,
|
|
35
|
-
"max_depth": 6,
|
|
36
|
-
"learning_rate": 0.05,
|
|
37
|
-
|
|
52
|
+
"n_estimators": 200,
|
|
53
|
+
"max_depth": 6,
|
|
54
|
+
"learning_rate": 0.05,
|
|
38
55
|
# Sampling parameters
|
|
39
|
-
"subsample": 0.7,
|
|
40
|
-
"colsample_bytree": 0.6,
|
|
41
|
-
"colsample_bylevel": 0.8,
|
|
42
|
-
|
|
56
|
+
"subsample": 0.7,
|
|
57
|
+
"colsample_bytree": 0.6,
|
|
58
|
+
"colsample_bylevel": 0.8,
|
|
43
59
|
# Regularization
|
|
44
|
-
"min_child_weight": 5,
|
|
45
|
-
"gamma": 0.2,
|
|
46
|
-
"reg_alpha": 0.5,
|
|
47
|
-
"reg_lambda": 2.0,
|
|
48
|
-
|
|
60
|
+
"min_child_weight": 5,
|
|
61
|
+
"gamma": 0.2,
|
|
62
|
+
"reg_alpha": 0.5,
|
|
63
|
+
"reg_lambda": 2.0,
|
|
49
64
|
# Random seed
|
|
50
65
|
"random_state": 42,
|
|
51
66
|
}
|
|
52
67
|
|
|
53
|
-
#
|
|
68
|
+
# Workbench-specific parameters (not passed to XGBoost)
|
|
69
|
+
WORKBENCH_PARAMS = {"n_folds"}
|
|
70
|
+
|
|
71
|
+
# Template parameters (filled in by Workbench)
|
|
54
72
|
TEMPLATE_PARAMS = {
|
|
55
|
-
"model_type": "
|
|
56
|
-
"target": "
|
|
57
|
-
"features": ['
|
|
73
|
+
"model_type": "uq_regressor",
|
|
74
|
+
"target": "udm_asy_res_efflux_ratio",
|
|
75
|
+
"features": ['smr_vsa4', 'tpsa', 'numhdonors', 'nhohcount', 'nbase', 'vsa_estate3', 'fr_guanido', 'mollogp', 'peoe_vsa8', 'peoe_vsa1', 'fr_imine', 'vsa_estate2', 'estate_vsa10', 'asphericity', 'xc_3dv', 'smr_vsa3', 'charge_centroid_distance', 'c3sp3', 'nitrogen_span', 'estate_vsa2', 'minpartialcharge', 'hba_hbd_ratio', 'slogp_vsa1', 'axp_7d', 'nocount', 'vsa_estate4', 'vsa_estate6', 'estate_vsa4', 'xc_4dv', 'xc_4d', 'num_s_centers', 'vsa_estate9', 'chi2v', 'axp_5d', 'mi', 'mse', 'bcut2d_mrhi', 'smr_vsa6', 'hallkieralpha', 'balabanj', 'amphiphilic_moment', 'type_ii_pattern_count', 'minabsestateindex', 'bcut2d_mwlow', 'axp_0dv', 'slogp_vsa5', 'axp_2d', 'axp_1dv', 'xch_5d', 'peoe_vsa10', 'molecular_asymmetry', 'kappa3', 'estate_vsa3', 'sse', 'bcut2d_logphi', 'fr_imidazole', 'molecular_volume_3d', 'bertzct', 'maxestateindex', 'aromatic_interaction_score', 'axp_3d', 'radius_of_gyration', 'vsa_estate7', 'si', 'axp_5dv', 'molecular_axis_length', 'estate_vsa6', 'fpdensitymorgan1', 'axp_6d', 'estate_vsa9', 'fpdensitymorgan2', 'xp_0dv', 'xp_6dv', 'molmr', 'qed', 'estate_vsa8', 'peoe_vsa9', 'xch_6dv', 'xp_7d', 'slogp_vsa2', 'xp_5dv', 'bcut2d_chghi', 'xch_6d', 'chi0n', 'slogp_vsa3', 'chi1v', 'chi3v', 'bcut2d_chglo', 'axp_1d', 'mp', 'num_defined_stereocenters', 'xp_3dv', 'bcut2d_mrlow', 'fr_al_oh', 'peoe_vsa7', 'chi2n', 'axp_6dv', 'axp_2dv', 'chi4n', 'xc_3d', 'axp_7dv', 'vsa_estate8', 'xch_7d', 'maxpartialcharge', 'chi1n', 'peoe_vsa2', 'axp_3dv', 'bcut2d_logplow', 'mv', 'xpc_5dv', 'kappa2', 'vsa_estate5', 'xp_5d', 'mm', 'maxabspartialcharge', 'axp_4dv', 'maxabsestateindex', 'axp_4d', 'xch_4dv', 'xp_2dv', 'heavyatommolwt', 'numatomstereocenters', 'xp_7dv', 'numsaturatedheterocycles', 'xp_3d', 'kappa1', 'mz', 'axp_0d', 'chi1', 'xch_4d', 'smr_vsa1', 'xp_2d', 'estate_vsa5', 'phi', 'fr_ether', 'xc_5d', 'c1sp3', 'estate_vsa7', 'estate_vsa1', 'vsa_estate1', 'slogp_vsa4', 'avgipc', 'smr_vsa10', 'numvalenceelectrons', 'xc_5dv', 'peoe_vsa12', 'peoe_vsa6', 'xpc_5d', 'xpc_6d', 'minestateindex', 'chi3n', 'smr_vsa5', 'xp_4d', 'numheteroatoms', 'fpdensitymorgan3', 'xpc_4d', 'sps', 'xp_1d', 'sv', 'fr_ar_n', 'slogp_vsa10', 'c2sp3', 'xpc_4dv', 'chi0v', 'xpc_6dv', 'xp_1dv', 'vsa_estate10', 'sare', 'c2sp2', 'mpe', 'xch_7dv', 'chi4v', 'type_i_pattern_count', 'sp', 'slogp_vsa8', 'amide_count', 'num_stereocenters', 'num_r_centers', 'tertiary_amine_count', 'spe', 'xp_4dv', 'numsaturatedrings', 'mare', 'numhacceptors', 'chi0', 'fractioncsp3', 'fr_nh0', 'xch_5dv', 'fr_aniline', 'smr_vsa7', 'labuteasa', 'c3sp2', 'xp_0d', 'xp_6d', 'peoe_vsa11', 'fr_ar_nh', 'molwt', 'intramolecular_hbond_potential', 'peoe_vsa3', 'fr_nhpyrrole', 'numaliphaticrings', 'hybratio', 'smr_vsa9', 'peoe_vsa13', 'bcut2d_mwhi', 'c1sp2', 'slogp_vsa11', 'numrotatablebonds', 'numaliphaticcarbocycles', 'slogp_vsa6', 'peoe_vsa4', 'numunspecifiedatomstereocenters', 'xc_6d', 'xc_6dv', 'num_unspecified_stereocenters', 'sz', 'minabspartialcharge', 'fcsp3', 'c1sp1', 'fr_piperzine', 'numaliphaticheterocycles', 'numamidebonds', 'fr_benzene', 'numaromaticheterocycles', 'sm', 'fr_priamide', 'fr_piperdine', 'fr_methoxy', 'c4sp3', 'fr_c_o_nocoo', 'exactmolwt', 'stereo_complexity', 'fr_hoccn', 'numaromaticcarbocycles', 'fr_nh2', 'numheterocycles', 'fr_morpholine', 'fr_ketone', 'fr_nh1', 'frac_defined_stereo', 'fr_aryl_methyl', 'fr_alkyl_halide', 'fr_phenol', 'fr_al_oh_notert', 'fr_ar_oh', 'fr_pyridine', 'fr_amide', 'slogp_vsa7', 'fr_halogen', 'numsaturatedcarbocycles', 'slogp_vsa12', 'fr_ndealkylation1', 'xch_3d', 'fr_bicyclic', 'naromatom', 'narombond'],
|
|
76
|
+
"id_column": "udm_mol_bat_id",
|
|
58
77
|
"compressed_features": [],
|
|
59
|
-
"model_metrics_s3_path": "s3://
|
|
60
|
-
"
|
|
61
|
-
"hyperparameters": {},
|
|
78
|
+
"model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/caco2-er-reg-test-log/training",
|
|
79
|
+
"hyperparameters": {'target_transform': 'log'},
|
|
62
80
|
}
|
|
63
81
|
|
|
64
82
|
|
|
65
|
-
#
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
Check if the provided dataframe is empty and raise an exception if it is.
|
|
69
|
-
|
|
70
|
-
Args:
|
|
71
|
-
df (pd.DataFrame): DataFrame to check
|
|
72
|
-
df_name (str): Name of the DataFrame
|
|
73
|
-
"""
|
|
74
|
-
if df.empty:
|
|
75
|
-
msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
|
|
76
|
-
print(msg)
|
|
77
|
-
raise ValueError(msg)
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFrame:
|
|
81
|
-
"""
|
|
82
|
-
Expands a column in a DataFrame containing a list of probabilities into separate columns.
|
|
83
|
-
|
|
84
|
-
Args:
|
|
85
|
-
df (pd.DataFrame): DataFrame containing a "pred_proba" column
|
|
86
|
-
class_labels (List[str]): List of class labels
|
|
87
|
-
|
|
88
|
-
Returns:
|
|
89
|
-
pd.DataFrame: DataFrame with the "pred_proba" expanded into separate columns
|
|
90
|
-
"""
|
|
91
|
-
|
|
92
|
-
# Sanity check
|
|
93
|
-
proba_column = "pred_proba"
|
|
94
|
-
if proba_column not in df.columns:
|
|
95
|
-
raise ValueError('DataFrame does not contain a "pred_proba" column')
|
|
96
|
-
|
|
97
|
-
# Construct new column names with '_proba' suffix
|
|
98
|
-
proba_splits = [f"{label}_proba" for label in class_labels]
|
|
99
|
-
|
|
100
|
-
# Expand the proba_column into separate columns for each probability
|
|
101
|
-
proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
|
|
102
|
-
|
|
103
|
-
# Drop any proba columns and reset the index in prep for the concat
|
|
104
|
-
df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
|
|
105
|
-
df = df.reset_index(drop=True)
|
|
106
|
-
|
|
107
|
-
# Concatenate the new columns with the original DataFrame
|
|
108
|
-
df = pd.concat([df, proba_df], axis=1)
|
|
109
|
-
print(df)
|
|
110
|
-
return df
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
|
|
114
|
-
"""
|
|
115
|
-
Matches and renames DataFrame columns to match model feature names (case-insensitive).
|
|
116
|
-
Prioritizes exact matches, then case-insensitive matches.
|
|
117
|
-
|
|
118
|
-
Raises ValueError if any model features cannot be matched.
|
|
119
|
-
"""
|
|
120
|
-
df_columns_lower = {col.lower(): col for col in df.columns}
|
|
121
|
-
rename_dict = {}
|
|
122
|
-
missing = []
|
|
123
|
-
for feature in model_features:
|
|
124
|
-
if feature in df.columns:
|
|
125
|
-
continue # Exact match
|
|
126
|
-
elif feature.lower() in df_columns_lower:
|
|
127
|
-
rename_dict[df_columns_lower[feature.lower()]] = feature
|
|
128
|
-
else:
|
|
129
|
-
missing.append(feature)
|
|
130
|
-
|
|
131
|
-
if missing:
|
|
132
|
-
raise ValueError(f"Features not found: {missing}")
|
|
133
|
-
|
|
134
|
-
# Rename the DataFrame columns to match the model features
|
|
135
|
-
return df.rename(columns=rename_dict)
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
|
|
139
|
-
"""
|
|
140
|
-
Converts appropriate columns to categorical type with consistent mappings.
|
|
141
|
-
|
|
142
|
-
Args:
|
|
143
|
-
df (pd.DataFrame): The DataFrame to process.
|
|
144
|
-
features (list): List of feature names to consider for conversion.
|
|
145
|
-
category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
|
|
146
|
-
training mode. If populated, we're in inference mode.
|
|
147
|
-
|
|
148
|
-
Returns:
|
|
149
|
-
tuple: (processed DataFrame, category mappings dictionary)
|
|
150
|
-
"""
|
|
151
|
-
# Training mode
|
|
152
|
-
if category_mappings == {}:
|
|
153
|
-
for col in df.select_dtypes(include=["object", "string"]):
|
|
154
|
-
if col in features and df[col].nunique() < 20:
|
|
155
|
-
print(f"Training mode: Converting {col} to category")
|
|
156
|
-
df[col] = df[col].astype("category")
|
|
157
|
-
category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
|
|
158
|
-
|
|
159
|
-
# Inference mode
|
|
160
|
-
else:
|
|
161
|
-
for col, categories in category_mappings.items():
|
|
162
|
-
if col in df.columns:
|
|
163
|
-
print(f"Inference mode: Applying categorical mapping for {col}")
|
|
164
|
-
df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
|
|
165
|
-
|
|
166
|
-
return df, category_mappings
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
def decompress_features(
|
|
170
|
-
df: pd.DataFrame, features: List[str], compressed_features: List[str]
|
|
171
|
-
) -> Tuple[pd.DataFrame, List[str]]:
|
|
172
|
-
"""Prepare features for the model by decompressing bitstring features
|
|
173
|
-
|
|
174
|
-
Args:
|
|
175
|
-
df (pd.DataFrame): The features DataFrame
|
|
176
|
-
features (List[str]): Full list of feature names
|
|
177
|
-
compressed_features (List[str]): List of feature names to decompress (bitstrings)
|
|
178
|
-
|
|
179
|
-
Returns:
|
|
180
|
-
pd.DataFrame: DataFrame with the decompressed features
|
|
181
|
-
List[str]: Updated list of feature names after decompression
|
|
182
|
-
|
|
183
|
-
Raises:
|
|
184
|
-
ValueError: If any missing values are found in the specified features
|
|
185
|
-
"""
|
|
186
|
-
|
|
187
|
-
# Check for any missing values in the required features
|
|
188
|
-
missing_counts = df[features].isna().sum()
|
|
189
|
-
if missing_counts.any():
|
|
190
|
-
missing_features = missing_counts[missing_counts > 0]
|
|
191
|
-
print(
|
|
192
|
-
f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
|
|
193
|
-
"WARNING: You might want to remove/replace all NaN values before processing."
|
|
194
|
-
)
|
|
195
|
-
|
|
196
|
-
# Decompress the specified compressed features
|
|
197
|
-
decompressed_features = features.copy()
|
|
198
|
-
for feature in compressed_features:
|
|
199
|
-
if (feature not in df.columns) or (feature not in features):
|
|
200
|
-
print(f"Feature '{feature}' not in the features list, skipping decompression.")
|
|
201
|
-
continue
|
|
202
|
-
|
|
203
|
-
# Remove the feature from the list of features to avoid duplication
|
|
204
|
-
decompressed_features.remove(feature)
|
|
205
|
-
|
|
206
|
-
# Handle all compressed features as bitstrings
|
|
207
|
-
bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
|
|
208
|
-
prefix = feature[:3]
|
|
209
|
-
|
|
210
|
-
# Create all new columns at once - avoids fragmentation
|
|
211
|
-
new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
|
|
212
|
-
new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
|
|
213
|
-
|
|
214
|
-
# Add to features list
|
|
215
|
-
decompressed_features.extend(new_col_names)
|
|
216
|
-
|
|
217
|
-
# Drop original column and concatenate new ones
|
|
218
|
-
df = df.drop(columns=[feature])
|
|
219
|
-
df = pd.concat([df, new_df], axis=1)
|
|
220
|
-
|
|
221
|
-
return df, decompressed_features
|
|
222
|
-
|
|
223
|
-
|
|
83
|
+
# =============================================================================
|
|
84
|
+
# Training
|
|
85
|
+
# =============================================================================
|
|
224
86
|
if __name__ == "__main__":
|
|
225
|
-
|
|
87
|
+
# -------------------------------------------------------------------------
|
|
88
|
+
# Setup: Parse arguments and load data
|
|
89
|
+
# -------------------------------------------------------------------------
|
|
90
|
+
parser = argparse.ArgumentParser()
|
|
91
|
+
parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
|
|
92
|
+
parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
|
|
93
|
+
parser.add_argument("--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data"))
|
|
94
|
+
args = parser.parse_args()
|
|
226
95
|
|
|
227
|
-
#
|
|
96
|
+
# Extract template parameters
|
|
228
97
|
target = TEMPLATE_PARAMS["target"]
|
|
229
98
|
features = TEMPLATE_PARAMS["features"]
|
|
230
99
|
orig_features = features.copy()
|
|
100
|
+
id_column = TEMPLATE_PARAMS["id_column"]
|
|
231
101
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
232
102
|
model_type = TEMPLATE_PARAMS["model_type"]
|
|
233
103
|
model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
|
|
234
|
-
|
|
235
|
-
hyperparameters = {**DEFAULT_HYPERPARAMETERS, **TEMPLATE_PARAMS["hyperparameters"]}
|
|
236
|
-
validation_split = 0.2
|
|
237
|
-
|
|
238
|
-
# Script arguments for input/output directories
|
|
239
|
-
parser = argparse.ArgumentParser()
|
|
240
|
-
parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
|
|
241
|
-
parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
|
|
242
|
-
parser.add_argument(
|
|
243
|
-
"--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
|
|
244
|
-
)
|
|
245
|
-
args = parser.parse_args()
|
|
104
|
+
hyperparameters = {**DEFAULT_HYPERPARAMETERS, **(TEMPLATE_PARAMS["hyperparameters"] or {})}
|
|
246
105
|
|
|
247
|
-
#
|
|
248
|
-
training_files = [os.path.join(args.train,
|
|
106
|
+
# Load training data
|
|
107
|
+
training_files = [os.path.join(args.train, f) for f in os.listdir(args.train) if f.endswith(".csv")]
|
|
249
108
|
print(f"Training Files: {training_files}")
|
|
250
|
-
|
|
251
|
-
# Combine files and read them all into a single pandas dataframe
|
|
252
|
-
all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
253
|
-
|
|
254
|
-
# Check if the dataframe is empty
|
|
109
|
+
all_df = pd.concat([pd.read_csv(f, engine="python") for f in training_files])
|
|
255
110
|
check_dataframe(all_df, "training_df")
|
|
256
111
|
|
|
257
|
-
# Features/Target output
|
|
258
112
|
print(f"Target: {target}")
|
|
259
|
-
print(f"Features: {
|
|
113
|
+
print(f"Features: {features}")
|
|
114
|
+
print(f"Hyperparameters: {hyperparameters}")
|
|
260
115
|
|
|
261
|
-
#
|
|
116
|
+
# -------------------------------------------------------------------------
|
|
117
|
+
# Preprocessing: Categorical features and decompression
|
|
118
|
+
# -------------------------------------------------------------------------
|
|
262
119
|
all_df, category_mappings = convert_categorical_types(all_df, features)
|
|
263
120
|
|
|
264
|
-
# If we have compressed features, decompress them
|
|
265
121
|
if compressed_features:
|
|
266
|
-
print(f"Decompressing features {compressed_features}
|
|
122
|
+
print(f"Decompressing features: {compressed_features}")
|
|
267
123
|
all_df, features = decompress_features(all_df, features, compressed_features)
|
|
268
124
|
|
|
269
|
-
#
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
125
|
+
# -------------------------------------------------------------------------
|
|
126
|
+
# Classification setup: Encode target labels
|
|
127
|
+
# -------------------------------------------------------------------------
|
|
128
|
+
label_encoder = None
|
|
129
|
+
if model_type == "classifier":
|
|
130
|
+
label_encoder = LabelEncoder()
|
|
131
|
+
all_df[target] = label_encoder.fit_transform(all_df[target])
|
|
132
|
+
print(f"Class labels: {label_encoder.classes_.tolist()}")
|
|
133
|
+
|
|
134
|
+
# -------------------------------------------------------------------------
|
|
135
|
+
# Cross-validation setup
|
|
136
|
+
# -------------------------------------------------------------------------
|
|
137
|
+
n_folds = hyperparameters["n_folds"]
|
|
138
|
+
xgb_params = {k: v for k, v in hyperparameters.items() if k not in WORKBENCH_PARAMS}
|
|
139
|
+
print(f"XGBoost params: {xgb_params}")
|
|
140
|
+
|
|
141
|
+
if n_folds == 1:
|
|
142
|
+
# Single train/val split
|
|
143
|
+
if "training" in all_df.columns:
|
|
144
|
+
print("Using 'training' column for train/val split")
|
|
145
|
+
train_idx = np.where(all_df["training"])[0]
|
|
146
|
+
val_idx = np.where(~all_df["training"])[0]
|
|
147
|
+
else:
|
|
148
|
+
print("WARNING: No 'training' column found, using random 80/20 split")
|
|
149
|
+
indices = np.arange(len(all_df))
|
|
150
|
+
train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)
|
|
151
|
+
folds = [(train_idx, val_idx)]
|
|
280
152
|
else:
|
|
281
|
-
#
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
153
|
+
# K-fold cross-validation
|
|
154
|
+
if model_type == "classifier":
|
|
155
|
+
kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
156
|
+
folds = list(kfold.split(all_df, all_df[target]))
|
|
157
|
+
else:
|
|
158
|
+
kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
159
|
+
folds = list(kfold.split(all_df))
|
|
286
160
|
|
|
287
|
-
|
|
288
|
-
print(f"Hyperparameters: {hyperparameters}")
|
|
161
|
+
print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold ensemble'}...")
|
|
289
162
|
|
|
290
|
-
#
|
|
163
|
+
# -------------------------------------------------------------------------
|
|
164
|
+
# Training loop
|
|
165
|
+
# -------------------------------------------------------------------------
|
|
166
|
+
# Initialize out-of-fold storage
|
|
167
|
+
oof_predictions = np.full(len(all_df), np.nan, dtype=np.float64)
|
|
291
168
|
if model_type == "classifier":
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
# Encode the target column
|
|
295
|
-
label_encoder = LabelEncoder()
|
|
296
|
-
df_train[target] = label_encoder.fit_transform(df_train[target])
|
|
297
|
-
df_val[target] = label_encoder.transform(df_val[target])
|
|
298
|
-
|
|
169
|
+
num_classes = len(label_encoder.classes_)
|
|
170
|
+
oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
|
|
299
171
|
else:
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
#
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
172
|
+
oof_proba = None
|
|
173
|
+
|
|
174
|
+
# Check for sample weights
|
|
175
|
+
has_sample_weights = "sample_weight" in all_df.columns
|
|
176
|
+
if has_sample_weights:
|
|
177
|
+
sw = all_df["sample_weight"]
|
|
178
|
+
print(f"Using sample weights: min={sw.min():.2f}, max={sw.max():.2f}, mean={sw.mean():.2f}")
|
|
179
|
+
|
|
180
|
+
# Train ensemble
|
|
181
|
+
ensemble_models = []
|
|
182
|
+
for fold_idx, (train_idx, val_idx) in enumerate(folds):
|
|
183
|
+
print(f"\n{'='*50}")
|
|
184
|
+
print(f"Fold {fold_idx + 1}/{len(folds)} - Train: {len(train_idx)}, Val: {len(val_idx)}")
|
|
185
|
+
print(f"{'='*50}")
|
|
186
|
+
|
|
187
|
+
# Prepare fold data
|
|
188
|
+
X_train = all_df.iloc[train_idx][features]
|
|
189
|
+
y_train = all_df.iloc[train_idx][target]
|
|
190
|
+
X_val = all_df.iloc[val_idx][features]
|
|
191
|
+
sample_weights = all_df.iloc[train_idx]["sample_weight"] if has_sample_weights else None
|
|
192
|
+
|
|
193
|
+
# Create model with fold-specific random state for diversity
|
|
194
|
+
fold_params = {**xgb_params, "random_state": xgb_params.get("random_state", 42) + fold_idx}
|
|
195
|
+
if model_type == "classifier":
|
|
196
|
+
model = xgb.XGBClassifier(enable_categorical=True, **fold_params)
|
|
197
|
+
else:
|
|
198
|
+
model = xgb.XGBRegressor(enable_categorical=True, **fold_params)
|
|
199
|
+
|
|
200
|
+
# Train
|
|
201
|
+
model.fit(X_train, y_train, sample_weight=sample_weights)
|
|
202
|
+
ensemble_models.append(model)
|
|
203
|
+
|
|
204
|
+
# Out-of-fold predictions
|
|
205
|
+
oof_predictions[val_idx] = model.predict(X_val)
|
|
206
|
+
if model_type == "classifier":
|
|
207
|
+
oof_proba[val_idx] = model.predict_proba(X_val)
|
|
208
|
+
|
|
209
|
+
print(f"\nTraining complete! Trained {len(ensemble_models)} model(s).")
|
|
210
|
+
|
|
211
|
+
# -------------------------------------------------------------------------
|
|
212
|
+
# Prepare validation results
|
|
213
|
+
# -------------------------------------------------------------------------
|
|
214
|
+
if n_folds == 1:
|
|
215
|
+
# Single fold: only validation rows
|
|
216
|
+
val_mask = ~np.isnan(oof_predictions)
|
|
217
|
+
df_val = all_df[val_mask].copy()
|
|
218
|
+
predictions = oof_predictions[val_mask]
|
|
219
|
+
if oof_proba is not None:
|
|
220
|
+
oof_proba = oof_proba[val_mask]
|
|
221
|
+
else:
|
|
222
|
+
# K-fold: all rows have out-of-fold predictions
|
|
223
|
+
df_val = all_df.copy()
|
|
224
|
+
predictions = oof_predictions
|
|
225
|
+
|
|
226
|
+
# Decode labels for classification
|
|
313
227
|
if model_type == "classifier":
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
# Save predictions to S3 (just the target, prediction, and '_proba' columns)
|
|
329
|
-
df_val["prediction"] = preds
|
|
330
|
-
output_columns = [target, "prediction"]
|
|
331
|
-
output_columns += [col for col in df_val.columns if col.endswith("_proba")]
|
|
332
|
-
wr.s3.to_csv(
|
|
333
|
-
df_val[output_columns],
|
|
334
|
-
path=f"{model_metrics_s3_path}/validation_predictions.csv",
|
|
335
|
-
index=False,
|
|
336
|
-
)
|
|
337
|
-
|
|
338
|
-
# Report Performance Metrics
|
|
228
|
+
df_val[target] = label_encoder.inverse_transform(df_val[target].astype(int))
|
|
229
|
+
df_val["prediction"] = label_encoder.inverse_transform(predictions.astype(int))
|
|
230
|
+
if oof_proba is not None:
|
|
231
|
+
df_val["pred_proba"] = [p.tolist() for p in oof_proba]
|
|
232
|
+
df_val = expand_proba_column(df_val, label_encoder.classes_)
|
|
233
|
+
else:
|
|
234
|
+
df_val["prediction"] = predictions
|
|
235
|
+
|
|
236
|
+
# -------------------------------------------------------------------------
|
|
237
|
+
# Compute and print metrics
|
|
238
|
+
# -------------------------------------------------------------------------
|
|
239
|
+
y_true = df_val[target].values
|
|
240
|
+
y_pred = df_val["prediction"].values
|
|
241
|
+
|
|
339
242
|
if model_type == "classifier":
|
|
340
|
-
# Get the label names and their integer mapping
|
|
341
243
|
label_names = label_encoder.classes_
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
244
|
+
score_df = compute_classification_metrics(y_true, y_pred, label_names, target)
|
|
245
|
+
print_classification_metrics(score_df, target, label_names)
|
|
246
|
+
print_confusion_matrix(y_true, y_pred, label_names)
|
|
247
|
+
else:
|
|
248
|
+
metrics = compute_regression_metrics(y_true, y_pred)
|
|
249
|
+
print_regression_metrics(metrics)
|
|
250
|
+
|
|
251
|
+
# Compute ensemble prediction_std
|
|
252
|
+
if n_folds > 1:
|
|
253
|
+
all_preds = np.stack([m.predict(all_df[features]) for m in ensemble_models])
|
|
254
|
+
df_val["prediction_std"] = np.std(all_preds, axis=0)
|
|
255
|
+
print(f"Ensemble std - mean: {df_val['prediction_std'].mean():.4f}, max: {df_val['prediction_std'].max():.4f}")
|
|
256
|
+
else:
|
|
257
|
+
df_val["prediction_std"] = 0.0
|
|
258
|
+
|
|
259
|
+
# Train UQ models for uncertainty quantification
|
|
260
|
+
print("\n" + "=" * 50)
|
|
261
|
+
print("Training UQ Models")
|
|
262
|
+
print("=" * 50)
|
|
263
|
+
uq_models, uq_metadata = train_uq_models(
|
|
264
|
+
all_df[features], all_df[target], df_val[features], y_true
|
|
355
265
|
)
|
|
266
|
+
df_val = predict_intervals(df_val, df_val[features], uq_models, uq_metadata)
|
|
267
|
+
df_val = compute_confidence(df_val, uq_metadata["median_interval_width"])
|
|
356
268
|
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
269
|
+
# -------------------------------------------------------------------------
|
|
270
|
+
# Save validation predictions to S3
|
|
271
|
+
# -------------------------------------------------------------------------
|
|
272
|
+
output_columns = []
|
|
273
|
+
if id_column in df_val.columns:
|
|
274
|
+
output_columns.append(id_column)
|
|
275
|
+
output_columns += [target, "prediction"]
|
|
363
276
|
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
for
|
|
367
|
-
for j, col_name in enumerate(label_names):
|
|
368
|
-
value = conf_mtx[i, j]
|
|
369
|
-
print(f"ConfusionMatrix:{row_name}:{col_name} {value}")
|
|
277
|
+
if model_type != "classifier":
|
|
278
|
+
output_columns.append("prediction_std")
|
|
279
|
+
output_columns += [c for c in df_val.columns if c.startswith("q_") or c == "confidence"]
|
|
370
280
|
|
|
371
|
-
|
|
372
|
-
# Calculate various model performance metrics (regression)
|
|
373
|
-
rmse = root_mean_squared_error(y_validate, preds)
|
|
374
|
-
mae = mean_absolute_error(y_validate, preds)
|
|
375
|
-
medae = median_absolute_error(y_validate, preds)
|
|
376
|
-
r2 = r2_score(y_validate, preds)
|
|
377
|
-
spearman_corr = spearmanr(y_validate, preds).correlation
|
|
378
|
-
support = len(df_val)
|
|
379
|
-
print(f"rmse: {rmse:.3f}")
|
|
380
|
-
print(f"mae: {mae:.3f}")
|
|
381
|
-
print(f"medae: {medae:.3f}")
|
|
382
|
-
print(f"r2: {r2:.3f}")
|
|
383
|
-
print(f"spearmanr: {spearman_corr:.3f}")
|
|
384
|
-
print(f"support: {support}")
|
|
385
|
-
|
|
386
|
-
# Now save the model to the standard place/name
|
|
387
|
-
joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
|
|
388
|
-
|
|
389
|
-
# Save the label encoder if we have one
|
|
390
|
-
if label_encoder:
|
|
391
|
-
joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
|
|
281
|
+
output_columns += [c for c in df_val.columns if c.endswith("_proba")]
|
|
392
282
|
|
|
393
|
-
|
|
394
|
-
with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
|
|
395
|
-
json.dump(orig_features, fp) # We save the original features, not the decompressed ones
|
|
283
|
+
wr.s3.to_csv(df_val[output_columns], f"{model_metrics_s3_path}/validation_predictions.csv", index=False)
|
|
396
284
|
|
|
397
|
-
#
|
|
398
|
-
|
|
399
|
-
|
|
285
|
+
# -------------------------------------------------------------------------
|
|
286
|
+
# Save model artifacts
|
|
287
|
+
# -------------------------------------------------------------------------
|
|
288
|
+
# Ensemble models
|
|
289
|
+
for idx, ens_model in enumerate(ensemble_models):
|
|
290
|
+
joblib.dump(ens_model, os.path.join(args.model_dir, f"xgb_model_{idx}.joblib"))
|
|
291
|
+
print(f"Saved {len(ensemble_models)} XGBoost model(s)")
|
|
400
292
|
|
|
293
|
+
# Metadata files
|
|
294
|
+
with open(os.path.join(args.model_dir, "ensemble_metadata.json"), "w") as f:
|
|
295
|
+
json.dump({"n_ensemble": len(ensemble_models), "n_folds": n_folds}, f)
|
|
401
296
|
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
model_path = os.path.join(model_dir, "xgb_model.joblib")
|
|
405
|
-
model = joblib.load(model_path)
|
|
406
|
-
return model
|
|
297
|
+
with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as f:
|
|
298
|
+
json.dump(orig_features, f)
|
|
407
299
|
|
|
300
|
+
with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as f:
|
|
301
|
+
json.dump(category_mappings, f)
|
|
408
302
|
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
if not input_data:
|
|
412
|
-
raise ValueError("Empty input data is not supported!")
|
|
303
|
+
with open(os.path.join(args.model_dir, "hyperparameters.json"), "w") as f:
|
|
304
|
+
json.dump(hyperparameters, f, indent=2)
|
|
413
305
|
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
input_data = input_data.decode("utf-8")
|
|
306
|
+
if label_encoder:
|
|
307
|
+
joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
|
|
417
308
|
|
|
418
|
-
if "
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
else:
|
|
423
|
-
raise ValueError(f"{content_type} not supported!")
|
|
309
|
+
if model_type != "classifier":
|
|
310
|
+
save_uq_models(uq_models, uq_metadata, args.model_dir)
|
|
311
|
+
|
|
312
|
+
print(f"\nModel training complete! Artifacts saved to {args.model_dir}")
|
|
424
313
|
|
|
425
314
|
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
315
|
+
# =============================================================================
|
|
316
|
+
# Model Loading (for SageMaker inference)
|
|
317
|
+
# =============================================================================
|
|
318
|
+
def model_fn(model_dir: str) -> dict:
|
|
319
|
+
"""Load XGBoost ensemble and associated artifacts.
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
model_dir: Directory containing model artifacts
|
|
323
|
+
|
|
324
|
+
Returns:
|
|
325
|
+
Dictionary with ensemble_models, label_encoder, category_mappings, uq_models, etc.
|
|
326
|
+
"""
|
|
327
|
+
# Load ensemble metadata
|
|
328
|
+
metadata_path = os.path.join(model_dir, "ensemble_metadata.json")
|
|
329
|
+
if os.path.exists(metadata_path):
|
|
330
|
+
with open(metadata_path) as f:
|
|
331
|
+
metadata = json.load(f)
|
|
332
|
+
n_ensemble = metadata["n_ensemble"]
|
|
433
333
|
else:
|
|
434
|
-
|
|
334
|
+
n_ensemble = 1 # Legacy single model
|
|
435
335
|
|
|
336
|
+
# Load ensemble models
|
|
337
|
+
ensemble_models = []
|
|
338
|
+
for i in range(n_ensemble):
|
|
339
|
+
model_path = os.path.join(model_dir, f"xgb_model_{i}.joblib")
|
|
340
|
+
if not os.path.exists(model_path):
|
|
341
|
+
model_path = os.path.join(model_dir, "xgb_model.joblib") # Legacy fallback
|
|
342
|
+
ensemble_models.append(joblib.load(model_path))
|
|
436
343
|
|
|
437
|
-
|
|
438
|
-
|
|
344
|
+
# Load label encoder (classifier only)
|
|
345
|
+
label_encoder = None
|
|
346
|
+
encoder_path = os.path.join(model_dir, "label_encoder.joblib")
|
|
347
|
+
if os.path.exists(encoder_path):
|
|
348
|
+
label_encoder = joblib.load(encoder_path)
|
|
349
|
+
|
|
350
|
+
# Load category mappings
|
|
351
|
+
category_mappings = {}
|
|
352
|
+
category_path = os.path.join(model_dir, "category_mappings.json")
|
|
353
|
+
if os.path.exists(category_path):
|
|
354
|
+
with open(category_path) as f:
|
|
355
|
+
category_mappings = json.load(f)
|
|
356
|
+
|
|
357
|
+
# Load UQ models (regression only)
|
|
358
|
+
uq_models, uq_metadata = None, None
|
|
359
|
+
uq_path = os.path.join(model_dir, "uq_metadata.json")
|
|
360
|
+
if os.path.exists(uq_path):
|
|
361
|
+
uq_models, uq_metadata = load_uq_models(model_dir)
|
|
362
|
+
|
|
363
|
+
return {
|
|
364
|
+
"ensemble_models": ensemble_models,
|
|
365
|
+
"n_ensemble": n_ensemble,
|
|
366
|
+
"label_encoder": label_encoder,
|
|
367
|
+
"category_mappings": category_mappings,
|
|
368
|
+
"uq_models": uq_models,
|
|
369
|
+
"uq_metadata": uq_metadata,
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
# =============================================================================
|
|
374
|
+
# Inference (for SageMaker inference)
|
|
375
|
+
# =============================================================================
|
|
376
|
+
def predict_fn(df: pd.DataFrame, models: dict) -> pd.DataFrame:
|
|
377
|
+
"""Make predictions with XGBoost ensemble.
|
|
439
378
|
|
|
440
379
|
Args:
|
|
441
|
-
df
|
|
442
|
-
|
|
380
|
+
df: Input DataFrame with features
|
|
381
|
+
models: Dictionary from model_fn containing ensemble and metadata
|
|
443
382
|
|
|
444
383
|
Returns:
|
|
445
|
-
|
|
384
|
+
DataFrame with predictions added
|
|
446
385
|
"""
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
# Grab our feature columns (from training)
|
|
386
|
+
# Load feature columns
|
|
450
387
|
model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
|
|
451
|
-
with open(os.path.join(model_dir, "feature_columns.json")) as
|
|
452
|
-
features = json.load(
|
|
388
|
+
with open(os.path.join(model_dir, "feature_columns.json")) as f:
|
|
389
|
+
features = json.load(f)
|
|
453
390
|
print(f"Model Features: {features}")
|
|
454
391
|
|
|
455
|
-
#
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
|
|
392
|
+
# Extract model components
|
|
393
|
+
ensemble_models = models["ensemble_models"]
|
|
394
|
+
label_encoder = models.get("label_encoder")
|
|
395
|
+
category_mappings = models.get("category_mappings", {})
|
|
396
|
+
uq_models = models.get("uq_models")
|
|
397
|
+
uq_metadata = models.get("uq_metadata")
|
|
398
|
+
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
463
399
|
|
|
464
|
-
#
|
|
465
|
-
# - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
|
|
466
|
-
# - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
|
|
400
|
+
# Prepare features
|
|
467
401
|
matched_df = match_features_case_insensitive(df, features)
|
|
468
|
-
|
|
469
|
-
# Detect categorical types in the incoming DataFrame
|
|
470
402
|
matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
|
|
471
403
|
|
|
472
|
-
# If we have compressed features, decompress them
|
|
473
404
|
if compressed_features:
|
|
474
405
|
print("Decompressing features for prediction...")
|
|
475
406
|
matched_df, features = decompress_features(matched_df, features, compressed_features)
|
|
476
407
|
|
|
477
|
-
# Predict the features against our XGB Model
|
|
478
408
|
X = matched_df[features]
|
|
479
|
-
predictions = model.predict(X)
|
|
480
409
|
|
|
481
|
-
#
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
# Set the predictions on the DataFrame
|
|
486
|
-
df["prediction"] = predictions
|
|
410
|
+
# Collect ensemble predictions
|
|
411
|
+
all_preds = [m.predict(X) for m in ensemble_models]
|
|
412
|
+
ensemble_preds = np.stack(all_preds, axis=0)
|
|
487
413
|
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
414
|
+
if label_encoder is not None:
|
|
415
|
+
# Classification: average probabilities, then argmax
|
|
416
|
+
all_probs = [m.predict_proba(X) for m in ensemble_models]
|
|
417
|
+
avg_probs = np.mean(np.stack(all_probs, axis=0), axis=0)
|
|
418
|
+
class_preds = np.argmax(avg_probs, axis=1)
|
|
492
419
|
|
|
493
|
-
|
|
420
|
+
df["prediction"] = label_encoder.inverse_transform(class_preds)
|
|
421
|
+
df["pred_proba"] = [p.tolist() for p in avg_probs]
|
|
494
422
|
df = expand_proba_column(df, label_encoder.classes_)
|
|
423
|
+
else:
|
|
424
|
+
# Regression: average predictions
|
|
425
|
+
df["prediction"] = np.mean(ensemble_preds, axis=0)
|
|
426
|
+
df["prediction_std"] = np.std(ensemble_preds, axis=0)
|
|
427
|
+
|
|
428
|
+
# Add UQ intervals if available
|
|
429
|
+
if uq_models and uq_metadata:
|
|
430
|
+
df = predict_intervals(df, X, uq_models, uq_metadata)
|
|
431
|
+
df = compute_confidence(df, uq_metadata["median_interval_width"], "q_10", "q_90")
|
|
495
432
|
|
|
496
|
-
|
|
433
|
+
print(f"Inference complete: {len(df)} predictions, {len(ensemble_models)} ensemble members")
|
|
497
434
|
return df
|