workbench 0.8.160__py3-none-any.whl → 0.8.202__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of workbench might be problematic. Click here for more details.
- workbench/algorithms/dataframe/__init__.py +1 -2
- workbench/algorithms/dataframe/fingerprint_proximity.py +2 -2
- workbench/algorithms/dataframe/proximity.py +261 -235
- workbench/algorithms/graph/light/proximity_graph.py +10 -8
- workbench/api/__init__.py +2 -1
- workbench/api/compound.py +1 -1
- workbench/api/endpoint.py +11 -0
- workbench/api/feature_set.py +12 -8
- workbench/api/meta.py +5 -2
- workbench/api/model.py +16 -15
- workbench/api/monitor.py +1 -16
- workbench/api/parameter_store.py +5 -0
- workbench/core/artifacts/__init__.py +11 -2
- workbench/core/artifacts/artifact.py +11 -3
- workbench/core/artifacts/data_capture_core.py +355 -0
- workbench/core/artifacts/endpoint_core.py +256 -118
- workbench/core/artifacts/feature_set_core.py +265 -16
- workbench/core/artifacts/model_core.py +110 -63
- workbench/core/artifacts/monitor_core.py +33 -248
- workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
- workbench/core/cloud_platform/aws/aws_meta.py +12 -5
- workbench/core/cloud_platform/aws/aws_parameter_store.py +18 -2
- workbench/core/cloud_platform/aws/aws_session.py +4 -4
- workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
- workbench/core/transforms/features_to_model/features_to_model.py +45 -33
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
- workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
- workbench/core/views/training_view.py +113 -42
- workbench/core/views/view.py +53 -3
- workbench/core/views/view_utils.py +4 -4
- workbench/model_scripts/chemprop/chemprop.template +852 -0
- workbench/model_scripts/chemprop/generated_model_script.py +852 -0
- workbench/model_scripts/chemprop/requirements.txt +11 -0
- workbench/model_scripts/custom_models/chem_info/fingerprints.py +134 -0
- workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
- workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
- workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
- workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
- workbench/model_scripts/custom_models/proximity/proximity.py +261 -235
- workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
- workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
- workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +166 -62
- workbench/model_scripts/custom_models/uq_models/ngboost.template +30 -18
- workbench/model_scripts/custom_models/uq_models/proximity.py +261 -235
- workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
- workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
- workbench/model_scripts/pytorch_model/generated_model_script.py +390 -188
- workbench/model_scripts/pytorch_model/pytorch.template +387 -176
- workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
- workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
- workbench/model_scripts/script_generation.py +19 -10
- workbench/model_scripts/uq_models/generated_model_script.py +605 -0
- workbench/model_scripts/uq_models/mapie.template +605 -0
- workbench/model_scripts/uq_models/requirements.txt +1 -0
- workbench/model_scripts/xgb_model/generated_model_script.py +37 -46
- workbench/model_scripts/xgb_model/xgb_model.template +44 -46
- workbench/repl/workbench_shell.py +28 -14
- workbench/scripts/endpoint_test.py +162 -0
- workbench/scripts/lambda_test.py +73 -0
- workbench/scripts/ml_pipeline_batch.py +137 -0
- workbench/scripts/ml_pipeline_sqs.py +186 -0
- workbench/scripts/monitor_cloud_watch.py +20 -100
- workbench/utils/aws_utils.py +4 -3
- workbench/utils/chem_utils/__init__.py +0 -0
- workbench/utils/chem_utils/fingerprints.py +134 -0
- workbench/utils/chem_utils/misc.py +194 -0
- workbench/utils/chem_utils/mol_descriptors.py +483 -0
- workbench/utils/chem_utils/mol_standardize.py +450 -0
- workbench/utils/chem_utils/mol_tagging.py +348 -0
- workbench/utils/chem_utils/projections.py +209 -0
- workbench/utils/chem_utils/salts.py +256 -0
- workbench/utils/chem_utils/sdf.py +292 -0
- workbench/utils/chem_utils/toxicity.py +250 -0
- workbench/utils/chem_utils/vis.py +253 -0
- workbench/utils/chemprop_utils.py +760 -0
- workbench/utils/cloudwatch_handler.py +1 -1
- workbench/utils/cloudwatch_utils.py +137 -0
- workbench/utils/config_manager.py +3 -7
- workbench/utils/endpoint_utils.py +5 -7
- workbench/utils/license_manager.py +2 -6
- workbench/utils/model_utils.py +95 -34
- workbench/utils/monitor_utils.py +44 -62
- workbench/utils/pandas_utils.py +3 -3
- workbench/utils/pytorch_utils.py +526 -0
- workbench/utils/shap_utils.py +10 -2
- workbench/utils/workbench_logging.py +0 -3
- workbench/utils/workbench_sqs.py +1 -1
- workbench/utils/xgboost_model_utils.py +371 -156
- workbench/web_interface/components/model_plot.py +7 -1
- workbench/web_interface/components/plugin_unit_test.py +5 -2
- workbench/web_interface/components/plugins/dashboard_status.py +3 -1
- workbench/web_interface/components/plugins/generated_compounds.py +1 -1
- workbench/web_interface/components/plugins/model_details.py +9 -7
- workbench/web_interface/components/plugins/scatter_plot.py +3 -3
- {workbench-0.8.160.dist-info → workbench-0.8.202.dist-info}/METADATA +27 -6
- {workbench-0.8.160.dist-info → workbench-0.8.202.dist-info}/RECORD +102 -86
- {workbench-0.8.160.dist-info → workbench-0.8.202.dist-info}/entry_points.txt +4 -0
- {workbench-0.8.160.dist-info → workbench-0.8.202.dist-info}/licenses/LICENSE +1 -1
- workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
- workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
- workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
- workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
- workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
- workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
- workbench/model_scripts/quant_regression/quant_regression.template +0 -279
- workbench/model_scripts/quant_regression/requirements.txt +0 -1
- workbench/utils/chem_utils.py +0 -1556
- workbench/utils/execution_environment.py +0 -211
- workbench/utils/fast_inference.py +0 -167
- workbench/utils/resource_utils.py +0 -39
- {workbench-0.8.160.dist-info → workbench-0.8.202.dist-info}/WHEEL +0 -0
- {workbench-0.8.160.dist-info → workbench-0.8.202.dist-info}/top_level.txt +0 -0
|
@@ -13,38 +13,38 @@ from pytorch_tabular.models import CategoryEmbeddingModelConfig
|
|
|
13
13
|
# Model Performance Scores
|
|
14
14
|
from sklearn.metrics import (
|
|
15
15
|
mean_absolute_error,
|
|
16
|
+
median_absolute_error,
|
|
16
17
|
r2_score,
|
|
17
18
|
root_mean_squared_error,
|
|
18
19
|
precision_recall_fscore_support,
|
|
19
20
|
confusion_matrix,
|
|
20
21
|
)
|
|
22
|
+
from scipy.stats import spearmanr
|
|
21
23
|
|
|
22
24
|
# Classification Encoder
|
|
23
25
|
from sklearn.preprocessing import LabelEncoder
|
|
24
26
|
|
|
25
27
|
# Scikit Learn Imports
|
|
26
|
-
from sklearn.model_selection import train_test_split
|
|
28
|
+
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
|
|
27
29
|
|
|
28
30
|
from io import StringIO
|
|
29
31
|
import json
|
|
30
32
|
import argparse
|
|
31
33
|
import joblib
|
|
32
|
-
import os
|
|
33
34
|
import pandas as pd
|
|
34
|
-
from typing import List, Tuple
|
|
35
35
|
|
|
36
36
|
# Template Parameters
|
|
37
37
|
TEMPLATE_PARAMS = {
|
|
38
38
|
"model_type": "{{model_type}}",
|
|
39
|
-
"
|
|
39
|
+
"target": "{{target_column}}",
|
|
40
40
|
"features": "{{feature_list}}",
|
|
41
|
+
"id_column": "{{id_column}}",
|
|
41
42
|
"compressed_features": "{{compressed_features}}",
|
|
42
43
|
"model_metrics_s3_path": "{{model_metrics_s3_path}}",
|
|
43
|
-
"
|
|
44
|
+
"hyperparameters": "{{hyperparameters}}",
|
|
44
45
|
}
|
|
45
46
|
|
|
46
47
|
|
|
47
|
-
# Function to check if dataframe is empty
|
|
48
48
|
def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
|
|
49
49
|
"""
|
|
50
50
|
Check if the provided dataframe is empty and raise an exception if it is.
|
|
@@ -59,19 +59,17 @@ def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
|
|
|
59
59
|
raise ValueError(msg)
|
|
60
60
|
|
|
61
61
|
|
|
62
|
-
def expand_proba_column(df: pd.DataFrame, class_labels:
|
|
62
|
+
def expand_proba_column(df: pd.DataFrame, class_labels: list[str]) -> pd.DataFrame:
|
|
63
63
|
"""
|
|
64
64
|
Expands a column in a DataFrame containing a list of probabilities into separate columns.
|
|
65
65
|
|
|
66
66
|
Args:
|
|
67
67
|
df (pd.DataFrame): DataFrame containing a "pred_proba" column
|
|
68
|
-
class_labels (
|
|
68
|
+
class_labels (list[str]): List of class labels
|
|
69
69
|
|
|
70
70
|
Returns:
|
|
71
71
|
pd.DataFrame: DataFrame with the "pred_proba" expanded into separate columns
|
|
72
72
|
"""
|
|
73
|
-
|
|
74
|
-
# Sanity check
|
|
75
73
|
proba_column = "pred_proba"
|
|
76
74
|
if proba_column not in df.columns:
|
|
77
75
|
raise ValueError('DataFrame does not contain a "pred_proba" column')
|
|
@@ -88,11 +86,10 @@ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFra
|
|
|
88
86
|
|
|
89
87
|
# Concatenate the new columns with the original DataFrame
|
|
90
88
|
df = pd.concat([df, proba_df], axis=1)
|
|
91
|
-
print(df)
|
|
92
89
|
return df
|
|
93
90
|
|
|
94
91
|
|
|
95
|
-
def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
|
|
92
|
+
def match_features_case_insensitive(df: pd.DataFrame, model_features: list[str]) -> pd.DataFrame:
|
|
96
93
|
"""
|
|
97
94
|
Matches and renames DataFrame columns to match model feature names (case-insensitive).
|
|
98
95
|
Prioritizes exact matches, then case-insensitive matches.
|
|
@@ -102,7 +99,6 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
102
99
|
df_columns_lower = {col.lower(): col for col in df.columns}
|
|
103
100
|
rename_dict = {}
|
|
104
101
|
missing = []
|
|
105
|
-
|
|
106
102
|
for feature in model_features:
|
|
107
103
|
if feature in df.columns:
|
|
108
104
|
continue # Exact match
|
|
@@ -114,58 +110,64 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
114
110
|
if missing:
|
|
115
111
|
raise ValueError(f"Features not found: {missing}")
|
|
116
112
|
|
|
113
|
+
# Rename the DataFrame columns to match the model features
|
|
117
114
|
return df.rename(columns=rename_dict)
|
|
118
115
|
|
|
119
116
|
|
|
120
|
-
def convert_categorical_types(
|
|
117
|
+
def convert_categorical_types(
|
|
118
|
+
df: pd.DataFrame, features: list[str], category_mappings: dict[str, list[str]] | None = None
|
|
119
|
+
) -> tuple[pd.DataFrame, dict[str, list[str]]]:
|
|
121
120
|
"""
|
|
122
121
|
Converts appropriate columns to categorical type with consistent mappings.
|
|
123
122
|
|
|
124
123
|
Args:
|
|
125
124
|
df (pd.DataFrame): The DataFrame to process.
|
|
126
125
|
features (list): List of feature names to consider for conversion.
|
|
127
|
-
category_mappings (dict, optional): Existing category mappings. If empty
|
|
128
|
-
training mode. If populated, we're in
|
|
126
|
+
category_mappings (dict, optional): Existing category mappings. If None or empty,
|
|
127
|
+
we're in training mode. If populated, we're in
|
|
128
|
+
inference mode.
|
|
129
129
|
|
|
130
130
|
Returns:
|
|
131
131
|
tuple: (processed DataFrame, category mappings dictionary)
|
|
132
132
|
"""
|
|
133
|
+
if category_mappings is None:
|
|
134
|
+
category_mappings = {}
|
|
135
|
+
|
|
133
136
|
# Training mode
|
|
134
|
-
if category_mappings
|
|
137
|
+
if not category_mappings:
|
|
135
138
|
for col in df.select_dtypes(include=["object", "string"]):
|
|
136
139
|
if col in features and df[col].nunique() < 20:
|
|
137
140
|
print(f"Training mode: Converting {col} to category")
|
|
138
141
|
df[col] = df[col].astype("category")
|
|
139
|
-
category_mappings[col] = df[col].cat.categories.tolist()
|
|
142
|
+
category_mappings[col] = df[col].cat.categories.tolist()
|
|
140
143
|
|
|
141
144
|
# Inference mode
|
|
142
145
|
else:
|
|
143
146
|
for col, categories in category_mappings.items():
|
|
144
147
|
if col in df.columns:
|
|
145
148
|
print(f"Inference mode: Applying categorical mapping for {col}")
|
|
146
|
-
df[col] = pd.Categorical(df[col], categories=categories)
|
|
149
|
+
df[col] = pd.Categorical(df[col], categories=categories)
|
|
147
150
|
|
|
148
151
|
return df, category_mappings
|
|
149
152
|
|
|
150
153
|
|
|
151
154
|
def decompress_features(
|
|
152
|
-
df: pd.DataFrame, features:
|
|
153
|
-
) ->
|
|
155
|
+
df: pd.DataFrame, features: list[str], compressed_features: list[str]
|
|
156
|
+
) -> tuple[pd.DataFrame, list[str]]:
|
|
154
157
|
"""Prepare features for the model
|
|
155
158
|
|
|
156
159
|
Args:
|
|
157
160
|
df (pd.DataFrame): The features DataFrame
|
|
158
|
-
features (
|
|
159
|
-
compressed_features (
|
|
161
|
+
features (list[str]): Full list of feature names
|
|
162
|
+
compressed_features (list[str]): List of feature names to decompress (bitstrings)
|
|
160
163
|
|
|
161
164
|
Returns:
|
|
162
165
|
pd.DataFrame: DataFrame with the decompressed features
|
|
163
|
-
|
|
166
|
+
list[str]: Updated list of feature names after decompression
|
|
164
167
|
|
|
165
168
|
Raises:
|
|
166
169
|
ValueError: If any missing values are found in the specified features
|
|
167
170
|
"""
|
|
168
|
-
|
|
169
171
|
# Check for any missing values in the required features
|
|
170
172
|
missing_counts = df[features].isna().sum()
|
|
171
173
|
if missing_counts.any():
|
|
@@ -175,10 +177,11 @@ def decompress_features(
|
|
|
175
177
|
"WARNING: You might want to remove/replace all NaN values before processing."
|
|
176
178
|
)
|
|
177
179
|
|
|
178
|
-
#
|
|
179
|
-
decompressed_features = features
|
|
180
|
+
# Make a copy to avoid mutating the original list
|
|
181
|
+
decompressed_features = features.copy()
|
|
182
|
+
|
|
180
183
|
for feature in compressed_features:
|
|
181
|
-
if (feature not in df.columns) or (feature not in
|
|
184
|
+
if (feature not in df.columns) or (feature not in decompressed_features):
|
|
182
185
|
print(f"Feature '{feature}' not in the features list, skipping decompression.")
|
|
183
186
|
continue
|
|
184
187
|
|
|
@@ -203,26 +206,60 @@ def decompress_features(
|
|
|
203
206
|
return df, decompressed_features
|
|
204
207
|
|
|
205
208
|
|
|
206
|
-
def model_fn(model_dir):
|
|
209
|
+
def model_fn(model_dir: str) -> dict:
|
|
210
|
+
"""Load the PyTorch Tabular ensemble models from the specified directory.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
model_dir: Directory containing the saved model(s)
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Dictionary with ensemble models and metadata
|
|
217
|
+
"""
|
|
218
|
+
import torch
|
|
219
|
+
from functools import partial
|
|
220
|
+
|
|
221
|
+
# Load ensemble metadata if present
|
|
222
|
+
ensemble_metadata_path = os.path.join(model_dir, "ensemble_metadata.joblib")
|
|
223
|
+
if os.path.exists(ensemble_metadata_path):
|
|
224
|
+
ensemble_metadata = joblib.load(ensemble_metadata_path)
|
|
225
|
+
n_ensemble = ensemble_metadata["n_ensemble"]
|
|
226
|
+
else:
|
|
227
|
+
n_ensemble = 1
|
|
228
|
+
|
|
229
|
+
# Determine map_location for loading models (handle CUDA trained models on CPU inference)
|
|
230
|
+
map_location = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
231
|
+
|
|
232
|
+
# Patch torch.load globally to use map_location (needed for joblib-loaded callbacks)
|
|
233
|
+
# This handles the case where pytorch-tabular loads callbacks.sav via joblib,
|
|
234
|
+
# which internally calls torch.load without map_location
|
|
235
|
+
original_torch_load = torch.load
|
|
236
|
+
torch.load = partial(original_torch_load, map_location=map_location)
|
|
207
237
|
|
|
208
238
|
# Save current working directory
|
|
209
239
|
original_cwd = os.getcwd()
|
|
240
|
+
ensemble_models = []
|
|
241
|
+
|
|
210
242
|
try:
|
|
211
243
|
# Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
|
|
212
|
-
os.chdir(
|
|
244
|
+
os.chdir("/tmp")
|
|
213
245
|
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
246
|
+
for ens_idx in range(n_ensemble):
|
|
247
|
+
# Try numbered model path first, fall back to legacy path
|
|
248
|
+
model_path = os.path.join(model_dir, f"tabular_model_{ens_idx}")
|
|
249
|
+
if not os.path.exists(model_path):
|
|
250
|
+
model_path = os.path.join(model_dir, "tabular_model")
|
|
251
|
+
model = TabularModel.load_model(model_path, map_location=map_location)
|
|
252
|
+
ensemble_models.append(model)
|
|
217
253
|
|
|
218
|
-
# Restore the original working directory
|
|
219
254
|
finally:
|
|
255
|
+
# Restore torch.load and working directory
|
|
256
|
+
torch.load = original_torch_load
|
|
220
257
|
os.chdir(original_cwd)
|
|
221
258
|
|
|
222
|
-
return
|
|
259
|
+
return {"ensemble_models": ensemble_models, "n_ensemble": n_ensemble}
|
|
223
260
|
|
|
224
261
|
|
|
225
|
-
def input_fn(input_data, content_type):
|
|
262
|
+
def input_fn(input_data, content_type: str) -> pd.DataFrame:
|
|
226
263
|
"""Parse input data and return a DataFrame."""
|
|
227
264
|
if not input_data:
|
|
228
265
|
raise ValueError("Empty input data is not supported!")
|
|
@@ -239,29 +276,34 @@ def input_fn(input_data, content_type):
|
|
|
239
276
|
raise ValueError(f"{content_type} not supported!")
|
|
240
277
|
|
|
241
278
|
|
|
242
|
-
def output_fn(output_df, accept_type):
|
|
279
|
+
def output_fn(output_df: pd.DataFrame, accept_type: str) -> tuple[str, str]:
|
|
243
280
|
"""Supports both CSV and JSON output formats."""
|
|
244
281
|
if "text/csv" in accept_type:
|
|
245
|
-
csv_output = output_df.fillna("N/A").to_csv(index=False)
|
|
282
|
+
csv_output = output_df.fillna("N/A").to_csv(index=False)
|
|
246
283
|
return csv_output, "text/csv"
|
|
247
284
|
elif "application/json" in accept_type:
|
|
248
|
-
return output_df.to_json(orient="records"), "application/json"
|
|
285
|
+
return output_df.to_json(orient="records"), "application/json"
|
|
249
286
|
else:
|
|
250
287
|
raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
|
|
251
288
|
|
|
252
289
|
|
|
253
|
-
def predict_fn(df,
|
|
254
|
-
"""Make Predictions with our PyTorch Tabular Model
|
|
290
|
+
def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
|
|
291
|
+
"""Make Predictions with our PyTorch Tabular Model ensemble.
|
|
255
292
|
|
|
256
293
|
Args:
|
|
257
294
|
df (pd.DataFrame): The input DataFrame
|
|
258
|
-
|
|
295
|
+
model_dict: Dictionary containing ensemble models and metadata
|
|
259
296
|
|
|
260
297
|
Returns:
|
|
261
|
-
pd.DataFrame: The DataFrame with
|
|
298
|
+
pd.DataFrame: The DataFrame with predictions (and prediction_std for ensembles)
|
|
262
299
|
"""
|
|
300
|
+
model_type = TEMPLATE_PARAMS["model_type"]
|
|
263
301
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
264
302
|
|
|
303
|
+
# Extract ensemble models
|
|
304
|
+
ensemble_models = model_dict["ensemble_models"]
|
|
305
|
+
n_ensemble = model_dict["n_ensemble"]
|
|
306
|
+
|
|
265
307
|
# Grab our feature columns (from training)
|
|
266
308
|
model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
|
|
267
309
|
with open(os.path.join(model_dir, "feature_columns.json")) as fp:
|
|
@@ -274,12 +316,11 @@ def predict_fn(df, model) -> pd.DataFrame:
|
|
|
274
316
|
|
|
275
317
|
# Load our Label Encoder if we have one
|
|
276
318
|
label_encoder = None
|
|
277
|
-
|
|
278
|
-
|
|
319
|
+
label_encoder_path = os.path.join(model_dir, "label_encoder.joblib")
|
|
320
|
+
if os.path.exists(label_encoder_path):
|
|
321
|
+
label_encoder = joblib.load(label_encoder_path)
|
|
279
322
|
|
|
280
|
-
#
|
|
281
|
-
# - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
|
|
282
|
-
# - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
|
|
323
|
+
# Match features in a case-insensitive manner
|
|
283
324
|
matched_df = match_features_case_insensitive(df, features)
|
|
284
325
|
|
|
285
326
|
# Detect categorical types in the incoming DataFrame
|
|
@@ -290,36 +331,80 @@ def predict_fn(df, model) -> pd.DataFrame:
|
|
|
290
331
|
print("Decompressing features for prediction...")
|
|
291
332
|
matched_df, features = decompress_features(matched_df, features, compressed_features)
|
|
292
333
|
|
|
293
|
-
#
|
|
294
|
-
|
|
334
|
+
# Track rows with missing features
|
|
335
|
+
missing_mask = matched_df[features].isna().any(axis=1)
|
|
336
|
+
if missing_mask.any():
|
|
337
|
+
print(f"Warning: {missing_mask.sum()} rows have missing features, will return NaN predictions")
|
|
338
|
+
|
|
339
|
+
# Initialize prediction columns
|
|
340
|
+
df["prediction"] = np.nan
|
|
341
|
+
if model_type in ["regressor", "uq_regressor"]:
|
|
342
|
+
df["prediction_std"] = np.nan
|
|
343
|
+
|
|
344
|
+
# Only predict on complete rows
|
|
345
|
+
complete_df = matched_df[~missing_mask]
|
|
346
|
+
if len(complete_df) == 0:
|
|
347
|
+
print("Warning: No complete rows to predict on")
|
|
348
|
+
return df
|
|
295
349
|
|
|
296
350
|
# pytorch-tabular returns predictions using f"{target}_prediction" column
|
|
297
|
-
|
|
298
|
-
target = TEMPLATE_PARAMS["target_column"]
|
|
351
|
+
target = TEMPLATE_PARAMS["target"]
|
|
299
352
|
prediction_column = f"{target}_prediction"
|
|
300
|
-
if prediction_column in result.columns:
|
|
301
|
-
predictions = result[prediction_column].values
|
|
302
|
-
else:
|
|
303
|
-
raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
|
|
304
353
|
|
|
305
|
-
#
|
|
306
|
-
|
|
307
|
-
|
|
354
|
+
# Collect predictions from all ensemble members
|
|
355
|
+
all_ensemble_preds = []
|
|
356
|
+
all_ensemble_probs = []
|
|
357
|
+
|
|
358
|
+
for ens_idx, ens_model in enumerate(ensemble_models):
|
|
359
|
+
result = ens_model.predict(complete_df[features])
|
|
360
|
+
|
|
361
|
+
if prediction_column in result.columns:
|
|
362
|
+
ens_preds = result[prediction_column].values
|
|
363
|
+
else:
|
|
364
|
+
raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
|
|
365
|
+
|
|
366
|
+
all_ensemble_preds.append(ens_preds)
|
|
367
|
+
|
|
368
|
+
# For classification, collect probabilities
|
|
369
|
+
if label_encoder is not None:
|
|
370
|
+
prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
|
|
371
|
+
if prob_cols:
|
|
372
|
+
all_ensemble_probs.append(result[prob_cols].values)
|
|
373
|
+
|
|
374
|
+
# Stack and compute mean/std (std is 0 for single model)
|
|
375
|
+
ensemble_preds = np.stack(all_ensemble_preds, axis=0) # (n_ensemble, n_samples)
|
|
376
|
+
preds = np.mean(ensemble_preds, axis=0)
|
|
377
|
+
preds_std = np.std(ensemble_preds, axis=0) # Will be 0s for n_ensemble=1
|
|
308
378
|
|
|
309
|
-
|
|
310
|
-
df["prediction"] = predictions
|
|
379
|
+
print(f"Inference: Ensemble predictions shape: {preds.shape}, n_ensemble: {n_ensemble}")
|
|
311
380
|
|
|
312
|
-
#
|
|
381
|
+
# Handle classification vs regression
|
|
313
382
|
if label_encoder is not None:
|
|
314
|
-
|
|
315
|
-
if
|
|
316
|
-
|
|
317
|
-
|
|
383
|
+
# For classification, average probabilities then take argmax
|
|
384
|
+
if all_ensemble_probs:
|
|
385
|
+
ensemble_probs = np.stack(all_ensemble_probs, axis=0) # (n_ensemble, n_samples, n_classes)
|
|
386
|
+
avg_probs = np.mean(ensemble_probs, axis=0) # (n_samples, n_classes)
|
|
387
|
+
class_preds = np.argmax(avg_probs, axis=1)
|
|
388
|
+
predictions = label_encoder.inverse_transform(class_preds)
|
|
389
|
+
|
|
390
|
+
# Build full proba Series with None for missing rows
|
|
391
|
+
all_proba = pd.Series([None] * len(df), index=df.index, dtype=object)
|
|
392
|
+
all_proba.loc[~missing_mask] = [p.tolist() for p in avg_probs]
|
|
393
|
+
df["pred_proba"] = all_proba
|
|
318
394
|
|
|
319
395
|
# Expand the pred_proba column into separate columns for each class
|
|
320
396
|
df = expand_proba_column(df, label_encoder.classes_)
|
|
397
|
+
else:
|
|
398
|
+
# No probabilities, use averaged predictions
|
|
399
|
+
predictions = label_encoder.inverse_transform(preds.astype(int))
|
|
400
|
+
else:
|
|
401
|
+
# Regression (includes uq_regressor)
|
|
402
|
+
predictions = preds
|
|
403
|
+
df.loc[~missing_mask, "prediction_std"] = preds_std
|
|
404
|
+
|
|
405
|
+
# Set predictions only for complete rows
|
|
406
|
+
df.loc[~missing_mask, "prediction"] = predictions
|
|
321
407
|
|
|
322
|
-
# All done, return the DataFrame with new columns for the predictions
|
|
323
408
|
return df
|
|
324
409
|
|
|
325
410
|
|
|
@@ -327,14 +412,14 @@ if __name__ == "__main__":
|
|
|
327
412
|
"""The main function is for training the PyTorch Tabular model"""
|
|
328
413
|
|
|
329
414
|
# Harness Template Parameters
|
|
330
|
-
target = TEMPLATE_PARAMS["
|
|
415
|
+
target = TEMPLATE_PARAMS["target"]
|
|
331
416
|
features = TEMPLATE_PARAMS["features"]
|
|
332
417
|
orig_features = features.copy()
|
|
418
|
+
id_column = TEMPLATE_PARAMS["id_column"]
|
|
333
419
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
334
420
|
model_type = TEMPLATE_PARAMS["model_type"]
|
|
335
421
|
model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
|
|
336
|
-
|
|
337
|
-
validation_split = 0.2
|
|
422
|
+
hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
|
|
338
423
|
|
|
339
424
|
# Script arguments for input/output directories
|
|
340
425
|
parser = argparse.ArgumentParser()
|
|
@@ -346,19 +431,27 @@ if __name__ == "__main__":
|
|
|
346
431
|
args = parser.parse_args()
|
|
347
432
|
|
|
348
433
|
# Read the training data into DataFrames
|
|
349
|
-
training_files = [
|
|
350
|
-
os.path.join(args.train, file)
|
|
351
|
-
for file in os.listdir(args.train)
|
|
352
|
-
if file.endswith(".csv")
|
|
353
|
-
]
|
|
434
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
354
435
|
print(f"Training Files: {training_files}")
|
|
355
436
|
|
|
356
437
|
# Combine files and read them all into a single pandas dataframe
|
|
357
438
|
all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
358
439
|
|
|
440
|
+
# Print out some info about the dataframe
|
|
441
|
+
print(f"All Data Shape: {all_df.shape}")
|
|
442
|
+
print(f"Feature dtypes:\n{all_df[features].dtypes.value_counts()}")
|
|
443
|
+
print(f"Int64 columns: {all_df[features].select_dtypes(include=['int64']).columns.tolist()}")
|
|
444
|
+
|
|
359
445
|
# Check if the dataframe is empty
|
|
360
446
|
check_dataframe(all_df, "training_df")
|
|
361
447
|
|
|
448
|
+
# Drop any rows with missing feature values
|
|
449
|
+
initial_row_count = all_df.shape[0]
|
|
450
|
+
all_df = all_df.dropna(subset=features)
|
|
451
|
+
dropped_rows = initial_row_count - all_df.shape[0]
|
|
452
|
+
if dropped_rows > 0:
|
|
453
|
+
print(f"Dropped {dropped_rows} rows due to missing feature values.")
|
|
454
|
+
|
|
362
455
|
# Features/Target output
|
|
363
456
|
print(f"Target: {target}")
|
|
364
457
|
print(f"Features: {str(features)}")
|
|
@@ -366,125 +459,228 @@ if __name__ == "__main__":
|
|
|
366
459
|
# Convert any features that might be categorical to 'category' type
|
|
367
460
|
all_df, category_mappings = convert_categorical_types(all_df, features)
|
|
368
461
|
|
|
462
|
+
# Print out some info about the dataframe
|
|
463
|
+
print(f"All Data Shape: {all_df.shape}")
|
|
464
|
+
print(f"Feature dtypes:\n{all_df[features].dtypes.value_counts()}")
|
|
465
|
+
print(f"Int64 columns: {all_df[features].select_dtypes(include=['int64']).columns.tolist()}")
|
|
466
|
+
|
|
369
467
|
# If we have compressed features, decompress them
|
|
370
468
|
if compressed_features:
|
|
371
469
|
print(f"Decompressing features {compressed_features}...")
|
|
372
470
|
all_df, features = decompress_features(all_df, features, compressed_features)
|
|
373
471
|
|
|
374
|
-
# Do we want to train on all the data?
|
|
375
|
-
if train_all_data:
|
|
376
|
-
print("Training on ALL of the data")
|
|
377
|
-
df_train = all_df.copy()
|
|
378
|
-
df_val = all_df.copy()
|
|
379
|
-
|
|
380
|
-
# Does the dataframe have a training column?
|
|
381
|
-
elif "training" in all_df.columns:
|
|
382
|
-
print("Found training column, splitting data based on training column")
|
|
383
|
-
df_train = all_df[all_df["training"]]
|
|
384
|
-
df_val = all_df[~all_df["training"]]
|
|
385
|
-
else:
|
|
386
|
-
# Just do a random training Split
|
|
387
|
-
print("WARNING: No training column found, splitting data with random state=42")
|
|
388
|
-
df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
|
|
389
|
-
print(f"FIT/TRAIN: {df_train.shape}")
|
|
390
|
-
print(f"VALIDATION: {df_val.shape}")
|
|
391
|
-
|
|
392
472
|
# Determine categorical and continuous columns
|
|
393
|
-
categorical_cols = [col for col in features if
|
|
473
|
+
categorical_cols = [col for col in features if all_df[col].dtype.name == "category"]
|
|
394
474
|
continuous_cols = [col for col in features if col not in categorical_cols]
|
|
395
|
-
|
|
396
475
|
print(f"Categorical columns: {categorical_cols}")
|
|
397
476
|
print(f"Continuous columns: {continuous_cols}")
|
|
398
477
|
|
|
399
|
-
#
|
|
400
|
-
|
|
401
|
-
target=[target],
|
|
402
|
-
continuous_cols=continuous_cols,
|
|
403
|
-
categorical_cols=categorical_cols,
|
|
404
|
-
)
|
|
478
|
+
# Cast continuous columns to float
|
|
479
|
+
all_df[continuous_cols] = all_df[continuous_cols].astype("float64")
|
|
405
480
|
|
|
406
|
-
|
|
407
|
-
auto_lr_find=True,
|
|
408
|
-
batch_size=min(1024, len(df_train) // 4),
|
|
409
|
-
max_epochs=100,
|
|
410
|
-
early_stopping="valid_loss",
|
|
411
|
-
early_stopping_patience=15,
|
|
412
|
-
checkpoints="valid_loss",
|
|
413
|
-
accelerator="auto",
|
|
414
|
-
progress_bar="none",
|
|
415
|
-
gradient_clip_val=1.0,
|
|
416
|
-
)
|
|
417
|
-
|
|
418
|
-
optimizer_config = OptimizerConfig()
|
|
419
|
-
|
|
420
|
-
# Choose model configuration based on model type
|
|
481
|
+
# Choose the 'task' based on model type and set up the label encoder if needed
|
|
421
482
|
if model_type == "classifier":
|
|
422
483
|
task = "classification"
|
|
423
|
-
# Encode the target column
|
|
484
|
+
# Encode the target column on full dataset for consistent encoding
|
|
424
485
|
label_encoder = LabelEncoder()
|
|
425
|
-
|
|
426
|
-
|
|
486
|
+
all_df[target] = label_encoder.fit_transform(all_df[target])
|
|
487
|
+
num_classes = len(label_encoder.classes_)
|
|
427
488
|
else:
|
|
428
489
|
task = "regression"
|
|
429
490
|
label_encoder = None
|
|
491
|
+
num_classes = None
|
|
492
|
+
|
|
493
|
+
# Use any hyperparameters to set up both the trainer and model configurations
|
|
494
|
+
print(f"Hyperparameters: {hyperparameters}")
|
|
495
|
+
n_folds = hyperparameters.get("n_folds", 5) # Number of CV folds (default: 5)
|
|
496
|
+
|
|
497
|
+
# =========================================================================
|
|
498
|
+
# UNIFIED TRAINING: Works for n_folds=1 (single model) or n_folds>1 (K-fold CV)
|
|
499
|
+
# =========================================================================
|
|
500
|
+
print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold cross-validation ensemble'}...")
|
|
501
|
+
|
|
502
|
+
# Create fold splits
|
|
503
|
+
if n_folds == 1:
|
|
504
|
+
# Single fold: use train/val split from "training" column or random split
|
|
505
|
+
if "training" in all_df.columns:
|
|
506
|
+
print("Found training column, splitting data based on training column")
|
|
507
|
+
train_idx = np.where(all_df["training"])[0]
|
|
508
|
+
val_idx = np.where(~all_df["training"])[0]
|
|
509
|
+
else:
|
|
510
|
+
print("WARNING: No training column found, splitting data with random 80/20 split")
|
|
511
|
+
indices = np.arange(len(all_df))
|
|
512
|
+
train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)
|
|
513
|
+
folds = [(train_idx, val_idx)]
|
|
514
|
+
else:
|
|
515
|
+
# K-Fold CV
|
|
516
|
+
if model_type == "classifier":
|
|
517
|
+
kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
518
|
+
split_target = all_df[target]
|
|
519
|
+
else:
|
|
520
|
+
kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
521
|
+
split_target = None
|
|
522
|
+
folds = list(kfold.split(all_df, split_target))
|
|
523
|
+
|
|
524
|
+
# Initialize storage for out-of-fold predictions
|
|
525
|
+
oof_predictions = np.full(len(all_df), np.nan, dtype=np.float64)
|
|
526
|
+
if model_type == "classifier" and num_classes and num_classes > 1:
|
|
527
|
+
oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
|
|
528
|
+
else:
|
|
529
|
+
oof_proba = None
|
|
430
530
|
|
|
431
|
-
|
|
432
|
-
model_config = CategoryEmbeddingModelConfig(
|
|
433
|
-
task=task,
|
|
434
|
-
layers="1024-512-512",
|
|
435
|
-
activation="ReLU",
|
|
436
|
-
learning_rate=1e-3,
|
|
437
|
-
dropout=0.1,
|
|
438
|
-
use_batch_norm=True,
|
|
439
|
-
initialization="kaiming",
|
|
440
|
-
)
|
|
531
|
+
ensemble_models = []
|
|
441
532
|
|
|
442
|
-
#
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
trainer_config=trainer_config,
|
|
533
|
+
# Set up PyTorch Tabular data configuration (shared across folds)
|
|
534
|
+
data_config = DataConfig(
|
|
535
|
+
target=[target],
|
|
536
|
+
continuous_cols=continuous_cols,
|
|
537
|
+
categorical_cols=categorical_cols,
|
|
448
538
|
)
|
|
449
539
|
|
|
450
|
-
#
|
|
451
|
-
|
|
540
|
+
# Model config defaults
|
|
541
|
+
model_defaults = {
|
|
542
|
+
"layers": "256-128-64",
|
|
543
|
+
"activation": "LeakyReLU",
|
|
544
|
+
"learning_rate": 1e-3,
|
|
545
|
+
"dropout": 0.1,
|
|
546
|
+
"use_batch_norm": True,
|
|
547
|
+
"initialization": "kaiming",
|
|
548
|
+
}
|
|
549
|
+
# Override defaults with model_config if present
|
|
550
|
+
model_overrides = {k: v for k, v in hyperparameters.get("model_config", {}).items() if k in model_defaults}
|
|
551
|
+
for key, value in model_overrides.items():
|
|
552
|
+
print(f"MODEL CONFIG Override: {key}: {model_defaults[key]} → {value}")
|
|
553
|
+
model_params = {**model_defaults, **model_overrides}
|
|
554
|
+
|
|
555
|
+
model_config = CategoryEmbeddingModelConfig(task=task, **model_params)
|
|
556
|
+
optimizer_config = OptimizerConfig()
|
|
452
557
|
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
558
|
+
for fold_idx, (train_idx, val_idx) in enumerate(folds):
|
|
559
|
+
print(f"\n{'='*50}")
|
|
560
|
+
print(f"Training Fold {fold_idx + 1}/{len(folds)}")
|
|
561
|
+
print(f"{'='*50}")
|
|
562
|
+
|
|
563
|
+
# Split data for this fold
|
|
564
|
+
df_train = all_df.iloc[train_idx].reset_index(drop=True)
|
|
565
|
+
df_val = all_df.iloc[val_idx].reset_index(drop=True)
|
|
566
|
+
|
|
567
|
+
print(f"Fold {fold_idx + 1} - Train: {len(df_train)}, Val: {len(df_val)}")
|
|
568
|
+
|
|
569
|
+
# Set up PyTorch Tabular trainer configuration (per-fold for batch_size)
|
|
570
|
+
# Calculate batch size that avoids single-sample last batch (batch norm requires >1)
|
|
571
|
+
batch_size = min(128, max(32, len(df_train) // 16))
|
|
572
|
+
if len(df_train) % batch_size == 1:
|
|
573
|
+
batch_size += 1 # Adjust to avoid last batch of size 1
|
|
574
|
+
trainer_defaults = {
|
|
575
|
+
"auto_lr_find": False,
|
|
576
|
+
"batch_size": batch_size,
|
|
577
|
+
"max_epochs": 200,
|
|
578
|
+
"min_epochs": 10,
|
|
579
|
+
"early_stopping": "valid_loss",
|
|
580
|
+
"early_stopping_patience": 20,
|
|
581
|
+
"checkpoints": "valid_loss",
|
|
582
|
+
"accelerator": "auto",
|
|
583
|
+
"progress_bar": "none",
|
|
584
|
+
"gradient_clip_val": 1.0,
|
|
585
|
+
"seed": 42 + fold_idx,
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
# Override defaults with training_config if present
|
|
589
|
+
training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
|
|
590
|
+
if fold_idx == 0: # Only print overrides once
|
|
591
|
+
for key, value in training_overrides.items():
|
|
592
|
+
print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
|
|
593
|
+
trainer_params = {**trainer_defaults, **training_overrides}
|
|
594
|
+
trainer_config = TrainerConfig(**trainer_params)
|
|
595
|
+
|
|
596
|
+
# Create and train the TabularModel for this fold
|
|
597
|
+
tabular_model = TabularModel(
|
|
598
|
+
data_config=data_config,
|
|
599
|
+
model_config=model_config,
|
|
600
|
+
optimizer_config=optimizer_config,
|
|
601
|
+
trainer_config=trainer_config,
|
|
602
|
+
)
|
|
603
|
+
tabular_model.fit(train=df_train, validation=df_val)
|
|
604
|
+
ensemble_models.append(tabular_model)
|
|
605
|
+
|
|
606
|
+
# Make out-of-fold predictions
|
|
607
|
+
result = tabular_model.predict(df_val, include_input_features=False)
|
|
608
|
+
fold_preds = result[f"{target}_prediction"].values
|
|
609
|
+
|
|
610
|
+
# Store out-of-fold predictions
|
|
611
|
+
if model_type == "classifier":
|
|
612
|
+
oof_predictions[val_idx] = fold_preds.astype(int)
|
|
613
|
+
prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
|
|
614
|
+
if prob_cols and oof_proba is not None:
|
|
615
|
+
oof_proba[val_idx] = result[prob_cols].values
|
|
616
|
+
else:
|
|
617
|
+
oof_predictions[val_idx] = fold_preds.flatten()
|
|
456
618
|
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
619
|
+
print(f"Fold {fold_idx + 1} complete!")
|
|
620
|
+
|
|
621
|
+
print(f"\nTraining complete! Trained {len(ensemble_models)} model(s).")
|
|
622
|
+
|
|
623
|
+
# Use out-of-fold predictions for metrics
|
|
624
|
+
# For n_folds=1, we only have predictions for val_idx, so filter to those rows
|
|
625
|
+
if n_folds == 1:
|
|
626
|
+
val_mask = ~np.isnan(oof_predictions)
|
|
627
|
+
preds = oof_predictions[val_mask]
|
|
628
|
+
df_val = all_df[val_mask].copy()
|
|
629
|
+
if oof_proba is not None:
|
|
630
|
+
oof_proba = oof_proba[val_mask]
|
|
461
631
|
else:
|
|
462
|
-
|
|
463
|
-
|
|
632
|
+
preds = oof_predictions
|
|
633
|
+
df_val = all_df.copy()
|
|
634
|
+
|
|
635
|
+
# Compute prediction_std by running all ensemble models on validation data
|
|
636
|
+
# For n_folds=1, std will be 0 (only one model). For n_folds>1, std shows ensemble disagreement.
|
|
637
|
+
preds_std = None
|
|
638
|
+
if model_type in ["regressor", "uq_regressor"] and len(ensemble_models) > 0:
|
|
639
|
+
print("Computing prediction_std from ensemble predictions on validation data...")
|
|
640
|
+
all_ensemble_preds_for_std = []
|
|
641
|
+
for ens_model in ensemble_models:
|
|
642
|
+
result = ens_model.predict(df_val[features], include_input_features=False)
|
|
643
|
+
ens_preds = result[f"{target}_prediction"].values.flatten()
|
|
644
|
+
all_ensemble_preds_for_std.append(ens_preds)
|
|
645
|
+
|
|
646
|
+
ensemble_preds_stacked = np.stack(all_ensemble_preds_for_std, axis=0)
|
|
647
|
+
preds_std = np.std(ensemble_preds_stacked, axis=0)
|
|
648
|
+
print(f"Ensemble prediction_std - mean: {np.mean(preds_std):.4f}, max: {np.max(preds_std):.4f}")
|
|
464
649
|
|
|
465
650
|
if model_type == "classifier":
|
|
466
651
|
# Get probabilities for classification
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
probs = result[prob_cols].values
|
|
471
|
-
df_val["pred_proba"] = [p.tolist() for p in probs]
|
|
472
|
-
|
|
473
|
-
# Expand the pred_proba column into separate columns for each class
|
|
474
|
-
print(df_val.columns)
|
|
652
|
+
if oof_proba is not None:
|
|
653
|
+
df_val = df_val.copy()
|
|
654
|
+
df_val["pred_proba"] = [p.tolist() for p in oof_proba]
|
|
475
655
|
df_val = expand_proba_column(df_val, label_encoder.classes_)
|
|
476
|
-
print(df_val.columns)
|
|
477
656
|
|
|
478
657
|
# Decode the target and prediction labels
|
|
479
658
|
y_validate = label_encoder.inverse_transform(df_val[target])
|
|
480
|
-
|
|
659
|
+
preds_decoded = label_encoder.inverse_transform(preds.astype(int))
|
|
481
660
|
else:
|
|
482
661
|
y_validate = df_val[target].values
|
|
662
|
+
preds_decoded = preds
|
|
663
|
+
|
|
664
|
+
# Save predictions to S3
|
|
665
|
+
df_val = df_val.copy()
|
|
666
|
+
df_val["prediction"] = preds_decoded
|
|
667
|
+
|
|
668
|
+
# Build output columns - include id_column if it exists
|
|
669
|
+
output_columns = []
|
|
670
|
+
if id_column in df_val.columns:
|
|
671
|
+
output_columns.append(id_column)
|
|
672
|
+
output_columns += [target, "prediction"]
|
|
673
|
+
|
|
674
|
+
# Add prediction_std for regression models (always present, 0 for single model)
|
|
675
|
+
if model_type in ["regressor", "uq_regressor"]:
|
|
676
|
+
if preds_std is not None:
|
|
677
|
+
df_val["prediction_std"] = preds_std
|
|
678
|
+
else:
|
|
679
|
+
df_val["prediction_std"] = 0.0
|
|
680
|
+
output_columns.append("prediction_std")
|
|
681
|
+
print(f"Ensemble std - mean: {df_val['prediction_std'].mean():.4f}, max: {df_val['prediction_std'].max():.4f}")
|
|
483
682
|
|
|
484
|
-
|
|
485
|
-
df_val["prediction"] = preds
|
|
486
|
-
output_columns = [target, "prediction"]
|
|
487
|
-
output_columns += [col for col in df_val.columns if col.endswith("_probability")]
|
|
683
|
+
output_columns += [col for col in df_val.columns if col.endswith("_proba")]
|
|
488
684
|
wr.s3.to_csv(
|
|
489
685
|
df_val[output_columns],
|
|
490
686
|
path=f"{model_metrics_s3_path}/validation_predictions.csv",
|
|
@@ -497,7 +693,7 @@ if __name__ == "__main__":
|
|
|
497
693
|
label_names = label_encoder.classes_
|
|
498
694
|
|
|
499
695
|
# Calculate various model performance metrics
|
|
500
|
-
scores = precision_recall_fscore_support(y_validate,
|
|
696
|
+
scores = precision_recall_fscore_support(y_validate, preds_decoded, average=None, labels=label_names)
|
|
501
697
|
|
|
502
698
|
# Put the scores into a dataframe
|
|
503
699
|
score_df = pd.DataFrame(
|
|
@@ -505,20 +701,20 @@ if __name__ == "__main__":
|
|
|
505
701
|
target: label_names,
|
|
506
702
|
"precision": scores[0],
|
|
507
703
|
"recall": scores[1],
|
|
508
|
-
"
|
|
704
|
+
"f1": scores[2],
|
|
509
705
|
"support": scores[3],
|
|
510
706
|
}
|
|
511
707
|
)
|
|
512
708
|
|
|
513
|
-
#
|
|
514
|
-
metrics = ["precision", "recall", "
|
|
709
|
+
# Output metrics per class
|
|
710
|
+
metrics = ["precision", "recall", "f1", "support"]
|
|
515
711
|
for t in label_names:
|
|
516
712
|
for m in metrics:
|
|
517
713
|
value = score_df.loc[score_df[target] == t, m].iloc[0]
|
|
518
714
|
print(f"Metrics:{t}:{m} {value}")
|
|
519
715
|
|
|
520
716
|
# Compute and output the confusion matrix
|
|
521
|
-
conf_mtx = confusion_matrix(y_validate,
|
|
717
|
+
conf_mtx = confusion_matrix(y_validate, preds_decoded, labels=label_names)
|
|
522
718
|
for i, row_name in enumerate(label_names):
|
|
523
719
|
for j, col_name in enumerate(label_names):
|
|
524
720
|
value = conf_mtx[i, j]
|
|
@@ -526,22 +722,37 @@ if __name__ == "__main__":
|
|
|
526
722
|
|
|
527
723
|
else:
|
|
528
724
|
# Calculate various model performance metrics (regression)
|
|
529
|
-
rmse = root_mean_squared_error(y_validate,
|
|
530
|
-
mae = mean_absolute_error(y_validate,
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
print(f"
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
725
|
+
rmse = root_mean_squared_error(y_validate, preds_decoded)
|
|
726
|
+
mae = mean_absolute_error(y_validate, preds_decoded)
|
|
727
|
+
medae = median_absolute_error(y_validate, preds_decoded)
|
|
728
|
+
r2 = r2_score(y_validate, preds_decoded)
|
|
729
|
+
spearman_corr = spearmanr(y_validate, preds_decoded).correlation
|
|
730
|
+
support = len(df_val)
|
|
731
|
+
print(f"rmse: {rmse:.3f}")
|
|
732
|
+
print(f"mae: {mae:.3f}")
|
|
733
|
+
print(f"medae: {medae:.3f}")
|
|
734
|
+
print(f"r2: {r2:.3f}")
|
|
735
|
+
print(f"spearmanr: {spearman_corr:.3f}")
|
|
736
|
+
print(f"support: {support}")
|
|
737
|
+
|
|
738
|
+
# Save ensemble models
|
|
739
|
+
for model_idx, ens_model in enumerate(ensemble_models):
|
|
740
|
+
model_path = os.path.join(args.model_dir, f"tabular_model_{model_idx}")
|
|
741
|
+
ens_model.save_model(model_path)
|
|
742
|
+
print(f"Saved model {model_idx + 1} to {model_path}")
|
|
743
|
+
|
|
744
|
+
# Save ensemble metadata
|
|
745
|
+
n_ensemble = len(ensemble_models)
|
|
746
|
+
ensemble_metadata = {"n_ensemble": n_ensemble, "n_folds": n_folds}
|
|
747
|
+
joblib.dump(ensemble_metadata, os.path.join(args.model_dir, "ensemble_metadata.joblib"))
|
|
748
|
+
print(f"Saved ensemble metadata (n_ensemble={n_ensemble}, n_folds={n_folds})")
|
|
749
|
+
|
|
539
750
|
if label_encoder:
|
|
540
751
|
joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
|
|
541
752
|
|
|
542
753
|
# Save the features (this will validate input during predictions)
|
|
543
754
|
with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
|
|
544
|
-
json.dump(orig_features, fp)
|
|
755
|
+
json.dump(orig_features, fp)
|
|
545
756
|
|
|
546
757
|
# Save the category mappings
|
|
547
758
|
with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
|