workbench 0.8.160__py3-none-any.whl → 0.8.202__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of workbench might be problematic. Click here for more details.
- workbench/algorithms/dataframe/__init__.py +1 -2
- workbench/algorithms/dataframe/fingerprint_proximity.py +2 -2
- workbench/algorithms/dataframe/proximity.py +261 -235
- workbench/algorithms/graph/light/proximity_graph.py +10 -8
- workbench/api/__init__.py +2 -1
- workbench/api/compound.py +1 -1
- workbench/api/endpoint.py +11 -0
- workbench/api/feature_set.py +12 -8
- workbench/api/meta.py +5 -2
- workbench/api/model.py +16 -15
- workbench/api/monitor.py +1 -16
- workbench/api/parameter_store.py +5 -0
- workbench/core/artifacts/__init__.py +11 -2
- workbench/core/artifacts/artifact.py +11 -3
- workbench/core/artifacts/data_capture_core.py +355 -0
- workbench/core/artifacts/endpoint_core.py +256 -118
- workbench/core/artifacts/feature_set_core.py +265 -16
- workbench/core/artifacts/model_core.py +110 -63
- workbench/core/artifacts/monitor_core.py +33 -248
- workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
- workbench/core/cloud_platform/aws/aws_meta.py +12 -5
- workbench/core/cloud_platform/aws/aws_parameter_store.py +18 -2
- workbench/core/cloud_platform/aws/aws_session.py +4 -4
- workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
- workbench/core/transforms/features_to_model/features_to_model.py +45 -33
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
- workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
- workbench/core/views/training_view.py +113 -42
- workbench/core/views/view.py +53 -3
- workbench/core/views/view_utils.py +4 -4
- workbench/model_scripts/chemprop/chemprop.template +852 -0
- workbench/model_scripts/chemprop/generated_model_script.py +852 -0
- workbench/model_scripts/chemprop/requirements.txt +11 -0
- workbench/model_scripts/custom_models/chem_info/fingerprints.py +134 -0
- workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
- workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
- workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
- workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
- workbench/model_scripts/custom_models/proximity/proximity.py +261 -235
- workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
- workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
- workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +166 -62
- workbench/model_scripts/custom_models/uq_models/ngboost.template +30 -18
- workbench/model_scripts/custom_models/uq_models/proximity.py +261 -235
- workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
- workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
- workbench/model_scripts/pytorch_model/generated_model_script.py +390 -188
- workbench/model_scripts/pytorch_model/pytorch.template +387 -176
- workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
- workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
- workbench/model_scripts/script_generation.py +19 -10
- workbench/model_scripts/uq_models/generated_model_script.py +605 -0
- workbench/model_scripts/uq_models/mapie.template +605 -0
- workbench/model_scripts/uq_models/requirements.txt +1 -0
- workbench/model_scripts/xgb_model/generated_model_script.py +37 -46
- workbench/model_scripts/xgb_model/xgb_model.template +44 -46
- workbench/repl/workbench_shell.py +28 -14
- workbench/scripts/endpoint_test.py +162 -0
- workbench/scripts/lambda_test.py +73 -0
- workbench/scripts/ml_pipeline_batch.py +137 -0
- workbench/scripts/ml_pipeline_sqs.py +186 -0
- workbench/scripts/monitor_cloud_watch.py +20 -100
- workbench/utils/aws_utils.py +4 -3
- workbench/utils/chem_utils/__init__.py +0 -0
- workbench/utils/chem_utils/fingerprints.py +134 -0
- workbench/utils/chem_utils/misc.py +194 -0
- workbench/utils/chem_utils/mol_descriptors.py +483 -0
- workbench/utils/chem_utils/mol_standardize.py +450 -0
- workbench/utils/chem_utils/mol_tagging.py +348 -0
- workbench/utils/chem_utils/projections.py +209 -0
- workbench/utils/chem_utils/salts.py +256 -0
- workbench/utils/chem_utils/sdf.py +292 -0
- workbench/utils/chem_utils/toxicity.py +250 -0
- workbench/utils/chem_utils/vis.py +253 -0
- workbench/utils/chemprop_utils.py +760 -0
- workbench/utils/cloudwatch_handler.py +1 -1
- workbench/utils/cloudwatch_utils.py +137 -0
- workbench/utils/config_manager.py +3 -7
- workbench/utils/endpoint_utils.py +5 -7
- workbench/utils/license_manager.py +2 -6
- workbench/utils/model_utils.py +95 -34
- workbench/utils/monitor_utils.py +44 -62
- workbench/utils/pandas_utils.py +3 -3
- workbench/utils/pytorch_utils.py +526 -0
- workbench/utils/shap_utils.py +10 -2
- workbench/utils/workbench_logging.py +0 -3
- workbench/utils/workbench_sqs.py +1 -1
- workbench/utils/xgboost_model_utils.py +371 -156
- workbench/web_interface/components/model_plot.py +7 -1
- workbench/web_interface/components/plugin_unit_test.py +5 -2
- workbench/web_interface/components/plugins/dashboard_status.py +3 -1
- workbench/web_interface/components/plugins/generated_compounds.py +1 -1
- workbench/web_interface/components/plugins/model_details.py +9 -7
- workbench/web_interface/components/plugins/scatter_plot.py +3 -3
- {workbench-0.8.160.dist-info → workbench-0.8.202.dist-info}/METADATA +27 -6
- {workbench-0.8.160.dist-info → workbench-0.8.202.dist-info}/RECORD +102 -86
- {workbench-0.8.160.dist-info → workbench-0.8.202.dist-info}/entry_points.txt +4 -0
- {workbench-0.8.160.dist-info → workbench-0.8.202.dist-info}/licenses/LICENSE +1 -1
- workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
- workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
- workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
- workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
- workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
- workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
- workbench/model_scripts/quant_regression/quant_regression.template +0 -279
- workbench/model_scripts/quant_regression/requirements.txt +0 -1
- workbench/utils/chem_utils.py +0 -1556
- workbench/utils/execution_environment.py +0 -211
- workbench/utils/fast_inference.py +0 -167
- workbench/utils/resource_utils.py +0 -39
- {workbench-0.8.160.dist-info → workbench-0.8.202.dist-info}/WHEEL +0 -0
- {workbench-0.8.160.dist-info → workbench-0.8.202.dist-info}/top_level.txt +0 -0
|
@@ -8,43 +8,43 @@ import numpy as np
|
|
|
8
8
|
os.environ["TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD"] = "1"
|
|
9
9
|
from pytorch_tabular import TabularModel
|
|
10
10
|
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
|
|
11
|
-
from pytorch_tabular.models import CategoryEmbeddingModelConfig
|
|
11
|
+
from pytorch_tabular.models import CategoryEmbeddingModelConfig
|
|
12
12
|
|
|
13
13
|
# Model Performance Scores
|
|
14
14
|
from sklearn.metrics import (
|
|
15
15
|
mean_absolute_error,
|
|
16
|
+
median_absolute_error,
|
|
16
17
|
r2_score,
|
|
17
18
|
root_mean_squared_error,
|
|
18
19
|
precision_recall_fscore_support,
|
|
19
20
|
confusion_matrix,
|
|
20
21
|
)
|
|
22
|
+
from scipy.stats import spearmanr
|
|
21
23
|
|
|
22
24
|
# Classification Encoder
|
|
23
25
|
from sklearn.preprocessing import LabelEncoder
|
|
24
26
|
|
|
25
27
|
# Scikit Learn Imports
|
|
26
|
-
from sklearn.model_selection import train_test_split
|
|
28
|
+
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
|
|
27
29
|
|
|
28
30
|
from io import StringIO
|
|
29
31
|
import json
|
|
30
32
|
import argparse
|
|
31
33
|
import joblib
|
|
32
|
-
import os
|
|
33
34
|
import pandas as pd
|
|
34
|
-
from typing import List, Tuple
|
|
35
35
|
|
|
36
36
|
# Template Parameters
|
|
37
37
|
TEMPLATE_PARAMS = {
|
|
38
|
-
"model_type": "
|
|
39
|
-
"
|
|
40
|
-
"features": ['
|
|
38
|
+
"model_type": "uq_regressor",
|
|
39
|
+
"target": "udm_asy_res_efflux_ratio",
|
|
40
|
+
"features": ['smr_vsa4', 'tpsa', 'nhohcount', 'peoe_vsa1', 'mollogp', 'numhdonors', 'tertiary_amine_count', 'smr_vsa3', 'nitrogen_span', 'vsa_estate2', 'hba_hbd_ratio', 'minpartialcharge', 'estate_vsa4', 'asphericity', 'charge_centroid_distance', 'peoe_vsa8', 'mi', 'estate_vsa8', 'vsa_estate6', 'vsa_estate3', 'molecular_volume_3d', 'kappa3', 'smr_vsa5', 'sv', 'xp_6dv', 'xc_4dv', 'si', 'molecular_axis_length', 'axp_5d', 'estate_vsa3', 'estate_vsa10', 'axp_7dv', 'slogp_vsa1', 'molecular_asymmetry', 'molmr', 'qed', 'xp_3d', 'axp_0dv', 'fpdensitymorgan1', 'minabsestateindex', 'numatomstereocenters', 'fpdensitymorgan2', 'slogp_vsa2', 'xch_5dv', 'num_s_centers', 'aromatic_interaction_score', 'axp_2dv', 'chi1v', 'hallkieralpha', 'vsa_estate8', 'peoe_vsa9', 'type_ii_pattern_count', 'slogp_vsa5', 'xc_3d', 'amphiphilic_moment', 'bcut2d_logphi', 'estate_vsa6', 'xc_3dv', 'chi0n', 'vsa_estate5', 'xpc_6d', 'vsa_estate7', 'axp_1d', 'axp_7d', 'xch_4dv', 'phi', 'maxestateindex', 'sps', 'bcut2d_mrlow', 'vsa_estate4', 'avgipc', 'bcut2d_mrhi', 'bcut2d_logplow', 'axp_1dv', 'kappa1', 'vsa_estate9', 'fr_imidazole', 'axp_6d', 'radius_of_gyration', 'chi2v', 'chi4n', 'xp_7d', 'smr_vsa6', 'axp_2d', 'num_r_centers', 'xch_7dv', 'estate_vsa2', 'axp_4d', 'maxpartialcharge', 'xpc_5dv', 'xp_5d', 'chi3n', 'chi2n', 'vsa_estate1', 'slogp_vsa11', 'bcut2d_mwlow', 'mm', 'c3sp3', 'numhacceptors', 'fr_nhpyrrole', 'labuteasa', 'fpdensitymorgan3', 'bcut2d_chghi', 'axp_3dv', 'c1sp3', 'kappa2', 'smr_vsa9', 'xp_6d', 'estate_vsa7', 'axp_6dv', 'sp', 'estate_vsa5', 'peoe_vsa2', 'smr_vsa1', 'mp', 'minestateindex', 'axp_3d', 'axp_4dv', 'chi0v', 'slogp_vsa3', 'heavyatommolwt', 'smr_vsa7', 'peoe_vsa6', 'mv', 'xp_4d', 'peoe_vsa7', 'mpe', 'chi4v', 'maxabspartialcharge', 'bcut2d_chglo', 'c1sp2', 'xp_0dv', 'smr_vsa10', 'estate_vsa1', 'fr_pyridine', 'bcut2d_mwhi', 'spe', 'balabanj', 'xch_7d', 'estate_vsa9', 'xp_3dv', 'fr_piperzine', 'xch_6dv', 'slogp_vsa8', 'peoe_vsa10', 'xp_4dv', 'c3sp2', 'fr_al_oh', 'xc_5d', 'fractioncsp3', 'fr_bicyclic', 'fr_piperdine', 'peoe_vsa12', 'peoe_vsa11', 'numheteroatoms', 'mse', 'xp_7dv', 'chi1', 'xpc_6dv', 'numsaturatedcarbocycles', 'chi1n', 'bertzct', 'xc_5dv', 'chi3v', 'intramolecular_hbond_potential', 'peoe_vsa4', 'xpc_5d', 'xp_2d', 'nbase', 'fr_priamide', 'slogp_vsa4', 'naromatom', 'vsa_estate10', 'fr_nitrile', 'molwt', 'peoe_vsa13', 'xch_4d', 'xp_5dv', 'numaromaticheterocycles', 'xpc_4dv', 'fr_hoccn', 'nocount', 'fr_nh1', 'mz', 'xc_6dv', 'hybratio', 'fr_imine', 'fr_morpholine', 'xpc_4d', 'xch_5d', 'numvalenceelectrons', 'numheterocycles', 'fr_aniline', 'fr_nh0', 'frac_defined_stereo', 'fr_benzene', 'xp_2dv', 'type_i_pattern_count', 'fr_ketone_topliss', 'fr_aryl_methyl', 'heavyatomcount', 'mare', 'axp_5dv', 'exactmolwt', 'xch_6d', 'xp_1d', 'xch_3d', 'axp_0d', 'amide_count', 'sse', 'slogp_vsa7', 'c2sp2', 'numrotatablebonds', 'chi0', 'xc_4d', 'slogp_vsa10', 'fr_al_oh_notert', 'numspiroatoms', 'numsaturatedrings', 'minabspartialcharge', 'fr_sulfone', 'slogp_vsa6', 'smr_vsa2', 'num_defined_stereocenters', 'numbridgeheadatoms', 'peoe_vsa3', 'numaliphaticheterocycles', 'fr_ndealkylation1', 'xc_6d'],
|
|
41
|
+
"id_column": "udm_mol_bat_id",
|
|
41
42
|
"compressed_features": [],
|
|
42
|
-
"model_metrics_s3_path": "s3://
|
|
43
|
-
"
|
|
43
|
+
"model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/caco2-er-reg-pytorch-test/training",
|
|
44
|
+
"hyperparameters": {'n_folds': 5, 'training_config': {'max_epochs': 200, 'early_stopping_patience': 20}, 'model_config': {'layers': '256-128-64', 'dropout': 0.1, 'learning_rate': 0.001, 'activation': 'LeakyReLU'}},
|
|
44
45
|
}
|
|
45
46
|
|
|
46
47
|
|
|
47
|
-
# Function to check if dataframe is empty
|
|
48
48
|
def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
|
|
49
49
|
"""
|
|
50
50
|
Check if the provided dataframe is empty and raise an exception if it is.
|
|
@@ -59,19 +59,17 @@ def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
|
|
|
59
59
|
raise ValueError(msg)
|
|
60
60
|
|
|
61
61
|
|
|
62
|
-
def expand_proba_column(df: pd.DataFrame, class_labels:
|
|
62
|
+
def expand_proba_column(df: pd.DataFrame, class_labels: list[str]) -> pd.DataFrame:
|
|
63
63
|
"""
|
|
64
64
|
Expands a column in a DataFrame containing a list of probabilities into separate columns.
|
|
65
65
|
|
|
66
66
|
Args:
|
|
67
67
|
df (pd.DataFrame): DataFrame containing a "pred_proba" column
|
|
68
|
-
class_labels (
|
|
68
|
+
class_labels (list[str]): List of class labels
|
|
69
69
|
|
|
70
70
|
Returns:
|
|
71
71
|
pd.DataFrame: DataFrame with the "pred_proba" expanded into separate columns
|
|
72
72
|
"""
|
|
73
|
-
|
|
74
|
-
# Sanity check
|
|
75
73
|
proba_column = "pred_proba"
|
|
76
74
|
if proba_column not in df.columns:
|
|
77
75
|
raise ValueError('DataFrame does not contain a "pred_proba" column')
|
|
@@ -88,11 +86,10 @@ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFra
|
|
|
88
86
|
|
|
89
87
|
# Concatenate the new columns with the original DataFrame
|
|
90
88
|
df = pd.concat([df, proba_df], axis=1)
|
|
91
|
-
print(df)
|
|
92
89
|
return df
|
|
93
90
|
|
|
94
91
|
|
|
95
|
-
def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
|
|
92
|
+
def match_features_case_insensitive(df: pd.DataFrame, model_features: list[str]) -> pd.DataFrame:
|
|
96
93
|
"""
|
|
97
94
|
Matches and renames DataFrame columns to match model feature names (case-insensitive).
|
|
98
95
|
Prioritizes exact matches, then case-insensitive matches.
|
|
@@ -102,7 +99,6 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
102
99
|
df_columns_lower = {col.lower(): col for col in df.columns}
|
|
103
100
|
rename_dict = {}
|
|
104
101
|
missing = []
|
|
105
|
-
|
|
106
102
|
for feature in model_features:
|
|
107
103
|
if feature in df.columns:
|
|
108
104
|
continue # Exact match
|
|
@@ -114,58 +110,64 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
114
110
|
if missing:
|
|
115
111
|
raise ValueError(f"Features not found: {missing}")
|
|
116
112
|
|
|
113
|
+
# Rename the DataFrame columns to match the model features
|
|
117
114
|
return df.rename(columns=rename_dict)
|
|
118
115
|
|
|
119
116
|
|
|
120
|
-
def convert_categorical_types(
|
|
117
|
+
def convert_categorical_types(
|
|
118
|
+
df: pd.DataFrame, features: list[str], category_mappings: dict[str, list[str]] | None = None
|
|
119
|
+
) -> tuple[pd.DataFrame, dict[str, list[str]]]:
|
|
121
120
|
"""
|
|
122
121
|
Converts appropriate columns to categorical type with consistent mappings.
|
|
123
122
|
|
|
124
123
|
Args:
|
|
125
124
|
df (pd.DataFrame): The DataFrame to process.
|
|
126
125
|
features (list): List of feature names to consider for conversion.
|
|
127
|
-
category_mappings (dict, optional): Existing category mappings. If empty
|
|
128
|
-
training mode. If populated, we're in
|
|
126
|
+
category_mappings (dict, optional): Existing category mappings. If None or empty,
|
|
127
|
+
we're in training mode. If populated, we're in
|
|
128
|
+
inference mode.
|
|
129
129
|
|
|
130
130
|
Returns:
|
|
131
131
|
tuple: (processed DataFrame, category mappings dictionary)
|
|
132
132
|
"""
|
|
133
|
+
if category_mappings is None:
|
|
134
|
+
category_mappings = {}
|
|
135
|
+
|
|
133
136
|
# Training mode
|
|
134
|
-
if category_mappings
|
|
137
|
+
if not category_mappings:
|
|
135
138
|
for col in df.select_dtypes(include=["object", "string"]):
|
|
136
139
|
if col in features and df[col].nunique() < 20:
|
|
137
140
|
print(f"Training mode: Converting {col} to category")
|
|
138
141
|
df[col] = df[col].astype("category")
|
|
139
|
-
category_mappings[col] = df[col].cat.categories.tolist()
|
|
142
|
+
category_mappings[col] = df[col].cat.categories.tolist()
|
|
140
143
|
|
|
141
144
|
# Inference mode
|
|
142
145
|
else:
|
|
143
146
|
for col, categories in category_mappings.items():
|
|
144
147
|
if col in df.columns:
|
|
145
148
|
print(f"Inference mode: Applying categorical mapping for {col}")
|
|
146
|
-
df[col] = pd.Categorical(df[col], categories=categories)
|
|
149
|
+
df[col] = pd.Categorical(df[col], categories=categories)
|
|
147
150
|
|
|
148
151
|
return df, category_mappings
|
|
149
152
|
|
|
150
153
|
|
|
151
154
|
def decompress_features(
|
|
152
|
-
df: pd.DataFrame, features:
|
|
153
|
-
) ->
|
|
155
|
+
df: pd.DataFrame, features: list[str], compressed_features: list[str]
|
|
156
|
+
) -> tuple[pd.DataFrame, list[str]]:
|
|
154
157
|
"""Prepare features for the model
|
|
155
158
|
|
|
156
159
|
Args:
|
|
157
160
|
df (pd.DataFrame): The features DataFrame
|
|
158
|
-
features (
|
|
159
|
-
compressed_features (
|
|
161
|
+
features (list[str]): Full list of feature names
|
|
162
|
+
compressed_features (list[str]): List of feature names to decompress (bitstrings)
|
|
160
163
|
|
|
161
164
|
Returns:
|
|
162
165
|
pd.DataFrame: DataFrame with the decompressed features
|
|
163
|
-
|
|
166
|
+
list[str]: Updated list of feature names after decompression
|
|
164
167
|
|
|
165
168
|
Raises:
|
|
166
169
|
ValueError: If any missing values are found in the specified features
|
|
167
170
|
"""
|
|
168
|
-
|
|
169
171
|
# Check for any missing values in the required features
|
|
170
172
|
missing_counts = df[features].isna().sum()
|
|
171
173
|
if missing_counts.any():
|
|
@@ -175,10 +177,11 @@ def decompress_features(
|
|
|
175
177
|
"WARNING: You might want to remove/replace all NaN values before processing."
|
|
176
178
|
)
|
|
177
179
|
|
|
178
|
-
#
|
|
179
|
-
decompressed_features = features
|
|
180
|
+
# Make a copy to avoid mutating the original list
|
|
181
|
+
decompressed_features = features.copy()
|
|
182
|
+
|
|
180
183
|
for feature in compressed_features:
|
|
181
|
-
if (feature not in df.columns) or (feature not in
|
|
184
|
+
if (feature not in df.columns) or (feature not in decompressed_features):
|
|
182
185
|
print(f"Feature '{feature}' not in the features list, skipping decompression.")
|
|
183
186
|
continue
|
|
184
187
|
|
|
@@ -203,35 +206,60 @@ def decompress_features(
|
|
|
203
206
|
return df, decompressed_features
|
|
204
207
|
|
|
205
208
|
|
|
206
|
-
def model_fn(model_dir):
|
|
207
|
-
"""
|
|
208
|
-
#
|
|
209
|
-
os.environ['TEMP'] = '/tmp'
|
|
210
|
-
model_path = os.path.join(model_dir, "tabular_model")
|
|
211
|
-
model = TabularModel.load_model(model_path)
|
|
212
|
-
return model
|
|
209
|
+
def model_fn(model_dir: str) -> dict:
|
|
210
|
+
"""Load the PyTorch Tabular ensemble models from the specified directory.
|
|
213
211
|
|
|
212
|
+
Args:
|
|
213
|
+
model_dir: Directory containing the saved model(s)
|
|
214
214
|
|
|
215
|
-
|
|
215
|
+
Returns:
|
|
216
|
+
Dictionary with ensemble models and metadata
|
|
217
|
+
"""
|
|
218
|
+
import torch
|
|
219
|
+
from functools import partial
|
|
220
|
+
|
|
221
|
+
# Load ensemble metadata if present
|
|
222
|
+
ensemble_metadata_path = os.path.join(model_dir, "ensemble_metadata.joblib")
|
|
223
|
+
if os.path.exists(ensemble_metadata_path):
|
|
224
|
+
ensemble_metadata = joblib.load(ensemble_metadata_path)
|
|
225
|
+
n_ensemble = ensemble_metadata["n_ensemble"]
|
|
226
|
+
else:
|
|
227
|
+
n_ensemble = 1
|
|
228
|
+
|
|
229
|
+
# Determine map_location for loading models (handle CUDA trained models on CPU inference)
|
|
230
|
+
map_location = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
231
|
+
|
|
232
|
+
# Patch torch.load globally to use map_location (needed for joblib-loaded callbacks)
|
|
233
|
+
# This handles the case where pytorch-tabular loads callbacks.sav via joblib,
|
|
234
|
+
# which internally calls torch.load without map_location
|
|
235
|
+
original_torch_load = torch.load
|
|
236
|
+
torch.load = partial(original_torch_load, map_location=map_location)
|
|
216
237
|
|
|
217
238
|
# Save current working directory
|
|
218
239
|
original_cwd = os.getcwd()
|
|
240
|
+
ensemble_models = []
|
|
241
|
+
|
|
219
242
|
try:
|
|
220
243
|
# Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
|
|
221
|
-
os.chdir(
|
|
244
|
+
os.chdir("/tmp")
|
|
222
245
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
246
|
+
for ens_idx in range(n_ensemble):
|
|
247
|
+
# Try numbered model path first, fall back to legacy path
|
|
248
|
+
model_path = os.path.join(model_dir, f"tabular_model_{ens_idx}")
|
|
249
|
+
if not os.path.exists(model_path):
|
|
250
|
+
model_path = os.path.join(model_dir, "tabular_model")
|
|
251
|
+
model = TabularModel.load_model(model_path, map_location=map_location)
|
|
252
|
+
ensemble_models.append(model)
|
|
226
253
|
|
|
227
|
-
# Restore the original working directory
|
|
228
254
|
finally:
|
|
255
|
+
# Restore torch.load and working directory
|
|
256
|
+
torch.load = original_torch_load
|
|
229
257
|
os.chdir(original_cwd)
|
|
230
258
|
|
|
231
|
-
return
|
|
259
|
+
return {"ensemble_models": ensemble_models, "n_ensemble": n_ensemble}
|
|
232
260
|
|
|
233
261
|
|
|
234
|
-
def input_fn(input_data, content_type):
|
|
262
|
+
def input_fn(input_data, content_type: str) -> pd.DataFrame:
|
|
235
263
|
"""Parse input data and return a DataFrame."""
|
|
236
264
|
if not input_data:
|
|
237
265
|
raise ValueError("Empty input data is not supported!")
|
|
@@ -248,29 +276,34 @@ def input_fn(input_data, content_type):
|
|
|
248
276
|
raise ValueError(f"{content_type} not supported!")
|
|
249
277
|
|
|
250
278
|
|
|
251
|
-
def output_fn(output_df, accept_type):
|
|
279
|
+
def output_fn(output_df: pd.DataFrame, accept_type: str) -> tuple[str, str]:
|
|
252
280
|
"""Supports both CSV and JSON output formats."""
|
|
253
281
|
if "text/csv" in accept_type:
|
|
254
|
-
csv_output = output_df.fillna("N/A").to_csv(index=False)
|
|
282
|
+
csv_output = output_df.fillna("N/A").to_csv(index=False)
|
|
255
283
|
return csv_output, "text/csv"
|
|
256
284
|
elif "application/json" in accept_type:
|
|
257
|
-
return output_df.to_json(orient="records"), "application/json"
|
|
285
|
+
return output_df.to_json(orient="records"), "application/json"
|
|
258
286
|
else:
|
|
259
287
|
raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
|
|
260
288
|
|
|
261
289
|
|
|
262
|
-
def predict_fn(df,
|
|
263
|
-
"""Make Predictions with our PyTorch Tabular Model
|
|
290
|
+
def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
|
|
291
|
+
"""Make Predictions with our PyTorch Tabular Model ensemble.
|
|
264
292
|
|
|
265
293
|
Args:
|
|
266
294
|
df (pd.DataFrame): The input DataFrame
|
|
267
|
-
|
|
295
|
+
model_dict: Dictionary containing ensemble models and metadata
|
|
268
296
|
|
|
269
297
|
Returns:
|
|
270
|
-
pd.DataFrame: The DataFrame with
|
|
298
|
+
pd.DataFrame: The DataFrame with predictions (and prediction_std for ensembles)
|
|
271
299
|
"""
|
|
300
|
+
model_type = TEMPLATE_PARAMS["model_type"]
|
|
272
301
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
273
302
|
|
|
303
|
+
# Extract ensemble models
|
|
304
|
+
ensemble_models = model_dict["ensemble_models"]
|
|
305
|
+
n_ensemble = model_dict["n_ensemble"]
|
|
306
|
+
|
|
274
307
|
# Grab our feature columns (from training)
|
|
275
308
|
model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
|
|
276
309
|
with open(os.path.join(model_dir, "feature_columns.json")) as fp:
|
|
@@ -283,12 +316,11 @@ def predict_fn(df, model) -> pd.DataFrame:
|
|
|
283
316
|
|
|
284
317
|
# Load our Label Encoder if we have one
|
|
285
318
|
label_encoder = None
|
|
286
|
-
|
|
287
|
-
|
|
319
|
+
label_encoder_path = os.path.join(model_dir, "label_encoder.joblib")
|
|
320
|
+
if os.path.exists(label_encoder_path):
|
|
321
|
+
label_encoder = joblib.load(label_encoder_path)
|
|
288
322
|
|
|
289
|
-
#
|
|
290
|
-
# - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
|
|
291
|
-
# - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
|
|
323
|
+
# Match features in a case-insensitive manner
|
|
292
324
|
matched_df = match_features_case_insensitive(df, features)
|
|
293
325
|
|
|
294
326
|
# Detect categorical types in the incoming DataFrame
|
|
@@ -299,36 +331,80 @@ def predict_fn(df, model) -> pd.DataFrame:
|
|
|
299
331
|
print("Decompressing features for prediction...")
|
|
300
332
|
matched_df, features = decompress_features(matched_df, features, compressed_features)
|
|
301
333
|
|
|
302
|
-
#
|
|
303
|
-
|
|
334
|
+
# Track rows with missing features
|
|
335
|
+
missing_mask = matched_df[features].isna().any(axis=1)
|
|
336
|
+
if missing_mask.any():
|
|
337
|
+
print(f"Warning: {missing_mask.sum()} rows have missing features, will return NaN predictions")
|
|
338
|
+
|
|
339
|
+
# Initialize prediction columns
|
|
340
|
+
df["prediction"] = np.nan
|
|
341
|
+
if model_type in ["regressor", "uq_regressor"]:
|
|
342
|
+
df["prediction_std"] = np.nan
|
|
343
|
+
|
|
344
|
+
# Only predict on complete rows
|
|
345
|
+
complete_df = matched_df[~missing_mask]
|
|
346
|
+
if len(complete_df) == 0:
|
|
347
|
+
print("Warning: No complete rows to predict on")
|
|
348
|
+
return df
|
|
304
349
|
|
|
305
350
|
# pytorch-tabular returns predictions using f"{target}_prediction" column
|
|
306
|
-
|
|
307
|
-
target = TEMPLATE_PARAMS["target_column"]
|
|
351
|
+
target = TEMPLATE_PARAMS["target"]
|
|
308
352
|
prediction_column = f"{target}_prediction"
|
|
309
|
-
if prediction_column in result.columns:
|
|
310
|
-
predictions = result[prediction_column].values
|
|
311
|
-
else:
|
|
312
|
-
raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
|
|
313
353
|
|
|
314
|
-
#
|
|
315
|
-
|
|
316
|
-
|
|
354
|
+
# Collect predictions from all ensemble members
|
|
355
|
+
all_ensemble_preds = []
|
|
356
|
+
all_ensemble_probs = []
|
|
357
|
+
|
|
358
|
+
for ens_idx, ens_model in enumerate(ensemble_models):
|
|
359
|
+
result = ens_model.predict(complete_df[features])
|
|
360
|
+
|
|
361
|
+
if prediction_column in result.columns:
|
|
362
|
+
ens_preds = result[prediction_column].values
|
|
363
|
+
else:
|
|
364
|
+
raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
|
|
317
365
|
|
|
318
|
-
|
|
319
|
-
df["prediction"] = predictions
|
|
366
|
+
all_ensemble_preds.append(ens_preds)
|
|
320
367
|
|
|
321
|
-
|
|
368
|
+
# For classification, collect probabilities
|
|
369
|
+
if label_encoder is not None:
|
|
370
|
+
prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
|
|
371
|
+
if prob_cols:
|
|
372
|
+
all_ensemble_probs.append(result[prob_cols].values)
|
|
373
|
+
|
|
374
|
+
# Stack and compute mean/std (std is 0 for single model)
|
|
375
|
+
ensemble_preds = np.stack(all_ensemble_preds, axis=0) # (n_ensemble, n_samples)
|
|
376
|
+
preds = np.mean(ensemble_preds, axis=0)
|
|
377
|
+
preds_std = np.std(ensemble_preds, axis=0) # Will be 0s for n_ensemble=1
|
|
378
|
+
|
|
379
|
+
print(f"Inference: Ensemble predictions shape: {preds.shape}, n_ensemble: {n_ensemble}")
|
|
380
|
+
|
|
381
|
+
# Handle classification vs regression
|
|
322
382
|
if label_encoder is not None:
|
|
323
|
-
|
|
324
|
-
if
|
|
325
|
-
|
|
326
|
-
|
|
383
|
+
# For classification, average probabilities then take argmax
|
|
384
|
+
if all_ensemble_probs:
|
|
385
|
+
ensemble_probs = np.stack(all_ensemble_probs, axis=0) # (n_ensemble, n_samples, n_classes)
|
|
386
|
+
avg_probs = np.mean(ensemble_probs, axis=0) # (n_samples, n_classes)
|
|
387
|
+
class_preds = np.argmax(avg_probs, axis=1)
|
|
388
|
+
predictions = label_encoder.inverse_transform(class_preds)
|
|
389
|
+
|
|
390
|
+
# Build full proba Series with None for missing rows
|
|
391
|
+
all_proba = pd.Series([None] * len(df), index=df.index, dtype=object)
|
|
392
|
+
all_proba.loc[~missing_mask] = [p.tolist() for p in avg_probs]
|
|
393
|
+
df["pred_proba"] = all_proba
|
|
327
394
|
|
|
328
395
|
# Expand the pred_proba column into separate columns for each class
|
|
329
396
|
df = expand_proba_column(df, label_encoder.classes_)
|
|
397
|
+
else:
|
|
398
|
+
# No probabilities, use averaged predictions
|
|
399
|
+
predictions = label_encoder.inverse_transform(preds.astype(int))
|
|
400
|
+
else:
|
|
401
|
+
# Regression (includes uq_regressor)
|
|
402
|
+
predictions = preds
|
|
403
|
+
df.loc[~missing_mask, "prediction_std"] = preds_std
|
|
404
|
+
|
|
405
|
+
# Set predictions only for complete rows
|
|
406
|
+
df.loc[~missing_mask, "prediction"] = predictions
|
|
330
407
|
|
|
331
|
-
# All done, return the DataFrame with new columns for the predictions
|
|
332
408
|
return df
|
|
333
409
|
|
|
334
410
|
|
|
@@ -336,14 +412,14 @@ if __name__ == "__main__":
|
|
|
336
412
|
"""The main function is for training the PyTorch Tabular model"""
|
|
337
413
|
|
|
338
414
|
# Harness Template Parameters
|
|
339
|
-
target = TEMPLATE_PARAMS["
|
|
415
|
+
target = TEMPLATE_PARAMS["target"]
|
|
340
416
|
features = TEMPLATE_PARAMS["features"]
|
|
341
417
|
orig_features = features.copy()
|
|
418
|
+
id_column = TEMPLATE_PARAMS["id_column"]
|
|
342
419
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
343
420
|
model_type = TEMPLATE_PARAMS["model_type"]
|
|
344
421
|
model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
|
|
345
|
-
|
|
346
|
-
validation_split = 0.2
|
|
422
|
+
hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
|
|
347
423
|
|
|
348
424
|
# Script arguments for input/output directories
|
|
349
425
|
parser = argparse.ArgumentParser()
|
|
@@ -355,19 +431,27 @@ if __name__ == "__main__":
|
|
|
355
431
|
args = parser.parse_args()
|
|
356
432
|
|
|
357
433
|
# Read the training data into DataFrames
|
|
358
|
-
training_files = [
|
|
359
|
-
os.path.join(args.train, file)
|
|
360
|
-
for file in os.listdir(args.train)
|
|
361
|
-
if file.endswith(".csv")
|
|
362
|
-
]
|
|
434
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
363
435
|
print(f"Training Files: {training_files}")
|
|
364
436
|
|
|
365
437
|
# Combine files and read them all into a single pandas dataframe
|
|
366
438
|
all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
367
439
|
|
|
440
|
+
# Print out some info about the dataframe
|
|
441
|
+
print(f"All Data Shape: {all_df.shape}")
|
|
442
|
+
print(f"Feature dtypes:\n{all_df[features].dtypes.value_counts()}")
|
|
443
|
+
print(f"Int64 columns: {all_df[features].select_dtypes(include=['int64']).columns.tolist()}")
|
|
444
|
+
|
|
368
445
|
# Check if the dataframe is empty
|
|
369
446
|
check_dataframe(all_df, "training_df")
|
|
370
447
|
|
|
448
|
+
# Drop any rows with missing feature values
|
|
449
|
+
initial_row_count = all_df.shape[0]
|
|
450
|
+
all_df = all_df.dropna(subset=features)
|
|
451
|
+
dropped_rows = initial_row_count - all_df.shape[0]
|
|
452
|
+
if dropped_rows > 0:
|
|
453
|
+
print(f"Dropped {dropped_rows} rows due to missing feature values.")
|
|
454
|
+
|
|
371
455
|
# Features/Target output
|
|
372
456
|
print(f"Target: {target}")
|
|
373
457
|
print(f"Features: {str(features)}")
|
|
@@ -375,125 +459,228 @@ if __name__ == "__main__":
|
|
|
375
459
|
# Convert any features that might be categorical to 'category' type
|
|
376
460
|
all_df, category_mappings = convert_categorical_types(all_df, features)
|
|
377
461
|
|
|
462
|
+
# Print out some info about the dataframe
|
|
463
|
+
print(f"All Data Shape: {all_df.shape}")
|
|
464
|
+
print(f"Feature dtypes:\n{all_df[features].dtypes.value_counts()}")
|
|
465
|
+
print(f"Int64 columns: {all_df[features].select_dtypes(include=['int64']).columns.tolist()}")
|
|
466
|
+
|
|
378
467
|
# If we have compressed features, decompress them
|
|
379
468
|
if compressed_features:
|
|
380
469
|
print(f"Decompressing features {compressed_features}...")
|
|
381
470
|
all_df, features = decompress_features(all_df, features, compressed_features)
|
|
382
471
|
|
|
383
|
-
# Do we want to train on all the data?
|
|
384
|
-
if train_all_data:
|
|
385
|
-
print("Training on ALL of the data")
|
|
386
|
-
df_train = all_df.copy()
|
|
387
|
-
df_val = all_df.copy()
|
|
388
|
-
|
|
389
|
-
# Does the dataframe have a training column?
|
|
390
|
-
elif "training" in all_df.columns:
|
|
391
|
-
print("Found training column, splitting data based on training column")
|
|
392
|
-
df_train = all_df[all_df["training"]]
|
|
393
|
-
df_val = all_df[~all_df["training"]]
|
|
394
|
-
else:
|
|
395
|
-
# Just do a random training Split
|
|
396
|
-
print("WARNING: No training column found, splitting data with random state=42")
|
|
397
|
-
df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
|
|
398
|
-
print(f"FIT/TRAIN: {df_train.shape}")
|
|
399
|
-
print(f"VALIDATION: {df_val.shape}")
|
|
400
|
-
|
|
401
472
|
# Determine categorical and continuous columns
|
|
402
|
-
categorical_cols = [col for col in features if
|
|
473
|
+
categorical_cols = [col for col in features if all_df[col].dtype.name == "category"]
|
|
403
474
|
continuous_cols = [col for col in features if col not in categorical_cols]
|
|
404
|
-
|
|
405
475
|
print(f"Categorical columns: {categorical_cols}")
|
|
406
476
|
print(f"Continuous columns: {continuous_cols}")
|
|
407
477
|
|
|
408
|
-
#
|
|
409
|
-
|
|
410
|
-
target=[target],
|
|
411
|
-
continuous_cols=continuous_cols,
|
|
412
|
-
categorical_cols=categorical_cols,
|
|
413
|
-
)
|
|
414
|
-
|
|
415
|
-
trainer_config = TrainerConfig(
|
|
416
|
-
auto_lr_find=True,
|
|
417
|
-
batch_size=min(1024, len(df_train) // 4),
|
|
418
|
-
max_epochs=100,
|
|
419
|
-
early_stopping="valid_loss",
|
|
420
|
-
early_stopping_patience=15,
|
|
421
|
-
checkpoints="valid_loss",
|
|
422
|
-
accelerator="auto",
|
|
423
|
-
progress_bar="none",
|
|
424
|
-
gradient_clip_val=1.0,
|
|
425
|
-
)
|
|
478
|
+
# Cast continuous columns to float
|
|
479
|
+
all_df[continuous_cols] = all_df[continuous_cols].astype("float64")
|
|
426
480
|
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
# Choose model configuration based on model type
|
|
481
|
+
# Choose the 'task' based on model type and set up the label encoder if needed
|
|
430
482
|
if model_type == "classifier":
|
|
431
483
|
task = "classification"
|
|
432
|
-
# Encode the target column
|
|
484
|
+
# Encode the target column on full dataset for consistent encoding
|
|
433
485
|
label_encoder = LabelEncoder()
|
|
434
|
-
|
|
435
|
-
|
|
486
|
+
all_df[target] = label_encoder.fit_transform(all_df[target])
|
|
487
|
+
num_classes = len(label_encoder.classes_)
|
|
436
488
|
else:
|
|
437
489
|
task = "regression"
|
|
438
490
|
label_encoder = None
|
|
491
|
+
num_classes = None
|
|
492
|
+
|
|
493
|
+
# Use any hyperparameters to set up both the trainer and model configurations
|
|
494
|
+
print(f"Hyperparameters: {hyperparameters}")
|
|
495
|
+
n_folds = hyperparameters.get("n_folds", 5) # Number of CV folds (default: 5)
|
|
496
|
+
|
|
497
|
+
# =========================================================================
|
|
498
|
+
# UNIFIED TRAINING: Works for n_folds=1 (single model) or n_folds>1 (K-fold CV)
|
|
499
|
+
# =========================================================================
|
|
500
|
+
print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold cross-validation ensemble'}...")
|
|
501
|
+
|
|
502
|
+
# Create fold splits
|
|
503
|
+
if n_folds == 1:
|
|
504
|
+
# Single fold: use train/val split from "training" column or random split
|
|
505
|
+
if "training" in all_df.columns:
|
|
506
|
+
print("Found training column, splitting data based on training column")
|
|
507
|
+
train_idx = np.where(all_df["training"])[0]
|
|
508
|
+
val_idx = np.where(~all_df["training"])[0]
|
|
509
|
+
else:
|
|
510
|
+
print("WARNING: No training column found, splitting data with random 80/20 split")
|
|
511
|
+
indices = np.arange(len(all_df))
|
|
512
|
+
train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)
|
|
513
|
+
folds = [(train_idx, val_idx)]
|
|
514
|
+
else:
|
|
515
|
+
# K-Fold CV
|
|
516
|
+
if model_type == "classifier":
|
|
517
|
+
kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
518
|
+
split_target = all_df[target]
|
|
519
|
+
else:
|
|
520
|
+
kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
521
|
+
split_target = None
|
|
522
|
+
folds = list(kfold.split(all_df, split_target))
|
|
523
|
+
|
|
524
|
+
# Initialize storage for out-of-fold predictions
|
|
525
|
+
oof_predictions = np.full(len(all_df), np.nan, dtype=np.float64)
|
|
526
|
+
if model_type == "classifier" and num_classes and num_classes > 1:
|
|
527
|
+
oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
|
|
528
|
+
else:
|
|
529
|
+
oof_proba = None
|
|
439
530
|
|
|
440
|
-
|
|
441
|
-
model_config = CategoryEmbeddingModelConfig(
|
|
442
|
-
task=task,
|
|
443
|
-
layers="1024-512-512",
|
|
444
|
-
activation="ReLU",
|
|
445
|
-
learning_rate=1e-3,
|
|
446
|
-
dropout=0.1,
|
|
447
|
-
use_batch_norm=True,
|
|
448
|
-
initialization="kaiming",
|
|
449
|
-
)
|
|
531
|
+
ensemble_models = []
|
|
450
532
|
|
|
451
|
-
#
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
trainer_config=trainer_config,
|
|
533
|
+
# Set up PyTorch Tabular data configuration (shared across folds)
|
|
534
|
+
data_config = DataConfig(
|
|
535
|
+
target=[target],
|
|
536
|
+
continuous_cols=continuous_cols,
|
|
537
|
+
categorical_cols=categorical_cols,
|
|
457
538
|
)
|
|
458
539
|
|
|
459
|
-
#
|
|
460
|
-
|
|
540
|
+
# Model config defaults
|
|
541
|
+
model_defaults = {
|
|
542
|
+
"layers": "256-128-64",
|
|
543
|
+
"activation": "LeakyReLU",
|
|
544
|
+
"learning_rate": 1e-3,
|
|
545
|
+
"dropout": 0.1,
|
|
546
|
+
"use_batch_norm": True,
|
|
547
|
+
"initialization": "kaiming",
|
|
548
|
+
}
|
|
549
|
+
# Override defaults with model_config if present
|
|
550
|
+
model_overrides = {k: v for k, v in hyperparameters.get("model_config", {}).items() if k in model_defaults}
|
|
551
|
+
for key, value in model_overrides.items():
|
|
552
|
+
print(f"MODEL CONFIG Override: {key}: {model_defaults[key]} → {value}")
|
|
553
|
+
model_params = {**model_defaults, **model_overrides}
|
|
554
|
+
|
|
555
|
+
model_config = CategoryEmbeddingModelConfig(task=task, **model_params)
|
|
556
|
+
optimizer_config = OptimizerConfig()
|
|
461
557
|
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
558
|
+
for fold_idx, (train_idx, val_idx) in enumerate(folds):
|
|
559
|
+
print(f"\n{'='*50}")
|
|
560
|
+
print(f"Training Fold {fold_idx + 1}/{len(folds)}")
|
|
561
|
+
print(f"{'='*50}")
|
|
562
|
+
|
|
563
|
+
# Split data for this fold
|
|
564
|
+
df_train = all_df.iloc[train_idx].reset_index(drop=True)
|
|
565
|
+
df_val = all_df.iloc[val_idx].reset_index(drop=True)
|
|
566
|
+
|
|
567
|
+
print(f"Fold {fold_idx + 1} - Train: {len(df_train)}, Val: {len(df_val)}")
|
|
568
|
+
|
|
569
|
+
# Set up PyTorch Tabular trainer configuration (per-fold for batch_size)
|
|
570
|
+
# Calculate batch size that avoids single-sample last batch (batch norm requires >1)
|
|
571
|
+
batch_size = min(128, max(32, len(df_train) // 16))
|
|
572
|
+
if len(df_train) % batch_size == 1:
|
|
573
|
+
batch_size += 1 # Adjust to avoid last batch of size 1
|
|
574
|
+
trainer_defaults = {
|
|
575
|
+
"auto_lr_find": False,
|
|
576
|
+
"batch_size": batch_size,
|
|
577
|
+
"max_epochs": 200,
|
|
578
|
+
"min_epochs": 10,
|
|
579
|
+
"early_stopping": "valid_loss",
|
|
580
|
+
"early_stopping_patience": 20,
|
|
581
|
+
"checkpoints": "valid_loss",
|
|
582
|
+
"accelerator": "auto",
|
|
583
|
+
"progress_bar": "none",
|
|
584
|
+
"gradient_clip_val": 1.0,
|
|
585
|
+
"seed": 42 + fold_idx,
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
# Override defaults with training_config if present
|
|
589
|
+
training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
|
|
590
|
+
if fold_idx == 0: # Only print overrides once
|
|
591
|
+
for key, value in training_overrides.items():
|
|
592
|
+
print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
|
|
593
|
+
trainer_params = {**trainer_defaults, **training_overrides}
|
|
594
|
+
trainer_config = TrainerConfig(**trainer_params)
|
|
595
|
+
|
|
596
|
+
# Create and train the TabularModel for this fold
|
|
597
|
+
tabular_model = TabularModel(
|
|
598
|
+
data_config=data_config,
|
|
599
|
+
model_config=model_config,
|
|
600
|
+
optimizer_config=optimizer_config,
|
|
601
|
+
trainer_config=trainer_config,
|
|
602
|
+
)
|
|
603
|
+
tabular_model.fit(train=df_train, validation=df_val)
|
|
604
|
+
ensemble_models.append(tabular_model)
|
|
605
|
+
|
|
606
|
+
# Make out-of-fold predictions
|
|
607
|
+
result = tabular_model.predict(df_val, include_input_features=False)
|
|
608
|
+
fold_preds = result[f"{target}_prediction"].values
|
|
609
|
+
|
|
610
|
+
# Store out-of-fold predictions
|
|
611
|
+
if model_type == "classifier":
|
|
612
|
+
oof_predictions[val_idx] = fold_preds.astype(int)
|
|
613
|
+
prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
|
|
614
|
+
if prob_cols and oof_proba is not None:
|
|
615
|
+
oof_proba[val_idx] = result[prob_cols].values
|
|
616
|
+
else:
|
|
617
|
+
oof_predictions[val_idx] = fold_preds.flatten()
|
|
465
618
|
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
619
|
+
print(f"Fold {fold_idx + 1} complete!")
|
|
620
|
+
|
|
621
|
+
print(f"\nTraining complete! Trained {len(ensemble_models)} model(s).")
|
|
622
|
+
|
|
623
|
+
# Use out-of-fold predictions for metrics
|
|
624
|
+
# For n_folds=1, we only have predictions for val_idx, so filter to those rows
|
|
625
|
+
if n_folds == 1:
|
|
626
|
+
val_mask = ~np.isnan(oof_predictions)
|
|
627
|
+
preds = oof_predictions[val_mask]
|
|
628
|
+
df_val = all_df[val_mask].copy()
|
|
629
|
+
if oof_proba is not None:
|
|
630
|
+
oof_proba = oof_proba[val_mask]
|
|
470
631
|
else:
|
|
471
|
-
|
|
472
|
-
|
|
632
|
+
preds = oof_predictions
|
|
633
|
+
df_val = all_df.copy()
|
|
634
|
+
|
|
635
|
+
# Compute prediction_std by running all ensemble models on validation data
|
|
636
|
+
# For n_folds=1, std will be 0 (only one model). For n_folds>1, std shows ensemble disagreement.
|
|
637
|
+
preds_std = None
|
|
638
|
+
if model_type in ["regressor", "uq_regressor"] and len(ensemble_models) > 0:
|
|
639
|
+
print("Computing prediction_std from ensemble predictions on validation data...")
|
|
640
|
+
all_ensemble_preds_for_std = []
|
|
641
|
+
for ens_model in ensemble_models:
|
|
642
|
+
result = ens_model.predict(df_val[features], include_input_features=False)
|
|
643
|
+
ens_preds = result[f"{target}_prediction"].values.flatten()
|
|
644
|
+
all_ensemble_preds_for_std.append(ens_preds)
|
|
645
|
+
|
|
646
|
+
ensemble_preds_stacked = np.stack(all_ensemble_preds_for_std, axis=0)
|
|
647
|
+
preds_std = np.std(ensemble_preds_stacked, axis=0)
|
|
648
|
+
print(f"Ensemble prediction_std - mean: {np.mean(preds_std):.4f}, max: {np.max(preds_std):.4f}")
|
|
473
649
|
|
|
474
650
|
if model_type == "classifier":
|
|
475
651
|
# Get probabilities for classification
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
probs = result[prob_cols].values
|
|
480
|
-
df_val["pred_proba"] = [p.tolist() for p in probs]
|
|
481
|
-
|
|
482
|
-
# Expand the pred_proba column into separate columns for each class
|
|
483
|
-
print(df_val.columns)
|
|
652
|
+
if oof_proba is not None:
|
|
653
|
+
df_val = df_val.copy()
|
|
654
|
+
df_val["pred_proba"] = [p.tolist() for p in oof_proba]
|
|
484
655
|
df_val = expand_proba_column(df_val, label_encoder.classes_)
|
|
485
|
-
print(df_val.columns)
|
|
486
656
|
|
|
487
657
|
# Decode the target and prediction labels
|
|
488
658
|
y_validate = label_encoder.inverse_transform(df_val[target])
|
|
489
|
-
|
|
659
|
+
preds_decoded = label_encoder.inverse_transform(preds.astype(int))
|
|
490
660
|
else:
|
|
491
661
|
y_validate = df_val[target].values
|
|
662
|
+
preds_decoded = preds
|
|
663
|
+
|
|
664
|
+
# Save predictions to S3
|
|
665
|
+
df_val = df_val.copy()
|
|
666
|
+
df_val["prediction"] = preds_decoded
|
|
667
|
+
|
|
668
|
+
# Build output columns - include id_column if it exists
|
|
669
|
+
output_columns = []
|
|
670
|
+
if id_column in df_val.columns:
|
|
671
|
+
output_columns.append(id_column)
|
|
672
|
+
output_columns += [target, "prediction"]
|
|
673
|
+
|
|
674
|
+
# Add prediction_std for regression models (always present, 0 for single model)
|
|
675
|
+
if model_type in ["regressor", "uq_regressor"]:
|
|
676
|
+
if preds_std is not None:
|
|
677
|
+
df_val["prediction_std"] = preds_std
|
|
678
|
+
else:
|
|
679
|
+
df_val["prediction_std"] = 0.0
|
|
680
|
+
output_columns.append("prediction_std")
|
|
681
|
+
print(f"Ensemble std - mean: {df_val['prediction_std'].mean():.4f}, max: {df_val['prediction_std'].max():.4f}")
|
|
492
682
|
|
|
493
|
-
|
|
494
|
-
df_val["prediction"] = preds
|
|
495
|
-
output_columns = [target, "prediction"]
|
|
496
|
-
output_columns += [col for col in df_val.columns if col.endswith("_probability")]
|
|
683
|
+
output_columns += [col for col in df_val.columns if col.endswith("_proba")]
|
|
497
684
|
wr.s3.to_csv(
|
|
498
685
|
df_val[output_columns],
|
|
499
686
|
path=f"{model_metrics_s3_path}/validation_predictions.csv",
|
|
@@ -506,7 +693,7 @@ if __name__ == "__main__":
|
|
|
506
693
|
label_names = label_encoder.classes_
|
|
507
694
|
|
|
508
695
|
# Calculate various model performance metrics
|
|
509
|
-
scores = precision_recall_fscore_support(y_validate,
|
|
696
|
+
scores = precision_recall_fscore_support(y_validate, preds_decoded, average=None, labels=label_names)
|
|
510
697
|
|
|
511
698
|
# Put the scores into a dataframe
|
|
512
699
|
score_df = pd.DataFrame(
|
|
@@ -514,20 +701,20 @@ if __name__ == "__main__":
|
|
|
514
701
|
target: label_names,
|
|
515
702
|
"precision": scores[0],
|
|
516
703
|
"recall": scores[1],
|
|
517
|
-
"
|
|
704
|
+
"f1": scores[2],
|
|
518
705
|
"support": scores[3],
|
|
519
706
|
}
|
|
520
707
|
)
|
|
521
708
|
|
|
522
|
-
#
|
|
523
|
-
metrics = ["precision", "recall", "
|
|
709
|
+
# Output metrics per class
|
|
710
|
+
metrics = ["precision", "recall", "f1", "support"]
|
|
524
711
|
for t in label_names:
|
|
525
712
|
for m in metrics:
|
|
526
713
|
value = score_df.loc[score_df[target] == t, m].iloc[0]
|
|
527
714
|
print(f"Metrics:{t}:{m} {value}")
|
|
528
715
|
|
|
529
716
|
# Compute and output the confusion matrix
|
|
530
|
-
conf_mtx = confusion_matrix(y_validate,
|
|
717
|
+
conf_mtx = confusion_matrix(y_validate, preds_decoded, labels=label_names)
|
|
531
718
|
for i, row_name in enumerate(label_names):
|
|
532
719
|
for j, col_name in enumerate(label_names):
|
|
533
720
|
value = conf_mtx[i, j]
|
|
@@ -535,23 +722,38 @@ if __name__ == "__main__":
|
|
|
535
722
|
|
|
536
723
|
else:
|
|
537
724
|
# Calculate various model performance metrics (regression)
|
|
538
|
-
rmse = root_mean_squared_error(y_validate,
|
|
539
|
-
mae = mean_absolute_error(y_validate,
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
print(f"
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
725
|
+
rmse = root_mean_squared_error(y_validate, preds_decoded)
|
|
726
|
+
mae = mean_absolute_error(y_validate, preds_decoded)
|
|
727
|
+
medae = median_absolute_error(y_validate, preds_decoded)
|
|
728
|
+
r2 = r2_score(y_validate, preds_decoded)
|
|
729
|
+
spearman_corr = spearmanr(y_validate, preds_decoded).correlation
|
|
730
|
+
support = len(df_val)
|
|
731
|
+
print(f"rmse: {rmse:.3f}")
|
|
732
|
+
print(f"mae: {mae:.3f}")
|
|
733
|
+
print(f"medae: {medae:.3f}")
|
|
734
|
+
print(f"r2: {r2:.3f}")
|
|
735
|
+
print(f"spearmanr: {spearman_corr:.3f}")
|
|
736
|
+
print(f"support: {support}")
|
|
737
|
+
|
|
738
|
+
# Save ensemble models
|
|
739
|
+
for model_idx, ens_model in enumerate(ensemble_models):
|
|
740
|
+
model_path = os.path.join(args.model_dir, f"tabular_model_{model_idx}")
|
|
741
|
+
ens_model.save_model(model_path)
|
|
742
|
+
print(f"Saved model {model_idx + 1} to {model_path}")
|
|
743
|
+
|
|
744
|
+
# Save ensemble metadata
|
|
745
|
+
n_ensemble = len(ensemble_models)
|
|
746
|
+
ensemble_metadata = {"n_ensemble": n_ensemble, "n_folds": n_folds}
|
|
747
|
+
joblib.dump(ensemble_metadata, os.path.join(args.model_dir, "ensemble_metadata.joblib"))
|
|
748
|
+
print(f"Saved ensemble metadata (n_ensemble={n_ensemble}, n_folds={n_folds})")
|
|
749
|
+
|
|
548
750
|
if label_encoder:
|
|
549
751
|
joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
|
|
550
752
|
|
|
551
753
|
# Save the features (this will validate input during predictions)
|
|
552
754
|
with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
|
|
553
|
-
json.dump(orig_features, fp)
|
|
755
|
+
json.dump(orig_features, fp)
|
|
554
756
|
|
|
555
757
|
# Save the category mappings
|
|
556
758
|
with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
|
|
557
|
-
json.dump(category_mappings, fp)
|
|
759
|
+
json.dump(category_mappings, fp)
|