workbench 0.8.202__py3-none-any.whl → 0.8.220__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of workbench might be problematic. Click here for more details.
- workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
- workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
- workbench/algorithms/dataframe/fingerprint_proximity.py +421 -85
- workbench/algorithms/dataframe/projection_2d.py +44 -21
- workbench/algorithms/dataframe/proximity.py +78 -150
- workbench/algorithms/graph/light/proximity_graph.py +5 -5
- workbench/algorithms/models/cleanlab_model.py +382 -0
- workbench/algorithms/models/noise_model.py +388 -0
- workbench/algorithms/sql/outliers.py +3 -3
- workbench/api/__init__.py +3 -0
- workbench/api/df_store.py +17 -108
- workbench/api/endpoint.py +13 -11
- workbench/api/feature_set.py +111 -8
- workbench/api/meta_model.py +289 -0
- workbench/api/model.py +45 -12
- workbench/api/parameter_store.py +3 -52
- workbench/cached/cached_model.py +4 -4
- workbench/core/artifacts/artifact.py +5 -5
- workbench/core/artifacts/df_store_core.py +114 -0
- workbench/core/artifacts/endpoint_core.py +228 -237
- workbench/core/artifacts/feature_set_core.py +185 -230
- workbench/core/artifacts/model_core.py +34 -26
- workbench/core/artifacts/parameter_store_core.py +98 -0
- workbench/core/pipelines/pipeline_executor.py +1 -1
- workbench/core/transforms/features_to_model/features_to_model.py +22 -10
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +41 -10
- workbench/core/transforms/pandas_transforms/pandas_to_features.py +11 -2
- workbench/model_script_utils/model_script_utils.py +339 -0
- workbench/model_script_utils/pytorch_utils.py +405 -0
- workbench/model_script_utils/uq_harness.py +278 -0
- workbench/model_scripts/chemprop/chemprop.template +428 -631
- workbench/model_scripts/chemprop/generated_model_script.py +432 -635
- workbench/model_scripts/chemprop/model_script_utils.py +339 -0
- workbench/model_scripts/chemprop/requirements.txt +2 -10
- workbench/model_scripts/custom_models/chem_info/fingerprints.py +87 -46
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +6 -6
- workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
- workbench/model_scripts/meta_model/generated_model_script.py +209 -0
- workbench/model_scripts/meta_model/meta_model.template +209 -0
- workbench/model_scripts/pytorch_model/generated_model_script.py +374 -613
- workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
- workbench/model_scripts/pytorch_model/pytorch.template +370 -609
- workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
- workbench/model_scripts/pytorch_model/requirements.txt +1 -1
- workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
- workbench/model_scripts/script_generation.py +6 -5
- workbench/model_scripts/uq_models/generated_model_script.py +65 -422
- workbench/model_scripts/xgb_model/generated_model_script.py +372 -395
- workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
- workbench/model_scripts/xgb_model/uq_harness.py +278 -0
- workbench/model_scripts/xgb_model/xgb_model.template +366 -396
- workbench/repl/workbench_shell.py +0 -5
- workbench/resources/open_source_api.key +1 -1
- workbench/scripts/endpoint_test.py +2 -2
- workbench/scripts/meta_model_sim.py +35 -0
- workbench/scripts/training_test.py +85 -0
- workbench/utils/chem_utils/fingerprints.py +87 -46
- workbench/utils/chem_utils/projections.py +16 -6
- workbench/utils/chemprop_utils.py +36 -655
- workbench/utils/meta_model_simulator.py +499 -0
- workbench/utils/metrics_utils.py +256 -0
- workbench/utils/model_utils.py +192 -54
- workbench/utils/pytorch_utils.py +33 -472
- workbench/utils/shap_utils.py +1 -55
- workbench/utils/xgboost_local_crossfold.py +267 -0
- workbench/utils/xgboost_model_utils.py +49 -356
- workbench/web_interface/components/model_plot.py +7 -1
- workbench/web_interface/components/plugins/model_details.py +30 -68
- workbench/web_interface/components/plugins/scatter_plot.py +4 -8
- {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/METADATA +6 -5
- {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/RECORD +76 -60
- {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/entry_points.txt +2 -0
- workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
- workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -296
- workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
- workbench/model_scripts/custom_models/proximity/proximity.py +0 -410
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -377
- workbench/model_scripts/custom_models/uq_models/proximity.py +0 -410
- workbench/model_scripts/uq_models/mapie.template +0 -605
- workbench/model_scripts/uq_models/requirements.txt +0 -1
- {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/WHEEL +0 -0
- {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/top_level.txt +0 -0
|
@@ -1,468 +1,445 @@
|
|
|
1
|
-
#
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
#
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
)
|
|
1
|
+
# XGBoost Model Template for Workbench
|
|
2
|
+
#
|
|
3
|
+
# This template handles both classification and regression models with:
|
|
4
|
+
# - K-fold cross-validation ensemble training (or single train/val split)
|
|
5
|
+
# - Out-of-fold predictions for validation metrics
|
|
6
|
+
# - Uncertainty quantification for regression models
|
|
7
|
+
# - Sample weights support
|
|
8
|
+
# - Categorical feature handling
|
|
9
|
+
# - Compressed feature decompression
|
|
10
|
+
#
|
|
11
|
+
# NOTE: Imports are structured to minimize serverless endpoint startup time.
|
|
12
|
+
# Heavy imports (sklearn, awswrangler) are deferred to training time.
|
|
14
13
|
|
|
15
|
-
# Classification Encoder
|
|
16
|
-
from sklearn.preprocessing import LabelEncoder
|
|
17
|
-
|
|
18
|
-
# Scikit Learn Imports
|
|
19
|
-
from sklearn.model_selection import train_test_split
|
|
20
|
-
|
|
21
|
-
from io import StringIO
|
|
22
14
|
import json
|
|
23
|
-
import argparse
|
|
24
|
-
import joblib
|
|
25
15
|
import os
|
|
26
|
-
import pandas as pd
|
|
27
|
-
from typing import List, Tuple
|
|
28
|
-
|
|
29
|
-
# Template Parameters
|
|
30
|
-
TEMPLATE_PARAMS = {
|
|
31
|
-
"model_type": "regressor",
|
|
32
|
-
"target": "class_number_of_rings",
|
|
33
|
-
"features": ['length', 'diameter', 'height', 'whole_weight', 'shucked_weight', 'viscera_weight', 'shell_weight', 'sex'],
|
|
34
|
-
"compressed_features": [],
|
|
35
|
-
"model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/abalone-regression/training",
|
|
36
|
-
"train_all_data": False,
|
|
37
|
-
"hyperparameters": {},
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
# Function to check if dataframe is empty
|
|
42
|
-
def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
|
|
43
|
-
"""
|
|
44
|
-
Check if the provided dataframe is empty and raise an exception if it is.
|
|
45
|
-
|
|
46
|
-
Args:
|
|
47
|
-
df (pd.DataFrame): DataFrame to check
|
|
48
|
-
df_name (str): Name of the DataFrame
|
|
49
|
-
"""
|
|
50
|
-
if df.empty:
|
|
51
|
-
msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
|
|
52
|
-
print(msg)
|
|
53
|
-
raise ValueError(msg)
|
|
54
16
|
|
|
17
|
+
import joblib
|
|
18
|
+
import numpy as np
|
|
19
|
+
import pandas as pd
|
|
20
|
+
import xgboost as xgb
|
|
55
21
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
22
|
+
from model_script_utils import (
|
|
23
|
+
convert_categorical_types,
|
|
24
|
+
decompress_features,
|
|
25
|
+
expand_proba_column,
|
|
26
|
+
input_fn,
|
|
27
|
+
match_features_case_insensitive,
|
|
28
|
+
output_fn,
|
|
29
|
+
)
|
|
30
|
+
from uq_harness import (
|
|
31
|
+
compute_confidence,
|
|
32
|
+
load_uq_models,
|
|
33
|
+
predict_intervals,
|
|
34
|
+
)
|
|
59
35
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
36
|
+
# =============================================================================
|
|
37
|
+
# Default Hyperparameters
|
|
38
|
+
# =============================================================================
|
|
39
|
+
DEFAULT_HYPERPARAMETERS = {
|
|
40
|
+
# Training parameters
|
|
41
|
+
"n_folds": 5, # Number of CV folds (1 = single train/val split)
|
|
42
|
+
# Core tree parameters
|
|
43
|
+
"n_estimators": 300,
|
|
44
|
+
"max_depth": 7,
|
|
45
|
+
"learning_rate": 0.05,
|
|
46
|
+
# Sampling parameters (less aggressive - ensemble provides regularization)
|
|
47
|
+
"subsample": 0.8,
|
|
48
|
+
"colsample_bytree": 0.8,
|
|
49
|
+
# Regularization (lighter - ensemble averaging reduces overfitting)
|
|
50
|
+
"min_child_weight": 3,
|
|
51
|
+
"gamma": 0.1,
|
|
52
|
+
"reg_alpha": 0.1,
|
|
53
|
+
"reg_lambda": 1.0,
|
|
54
|
+
# Random seed
|
|
55
|
+
"seed": 42,
|
|
56
|
+
}
|
|
63
57
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
"""
|
|
58
|
+
# Workbench-specific parameters (not passed to XGBoost)
|
|
59
|
+
WORKBENCH_PARAMS = {"n_folds"}
|
|
67
60
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
if proba_column not in df.columns:
|
|
71
|
-
raise ValueError('DataFrame does not contain a "pred_proba" column')
|
|
61
|
+
# Regression-only parameters (filtered out for classifiers)
|
|
62
|
+
REGRESSION_ONLY_PARAMS = {"objective"}
|
|
72
63
|
|
|
73
|
-
|
|
74
|
-
|
|
64
|
+
# Template parameters (filled in by Workbench)
|
|
65
|
+
TEMPLATE_PARAMS = {
|
|
66
|
+
"model_type": "uq_regressor",
|
|
67
|
+
"target": "udm_asy_res_efflux_ratio",
|
|
68
|
+
"features": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo'],
|
|
69
|
+
"id_column": "udm_mol_bat_id",
|
|
70
|
+
"compressed_features": ['fingerprint'],
|
|
71
|
+
"model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/caco2-er-reg-temporal/training",
|
|
72
|
+
"hyperparameters": {'n_folds': 1},
|
|
73
|
+
}
|
|
75
74
|
|
|
76
|
-
# Expand the proba_column into separate columns for each probability
|
|
77
|
-
proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
|
|
78
75
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
76
|
+
# =============================================================================
|
|
77
|
+
# Model Loading (for SageMaker inference)
|
|
78
|
+
# =============================================================================
|
|
79
|
+
def model_fn(model_dir: str) -> dict:
|
|
80
|
+
"""Load XGBoost ensemble from the specified directory."""
|
|
81
|
+
# Load ensemble metadata
|
|
82
|
+
metadata_path = os.path.join(model_dir, "ensemble_metadata.json")
|
|
83
|
+
if os.path.exists(metadata_path):
|
|
84
|
+
with open(metadata_path) as f:
|
|
85
|
+
metadata = json.load(f)
|
|
86
|
+
n_ensemble = metadata["n_ensemble"]
|
|
87
|
+
else:
|
|
88
|
+
n_ensemble = 1 # Legacy single model
|
|
82
89
|
|
|
83
|
-
#
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
90
|
+
# Load ensemble models
|
|
91
|
+
ensemble_models = []
|
|
92
|
+
for i in range(n_ensemble):
|
|
93
|
+
model_path = os.path.join(model_dir, f"xgb_model_{i}.joblib")
|
|
94
|
+
if not os.path.exists(model_path):
|
|
95
|
+
model_path = os.path.join(model_dir, "xgb_model.joblib") # Legacy fallback
|
|
96
|
+
ensemble_models.append(joblib.load(model_path))
|
|
87
97
|
|
|
98
|
+
print(f"Loaded {len(ensemble_models)} model(s)")
|
|
88
99
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
100
|
+
# Load label encoder (classifier only)
|
|
101
|
+
label_encoder = None
|
|
102
|
+
encoder_path = os.path.join(model_dir, "label_encoder.joblib")
|
|
103
|
+
if os.path.exists(encoder_path):
|
|
104
|
+
label_encoder = joblib.load(encoder_path)
|
|
105
|
+
|
|
106
|
+
# Load category mappings
|
|
107
|
+
category_mappings = {}
|
|
108
|
+
category_path = os.path.join(model_dir, "category_mappings.json")
|
|
109
|
+
if os.path.exists(category_path):
|
|
110
|
+
with open(category_path) as f:
|
|
111
|
+
category_mappings = json.load(f)
|
|
112
|
+
|
|
113
|
+
# Load UQ models (regression only)
|
|
114
|
+
uq_models, uq_metadata = None, None
|
|
115
|
+
uq_path = os.path.join(model_dir, "uq_metadata.json")
|
|
116
|
+
if os.path.exists(uq_path):
|
|
117
|
+
uq_models, uq_metadata = load_uq_models(model_dir)
|
|
118
|
+
|
|
119
|
+
return {
|
|
120
|
+
"ensemble_models": ensemble_models,
|
|
121
|
+
"n_ensemble": n_ensemble,
|
|
122
|
+
"label_encoder": label_encoder,
|
|
123
|
+
"category_mappings": category_mappings,
|
|
124
|
+
"uq_models": uq_models,
|
|
125
|
+
"uq_metadata": uq_metadata,
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# =============================================================================
|
|
130
|
+
# Inference (for SageMaker inference)
|
|
131
|
+
# =============================================================================
|
|
132
|
+
def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
|
|
133
|
+
"""Make predictions with XGBoost ensemble."""
|
|
134
|
+
model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
|
|
135
|
+
with open(os.path.join(model_dir, "feature_columns.json")) as f:
|
|
136
|
+
features = json.load(f)
|
|
137
|
+
print(f"Model Features: {features}")
|
|
106
138
|
|
|
107
|
-
|
|
108
|
-
|
|
139
|
+
# Extract model components
|
|
140
|
+
ensemble_models = model_dict["ensemble_models"]
|
|
141
|
+
label_encoder = model_dict.get("label_encoder")
|
|
142
|
+
category_mappings = model_dict.get("category_mappings", {})
|
|
143
|
+
uq_models = model_dict.get("uq_models")
|
|
144
|
+
uq_metadata = model_dict.get("uq_metadata")
|
|
145
|
+
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
109
146
|
|
|
110
|
-
#
|
|
111
|
-
|
|
147
|
+
# Prepare features
|
|
148
|
+
matched_df = match_features_case_insensitive(df, features)
|
|
149
|
+
matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
|
|
112
150
|
|
|
151
|
+
if compressed_features:
|
|
152
|
+
print("Decompressing features for prediction...")
|
|
153
|
+
matched_df, features = decompress_features(matched_df, features, compressed_features)
|
|
113
154
|
|
|
114
|
-
|
|
115
|
-
"""
|
|
116
|
-
Converts appropriate columns to categorical type with consistent mappings.
|
|
155
|
+
X = matched_df[features]
|
|
117
156
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
|
|
122
|
-
training mode. If populated, we're in inference mode.
|
|
157
|
+
# Collect ensemble predictions
|
|
158
|
+
all_preds = [m.predict(X) for m in ensemble_models]
|
|
159
|
+
ensemble_preds = np.stack(all_preds, axis=0)
|
|
123
160
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
for col in df.select_dtypes(include=["object", "string"]):
|
|
130
|
-
if col in features and df[col].nunique() < 20:
|
|
131
|
-
print(f"Training mode: Converting {col} to category")
|
|
132
|
-
df[col] = df[col].astype("category")
|
|
133
|
-
category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
|
|
161
|
+
if label_encoder is not None:
|
|
162
|
+
# Classification: average probabilities, then argmax
|
|
163
|
+
all_probs = [m.predict_proba(X) for m in ensemble_models]
|
|
164
|
+
avg_probs = np.mean(np.stack(all_probs, axis=0), axis=0)
|
|
165
|
+
class_preds = np.argmax(avg_probs, axis=1)
|
|
134
166
|
|
|
135
|
-
|
|
167
|
+
df["prediction"] = label_encoder.inverse_transform(class_preds)
|
|
168
|
+
df["pred_proba"] = [p.tolist() for p in avg_probs]
|
|
169
|
+
df = expand_proba_column(df, label_encoder.classes_)
|
|
136
170
|
else:
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
|
|
141
|
-
|
|
142
|
-
return df, category_mappings
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
def decompress_features(
|
|
146
|
-
df: pd.DataFrame, features: List[str], compressed_features: List[str]
|
|
147
|
-
) -> Tuple[pd.DataFrame, List[str]]:
|
|
148
|
-
"""Prepare features for the model by decompressing bitstring features
|
|
149
|
-
|
|
150
|
-
Args:
|
|
151
|
-
df (pd.DataFrame): The features DataFrame
|
|
152
|
-
features (List[str]): Full list of feature names
|
|
153
|
-
compressed_features (List[str]): List of feature names to decompress (bitstrings)
|
|
154
|
-
|
|
155
|
-
Returns:
|
|
156
|
-
pd.DataFrame: DataFrame with the decompressed features
|
|
157
|
-
List[str]: Updated list of feature names after decompression
|
|
158
|
-
|
|
159
|
-
Raises:
|
|
160
|
-
ValueError: If any missing values are found in the specified features
|
|
161
|
-
"""
|
|
162
|
-
|
|
163
|
-
# Check for any missing values in the required features
|
|
164
|
-
missing_counts = df[features].isna().sum()
|
|
165
|
-
if missing_counts.any():
|
|
166
|
-
missing_features = missing_counts[missing_counts > 0]
|
|
167
|
-
print(
|
|
168
|
-
f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
|
|
169
|
-
"WARNING: You might want to remove/replace all NaN values before processing."
|
|
170
|
-
)
|
|
171
|
-
|
|
172
|
-
# Decompress the specified compressed features
|
|
173
|
-
decompressed_features = features.copy()
|
|
174
|
-
for feature in compressed_features:
|
|
175
|
-
if (feature not in df.columns) or (feature not in features):
|
|
176
|
-
print(f"Feature '{feature}' not in the features list, skipping decompression.")
|
|
177
|
-
continue
|
|
178
|
-
|
|
179
|
-
# Remove the feature from the list of features to avoid duplication
|
|
180
|
-
decompressed_features.remove(feature)
|
|
181
|
-
|
|
182
|
-
# Handle all compressed features as bitstrings
|
|
183
|
-
bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
|
|
184
|
-
prefix = feature[:3]
|
|
185
|
-
|
|
186
|
-
# Create all new columns at once - avoids fragmentation
|
|
187
|
-
new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
|
|
188
|
-
new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
|
|
189
|
-
|
|
190
|
-
# Add to features list
|
|
191
|
-
decompressed_features.extend(new_col_names)
|
|
171
|
+
# Regression: average predictions
|
|
172
|
+
df["prediction"] = np.mean(ensemble_preds, axis=0)
|
|
173
|
+
df["prediction_std"] = np.std(ensemble_preds, axis=0)
|
|
192
174
|
|
|
193
|
-
#
|
|
194
|
-
|
|
195
|
-
|
|
175
|
+
# Add UQ intervals if available
|
|
176
|
+
if uq_models and uq_metadata:
|
|
177
|
+
df = predict_intervals(df, X, uq_models, uq_metadata)
|
|
178
|
+
df = compute_confidence(df, uq_metadata["median_interval_width"], "q_10", "q_90")
|
|
196
179
|
|
|
197
|
-
|
|
180
|
+
print(f"Inference complete: {len(df)} predictions, {len(ensemble_models)} ensemble members")
|
|
181
|
+
return df
|
|
198
182
|
|
|
199
183
|
|
|
184
|
+
# =============================================================================
|
|
185
|
+
# Training
|
|
186
|
+
# =============================================================================
|
|
200
187
|
if __name__ == "__main__":
|
|
201
|
-
|
|
188
|
+
# -------------------------------------------------------------------------
|
|
189
|
+
# Training-only imports (deferred to reduce serverless startup time)
|
|
190
|
+
# -------------------------------------------------------------------------
|
|
191
|
+
import argparse
|
|
192
|
+
|
|
193
|
+
import awswrangler as wr
|
|
194
|
+
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
|
|
195
|
+
from sklearn.preprocessing import LabelEncoder
|
|
196
|
+
|
|
197
|
+
from model_script_utils import (
|
|
198
|
+
check_dataframe,
|
|
199
|
+
compute_classification_metrics,
|
|
200
|
+
compute_regression_metrics,
|
|
201
|
+
print_classification_metrics,
|
|
202
|
+
print_confusion_matrix,
|
|
203
|
+
print_regression_metrics,
|
|
204
|
+
)
|
|
205
|
+
from uq_harness import (
|
|
206
|
+
save_uq_models,
|
|
207
|
+
train_uq_models,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# -------------------------------------------------------------------------
|
|
211
|
+
# Setup: Parse arguments and load data
|
|
212
|
+
# -------------------------------------------------------------------------
|
|
213
|
+
parser = argparse.ArgumentParser()
|
|
214
|
+
parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
|
|
215
|
+
parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
|
|
216
|
+
parser.add_argument("--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data"))
|
|
217
|
+
args = parser.parse_args()
|
|
202
218
|
|
|
203
|
-
#
|
|
219
|
+
# Extract template parameters
|
|
204
220
|
target = TEMPLATE_PARAMS["target"]
|
|
205
221
|
features = TEMPLATE_PARAMS["features"]
|
|
206
222
|
orig_features = features.copy()
|
|
223
|
+
id_column = TEMPLATE_PARAMS["id_column"]
|
|
207
224
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
208
225
|
model_type = TEMPLATE_PARAMS["model_type"]
|
|
209
226
|
model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
|
|
210
|
-
|
|
211
|
-
hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
|
|
212
|
-
validation_split = 0.2
|
|
213
|
-
|
|
214
|
-
# Script arguments for input/output directories
|
|
215
|
-
parser = argparse.ArgumentParser()
|
|
216
|
-
parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
|
|
217
|
-
parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
|
|
218
|
-
parser.add_argument(
|
|
219
|
-
"--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
|
|
220
|
-
)
|
|
221
|
-
args = parser.parse_args()
|
|
227
|
+
hyperparameters = {**DEFAULT_HYPERPARAMETERS, **(TEMPLATE_PARAMS["hyperparameters"] or {})}
|
|
222
228
|
|
|
223
|
-
#
|
|
224
|
-
training_files = [os.path.join(args.train,
|
|
229
|
+
# Load training data
|
|
230
|
+
training_files = [os.path.join(args.train, f) for f in os.listdir(args.train) if f.endswith(".csv")]
|
|
225
231
|
print(f"Training Files: {training_files}")
|
|
226
|
-
|
|
227
|
-
# Combine files and read them all into a single pandas dataframe
|
|
228
|
-
all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
229
|
-
|
|
230
|
-
# Check if the dataframe is empty
|
|
232
|
+
all_df = pd.concat([pd.read_csv(f, engine="python") for f in training_files])
|
|
231
233
|
check_dataframe(all_df, "training_df")
|
|
232
234
|
|
|
233
|
-
# Features/Target output
|
|
234
235
|
print(f"Target: {target}")
|
|
235
|
-
print(f"Features: {
|
|
236
|
+
print(f"Features: {features}")
|
|
237
|
+
print(f"Hyperparameters: {hyperparameters}")
|
|
236
238
|
|
|
237
|
-
#
|
|
239
|
+
# -------------------------------------------------------------------------
|
|
240
|
+
# Preprocessing: Categorical features and decompression
|
|
241
|
+
# -------------------------------------------------------------------------
|
|
238
242
|
all_df, category_mappings = convert_categorical_types(all_df, features)
|
|
239
243
|
|
|
240
|
-
# If we have compressed features, decompress them
|
|
241
244
|
if compressed_features:
|
|
242
|
-
print(f"Decompressing features {compressed_features}
|
|
245
|
+
print(f"Decompressing features: {compressed_features}")
|
|
243
246
|
all_df, features = decompress_features(all_df, features, compressed_features)
|
|
244
247
|
|
|
245
|
-
#
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
df_val = all_df.copy()
|
|
250
|
-
|
|
251
|
-
# Does the dataframe have a training column?
|
|
252
|
-
elif "training" in all_df.columns:
|
|
253
|
-
print("Found training column, splitting data based on training column")
|
|
254
|
-
df_train = all_df[all_df["training"]]
|
|
255
|
-
df_val = all_df[~all_df["training"]]
|
|
256
|
-
else:
|
|
257
|
-
# Just do a random training Split
|
|
258
|
-
print("WARNING: No training column found, splitting data with random state=42")
|
|
259
|
-
df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
|
|
260
|
-
print(f"FIT/TRAIN: {df_train.shape}")
|
|
261
|
-
print(f"VALIDATION: {df_val.shape}")
|
|
262
|
-
|
|
263
|
-
# Use any hyperparameters to set up both the trainer and model configurations
|
|
264
|
-
print(f"Hyperparameters: {hyperparameters}")
|
|
265
|
-
|
|
266
|
-
# Now spin up our XGB Model
|
|
248
|
+
# -------------------------------------------------------------------------
|
|
249
|
+
# Classification setup
|
|
250
|
+
# -------------------------------------------------------------------------
|
|
251
|
+
label_encoder = None
|
|
267
252
|
if model_type == "classifier":
|
|
268
|
-
xgb_model = xgb.XGBClassifier(enable_categorical=True, **hyperparameters)
|
|
269
|
-
|
|
270
|
-
# Encode the target column
|
|
271
253
|
label_encoder = LabelEncoder()
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
else:
|
|
276
|
-
xgb_model = xgb.XGBRegressor(enable_categorical=True, **hyperparameters)
|
|
277
|
-
label_encoder = None # We don't need this for regression
|
|
278
|
-
|
|
279
|
-
# Grab our Features, Target and Train the Model
|
|
280
|
-
y_train = df_train[target]
|
|
281
|
-
X_train = df_train[features]
|
|
282
|
-
xgb_model.fit(X_train, y_train)
|
|
283
|
-
|
|
284
|
-
# Make Predictions on the Validation Set
|
|
285
|
-
print(f"Making Predictions on Validation Set...")
|
|
286
|
-
y_validate = df_val[target]
|
|
287
|
-
X_validate = df_val[features]
|
|
288
|
-
preds = xgb_model.predict(X_validate)
|
|
289
|
-
if model_type == "classifier":
|
|
290
|
-
# Also get the probabilities for each class
|
|
291
|
-
print("Processing Probabilities...")
|
|
292
|
-
probs = xgb_model.predict_proba(X_validate)
|
|
293
|
-
df_val["pred_proba"] = [p.tolist() for p in probs]
|
|
294
|
-
|
|
295
|
-
# Expand the pred_proba column into separate columns for each class
|
|
296
|
-
print(df_val.columns)
|
|
297
|
-
df_val = expand_proba_column(df_val, label_encoder.classes_)
|
|
298
|
-
print(df_val.columns)
|
|
299
|
-
|
|
300
|
-
# Decode the target and prediction labels
|
|
301
|
-
y_validate = label_encoder.inverse_transform(y_validate)
|
|
302
|
-
preds = label_encoder.inverse_transform(preds)
|
|
303
|
-
|
|
304
|
-
# Save predictions to S3 (just the target, prediction, and '_proba' columns)
|
|
305
|
-
df_val["prediction"] = preds
|
|
306
|
-
output_columns = [target, "prediction"]
|
|
307
|
-
output_columns += [col for col in df_val.columns if col.endswith("_proba")]
|
|
308
|
-
wr.s3.to_csv(
|
|
309
|
-
df_val[output_columns],
|
|
310
|
-
path=f"{model_metrics_s3_path}/validation_predictions.csv",
|
|
311
|
-
index=False,
|
|
312
|
-
)
|
|
254
|
+
all_df[target] = label_encoder.fit_transform(all_df[target])
|
|
255
|
+
print(f"Class labels: {label_encoder.classes_.tolist()}")
|
|
313
256
|
|
|
314
|
-
#
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
257
|
+
# -------------------------------------------------------------------------
|
|
258
|
+
# Cross-validation setup
|
|
259
|
+
# -------------------------------------------------------------------------
|
|
260
|
+
n_folds = hyperparameters["n_folds"]
|
|
261
|
+
xgb_params = {k: v for k, v in hyperparameters.items() if k not in WORKBENCH_PARAMS}
|
|
318
262
|
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
# Put the scores into a dataframe
|
|
323
|
-
score_df = pd.DataFrame(
|
|
324
|
-
{
|
|
325
|
-
target: label_names,
|
|
326
|
-
"precision": scores[0],
|
|
327
|
-
"recall": scores[1],
|
|
328
|
-
"f1": scores[2],
|
|
329
|
-
"support": scores[3],
|
|
330
|
-
}
|
|
331
|
-
)
|
|
332
|
-
|
|
333
|
-
# We need to get creative with the Classification Metrics
|
|
334
|
-
metrics = ["precision", "recall", "f1", "support"]
|
|
335
|
-
for t in label_names:
|
|
336
|
-
for m in metrics:
|
|
337
|
-
value = score_df.loc[score_df[target] == t, m].iloc[0]
|
|
338
|
-
print(f"Metrics:{t}:{m} {value}")
|
|
339
|
-
|
|
340
|
-
# Compute and output the confusion matrix
|
|
341
|
-
conf_mtx = confusion_matrix(y_validate, preds, labels=label_names)
|
|
342
|
-
for i, row_name in enumerate(label_names):
|
|
343
|
-
for j, col_name in enumerate(label_names):
|
|
344
|
-
value = conf_mtx[i, j]
|
|
345
|
-
print(f"ConfusionMatrix:{row_name}:{col_name} {value}")
|
|
263
|
+
# Map 'seed' to 'random_state' for XGBoost
|
|
264
|
+
if "seed" in xgb_params:
|
|
265
|
+
xgb_params["random_state"] = xgb_params.pop("seed")
|
|
346
266
|
|
|
267
|
+
# Handle objective: filter regression-only params for classifiers, set default for regressors
|
|
268
|
+
if model_type == "classifier":
|
|
269
|
+
xgb_params = {k: v for k, v in xgb_params.items() if k not in REGRESSION_ONLY_PARAMS}
|
|
347
270
|
else:
|
|
348
|
-
#
|
|
349
|
-
|
|
350
|
-
mae = mean_absolute_error(y_validate, preds)
|
|
351
|
-
r2 = r2_score(y_validate, preds)
|
|
352
|
-
print(f"RMSE: {rmse:.3f}")
|
|
353
|
-
print(f"MAE: {mae:.3f}")
|
|
354
|
-
print(f"R2: {r2:.3f}")
|
|
355
|
-
print(f"NumRows: {len(df_val)}")
|
|
356
|
-
|
|
357
|
-
# Now save the model to the standard place/name
|
|
358
|
-
joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
|
|
359
|
-
|
|
360
|
-
# Save the label encoder if we have one
|
|
361
|
-
if label_encoder:
|
|
362
|
-
joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
|
|
363
|
-
|
|
364
|
-
# Save the features (this will validate input during predictions)
|
|
365
|
-
with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
|
|
366
|
-
json.dump(orig_features, fp) # We save the original features, not the decompressed ones
|
|
367
|
-
|
|
368
|
-
# Save the category mappings
|
|
369
|
-
with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
|
|
370
|
-
json.dump(category_mappings, fp)
|
|
271
|
+
# Default to MAE (reg:absoluteerror) for regression if not specified
|
|
272
|
+
xgb_params.setdefault("objective", "reg:absoluteerror")
|
|
371
273
|
|
|
274
|
+
print(f"XGBoost params: {xgb_params}")
|
|
372
275
|
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
276
|
+
if n_folds == 1:
|
|
277
|
+
# Single train/val split
|
|
278
|
+
if "training" in all_df.columns:
|
|
279
|
+
print("Using 'training' column for train/val split")
|
|
280
|
+
train_idx = np.where(all_df["training"])[0]
|
|
281
|
+
val_idx = np.where(~all_df["training"])[0]
|
|
282
|
+
else:
|
|
283
|
+
print("WARNING: No 'training' column found, using random 80/20 split")
|
|
284
|
+
indices = np.arange(len(all_df))
|
|
285
|
+
train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)
|
|
286
|
+
folds = [(train_idx, val_idx)]
|
|
287
|
+
else:
|
|
288
|
+
# K-fold cross-validation
|
|
289
|
+
if model_type == "classifier":
|
|
290
|
+
kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
291
|
+
folds = list(kfold.split(all_df, all_df[target]))
|
|
292
|
+
else:
|
|
293
|
+
kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
294
|
+
folds = list(kfold.split(all_df))
|
|
384
295
|
|
|
385
|
-
|
|
386
|
-
if isinstance(input_data, bytes):
|
|
387
|
-
input_data = input_data.decode("utf-8")
|
|
296
|
+
print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold ensemble'}...")
|
|
388
297
|
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
298
|
+
# -------------------------------------------------------------------------
|
|
299
|
+
# Training loop
|
|
300
|
+
# -------------------------------------------------------------------------
|
|
301
|
+
# Initialize out-of-fold storage
|
|
302
|
+
oof_predictions = np.full(len(all_df), np.nan, dtype=np.float64)
|
|
303
|
+
if model_type == "classifier":
|
|
304
|
+
num_classes = len(label_encoder.classes_)
|
|
305
|
+
oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
|
|
393
306
|
else:
|
|
394
|
-
|
|
395
|
-
|
|
307
|
+
oof_proba = None
|
|
308
|
+
|
|
309
|
+
# Check for sample weights
|
|
310
|
+
has_sample_weights = "sample_weight" in all_df.columns
|
|
311
|
+
if has_sample_weights:
|
|
312
|
+
sw = all_df["sample_weight"]
|
|
313
|
+
print(f"Using sample weights: min={sw.min():.2f}, max={sw.max():.2f}, mean={sw.mean():.2f}")
|
|
314
|
+
|
|
315
|
+
# Train ensemble
|
|
316
|
+
ensemble_models = []
|
|
317
|
+
for fold_idx, (train_idx, val_idx) in enumerate(folds):
|
|
318
|
+
print(f"\n{'='*50}")
|
|
319
|
+
print(f"Fold {fold_idx + 1}/{len(folds)} - Train: {len(train_idx)}, Val: {len(val_idx)}")
|
|
320
|
+
print(f"{'='*50}")
|
|
321
|
+
|
|
322
|
+
# Prepare fold data
|
|
323
|
+
X_train = all_df.iloc[train_idx][features]
|
|
324
|
+
y_train = all_df.iloc[train_idx][target]
|
|
325
|
+
X_val = all_df.iloc[val_idx][features]
|
|
326
|
+
sample_weights = all_df.iloc[train_idx]["sample_weight"] if has_sample_weights else None
|
|
327
|
+
|
|
328
|
+
# Create model with fold-specific random state for diversity
|
|
329
|
+
fold_params = {**xgb_params, "random_state": xgb_params.get("random_state", 42) + fold_idx}
|
|
330
|
+
if model_type == "classifier":
|
|
331
|
+
model = xgb.XGBClassifier(enable_categorical=True, **fold_params)
|
|
332
|
+
else:
|
|
333
|
+
model = xgb.XGBRegressor(enable_categorical=True, **fold_params)
|
|
334
|
+
|
|
335
|
+
# Train
|
|
336
|
+
model.fit(X_train, y_train, sample_weight=sample_weights)
|
|
337
|
+
ensemble_models.append(model)
|
|
338
|
+
|
|
339
|
+
# Out-of-fold predictions
|
|
340
|
+
oof_predictions[val_idx] = model.predict(X_val)
|
|
341
|
+
if model_type == "classifier":
|
|
342
|
+
oof_proba[val_idx] = model.predict_proba(X_val)
|
|
343
|
+
|
|
344
|
+
print(f"\nTraining complete! Trained {len(ensemble_models)} model(s).")
|
|
345
|
+
|
|
346
|
+
# -------------------------------------------------------------------------
|
|
347
|
+
# Prepare validation results
|
|
348
|
+
# -------------------------------------------------------------------------
|
|
349
|
+
if n_folds == 1:
|
|
350
|
+
# Single fold: only validation rows
|
|
351
|
+
val_mask = ~np.isnan(oof_predictions)
|
|
352
|
+
df_val = all_df[val_mask].copy()
|
|
353
|
+
predictions = oof_predictions[val_mask]
|
|
354
|
+
if oof_proba is not None:
|
|
355
|
+
oof_proba = oof_proba[val_mask]
|
|
356
|
+
else:
|
|
357
|
+
# K-fold: all rows have out-of-fold predictions
|
|
358
|
+
df_val = all_df.copy()
|
|
359
|
+
predictions = oof_predictions
|
|
396
360
|
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
361
|
+
# Decode labels for classification
|
|
362
|
+
if model_type == "classifier":
|
|
363
|
+
df_val[target] = label_encoder.inverse_transform(df_val[target].astype(int))
|
|
364
|
+
df_val["prediction"] = label_encoder.inverse_transform(predictions.astype(int))
|
|
365
|
+
if oof_proba is not None:
|
|
366
|
+
df_val["pred_proba"] = [p.tolist() for p in oof_proba]
|
|
367
|
+
df_val = expand_proba_column(df_val, label_encoder.classes_)
|
|
404
368
|
else:
|
|
405
|
-
|
|
369
|
+
df_val["prediction"] = predictions
|
|
406
370
|
|
|
371
|
+
# -------------------------------------------------------------------------
|
|
372
|
+
# Compute and print metrics
|
|
373
|
+
# -------------------------------------------------------------------------
|
|
374
|
+
y_true = df_val[target].values
|
|
375
|
+
y_pred = df_val["prediction"].values
|
|
407
376
|
|
|
408
|
-
|
|
409
|
-
|
|
377
|
+
if model_type == "classifier":
|
|
378
|
+
label_names = label_encoder.classes_
|
|
379
|
+
score_df = compute_classification_metrics(y_true, y_pred, label_names, target)
|
|
380
|
+
print_classification_metrics(score_df, target, label_names)
|
|
381
|
+
print_confusion_matrix(y_true, y_pred, label_names)
|
|
382
|
+
else:
|
|
383
|
+
metrics = compute_regression_metrics(y_true, y_pred)
|
|
384
|
+
print_regression_metrics(metrics)
|
|
385
|
+
|
|
386
|
+
# Compute ensemble prediction_std
|
|
387
|
+
if n_folds > 1:
|
|
388
|
+
all_preds = np.stack([m.predict(all_df[features]) for m in ensemble_models])
|
|
389
|
+
df_val["prediction_std"] = np.std(all_preds, axis=0)
|
|
390
|
+
print(f"Ensemble std - mean: {df_val['prediction_std'].mean():.4f}, max: {df_val['prediction_std'].max():.4f}")
|
|
391
|
+
else:
|
|
392
|
+
df_val["prediction_std"] = 0.0
|
|
393
|
+
|
|
394
|
+
# Train UQ models for uncertainty quantification
|
|
395
|
+
print("\n" + "=" * 50)
|
|
396
|
+
print("Training UQ Models")
|
|
397
|
+
print("=" * 50)
|
|
398
|
+
uq_models, uq_metadata = train_uq_models(
|
|
399
|
+
all_df[features], all_df[target], df_val[features], y_true
|
|
400
|
+
)
|
|
401
|
+
df_val = predict_intervals(df_val, df_val[features], uq_models, uq_metadata)
|
|
402
|
+
df_val = compute_confidence(df_val, uq_metadata["median_interval_width"])
|
|
410
403
|
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
404
|
+
# -------------------------------------------------------------------------
|
|
405
|
+
# Save validation predictions to S3
|
|
406
|
+
# -------------------------------------------------------------------------
|
|
407
|
+
output_columns = []
|
|
408
|
+
if id_column in df_val.columns:
|
|
409
|
+
output_columns.append(id_column)
|
|
410
|
+
output_columns += [target, "prediction"]
|
|
414
411
|
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
412
|
+
if model_type != "classifier":
|
|
413
|
+
output_columns.append("prediction_std")
|
|
414
|
+
output_columns += [c for c in df_val.columns if c.startswith("q_") or c == "confidence"]
|
|
419
415
|
|
|
420
|
-
|
|
421
|
-
model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
|
|
422
|
-
with open(os.path.join(model_dir, "feature_columns.json")) as fp:
|
|
423
|
-
features = json.load(fp)
|
|
424
|
-
print(f"Model Features: {features}")
|
|
416
|
+
output_columns += [c for c in df_val.columns if c.endswith("_proba")]
|
|
425
417
|
|
|
426
|
-
|
|
427
|
-
with open(os.path.join(model_dir, "category_mappings.json")) as fp:
|
|
428
|
-
category_mappings = json.load(fp)
|
|
418
|
+
wr.s3.to_csv(df_val[output_columns], f"{model_metrics_s3_path}/validation_predictions.csv", index=False)
|
|
429
419
|
|
|
430
|
-
#
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
420
|
+
# -------------------------------------------------------------------------
|
|
421
|
+
# Save model artifacts
|
|
422
|
+
# -------------------------------------------------------------------------
|
|
423
|
+
for idx, m in enumerate(ensemble_models):
|
|
424
|
+
joblib.dump(m, os.path.join(args.model_dir, f"xgb_model_{idx}.joblib"))
|
|
425
|
+
print(f"Saved {len(ensemble_models)} model(s)")
|
|
434
426
|
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
# - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
|
|
438
|
-
matched_df = match_features_case_insensitive(df, features)
|
|
427
|
+
with open(os.path.join(args.model_dir, "ensemble_metadata.json"), "w") as f:
|
|
428
|
+
json.dump({"n_ensemble": len(ensemble_models), "n_folds": n_folds}, f)
|
|
439
429
|
|
|
440
|
-
|
|
441
|
-
|
|
430
|
+
with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as f:
|
|
431
|
+
json.dump(orig_features, f)
|
|
442
432
|
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
print("Decompressing features for prediction...")
|
|
446
|
-
matched_df, features = decompress_features(matched_df, features, compressed_features)
|
|
433
|
+
with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as f:
|
|
434
|
+
json.dump(category_mappings, f)
|
|
447
435
|
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
predictions = model.predict(X)
|
|
436
|
+
with open(os.path.join(args.model_dir, "hyperparameters.json"), "w") as f:
|
|
437
|
+
json.dump(hyperparameters, f, indent=2)
|
|
451
438
|
|
|
452
|
-
# If we have a label encoder, decode the predictions
|
|
453
439
|
if label_encoder:
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
# Set the predictions on the DataFrame
|
|
457
|
-
df["prediction"] = predictions
|
|
458
|
-
|
|
459
|
-
# Does our model have a 'predict_proba' method? If so we will call it and add the results to the DataFrame
|
|
460
|
-
if getattr(model, "predict_proba", None):
|
|
461
|
-
probs = model.predict_proba(matched_df[features])
|
|
462
|
-
df["pred_proba"] = [p.tolist() for p in probs]
|
|
440
|
+
joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
|
|
463
441
|
|
|
464
|
-
|
|
465
|
-
|
|
442
|
+
if model_type != "classifier":
|
|
443
|
+
save_uq_models(uq_models, uq_metadata, args.model_dir)
|
|
466
444
|
|
|
467
|
-
|
|
468
|
-
return df
|
|
445
|
+
print(f"\nModel training complete! Artifacts saved to {args.model_dir}")
|