workbench 0.8.162__py3-none-any.whl → 0.8.220__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of workbench might be problematic. Click here for more details.
- workbench/algorithms/dataframe/__init__.py +1 -2
- workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
- workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
- workbench/algorithms/dataframe/fingerprint_proximity.py +422 -86
- workbench/algorithms/dataframe/projection_2d.py +44 -21
- workbench/algorithms/dataframe/proximity.py +259 -305
- workbench/algorithms/graph/light/proximity_graph.py +14 -12
- workbench/algorithms/models/cleanlab_model.py +382 -0
- workbench/algorithms/models/noise_model.py +388 -0
- workbench/algorithms/sql/outliers.py +3 -3
- workbench/api/__init__.py +5 -1
- workbench/api/compound.py +1 -1
- workbench/api/df_store.py +17 -108
- workbench/api/endpoint.py +18 -5
- workbench/api/feature_set.py +121 -15
- workbench/api/meta.py +5 -2
- workbench/api/meta_model.py +289 -0
- workbench/api/model.py +55 -21
- workbench/api/monitor.py +1 -16
- workbench/api/parameter_store.py +3 -52
- workbench/cached/cached_model.py +4 -4
- workbench/core/artifacts/__init__.py +11 -2
- workbench/core/artifacts/artifact.py +16 -8
- workbench/core/artifacts/data_capture_core.py +355 -0
- workbench/core/artifacts/df_store_core.py +114 -0
- workbench/core/artifacts/endpoint_core.py +382 -253
- workbench/core/artifacts/feature_set_core.py +249 -45
- workbench/core/artifacts/model_core.py +135 -80
- workbench/core/artifacts/monitor_core.py +33 -248
- workbench/core/artifacts/parameter_store_core.py +98 -0
- workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
- workbench/core/cloud_platform/aws/aws_meta.py +12 -5
- workbench/core/cloud_platform/aws/aws_session.py +4 -4
- workbench/core/pipelines/pipeline_executor.py +1 -1
- workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
- workbench/core/transforms/features_to_model/features_to_model.py +62 -40
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +76 -15
- workbench/core/transforms/pandas_transforms/pandas_to_features.py +38 -2
- workbench/core/views/training_view.py +113 -42
- workbench/core/views/view.py +53 -3
- workbench/core/views/view_utils.py +4 -4
- workbench/model_script_utils/model_script_utils.py +339 -0
- workbench/model_script_utils/pytorch_utils.py +405 -0
- workbench/model_script_utils/uq_harness.py +278 -0
- workbench/model_scripts/chemprop/chemprop.template +649 -0
- workbench/model_scripts/chemprop/generated_model_script.py +649 -0
- workbench/model_scripts/chemprop/model_script_utils.py +339 -0
- workbench/model_scripts/chemprop/requirements.txt +3 -0
- workbench/model_scripts/custom_models/chem_info/fingerprints.py +175 -0
- workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
- workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
- workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
- workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +8 -10
- workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
- workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
- workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
- workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
- workbench/model_scripts/custom_models/uq_models/ngboost.template +30 -18
- workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
- workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
- workbench/model_scripts/meta_model/generated_model_script.py +209 -0
- workbench/model_scripts/meta_model/meta_model.template +209 -0
- workbench/model_scripts/pytorch_model/generated_model_script.py +444 -500
- workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
- workbench/model_scripts/pytorch_model/pytorch.template +440 -496
- workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
- workbench/model_scripts/pytorch_model/requirements.txt +1 -1
- workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
- workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
- workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
- workbench/model_scripts/script_generation.py +20 -11
- workbench/model_scripts/uq_models/generated_model_script.py +248 -0
- workbench/model_scripts/xgb_model/generated_model_script.py +372 -404
- workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
- workbench/model_scripts/xgb_model/uq_harness.py +278 -0
- workbench/model_scripts/xgb_model/xgb_model.template +369 -401
- workbench/repl/workbench_shell.py +28 -19
- workbench/resources/open_source_api.key +1 -1
- workbench/scripts/endpoint_test.py +162 -0
- workbench/scripts/lambda_test.py +73 -0
- workbench/scripts/meta_model_sim.py +35 -0
- workbench/scripts/ml_pipeline_batch.py +137 -0
- workbench/scripts/ml_pipeline_sqs.py +186 -0
- workbench/scripts/monitor_cloud_watch.py +20 -100
- workbench/scripts/training_test.py +85 -0
- workbench/utils/aws_utils.py +4 -3
- workbench/utils/chem_utils/__init__.py +0 -0
- workbench/utils/chem_utils/fingerprints.py +175 -0
- workbench/utils/chem_utils/misc.py +194 -0
- workbench/utils/chem_utils/mol_descriptors.py +483 -0
- workbench/utils/chem_utils/mol_standardize.py +450 -0
- workbench/utils/chem_utils/mol_tagging.py +348 -0
- workbench/utils/chem_utils/projections.py +219 -0
- workbench/utils/chem_utils/salts.py +256 -0
- workbench/utils/chem_utils/sdf.py +292 -0
- workbench/utils/chem_utils/toxicity.py +250 -0
- workbench/utils/chem_utils/vis.py +253 -0
- workbench/utils/chemprop_utils.py +141 -0
- workbench/utils/cloudwatch_handler.py +1 -1
- workbench/utils/cloudwatch_utils.py +137 -0
- workbench/utils/config_manager.py +3 -7
- workbench/utils/endpoint_utils.py +5 -7
- workbench/utils/license_manager.py +2 -6
- workbench/utils/meta_model_simulator.py +499 -0
- workbench/utils/metrics_utils.py +256 -0
- workbench/utils/model_utils.py +278 -79
- workbench/utils/monitor_utils.py +44 -62
- workbench/utils/pandas_utils.py +3 -3
- workbench/utils/pytorch_utils.py +87 -0
- workbench/utils/shap_utils.py +11 -57
- workbench/utils/workbench_logging.py +0 -3
- workbench/utils/workbench_sqs.py +1 -1
- workbench/utils/xgboost_local_crossfold.py +267 -0
- workbench/utils/xgboost_model_utils.py +127 -219
- workbench/web_interface/components/model_plot.py +14 -2
- workbench/web_interface/components/plugin_unit_test.py +5 -2
- workbench/web_interface/components/plugins/dashboard_status.py +3 -1
- workbench/web_interface/components/plugins/generated_compounds.py +1 -1
- workbench/web_interface/components/plugins/model_details.py +38 -74
- workbench/web_interface/components/plugins/scatter_plot.py +6 -10
- {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/METADATA +31 -9
- {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/RECORD +128 -96
- workbench-0.8.220.dist-info/entry_points.txt +11 -0
- {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/licenses/LICENSE +1 -1
- workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
- workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
- workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
- workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
- workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
- workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
- workbench/model_scripts/custom_models/proximity/proximity.py +0 -384
- workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
- workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -273
- workbench/model_scripts/custom_models/uq_models/proximity.py +0 -384
- workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
- workbench/model_scripts/quant_regression/quant_regression.template +0 -279
- workbench/model_scripts/quant_regression/requirements.txt +0 -1
- workbench/utils/chem_utils.py +0 -1556
- workbench/utils/execution_environment.py +0 -211
- workbench/utils/fast_inference.py +0 -167
- workbench/utils/resource_utils.py +0 -39
- workbench-0.8.162.dist-info/entry_points.txt +0 -5
- {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/WHEEL +0 -0
- {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/top_level.txt +0 -0
|
@@ -1,477 +1,445 @@
|
|
|
1
|
-
#
|
|
2
|
-
|
|
3
|
-
|
|
1
|
+
# XGBoost Model Template for Workbench
|
|
2
|
+
#
|
|
3
|
+
# This template handles both classification and regression models with:
|
|
4
|
+
# - K-fold cross-validation ensemble training (or single train/val split)
|
|
5
|
+
# - Out-of-fold predictions for validation metrics
|
|
6
|
+
# - Uncertainty quantification for regression models
|
|
7
|
+
# - Sample weights support
|
|
8
|
+
# - Categorical feature handling
|
|
9
|
+
# - Compressed feature decompression
|
|
10
|
+
#
|
|
11
|
+
# NOTE: Imports are structured to minimize serverless endpoint startup time.
|
|
12
|
+
# Heavy imports (sklearn, awswrangler) are deferred to training time.
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import os
|
|
16
|
+
|
|
17
|
+
import joblib
|
|
4
18
|
import numpy as np
|
|
19
|
+
import pandas as pd
|
|
20
|
+
import xgboost as xgb
|
|
5
21
|
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
22
|
+
from model_script_utils import (
|
|
23
|
+
convert_categorical_types,
|
|
24
|
+
decompress_features,
|
|
25
|
+
expand_proba_column,
|
|
26
|
+
input_fn,
|
|
27
|
+
match_features_case_insensitive,
|
|
28
|
+
output_fn,
|
|
29
|
+
)
|
|
30
|
+
from uq_harness import (
|
|
31
|
+
compute_confidence,
|
|
32
|
+
load_uq_models,
|
|
33
|
+
predict_intervals,
|
|
13
34
|
)
|
|
14
35
|
|
|
15
|
-
#
|
|
16
|
-
|
|
36
|
+
# =============================================================================
|
|
37
|
+
# Default Hyperparameters
|
|
38
|
+
# =============================================================================
|
|
39
|
+
DEFAULT_HYPERPARAMETERS = {
|
|
40
|
+
# Training parameters
|
|
41
|
+
"n_folds": 5, # Number of CV folds (1 = single train/val split)
|
|
42
|
+
# Core tree parameters
|
|
43
|
+
"n_estimators": 300,
|
|
44
|
+
"max_depth": 7,
|
|
45
|
+
"learning_rate": 0.05,
|
|
46
|
+
# Sampling parameters (less aggressive - ensemble provides regularization)
|
|
47
|
+
"subsample": 0.8,
|
|
48
|
+
"colsample_bytree": 0.8,
|
|
49
|
+
# Regularization (lighter - ensemble averaging reduces overfitting)
|
|
50
|
+
"min_child_weight": 3,
|
|
51
|
+
"gamma": 0.1,
|
|
52
|
+
"reg_alpha": 0.1,
|
|
53
|
+
"reg_lambda": 1.0,
|
|
54
|
+
# Random seed
|
|
55
|
+
"seed": 42,
|
|
56
|
+
}
|
|
17
57
|
|
|
18
|
-
#
|
|
19
|
-
|
|
58
|
+
# Workbench-specific parameters (not passed to XGBoost)
|
|
59
|
+
WORKBENCH_PARAMS = {"n_folds"}
|
|
20
60
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
import argparse
|
|
24
|
-
import joblib
|
|
25
|
-
import os
|
|
26
|
-
import pandas as pd
|
|
27
|
-
from typing import List, Tuple
|
|
61
|
+
# Regression-only parameters (filtered out for classifiers)
|
|
62
|
+
REGRESSION_ONLY_PARAMS = {"objective"}
|
|
28
63
|
|
|
29
|
-
# Template
|
|
64
|
+
# Template parameters (filled in by Workbench)
|
|
30
65
|
TEMPLATE_PARAMS = {
|
|
31
|
-
"model_type": "
|
|
32
|
-
"
|
|
33
|
-
"features": ['molwt', '
|
|
66
|
+
"model_type": "uq_regressor",
|
|
67
|
+
"target": "udm_asy_res_efflux_ratio",
|
|
68
|
+
"features": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo'],
|
|
69
|
+
"id_column": "udm_mol_bat_id",
|
|
34
70
|
"compressed_features": ['fingerprint'],
|
|
35
|
-
"model_metrics_s3_path": "s3://
|
|
36
|
-
"
|
|
71
|
+
"model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/caco2-er-reg-temporal/training",
|
|
72
|
+
"hyperparameters": {'n_folds': 1},
|
|
37
73
|
}
|
|
38
74
|
|
|
39
|
-
# Function to check if dataframe is empty
|
|
40
|
-
def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
|
|
41
|
-
"""
|
|
42
|
-
Check if the provided dataframe is empty and raise an exception if it is.
|
|
43
|
-
|
|
44
|
-
Args:
|
|
45
|
-
df (pd.DataFrame): DataFrame to check
|
|
46
|
-
df_name (str): Name of the DataFrame
|
|
47
|
-
"""
|
|
48
|
-
if df.empty:
|
|
49
|
-
msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
|
|
50
|
-
print(msg)
|
|
51
|
-
raise ValueError(msg)
|
|
52
75
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
"""
|
|
65
|
-
|
|
66
|
-
# Sanity check
|
|
67
|
-
proba_column = "pred_proba"
|
|
68
|
-
if proba_column not in df.columns:
|
|
69
|
-
raise ValueError('DataFrame does not contain a "pred_proba" column')
|
|
70
|
-
|
|
71
|
-
# Construct new column names with '_proba' suffix
|
|
72
|
-
proba_splits = [f"{label}_proba" for label in class_labels]
|
|
73
|
-
|
|
74
|
-
# Expand the proba_column into separate columns for each probability
|
|
75
|
-
proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
|
|
76
|
-
|
|
77
|
-
# Drop any proba columns and reset the index in prep for the concat
|
|
78
|
-
df = df.drop(columns=[proba_column]+proba_splits, errors="ignore")
|
|
79
|
-
df = df.reset_index(drop=True)
|
|
80
|
-
|
|
81
|
-
# Concatenate the new columns with the original DataFrame
|
|
82
|
-
df = pd.concat([df, proba_df], axis=1)
|
|
83
|
-
print(df)
|
|
84
|
-
return df
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
|
|
88
|
-
"""
|
|
89
|
-
Matches and renames DataFrame columns to match model feature names (case-insensitive).
|
|
90
|
-
Prioritizes exact matches, then case-insensitive matches.
|
|
91
|
-
|
|
92
|
-
Raises ValueError if any model features cannot be matched.
|
|
93
|
-
"""
|
|
94
|
-
df_columns_lower = {col.lower(): col for col in df.columns}
|
|
95
|
-
rename_dict = {}
|
|
96
|
-
missing = []
|
|
97
|
-
|
|
98
|
-
for feature in model_features:
|
|
99
|
-
if feature in df.columns:
|
|
100
|
-
continue # Exact match
|
|
101
|
-
elif feature.lower() in df_columns_lower:
|
|
102
|
-
rename_dict[df_columns_lower[feature.lower()]] = feature
|
|
103
|
-
else:
|
|
104
|
-
missing.append(feature)
|
|
105
|
-
|
|
106
|
-
if missing:
|
|
107
|
-
raise ValueError(f"Features not found: {missing}")
|
|
108
|
-
|
|
109
|
-
return df.rename(columns=rename_dict)
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
|
|
113
|
-
"""
|
|
114
|
-
Converts appropriate columns to categorical type with consistent mappings.
|
|
115
|
-
|
|
116
|
-
Args:
|
|
117
|
-
df (pd.DataFrame): The DataFrame to process.
|
|
118
|
-
features (list): List of feature names to consider for conversion.
|
|
119
|
-
category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
|
|
120
|
-
training mode. If populated, we're in inference mode.
|
|
121
|
-
|
|
122
|
-
Returns:
|
|
123
|
-
tuple: (processed DataFrame, category mappings dictionary)
|
|
124
|
-
"""
|
|
125
|
-
# Training mode
|
|
126
|
-
if category_mappings == {}:
|
|
127
|
-
for col in df.select_dtypes(include=["object", "string"]):
|
|
128
|
-
if col in features and df[col].nunique() < 20:
|
|
129
|
-
print(f"Training mode: Converting {col} to category")
|
|
130
|
-
df[col] = df[col].astype("category")
|
|
131
|
-
category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
|
|
132
|
-
|
|
133
|
-
# Inference mode
|
|
76
|
+
# =============================================================================
|
|
77
|
+
# Model Loading (for SageMaker inference)
|
|
78
|
+
# =============================================================================
|
|
79
|
+
def model_fn(model_dir: str) -> dict:
|
|
80
|
+
"""Load XGBoost ensemble from the specified directory."""
|
|
81
|
+
# Load ensemble metadata
|
|
82
|
+
metadata_path = os.path.join(model_dir, "ensemble_metadata.json")
|
|
83
|
+
if os.path.exists(metadata_path):
|
|
84
|
+
with open(metadata_path) as f:
|
|
85
|
+
metadata = json.load(f)
|
|
86
|
+
n_ensemble = metadata["n_ensemble"]
|
|
134
87
|
else:
|
|
135
|
-
|
|
136
|
-
if col in df.columns:
|
|
137
|
-
print(f"Inference mode: Applying categorical mapping for {col}")
|
|
138
|
-
df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
|
|
139
|
-
|
|
140
|
-
return df, category_mappings
|
|
141
|
-
|
|
88
|
+
n_ensemble = 1 # Legacy single model
|
|
142
89
|
|
|
143
|
-
|
|
144
|
-
|
|
90
|
+
# Load ensemble models
|
|
91
|
+
ensemble_models = []
|
|
92
|
+
for i in range(n_ensemble):
|
|
93
|
+
model_path = os.path.join(model_dir, f"xgb_model_{i}.joblib")
|
|
94
|
+
if not os.path.exists(model_path):
|
|
95
|
+
model_path = os.path.join(model_dir, "xgb_model.joblib") # Legacy fallback
|
|
96
|
+
ensemble_models.append(joblib.load(model_path))
|
|
145
97
|
|
|
146
|
-
|
|
147
|
-
df (pd.DataFrame): The features DataFrame
|
|
148
|
-
features (List[str]): Full list of feature names
|
|
149
|
-
compressed_features (List[str]): List of feature names to decompress (bitstrings)
|
|
98
|
+
print(f"Loaded {len(ensemble_models)} model(s)")
|
|
150
99
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
100
|
+
# Load label encoder (classifier only)
|
|
101
|
+
label_encoder = None
|
|
102
|
+
encoder_path = os.path.join(model_dir, "label_encoder.joblib")
|
|
103
|
+
if os.path.exists(encoder_path):
|
|
104
|
+
label_encoder = joblib.load(encoder_path)
|
|
105
|
+
|
|
106
|
+
# Load category mappings
|
|
107
|
+
category_mappings = {}
|
|
108
|
+
category_path = os.path.join(model_dir, "category_mappings.json")
|
|
109
|
+
if os.path.exists(category_path):
|
|
110
|
+
with open(category_path) as f:
|
|
111
|
+
category_mappings = json.load(f)
|
|
112
|
+
|
|
113
|
+
# Load UQ models (regression only)
|
|
114
|
+
uq_models, uq_metadata = None, None
|
|
115
|
+
uq_path = os.path.join(model_dir, "uq_metadata.json")
|
|
116
|
+
if os.path.exists(uq_path):
|
|
117
|
+
uq_models, uq_metadata = load_uq_models(model_dir)
|
|
118
|
+
|
|
119
|
+
return {
|
|
120
|
+
"ensemble_models": ensemble_models,
|
|
121
|
+
"n_ensemble": n_ensemble,
|
|
122
|
+
"label_encoder": label_encoder,
|
|
123
|
+
"category_mappings": category_mappings,
|
|
124
|
+
"uq_models": uq_models,
|
|
125
|
+
"uq_metadata": uq_metadata,
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# =============================================================================
|
|
130
|
+
# Inference (for SageMaker inference)
|
|
131
|
+
# =============================================================================
|
|
132
|
+
def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
|
|
133
|
+
"""Make predictions with XGBoost ensemble."""
|
|
134
|
+
model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
|
|
135
|
+
with open(os.path.join(model_dir, "feature_columns.json")) as f:
|
|
136
|
+
features = json.load(f)
|
|
137
|
+
print(f"Model Features: {features}")
|
|
154
138
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
""
|
|
139
|
+
# Extract model components
|
|
140
|
+
ensemble_models = model_dict["ensemble_models"]
|
|
141
|
+
label_encoder = model_dict.get("label_encoder")
|
|
142
|
+
category_mappings = model_dict.get("category_mappings", {})
|
|
143
|
+
uq_models = model_dict.get("uq_models")
|
|
144
|
+
uq_metadata = model_dict.get("uq_metadata")
|
|
145
|
+
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
158
146
|
|
|
159
|
-
#
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
missing_features = missing_counts[missing_counts > 0]
|
|
163
|
-
print(
|
|
164
|
-
f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
|
|
165
|
-
"WARNING: You might want to remove/replace all NaN values before processing."
|
|
166
|
-
)
|
|
147
|
+
# Prepare features
|
|
148
|
+
matched_df = match_features_case_insensitive(df, features)
|
|
149
|
+
matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
|
|
167
150
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
if (feature not in df.columns) or (feature not in features):
|
|
172
|
-
print(f"Feature '{feature}' not in the features list, skipping decompression.")
|
|
173
|
-
continue
|
|
151
|
+
if compressed_features:
|
|
152
|
+
print("Decompressing features for prediction...")
|
|
153
|
+
matched_df, features = decompress_features(matched_df, features, compressed_features)
|
|
174
154
|
|
|
175
|
-
|
|
176
|
-
decompressed_features.remove(feature)
|
|
155
|
+
X = matched_df[features]
|
|
177
156
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
157
|
+
# Collect ensemble predictions
|
|
158
|
+
all_preds = [m.predict(X) for m in ensemble_models]
|
|
159
|
+
ensemble_preds = np.stack(all_preds, axis=0)
|
|
181
160
|
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
161
|
+
if label_encoder is not None:
|
|
162
|
+
# Classification: average probabilities, then argmax
|
|
163
|
+
all_probs = [m.predict_proba(X) for m in ensemble_models]
|
|
164
|
+
avg_probs = np.mean(np.stack(all_probs, axis=0), axis=0)
|
|
165
|
+
class_preds = np.argmax(avg_probs, axis=1)
|
|
185
166
|
|
|
186
|
-
|
|
187
|
-
|
|
167
|
+
df["prediction"] = label_encoder.inverse_transform(class_preds)
|
|
168
|
+
df["pred_proba"] = [p.tolist() for p in avg_probs]
|
|
169
|
+
df = expand_proba_column(df, label_encoder.classes_)
|
|
170
|
+
else:
|
|
171
|
+
# Regression: average predictions
|
|
172
|
+
df["prediction"] = np.mean(ensemble_preds, axis=0)
|
|
173
|
+
df["prediction_std"] = np.std(ensemble_preds, axis=0)
|
|
188
174
|
|
|
189
|
-
#
|
|
190
|
-
|
|
191
|
-
|
|
175
|
+
# Add UQ intervals if available
|
|
176
|
+
if uq_models and uq_metadata:
|
|
177
|
+
df = predict_intervals(df, X, uq_models, uq_metadata)
|
|
178
|
+
df = compute_confidence(df, uq_metadata["median_interval_width"], "q_10", "q_90")
|
|
192
179
|
|
|
193
|
-
|
|
180
|
+
print(f"Inference complete: {len(df)} predictions, {len(ensemble_models)} ensemble members")
|
|
181
|
+
return df
|
|
194
182
|
|
|
195
183
|
|
|
184
|
+
# =============================================================================
|
|
185
|
+
# Training
|
|
186
|
+
# =============================================================================
|
|
196
187
|
if __name__ == "__main__":
|
|
197
|
-
|
|
188
|
+
# -------------------------------------------------------------------------
|
|
189
|
+
# Training-only imports (deferred to reduce serverless startup time)
|
|
190
|
+
# -------------------------------------------------------------------------
|
|
191
|
+
import argparse
|
|
192
|
+
|
|
193
|
+
import awswrangler as wr
|
|
194
|
+
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
|
|
195
|
+
from sklearn.preprocessing import LabelEncoder
|
|
196
|
+
|
|
197
|
+
from model_script_utils import (
|
|
198
|
+
check_dataframe,
|
|
199
|
+
compute_classification_metrics,
|
|
200
|
+
compute_regression_metrics,
|
|
201
|
+
print_classification_metrics,
|
|
202
|
+
print_confusion_matrix,
|
|
203
|
+
print_regression_metrics,
|
|
204
|
+
)
|
|
205
|
+
from uq_harness import (
|
|
206
|
+
save_uq_models,
|
|
207
|
+
train_uq_models,
|
|
208
|
+
)
|
|
198
209
|
|
|
199
|
-
#
|
|
200
|
-
|
|
210
|
+
# -------------------------------------------------------------------------
|
|
211
|
+
# Setup: Parse arguments and load data
|
|
212
|
+
# -------------------------------------------------------------------------
|
|
213
|
+
parser = argparse.ArgumentParser()
|
|
214
|
+
parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
|
|
215
|
+
parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
|
|
216
|
+
parser.add_argument("--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data"))
|
|
217
|
+
args = parser.parse_args()
|
|
218
|
+
|
|
219
|
+
# Extract template parameters
|
|
220
|
+
target = TEMPLATE_PARAMS["target"]
|
|
201
221
|
features = TEMPLATE_PARAMS["features"]
|
|
202
222
|
orig_features = features.copy()
|
|
223
|
+
id_column = TEMPLATE_PARAMS["id_column"]
|
|
203
224
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
204
225
|
model_type = TEMPLATE_PARAMS["model_type"]
|
|
205
226
|
model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
|
|
206
|
-
|
|
207
|
-
validation_split = 0.2
|
|
208
|
-
|
|
209
|
-
# Script arguments for input/output directories
|
|
210
|
-
parser = argparse.ArgumentParser()
|
|
211
|
-
parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
|
|
212
|
-
parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
|
|
213
|
-
parser.add_argument(
|
|
214
|
-
"--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
|
|
215
|
-
)
|
|
216
|
-
args = parser.parse_args()
|
|
227
|
+
hyperparameters = {**DEFAULT_HYPERPARAMETERS, **(TEMPLATE_PARAMS["hyperparameters"] or {})}
|
|
217
228
|
|
|
218
|
-
#
|
|
219
|
-
training_files = [
|
|
220
|
-
os.path.join(args.train, file)
|
|
221
|
-
for file in os.listdir(args.train)
|
|
222
|
-
if file.endswith(".csv")
|
|
223
|
-
]
|
|
229
|
+
# Load training data
|
|
230
|
+
training_files = [os.path.join(args.train, f) for f in os.listdir(args.train) if f.endswith(".csv")]
|
|
224
231
|
print(f"Training Files: {training_files}")
|
|
225
|
-
|
|
226
|
-
# Combine files and read them all into a single pandas dataframe
|
|
227
|
-
all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
228
|
-
|
|
229
|
-
# Check if the dataframe is empty
|
|
232
|
+
all_df = pd.concat([pd.read_csv(f, engine="python") for f in training_files])
|
|
230
233
|
check_dataframe(all_df, "training_df")
|
|
231
234
|
|
|
232
|
-
# Features/Target output
|
|
233
235
|
print(f"Target: {target}")
|
|
234
|
-
print(f"Features: {
|
|
236
|
+
print(f"Features: {features}")
|
|
237
|
+
print(f"Hyperparameters: {hyperparameters}")
|
|
235
238
|
|
|
236
|
-
#
|
|
239
|
+
# -------------------------------------------------------------------------
|
|
240
|
+
# Preprocessing: Categorical features and decompression
|
|
241
|
+
# -------------------------------------------------------------------------
|
|
237
242
|
all_df, category_mappings = convert_categorical_types(all_df, features)
|
|
238
243
|
|
|
239
|
-
# If we have compressed features, decompress them
|
|
240
244
|
if compressed_features:
|
|
241
|
-
print(f"Decompressing features {compressed_features}
|
|
245
|
+
print(f"Decompressing features: {compressed_features}")
|
|
242
246
|
all_df, features = decompress_features(all_df, features, compressed_features)
|
|
243
247
|
|
|
244
|
-
#
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
df_val = all_df.copy()
|
|
249
|
-
|
|
250
|
-
# Does the dataframe have a training column?
|
|
251
|
-
elif "training" in all_df.columns:
|
|
252
|
-
print("Found training column, splitting data based on training column")
|
|
253
|
-
df_train = all_df[all_df["training"]]
|
|
254
|
-
df_val = all_df[~all_df["training"]]
|
|
255
|
-
else:
|
|
256
|
-
# Just do a random training Split
|
|
257
|
-
print("WARNING: No training column found, splitting data with random state=42")
|
|
258
|
-
df_train, df_val = train_test_split(
|
|
259
|
-
all_df, test_size=validation_split, random_state=42
|
|
260
|
-
)
|
|
261
|
-
print(f"FIT/TRAIN: {df_train.shape}")
|
|
262
|
-
print(f"VALIDATION: {df_val.shape}")
|
|
263
|
-
|
|
264
|
-
# Now spin up our XGB Model
|
|
248
|
+
# -------------------------------------------------------------------------
|
|
249
|
+
# Classification setup
|
|
250
|
+
# -------------------------------------------------------------------------
|
|
251
|
+
label_encoder = None
|
|
265
252
|
if model_type == "classifier":
|
|
266
|
-
xgb_model = xgb.XGBClassifier(enable_categorical=True)
|
|
267
|
-
|
|
268
|
-
# Encode the target column
|
|
269
253
|
label_encoder = LabelEncoder()
|
|
270
|
-
|
|
271
|
-
|
|
254
|
+
all_df[target] = label_encoder.fit_transform(all_df[target])
|
|
255
|
+
print(f"Class labels: {label_encoder.classes_.tolist()}")
|
|
272
256
|
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
y_train = df_train[target]
|
|
279
|
-
X_train= df_train[features]
|
|
280
|
-
xgb_model.fit(X_train, y_train)
|
|
281
|
-
|
|
282
|
-
# Make Predictions on the Validation Set
|
|
283
|
-
print(f"Making Predictions on Validation Set...")
|
|
284
|
-
y_validate = df_val[target]
|
|
285
|
-
X_validate = df_val[features]
|
|
286
|
-
preds = xgb_model.predict(X_validate)
|
|
287
|
-
if model_type == "classifier":
|
|
288
|
-
# Also get the probabilities for each class
|
|
289
|
-
print("Processing Probabilities...")
|
|
290
|
-
probs = xgb_model.predict_proba(X_validate)
|
|
291
|
-
df_val["pred_proba"] = [p.tolist() for p in probs]
|
|
292
|
-
|
|
293
|
-
# Expand the pred_proba column into separate columns for each class
|
|
294
|
-
print(df_val.columns)
|
|
295
|
-
df_val = expand_proba_column(df_val, label_encoder.classes_)
|
|
296
|
-
print(df_val.columns)
|
|
297
|
-
|
|
298
|
-
# Decode the target and prediction labels
|
|
299
|
-
y_validate = label_encoder.inverse_transform(y_validate)
|
|
300
|
-
preds = label_encoder.inverse_transform(preds)
|
|
301
|
-
|
|
302
|
-
# Save predictions to S3 (just the target, prediction, and '_proba' columns)
|
|
303
|
-
df_val["prediction"] = preds
|
|
304
|
-
output_columns = [target, "prediction"]
|
|
305
|
-
output_columns += [col for col in df_val.columns if col.endswith("_proba")]
|
|
306
|
-
wr.s3.to_csv(
|
|
307
|
-
df_val[output_columns],
|
|
308
|
-
path=f"{model_metrics_s3_path}/validation_predictions.csv",
|
|
309
|
-
index=False,
|
|
310
|
-
)
|
|
257
|
+
# -------------------------------------------------------------------------
|
|
258
|
+
# Cross-validation setup
|
|
259
|
+
# -------------------------------------------------------------------------
|
|
260
|
+
n_folds = hyperparameters["n_folds"]
|
|
261
|
+
xgb_params = {k: v for k, v in hyperparameters.items() if k not in WORKBENCH_PARAMS}
|
|
311
262
|
|
|
312
|
-
#
|
|
313
|
-
if
|
|
314
|
-
|
|
315
|
-
label_names = label_encoder.classes_
|
|
316
|
-
|
|
317
|
-
# Calculate various model performance metrics
|
|
318
|
-
scores = precision_recall_fscore_support(
|
|
319
|
-
y_validate, preds, average=None, labels=label_names
|
|
320
|
-
)
|
|
321
|
-
|
|
322
|
-
# Put the scores into a dataframe
|
|
323
|
-
score_df = pd.DataFrame(
|
|
324
|
-
{
|
|
325
|
-
target: label_names,
|
|
326
|
-
"precision": scores[0],
|
|
327
|
-
"recall": scores[1],
|
|
328
|
-
"fscore": scores[2],
|
|
329
|
-
"support": scores[3],
|
|
330
|
-
}
|
|
331
|
-
)
|
|
332
|
-
|
|
333
|
-
# We need to get creative with the Classification Metrics
|
|
334
|
-
metrics = ["precision", "recall", "fscore", "support"]
|
|
335
|
-
for t in label_names:
|
|
336
|
-
for m in metrics:
|
|
337
|
-
value = score_df.loc[score_df[target] == t, m].iloc[0]
|
|
338
|
-
print(f"Metrics:{t}:{m} {value}")
|
|
339
|
-
|
|
340
|
-
# Compute and output the confusion matrix
|
|
341
|
-
conf_mtx = confusion_matrix(y_validate, preds, labels=label_names)
|
|
342
|
-
for i, row_name in enumerate(label_names):
|
|
343
|
-
for j, col_name in enumerate(label_names):
|
|
344
|
-
value = conf_mtx[i, j]
|
|
345
|
-
print(f"ConfusionMatrix:{row_name}:{col_name} {value}")
|
|
263
|
+
# Map 'seed' to 'random_state' for XGBoost
|
|
264
|
+
if "seed" in xgb_params:
|
|
265
|
+
xgb_params["random_state"] = xgb_params.pop("seed")
|
|
346
266
|
|
|
267
|
+
# Handle objective: filter regression-only params for classifiers, set default for regressors
|
|
268
|
+
if model_type == "classifier":
|
|
269
|
+
xgb_params = {k: v for k, v in xgb_params.items() if k not in REGRESSION_ONLY_PARAMS}
|
|
347
270
|
else:
|
|
348
|
-
#
|
|
349
|
-
|
|
350
|
-
mae = mean_absolute_error(y_validate, preds)
|
|
351
|
-
r2 = r2_score(y_validate, preds)
|
|
352
|
-
print(f"RMSE: {rmse:.3f}")
|
|
353
|
-
print(f"MAE: {mae:.3f}")
|
|
354
|
-
print(f"R2: {r2:.3f}")
|
|
355
|
-
print(f"NumRows: {len(df_val)}")
|
|
356
|
-
|
|
357
|
-
# Now save the model to the standard place/name
|
|
358
|
-
xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
|
|
359
|
-
if label_encoder:
|
|
360
|
-
joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
|
|
361
|
-
|
|
362
|
-
# Save the features (this will validate input during predictions)
|
|
363
|
-
with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
|
|
364
|
-
json.dump(orig_features, fp) # We save the original features, not the decompressed ones
|
|
365
|
-
|
|
366
|
-
# Save the category mappings
|
|
367
|
-
with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
|
|
368
|
-
json.dump(category_mappings, fp)
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
def model_fn(model_dir):
|
|
372
|
-
"""Deserialize and return fitted XGBoost model"""
|
|
373
|
-
|
|
374
|
-
model_path = os.path.join(model_dir, "xgb_model.json")
|
|
375
|
-
|
|
376
|
-
with open(model_path, "r") as f:
|
|
377
|
-
model_json = json.load(f)
|
|
378
|
-
|
|
379
|
-
sklearn_data = model_json['learner']['attributes']['scikit_learn']
|
|
380
|
-
model_type = json.loads(sklearn_data)['_estimator_type']
|
|
381
|
-
|
|
382
|
-
model_class = xgb.XGBClassifier if model_type == "classifier" else xgb.XGBRegressor
|
|
383
|
-
model = model_class(enable_categorical=True)
|
|
384
|
-
model.load_model(model_path)
|
|
271
|
+
# Default to MAE (reg:absoluteerror) for regression if not specified
|
|
272
|
+
xgb_params.setdefault("objective", "reg:absoluteerror")
|
|
385
273
|
|
|
386
|
-
|
|
274
|
+
print(f"XGBoost params: {xgb_params}")
|
|
387
275
|
|
|
276
|
+
if n_folds == 1:
|
|
277
|
+
# Single train/val split
|
|
278
|
+
if "training" in all_df.columns:
|
|
279
|
+
print("Using 'training' column for train/val split")
|
|
280
|
+
train_idx = np.where(all_df["training"])[0]
|
|
281
|
+
val_idx = np.where(~all_df["training"])[0]
|
|
282
|
+
else:
|
|
283
|
+
print("WARNING: No 'training' column found, using random 80/20 split")
|
|
284
|
+
indices = np.arange(len(all_df))
|
|
285
|
+
train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)
|
|
286
|
+
folds = [(train_idx, val_idx)]
|
|
287
|
+
else:
|
|
288
|
+
# K-fold cross-validation
|
|
289
|
+
if model_type == "classifier":
|
|
290
|
+
kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
291
|
+
folds = list(kfold.split(all_df, all_df[target]))
|
|
292
|
+
else:
|
|
293
|
+
kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
294
|
+
folds = list(kfold.split(all_df))
|
|
388
295
|
|
|
389
|
-
|
|
390
|
-
"""Parse input data and return a DataFrame."""
|
|
391
|
-
if not input_data:
|
|
392
|
-
raise ValueError("Empty input data is not supported!")
|
|
393
|
-
|
|
394
|
-
# Decode bytes to string if necessary
|
|
395
|
-
if isinstance(input_data, bytes):
|
|
396
|
-
input_data = input_data.decode("utf-8")
|
|
296
|
+
print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold ensemble'}...")
|
|
397
297
|
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
298
|
+
# -------------------------------------------------------------------------
|
|
299
|
+
# Training loop
|
|
300
|
+
# -------------------------------------------------------------------------
|
|
301
|
+
# Initialize out-of-fold storage
|
|
302
|
+
oof_predictions = np.full(len(all_df), np.nan, dtype=np.float64)
|
|
303
|
+
if model_type == "classifier":
|
|
304
|
+
num_classes = len(label_encoder.classes_)
|
|
305
|
+
oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
|
|
402
306
|
else:
|
|
403
|
-
|
|
404
|
-
|
|
307
|
+
oof_proba = None
|
|
308
|
+
|
|
309
|
+
# Check for sample weights
|
|
310
|
+
has_sample_weights = "sample_weight" in all_df.columns
|
|
311
|
+
if has_sample_weights:
|
|
312
|
+
sw = all_df["sample_weight"]
|
|
313
|
+
print(f"Using sample weights: min={sw.min():.2f}, max={sw.max():.2f}, mean={sw.mean():.2f}")
|
|
314
|
+
|
|
315
|
+
# Train ensemble
|
|
316
|
+
ensemble_models = []
|
|
317
|
+
for fold_idx, (train_idx, val_idx) in enumerate(folds):
|
|
318
|
+
print(f"\n{'='*50}")
|
|
319
|
+
print(f"Fold {fold_idx + 1}/{len(folds)} - Train: {len(train_idx)}, Val: {len(val_idx)}")
|
|
320
|
+
print(f"{'='*50}")
|
|
321
|
+
|
|
322
|
+
# Prepare fold data
|
|
323
|
+
X_train = all_df.iloc[train_idx][features]
|
|
324
|
+
y_train = all_df.iloc[train_idx][target]
|
|
325
|
+
X_val = all_df.iloc[val_idx][features]
|
|
326
|
+
sample_weights = all_df.iloc[train_idx]["sample_weight"] if has_sample_weights else None
|
|
327
|
+
|
|
328
|
+
# Create model with fold-specific random state for diversity
|
|
329
|
+
fold_params = {**xgb_params, "random_state": xgb_params.get("random_state", 42) + fold_idx}
|
|
330
|
+
if model_type == "classifier":
|
|
331
|
+
model = xgb.XGBClassifier(enable_categorical=True, **fold_params)
|
|
332
|
+
else:
|
|
333
|
+
model = xgb.XGBRegressor(enable_categorical=True, **fold_params)
|
|
334
|
+
|
|
335
|
+
# Train
|
|
336
|
+
model.fit(X_train, y_train, sample_weight=sample_weights)
|
|
337
|
+
ensemble_models.append(model)
|
|
338
|
+
|
|
339
|
+
# Out-of-fold predictions
|
|
340
|
+
oof_predictions[val_idx] = model.predict(X_val)
|
|
341
|
+
if model_type == "classifier":
|
|
342
|
+
oof_proba[val_idx] = model.predict_proba(X_val)
|
|
343
|
+
|
|
344
|
+
print(f"\nTraining complete! Trained {len(ensemble_models)} model(s).")
|
|
345
|
+
|
|
346
|
+
# -------------------------------------------------------------------------
|
|
347
|
+
# Prepare validation results
|
|
348
|
+
# -------------------------------------------------------------------------
|
|
349
|
+
if n_folds == 1:
|
|
350
|
+
# Single fold: only validation rows
|
|
351
|
+
val_mask = ~np.isnan(oof_predictions)
|
|
352
|
+
df_val = all_df[val_mask].copy()
|
|
353
|
+
predictions = oof_predictions[val_mask]
|
|
354
|
+
if oof_proba is not None:
|
|
355
|
+
oof_proba = oof_proba[val_mask]
|
|
356
|
+
else:
|
|
357
|
+
# K-fold: all rows have out-of-fold predictions
|
|
358
|
+
df_val = all_df.copy()
|
|
359
|
+
predictions = oof_predictions
|
|
405
360
|
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
361
|
+
# Decode labels for classification
|
|
362
|
+
if model_type == "classifier":
|
|
363
|
+
df_val[target] = label_encoder.inverse_transform(df_val[target].astype(int))
|
|
364
|
+
df_val["prediction"] = label_encoder.inverse_transform(predictions.astype(int))
|
|
365
|
+
if oof_proba is not None:
|
|
366
|
+
df_val["pred_proba"] = [p.tolist() for p in oof_proba]
|
|
367
|
+
df_val = expand_proba_column(df_val, label_encoder.classes_)
|
|
413
368
|
else:
|
|
414
|
-
|
|
369
|
+
df_val["prediction"] = predictions
|
|
415
370
|
|
|
371
|
+
# -------------------------------------------------------------------------
|
|
372
|
+
# Compute and print metrics
|
|
373
|
+
# -------------------------------------------------------------------------
|
|
374
|
+
y_true = df_val[target].values
|
|
375
|
+
y_pred = df_val["prediction"].values
|
|
416
376
|
|
|
417
|
-
|
|
418
|
-
|
|
377
|
+
if model_type == "classifier":
|
|
378
|
+
label_names = label_encoder.classes_
|
|
379
|
+
score_df = compute_classification_metrics(y_true, y_pred, label_names, target)
|
|
380
|
+
print_classification_metrics(score_df, target, label_names)
|
|
381
|
+
print_confusion_matrix(y_true, y_pred, label_names)
|
|
382
|
+
else:
|
|
383
|
+
metrics = compute_regression_metrics(y_true, y_pred)
|
|
384
|
+
print_regression_metrics(metrics)
|
|
385
|
+
|
|
386
|
+
# Compute ensemble prediction_std
|
|
387
|
+
if n_folds > 1:
|
|
388
|
+
all_preds = np.stack([m.predict(all_df[features]) for m in ensemble_models])
|
|
389
|
+
df_val["prediction_std"] = np.std(all_preds, axis=0)
|
|
390
|
+
print(f"Ensemble std - mean: {df_val['prediction_std'].mean():.4f}, max: {df_val['prediction_std'].max():.4f}")
|
|
391
|
+
else:
|
|
392
|
+
df_val["prediction_std"] = 0.0
|
|
393
|
+
|
|
394
|
+
# Train UQ models for uncertainty quantification
|
|
395
|
+
print("\n" + "=" * 50)
|
|
396
|
+
print("Training UQ Models")
|
|
397
|
+
print("=" * 50)
|
|
398
|
+
uq_models, uq_metadata = train_uq_models(
|
|
399
|
+
all_df[features], all_df[target], df_val[features], y_true
|
|
400
|
+
)
|
|
401
|
+
df_val = predict_intervals(df_val, df_val[features], uq_models, uq_metadata)
|
|
402
|
+
df_val = compute_confidence(df_val, uq_metadata["median_interval_width"])
|
|
419
403
|
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
404
|
+
# -------------------------------------------------------------------------
|
|
405
|
+
# Save validation predictions to S3
|
|
406
|
+
# -------------------------------------------------------------------------
|
|
407
|
+
output_columns = []
|
|
408
|
+
if id_column in df_val.columns:
|
|
409
|
+
output_columns.append(id_column)
|
|
410
|
+
output_columns += [target, "prediction"]
|
|
423
411
|
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
412
|
+
if model_type != "classifier":
|
|
413
|
+
output_columns.append("prediction_std")
|
|
414
|
+
output_columns += [c for c in df_val.columns if c.startswith("q_") or c == "confidence"]
|
|
428
415
|
|
|
429
|
-
|
|
430
|
-
model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
|
|
431
|
-
with open(os.path.join(model_dir, "feature_columns.json")) as fp:
|
|
432
|
-
features = json.load(fp)
|
|
433
|
-
print(f"Model Features: {features}")
|
|
416
|
+
output_columns += [c for c in df_val.columns if c.endswith("_proba")]
|
|
434
417
|
|
|
435
|
-
|
|
436
|
-
with open(os.path.join(model_dir, "category_mappings.json")) as fp:
|
|
437
|
-
category_mappings = json.load(fp)
|
|
418
|
+
wr.s3.to_csv(df_val[output_columns], f"{model_metrics_s3_path}/validation_predictions.csv", index=False)
|
|
438
419
|
|
|
439
|
-
#
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
420
|
+
# -------------------------------------------------------------------------
|
|
421
|
+
# Save model artifacts
|
|
422
|
+
# -------------------------------------------------------------------------
|
|
423
|
+
for idx, m in enumerate(ensemble_models):
|
|
424
|
+
joblib.dump(m, os.path.join(args.model_dir, f"xgb_model_{idx}.joblib"))
|
|
425
|
+
print(f"Saved {len(ensemble_models)} model(s)")
|
|
443
426
|
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
# - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
|
|
447
|
-
matched_df = match_features_case_insensitive(df, features)
|
|
427
|
+
with open(os.path.join(args.model_dir, "ensemble_metadata.json"), "w") as f:
|
|
428
|
+
json.dump({"n_ensemble": len(ensemble_models), "n_folds": n_folds}, f)
|
|
448
429
|
|
|
449
|
-
|
|
450
|
-
|
|
430
|
+
with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as f:
|
|
431
|
+
json.dump(orig_features, f)
|
|
451
432
|
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
print("Decompressing features for prediction...")
|
|
455
|
-
matched_df, features = decompress_features(matched_df, features, compressed_features)
|
|
433
|
+
with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as f:
|
|
434
|
+
json.dump(category_mappings, f)
|
|
456
435
|
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
predictions = model.predict(X)
|
|
436
|
+
with open(os.path.join(args.model_dir, "hyperparameters.json"), "w") as f:
|
|
437
|
+
json.dump(hyperparameters, f, indent=2)
|
|
460
438
|
|
|
461
|
-
# If we have a label encoder, decode the predictions
|
|
462
439
|
if label_encoder:
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
# Set the predictions on the DataFrame
|
|
466
|
-
df["prediction"] = predictions
|
|
467
|
-
|
|
468
|
-
# Does our model have a 'predict_proba' method? If so we will call it and add the results to the DataFrame
|
|
469
|
-
if getattr(model, "predict_proba", None):
|
|
470
|
-
probs = model.predict_proba(matched_df[features])
|
|
471
|
-
df["pred_proba"] = [p.tolist() for p in probs]
|
|
440
|
+
joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
|
|
472
441
|
|
|
473
|
-
|
|
474
|
-
|
|
442
|
+
if model_type != "classifier":
|
|
443
|
+
save_uq_models(uq_models, uq_metadata, args.model_dir)
|
|
475
444
|
|
|
476
|
-
|
|
477
|
-
return df
|
|
445
|
+
print(f"\nModel training complete! Artifacts saved to {args.model_dir}")
|