workbench 0.8.171__py3-none-any.whl → 0.8.172__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of workbench might be problematic. Click here for more details.
- workbench/api/feature_set.py +4 -4
- workbench/core/artifacts/artifact.py +11 -3
- workbench/core/artifacts/model_core.py +37 -14
- workbench/core/cloud_platform/aws/aws_account_clamp.py +4 -1
- workbench/core/transforms/features_to_model/features_to_model.py +4 -4
- workbench/model_scripts/custom_models/uq_models/generated_model_script.py +210 -91
- workbench/model_scripts/custom_models/uq_models/mapie.template +502 -0
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +8 -5
- workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
- workbench/model_scripts/script_generation.py +5 -0
- workbench/utils/model_utils.py +1 -1
- workbench/utils/workbench_sqs.py +1 -1
- workbench/utils/xgboost_model_utils.py +1 -0
- {workbench-0.8.171.dist-info → workbench-0.8.172.dist-info}/METADATA +1 -1
- {workbench-0.8.171.dist-info → workbench-0.8.172.dist-info}/RECORD +19 -19
- workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
- {workbench-0.8.171.dist-info → workbench-0.8.172.dist-info}/WHEEL +0 -0
- {workbench-0.8.171.dist-info → workbench-0.8.172.dist-info}/entry_points.txt +0 -0
- {workbench-0.8.171.dist-info → workbench-0.8.172.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.171.dist-info → workbench-0.8.172.dist-info}/top_level.txt +0 -0
workbench/api/feature_set.py
CHANGED
|
@@ -87,8 +87,8 @@ class FeatureSet(FeatureSetCore):
|
|
|
87
87
|
model_import_str: str = None,
|
|
88
88
|
custom_script: Union[str, Path] = None,
|
|
89
89
|
custom_args: dict = None,
|
|
90
|
-
training_image: str = "
|
|
91
|
-
inference_image: str = "
|
|
90
|
+
training_image: str = "training",
|
|
91
|
+
inference_image: str = "inference",
|
|
92
92
|
inference_arch: str = "x86_64",
|
|
93
93
|
**kwargs,
|
|
94
94
|
) -> Union[Model, None]:
|
|
@@ -105,8 +105,8 @@ class FeatureSet(FeatureSetCore):
|
|
|
105
105
|
model_class (str, optional): Model class to use (e.g. "KMeans", "PyTorch", default: None)
|
|
106
106
|
model_import_str (str, optional): The import for the model (e.g. "from sklearn.cluster import KMeans")
|
|
107
107
|
custom_script (str, optional): The custom script to use for the model (default: None)
|
|
108
|
-
training_image (str, optional): The training image to use (default: "
|
|
109
|
-
inference_image (str, optional): The inference image to use (default: "
|
|
108
|
+
training_image (str, optional): The training image to use (default: "training")
|
|
109
|
+
inference_image (str, optional): The inference image to use (default: "inference")
|
|
110
110
|
inference_arch (str, optional): The architecture to use for inference (default: "x86_64")
|
|
111
111
|
kwargs (dict, optional): Additional keyword arguments to pass to the model
|
|
112
112
|
|
|
@@ -236,6 +236,12 @@ class Artifact(ABC):
|
|
|
236
236
|
This functionality will work for FeatureSets, Models, and Endpoints
|
|
237
237
|
but not for DataSources. The DataSource class overrides this method.
|
|
238
238
|
"""
|
|
239
|
+
|
|
240
|
+
# Check for ReadOnly Role
|
|
241
|
+
if self.aws_account_clamp.read_only_role:
|
|
242
|
+
self.log.info("Cannot add metadata with a ReadOnly Role...")
|
|
243
|
+
return
|
|
244
|
+
|
|
239
245
|
# Sanity check
|
|
240
246
|
aws_arn = self.arn()
|
|
241
247
|
if aws_arn is None:
|
|
@@ -444,10 +450,12 @@ class Artifact(ABC):
|
|
|
444
450
|
|
|
445
451
|
if __name__ == "__main__":
|
|
446
452
|
"""Exercise the Artifact Class"""
|
|
447
|
-
from workbench.api
|
|
448
|
-
|
|
453
|
+
from workbench.api import DataSource, FeatureSet, Endpoint
|
|
454
|
+
|
|
455
|
+
# Grab an Endpoint (which is a subclass of Artifact)
|
|
456
|
+
end = Endpoint("wine-classification")
|
|
449
457
|
|
|
450
|
-
#
|
|
458
|
+
# Grab a DataSource (which is a subclass of Artifact)
|
|
451
459
|
data_source = DataSource("test_data")
|
|
452
460
|
|
|
453
461
|
# Just some random tests
|
|
@@ -37,16 +37,45 @@ class ModelType(Enum):
|
|
|
37
37
|
UNKNOWN = "unknown"
|
|
38
38
|
|
|
39
39
|
|
|
40
|
+
# Deprecated Images
|
|
41
|
+
"""
|
|
42
|
+
# US East 1 images
|
|
43
|
+
"py312-general-ml-training"
|
|
44
|
+
("us-east-1", "training", "0.1", "x86_64"): (
|
|
45
|
+
"507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-sklearn-xgb-training:0.1"
|
|
46
|
+
),
|
|
47
|
+
("us-east-1", "inference", "0.1", "x86_64"): (
|
|
48
|
+
"507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1"
|
|
49
|
+
),
|
|
50
|
+
|
|
51
|
+
# US West 2 images
|
|
52
|
+
("us-west-2", "training", "0.1", "x86_64"): (
|
|
53
|
+
"507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-training:0.1"
|
|
54
|
+
),
|
|
55
|
+
("us-west-2", "inference", "0.1", "x86_64"): (
|
|
56
|
+
"507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1"
|
|
57
|
+
),
|
|
58
|
+
|
|
59
|
+
# ARM64 images
|
|
60
|
+
("us-east-1", "inference", "0.1", "arm64"): (
|
|
61
|
+
"507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1-arm64"
|
|
62
|
+
),
|
|
63
|
+
("us-west-2", "inference", "0.1", "arm64"): (
|
|
64
|
+
"507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1-arm64"
|
|
65
|
+
),
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
|
|
40
69
|
class ModelImages:
|
|
41
70
|
"""Class for retrieving workbench inference images"""
|
|
42
71
|
|
|
43
72
|
image_uris = {
|
|
44
73
|
# US East 1 images
|
|
45
|
-
("us-east-1", "
|
|
46
|
-
"507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-
|
|
74
|
+
("us-east-1", "training", "0.1", "x86_64"): (
|
|
75
|
+
"507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-general-ml-training:0.1"
|
|
47
76
|
),
|
|
48
|
-
("us-east-1", "
|
|
49
|
-
"507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-
|
|
77
|
+
("us-east-1", "inference", "0.1", "x86_64"): (
|
|
78
|
+
"507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-general-ml-inference:0.1"
|
|
50
79
|
),
|
|
51
80
|
("us-east-1", "pytorch_training", "0.1", "x86_64"): (
|
|
52
81
|
"507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-pytorch-training:0.1"
|
|
@@ -55,11 +84,11 @@ class ModelImages:
|
|
|
55
84
|
"507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-pytorch-inference:0.1"
|
|
56
85
|
),
|
|
57
86
|
# US West 2 images
|
|
58
|
-
("us-west-2", "
|
|
59
|
-
"507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-
|
|
87
|
+
("us-west-2", "training", "0.1", "x86_64"): (
|
|
88
|
+
"507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-general-ml-training:0.1"
|
|
60
89
|
),
|
|
61
|
-
("us-west-2", "
|
|
62
|
-
"507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-
|
|
90
|
+
("us-west-2", "inference", "0.1", "x86_64"): (
|
|
91
|
+
"507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-general-ml-inference:0.1"
|
|
63
92
|
),
|
|
64
93
|
("us-west-2", "pytorch_training", "0.1", "x86_64"): (
|
|
65
94
|
"507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-pytorch-training:0.1"
|
|
@@ -68,12 +97,6 @@ class ModelImages:
|
|
|
68
97
|
"507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-pytorch-inference:0.1"
|
|
69
98
|
),
|
|
70
99
|
# ARM64 images
|
|
71
|
-
("us-east-1", "xgb_inference", "0.1", "arm64"): (
|
|
72
|
-
"507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1-arm64"
|
|
73
|
-
),
|
|
74
|
-
("us-west-2", "xgb_inference", "0.1", "arm64"): (
|
|
75
|
-
"507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1-arm64"
|
|
76
|
-
),
|
|
77
100
|
# Meta Endpoint inference images
|
|
78
101
|
("us-east-1", "meta-endpoint", "0.1", "x86_64"): (
|
|
79
102
|
"507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-meta-endpoint:0.1"
|
|
@@ -54,7 +54,10 @@ class AWSAccountClamp:
|
|
|
54
54
|
|
|
55
55
|
# Check our Assume Role
|
|
56
56
|
self.log.info("Checking Workbench Assumed Role...")
|
|
57
|
-
self.aws_session.assumed_role_info()
|
|
57
|
+
role_info = self.aws_session.assumed_role_info()
|
|
58
|
+
|
|
59
|
+
# Check if the Role is a 'ReadOnly' role
|
|
60
|
+
self.read_only_role = "readonly" in role_info["AssumedRoleArn"].lower()
|
|
58
61
|
|
|
59
62
|
# Check our Workbench API Key and Load the License
|
|
60
63
|
self.log.info("Checking Workbench API License...")
|
|
@@ -37,8 +37,8 @@ class FeaturesToModel(Transform):
|
|
|
37
37
|
model_import_str=None,
|
|
38
38
|
custom_script=None,
|
|
39
39
|
custom_args=None,
|
|
40
|
-
training_image="
|
|
41
|
-
inference_image="
|
|
40
|
+
training_image="training",
|
|
41
|
+
inference_image="inference",
|
|
42
42
|
inference_arch="x86_64",
|
|
43
43
|
):
|
|
44
44
|
"""FeaturesToModel Initialization
|
|
@@ -50,8 +50,8 @@ class FeaturesToModel(Transform):
|
|
|
50
50
|
model_import_str (str, optional): The import string for the model (default None)
|
|
51
51
|
custom_script (str, optional): Custom script to use for the model (default None)
|
|
52
52
|
custom_args (dict, optional): Custom arguments to pass to custom model scripts (default None)
|
|
53
|
-
training_image (str, optional): Training image (default "
|
|
54
|
-
inference_image (str, optional): Inference image (default "
|
|
53
|
+
training_image (str, optional): Training image (default "training")
|
|
54
|
+
inference_image (str, optional): Inference image (default "inference")
|
|
55
55
|
inference_arch (str, optional): Inference architecture (default "x86_64")
|
|
56
56
|
"""
|
|
57
57
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
# Model:
|
|
2
|
-
from
|
|
3
|
-
from
|
|
1
|
+
# Model: XGBoost for point predictions + LightGBM with MAPIE for conformalized intervals
|
|
2
|
+
from mapie.regression import ConformalizedQuantileRegressor
|
|
3
|
+
from lightgbm import LGBMRegressor
|
|
4
|
+
from xgboost import XGBRegressor
|
|
4
5
|
from sklearn.model_selection import train_test_split
|
|
5
6
|
|
|
6
7
|
# Model Performance Scores
|
|
@@ -19,19 +20,12 @@ import numpy as np
|
|
|
19
20
|
import pandas as pd
|
|
20
21
|
from typing import List, Tuple
|
|
21
22
|
|
|
22
|
-
# Local Imports
|
|
23
|
-
from proximity import Proximity
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
23
|
# Template Placeholders
|
|
28
24
|
TEMPLATE_PARAMS = {
|
|
29
|
-
"
|
|
30
|
-
"
|
|
31
|
-
"features": ['bcut2d_logplow', 'numradicalelectrons', 'smr_vsa5', 'fr_lactam', 'fr_morpholine', 'fr_aldehyde', 'slogp_vsa1', 'fr_amidine', 'bpol', 'fr_ester', 'fr_azo', 'kappa3', 'peoe_vsa5', 'fr_ketone_topliss', 'vsa_estate9', 'estate_vsa9', 'bcut2d_mrhi', 'fr_ndealkylation1', 'numrotatablebonds', 'minestateindex', 'fr_quatn', 'peoe_vsa3', 'fr_epoxide', 'fr_aniline', 'minpartialcharge', 'fr_nitroso', 'fpdensitymorgan2', 'fr_oxime', 'fr_sulfone', 'smr_vsa1', 'kappa1', 'fr_pyridine', 'numaromaticrings', 'vsa_estate6', 'molmr', 'estate_vsa1', 'fr_dihydropyridine', 'vsa_estate10', 'fr_alkyl_halide', 'chi2n', 'fr_thiocyan', 'fpdensitymorgan1', 'fr_unbrch_alkane', 'slogp_vsa9', 'chi4n', 'fr_nitro_arom', 'fr_al_oh', 'fr_furan', 'fr_c_s', 'peoe_vsa8', 'peoe_vsa14', 'numheteroatoms', 'fr_ndealkylation2', 'maxabspartialcharge', 'vsa_estate2', 'peoe_vsa7', 'apol', 'numhacceptors', 'fr_tetrazole', 'vsa_estate1', 'peoe_vsa9', 'naromatom', 'bcut2d_chghi', 'fr_sh', 'fr_halogen', 'slogp_vsa4', 'fr_benzodiazepine', 'molwt', 'fr_isocyan', 'fr_prisulfonamd', 'maxabsestateindex', 'minabsestateindex', 'peoe_vsa11', 'slogp_vsa12', 'estate_vsa5', 'numaliphaticcarbocycles', 'bcut2d_mwlow', 'slogp_vsa7', 'fr_allylic_oxid', 'fr_methoxy', 'fr_nh0', 'fr_coo2', 'fr_phenol', 'nacid', 'nbase', 'chi3v', 'fr_ar_nh', 'fr_nitrile', 'fr_imidazole', 'fr_urea', 'bcut2d_mrlow', 'chi1', 'smr_vsa6', 'fr_aryl_methyl', 'narombond', 'fr_alkyl_carbamate', 'fr_piperzine', 'exactmolwt', 'qed', 'chi0n', 'fr_sulfonamd', 'fr_thiazole', 'numvalenceelectrons', 'fr_phos_acid', 'peoe_vsa12', 'fr_nh1', 'fr_hdrzine', 'fr_c_o_nocoo', 'fr_lactone', 'estate_vsa6', 'bcut2d_logphi', 'vsa_estate7', 'peoe_vsa13', 'numsaturatedcarbocycles', 'fr_nitro', 'fr_phenol_noorthohbond', 'rotratio', 'fr_barbitur', 'fr_isothiocyan', 'balabanj', 'fr_arn', 'fr_imine', 'maxpartialcharge', 'fr_sulfide', 'slogp_vsa11', 'fr_hoccn', 'fr_n_o', 'peoe_vsa1', 'slogp_vsa6', 'heavyatommolwt', 'fractioncsp3', 'estate_vsa8', 'peoe_vsa10', 'numaliphaticrings', 'fr_thiophene', 'maxestateindex', 'smr_vsa10', 'labuteasa', 'smr_vsa2', 'fpdensitymorgan3', 'smr_vsa9', 'slogp_vsa10', 'numaromaticheterocycles', 'fr_nh2', 'fr_diazo', 'chi3n', 'fr_ar_coo', 'slogp_vsa5', 'fr_bicyclic', 'fr_amide', 'estate_vsa10', 'fr_guanido', 'chi1n', 'numsaturatedrings', 'fr_piperdine', 'fr_term_acetylene', 'estate_vsa4', 'slogp_vsa3', 'fr_coo', 'fr_ether', 'estate_vsa7', 'bcut2d_chglo', 'fr_oxazole', 'peoe_vsa6', 'hallkieralpha', 'peoe_vsa2', 'chi2v', 'nocount', 'vsa_estate5', 'fr_nhpyrrole', 'fr_al_coo', 'bertzct', 'estate_vsa11', 'minabspartialcharge', 'slogp_vsa8', 'fr_imide', 'kappa2', 'numaliphaticheterocycles', 'numsaturatedheterocycles', 'fr_hdrzone', 'smr_vsa4', 'fr_ar_n', 'nrot', 'smr_vsa8', 'slogp_vsa2', 'chi4v', 'fr_phos_ester', 'fr_para_hydroxylation', 'smr_vsa3', 'nhohcount', 'estate_vsa2', 'mollogp', 'tpsa', 'fr_azide', 'peoe_vsa4', 'numhdonors', 'fr_al_oh_notert', 'fr_c_o', 'chi0', 'fr_nitro_arom_nonortho', 'vsa_estate3', 'fr_benzene', 'fr_ketone', 'vsa_estate8', 'smr_vsa7', 'fr_ar_oh', 'fr_priamide', 'ringcount', 'estate_vsa3', 'numaromaticcarbocycles', 'bcut2d_mwhi', 'chi1v', 'heavyatomcount', 'vsa_estate4', 'chi0v'],
|
|
25
|
+
"target": "udm_asy_res_value",
|
|
26
|
+
"features": ['bcut2d_logplow', 'numradicalelectrons', 'smr_vsa5', 'fr_lactam', 'fr_morpholine', 'fr_aldehyde', 'slogp_vsa1', 'fr_amidine', 'bpol', 'fr_ester', 'fr_azo', 'kappa3', 'peoe_vsa5', 'fr_ketone_topliss', 'vsa_estate9', 'estate_vsa9', 'bcut2d_mrhi', 'fr_ndealkylation1', 'numrotatablebonds', 'minestateindex', 'fr_quatn', 'peoe_vsa3', 'fr_epoxide', 'fr_aniline', 'minpartialcharge', 'fr_nitroso', 'fpdensitymorgan2', 'fr_oxime', 'fr_sulfone', 'smr_vsa1', 'kappa1', 'fr_pyridine', 'numaromaticrings', 'vsa_estate6', 'molmr', 'estate_vsa1', 'fr_dihydropyridine', 'vsa_estate10', 'fr_alkyl_halide', 'chi2n', 'fr_thiocyan', 'fpdensitymorgan1', 'fr_unbrch_alkane', 'slogp_vsa9', 'chi4n', 'fr_nitro_arom', 'fr_al_oh', 'fr_furan', 'fr_c_s', 'peoe_vsa8', 'peoe_vsa14', 'numheteroatoms', 'fr_ndealkylation2', 'maxabspartialcharge', 'vsa_estate2', 'peoe_vsa7', 'apol', 'numhacceptors', 'fr_tetrazole', 'vsa_estate1', 'peoe_vsa9', 'naromatom', 'bcut2d_chghi', 'fr_sh', 'fr_halogen', 'slogp_vsa4', 'fr_benzodiazepine', 'molwt', 'fr_isocyan', 'fr_prisulfonamd', 'maxabsestateindex', 'minabsestateindex', 'peoe_vsa11', 'slogp_vsa12', 'estate_vsa5', 'numaliphaticcarbocycles', 'bcut2d_mwlow', 'slogp_vsa7', 'fr_allylic_oxid', 'fr_methoxy', 'fr_nh0', 'fr_coo2', 'fr_phenol', 'nacid', 'nbase', 'chi3v', 'fr_ar_nh', 'fr_nitrile', 'fr_imidazole', 'fr_urea', 'bcut2d_mrlow', 'chi1', 'smr_vsa6', 'fr_aryl_methyl', 'narombond', 'fr_alkyl_carbamate', 'fr_piperzine', 'exactmolwt', 'qed', 'chi0n', 'fr_sulfonamd', 'fr_thiazole', 'numvalenceelectrons', 'fr_phos_acid', 'peoe_vsa12', 'fr_nh1', 'fr_hdrzine', 'fr_c_o_nocoo', 'fr_lactone', 'estate_vsa6', 'bcut2d_logphi', 'vsa_estate7', 'peoe_vsa13', 'numsaturatedcarbocycles', 'fr_nitro', 'fr_phenol_noorthohbond', 'rotratio', 'fr_barbitur', 'fr_isothiocyan', 'balabanj', 'fr_arn', 'fr_imine', 'maxpartialcharge', 'fr_sulfide', 'slogp_vsa11', 'fr_hoccn', 'fr_n_o', 'peoe_vsa1', 'slogp_vsa6', 'heavyatommolwt', 'fractioncsp3', 'estate_vsa8', 'peoe_vsa10', 'numaliphaticrings', 'fr_thiophene', 'maxestateindex', 'smr_vsa10', 'labuteasa', 'smr_vsa2', 'fpdensitymorgan3', 'smr_vsa9', 'slogp_vsa10', 'numaromaticheterocycles', 'fr_nh2', 'fr_diazo', 'chi3n', 'fr_ar_coo', 'slogp_vsa5', 'fr_bicyclic', 'fr_amide', 'estate_vsa10', 'fr_guanido', 'chi1n', 'numsaturatedrings', 'fr_piperdine', 'fr_term_acetylene', 'estate_vsa4', 'slogp_vsa3', 'fr_coo', 'fr_ether', 'estate_vsa7', 'bcut2d_chglo', 'fr_oxazole', 'peoe_vsa6', 'hallkieralpha', 'peoe_vsa2', 'chi2v', 'nocount', 'vsa_estate5', 'fr_nhpyrrole', 'fr_al_coo', 'bertzct', 'estate_vsa11', 'minabspartialcharge', 'slogp_vsa8', 'fr_imide', 'kappa2', 'numaliphaticheterocycles', 'numsaturatedheterocycles', 'fr_hdrzone', 'smr_vsa4', 'fr_ar_n', 'nrot', 'smr_vsa8', 'slogp_vsa2', 'chi4v', 'fr_phos_ester', 'fr_para_hydroxylation', 'smr_vsa3', 'nhohcount', 'estate_vsa2', 'mollogp', 'tpsa', 'fr_azide', 'peoe_vsa4', 'numhdonors', 'fr_al_oh_notert', 'fr_c_o', 'chi0', 'fr_nitro_arom_nonortho', 'vsa_estate3', 'fr_benzene', 'fr_ketone', 'vsa_estate8', 'smr_vsa7', 'fr_ar_oh', 'fr_priamide', 'ringcount', 'estate_vsa3', 'numaromaticcarbocycles', 'bcut2d_mwhi', 'chi1v', 'heavyatomcount', 'vsa_estate4', 'chi0v', 'chiral_centers', 'r_cnt', 's_cnt', 'db_stereo', 'e_cnt', 'z_cnt', 'chiral_fp', 'db_fp'],
|
|
32
27
|
"compressed_features": [],
|
|
33
|
-
"train_all_data":
|
|
34
|
-
"track_columns": ['udm_asy_res_intrinsic_clearance_ul_per_min_per_mg_protein']
|
|
28
|
+
"train_all_data": True
|
|
35
29
|
}
|
|
36
30
|
|
|
37
31
|
|
|
@@ -106,8 +100,10 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
|
|
|
106
100
|
return df, category_mappings
|
|
107
101
|
|
|
108
102
|
|
|
109
|
-
def decompress_features(
|
|
110
|
-
|
|
103
|
+
def decompress_features(
|
|
104
|
+
df: pd.DataFrame, features: List[str], compressed_features: List[str]
|
|
105
|
+
) -> Tuple[pd.DataFrame, List[str]]:
|
|
106
|
+
"""Prepare features for the model by decompressing bitstring features
|
|
111
107
|
|
|
112
108
|
Args:
|
|
113
109
|
df (pd.DataFrame): The features DataFrame
|
|
@@ -132,7 +128,7 @@ def decompress_features(df: pd.DataFrame, features: List[str], compressed_featur
|
|
|
132
128
|
)
|
|
133
129
|
|
|
134
130
|
# Decompress the specified compressed features
|
|
135
|
-
decompressed_features = features
|
|
131
|
+
decompressed_features = features.copy()
|
|
136
132
|
for feature in compressed_features:
|
|
137
133
|
if (feature not in df.columns) or (feature not in features):
|
|
138
134
|
print(f"Feature '{feature}' not in the features list, skipping decompression.")
|
|
@@ -161,13 +157,11 @@ def decompress_features(df: pd.DataFrame, features: List[str], compressed_featur
|
|
|
161
157
|
|
|
162
158
|
if __name__ == "__main__":
|
|
163
159
|
# Template Parameters
|
|
164
|
-
id_column = TEMPLATE_PARAMS["id_column"]
|
|
165
160
|
target = TEMPLATE_PARAMS["target"]
|
|
166
161
|
features = TEMPLATE_PARAMS["features"]
|
|
167
162
|
orig_features = features.copy()
|
|
168
163
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
169
164
|
train_all_data = TEMPLATE_PARAMS["train_all_data"]
|
|
170
|
-
track_columns = TEMPLATE_PARAMS["track_columns"] # Can be None
|
|
171
165
|
validation_split = 0.2
|
|
172
166
|
|
|
173
167
|
# Script arguments for input/output directories
|
|
@@ -225,71 +219,175 @@ if __name__ == "__main__":
|
|
|
225
219
|
print(f"FIT/TRAIN: {df_train.shape}")
|
|
226
220
|
print(f"VALIDATION: {df_val.shape}")
|
|
227
221
|
|
|
228
|
-
# We're using XGBoost for point predictions and NGBoost for uncertainty quantification
|
|
229
|
-
xgb_model = XGBRegressor()
|
|
230
|
-
ngb_model = NGBRegressor()
|
|
231
|
-
|
|
232
222
|
# Prepare features and targets for training
|
|
233
223
|
X_train = df_train[features]
|
|
234
224
|
X_validate = df_val[features]
|
|
235
225
|
y_train = df_train[target]
|
|
236
226
|
y_validate = df_val[target]
|
|
237
227
|
|
|
238
|
-
# Train
|
|
228
|
+
# Train XGBoost for point predictions
|
|
229
|
+
print("\nTraining XGBoost for point predictions...")
|
|
230
|
+
xgb_model = XGBRegressor(
|
|
231
|
+
n_estimators=1000,
|
|
232
|
+
max_depth=6,
|
|
233
|
+
learning_rate=0.01,
|
|
234
|
+
subsample=0.8,
|
|
235
|
+
colsample_bytree=0.8,
|
|
236
|
+
random_state=42,
|
|
237
|
+
verbosity=0
|
|
238
|
+
)
|
|
239
239
|
xgb_model.fit(X_train, y_train)
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
print(f"
|
|
251
|
-
|
|
252
|
-
|
|
240
|
+
|
|
241
|
+
# Evaluate XGBoost performance
|
|
242
|
+
y_pred_xgb = xgb_model.predict(X_validate)
|
|
243
|
+
xgb_rmse = root_mean_squared_error(y_validate, y_pred_xgb)
|
|
244
|
+
xgb_mae = mean_absolute_error(y_validate, y_pred_xgb)
|
|
245
|
+
xgb_r2 = r2_score(y_validate, y_pred_xgb)
|
|
246
|
+
|
|
247
|
+
print(f"\nXGBoost Point Prediction Performance:")
|
|
248
|
+
print(f"RMSE: {xgb_rmse:.3f}")
|
|
249
|
+
print(f"MAE: {xgb_mae:.3f}")
|
|
250
|
+
print(f"R2: {xgb_r2:.3f}")
|
|
251
|
+
|
|
252
|
+
# Define confidence levels we want to model
|
|
253
|
+
confidence_levels = [0.50, 0.80, 0.90, 0.95] # 50%, 80%, 90%, 95% confidence intervals
|
|
254
|
+
|
|
255
|
+
# Store MAPIE models for each confidence level
|
|
256
|
+
mapie_models = {}
|
|
257
|
+
|
|
258
|
+
# Train models for each confidence level
|
|
259
|
+
for confidence_level in confidence_levels:
|
|
260
|
+
alpha = 1 - confidence_level
|
|
261
|
+
lower_q = alpha / 2
|
|
262
|
+
upper_q = 1 - alpha / 2
|
|
263
|
+
|
|
264
|
+
print(f"\nTraining quantile models for {confidence_level * 100:.0f}% confidence interval...")
|
|
265
|
+
print(f" Quantiles: {lower_q:.3f}, {upper_q:.3f}, 0.500")
|
|
266
|
+
|
|
267
|
+
# Train three models for this confidence level
|
|
268
|
+
quantile_estimators = []
|
|
269
|
+
for q in [lower_q, upper_q, 0.5]:
|
|
270
|
+
print(f" Training model for quantile {q:.3f}...")
|
|
271
|
+
est = LGBMRegressor(
|
|
272
|
+
objective="quantile",
|
|
273
|
+
alpha=q,
|
|
274
|
+
n_estimators=1000,
|
|
275
|
+
max_depth=6,
|
|
276
|
+
learning_rate=0.01,
|
|
277
|
+
num_leaves=31,
|
|
278
|
+
min_child_samples=20,
|
|
279
|
+
subsample=0.8,
|
|
280
|
+
colsample_bytree=0.8,
|
|
281
|
+
random_state=42,
|
|
282
|
+
verbose=-1,
|
|
283
|
+
force_col_wise=True
|
|
284
|
+
)
|
|
285
|
+
est.fit(X_train, y_train)
|
|
286
|
+
quantile_estimators.append(est)
|
|
287
|
+
|
|
288
|
+
# Create MAPIE CQR model for this confidence level
|
|
289
|
+
print(f" Setting up MAPIE CQR for {confidence_level * 100:.0f}% confidence...")
|
|
290
|
+
mapie_model = ConformalizedQuantileRegressor(
|
|
291
|
+
quantile_estimators,
|
|
292
|
+
confidence_level=confidence_level,
|
|
293
|
+
prefit=True
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
# Conformalize the model
|
|
297
|
+
print(f" Conformalizing with validation data...")
|
|
298
|
+
mapie_model.conformalize(X_validate, y_validate)
|
|
299
|
+
|
|
300
|
+
# Store the model
|
|
301
|
+
mapie_models[f"mapie_{confidence_level:.2f}"] = mapie_model
|
|
302
|
+
|
|
303
|
+
# Validate coverage for this confidence level
|
|
304
|
+
y_pred, y_pis = mapie_model.predict_interval(X_validate)
|
|
305
|
+
coverage = np.mean((y_validate >= y_pis[:, 0, 0]) & (y_validate <= y_pis[:, 1, 0]))
|
|
306
|
+
print(f" Coverage: Target={confidence_level * 100:.0f}%, Empirical={coverage * 100:.1f}%")
|
|
307
|
+
|
|
308
|
+
print(f"\nOverall Model Performance Summary:")
|
|
309
|
+
print(f"XGBoost RMSE: {xgb_rmse:.3f}")
|
|
310
|
+
print(f"XGBoost MAE: {xgb_mae:.3f}")
|
|
311
|
+
print(f"XGBoost R2: {xgb_r2:.3f}")
|
|
253
312
|
print(f"NumRows: {len(df_val)}")
|
|
254
313
|
|
|
314
|
+
# Analyze interval widths across confidence levels
|
|
315
|
+
print(f"\nInterval Width Analysis:")
|
|
316
|
+
for conf_level in confidence_levels:
|
|
317
|
+
model = mapie_models[f"mapie_{conf_level:.2f}"]
|
|
318
|
+
_, y_pis = model.predict_interval(X_validate)
|
|
319
|
+
widths = y_pis[:, 1, 0] - y_pis[:, 0, 0]
|
|
320
|
+
print(f" {conf_level * 100:.0f}% CI: Mean width={np.mean(widths):.3f}, Std={np.std(widths):.3f}")
|
|
321
|
+
|
|
255
322
|
# Save the trained XGBoost model
|
|
256
323
|
xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
|
|
257
324
|
|
|
258
|
-
# Save
|
|
259
|
-
|
|
325
|
+
# Save all MAPIE models
|
|
326
|
+
for model_name, model in mapie_models.items():
|
|
327
|
+
joblib.dump(model, os.path.join(args.model_dir, f"{model_name}.joblib"))
|
|
260
328
|
|
|
261
|
-
# Save the
|
|
329
|
+
# Save the feature list
|
|
262
330
|
with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
|
|
263
|
-
json.dump(
|
|
264
|
-
|
|
265
|
-
#
|
|
266
|
-
|
|
331
|
+
json.dump(features, fp)
|
|
332
|
+
|
|
333
|
+
# Save category mappings if any
|
|
334
|
+
if category_mappings:
|
|
335
|
+
with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
|
|
336
|
+
json.dump(category_mappings, fp)
|
|
337
|
+
|
|
338
|
+
# Save model configuration
|
|
339
|
+
model_config = {
|
|
340
|
+
"model_type": "XGBoost_MAPIE_CQR_LightGBM",
|
|
341
|
+
"confidence_levels": confidence_levels,
|
|
342
|
+
"n_features": len(features),
|
|
343
|
+
"target": target,
|
|
344
|
+
"validation_metrics": {
|
|
345
|
+
"xgb_rmse": float(xgb_rmse),
|
|
346
|
+
"xgb_mae": float(xgb_mae),
|
|
347
|
+
"xgb_r2": float(xgb_r2),
|
|
348
|
+
"n_validation": len(df_val)
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
with open(os.path.join(args.model_dir, "model_config.json"), "w") as fp:
|
|
352
|
+
json.dump(model_config, fp, indent=2)
|
|
267
353
|
|
|
268
|
-
|
|
269
|
-
model
|
|
354
|
+
print(f"\nModel training complete!")
|
|
355
|
+
print(f"Saved 1 XGBoost model and {len(mapie_models)} MAPIE models to {args.model_dir}")
|
|
270
356
|
|
|
271
357
|
|
|
272
358
|
#
|
|
273
359
|
# Inference Section
|
|
274
360
|
#
|
|
275
361
|
def model_fn(model_dir) -> dict:
|
|
276
|
-
"""Load
|
|
362
|
+
"""Load XGBoost and all MAPIE models from the specified directory."""
|
|
363
|
+
|
|
364
|
+
# Load model configuration to know which models to load
|
|
365
|
+
with open(os.path.join(model_dir, "model_config.json")) as fp:
|
|
366
|
+
config = json.load(fp)
|
|
277
367
|
|
|
278
368
|
# Load XGBoost regressor
|
|
279
369
|
xgb_path = os.path.join(model_dir, "xgb_model.json")
|
|
280
370
|
xgb_model = XGBRegressor(enable_categorical=True)
|
|
281
371
|
xgb_model.load_model(xgb_path)
|
|
282
372
|
|
|
283
|
-
# Load
|
|
284
|
-
|
|
373
|
+
# Load all MAPIE models
|
|
374
|
+
mapie_models = {}
|
|
375
|
+
for conf_level in config["confidence_levels"]:
|
|
376
|
+
model_name = f"mapie_{conf_level:.2f}"
|
|
377
|
+
mapie_models[model_name] = joblib.load(os.path.join(model_dir, f"{model_name}.joblib"))
|
|
285
378
|
|
|
286
|
-
#
|
|
287
|
-
|
|
379
|
+
# Load category mappings if they exist
|
|
380
|
+
category_mappings = {}
|
|
381
|
+
category_path = os.path.join(model_dir, "category_mappings.json")
|
|
382
|
+
if os.path.exists(category_path):
|
|
383
|
+
with open(category_path) as fp:
|
|
384
|
+
category_mappings = json.load(fp)
|
|
288
385
|
|
|
289
386
|
return {
|
|
290
|
-
"
|
|
291
|
-
"
|
|
292
|
-
"
|
|
387
|
+
"xgb_model": xgb_model,
|
|
388
|
+
"mapie_models": mapie_models,
|
|
389
|
+
"confidence_levels": config["confidence_levels"],
|
|
390
|
+
"category_mappings": category_mappings
|
|
293
391
|
}
|
|
294
392
|
|
|
295
393
|
|
|
@@ -305,7 +403,7 @@ def input_fn(input_data, content_type):
|
|
|
305
403
|
if "text/csv" in content_type:
|
|
306
404
|
return pd.read_csv(StringIO(input_data))
|
|
307
405
|
elif "application/json" in content_type:
|
|
308
|
-
return pd.DataFrame(json.loads(input_data))
|
|
406
|
+
return pd.DataFrame(json.loads(input_data))
|
|
309
407
|
else:
|
|
310
408
|
raise ValueError(f"{content_type} not supported!")
|
|
311
409
|
|
|
@@ -313,23 +411,26 @@ def input_fn(input_data, content_type):
|
|
|
313
411
|
def output_fn(output_df, accept_type):
|
|
314
412
|
"""Supports both CSV and JSON output formats."""
|
|
315
413
|
if "text/csv" in accept_type:
|
|
316
|
-
|
|
414
|
+
# Convert categorical columns to string to avoid fillna issues
|
|
415
|
+
for col in output_df.select_dtypes(include=['category']).columns:
|
|
416
|
+
output_df[col] = output_df[col].astype(str)
|
|
417
|
+
csv_output = output_df.fillna("N/A").to_csv(index=False)
|
|
317
418
|
return csv_output, "text/csv"
|
|
318
419
|
elif "application/json" in accept_type:
|
|
319
|
-
return output_df.to_json(orient="records"), "application/json"
|
|
420
|
+
return output_df.to_json(orient="records"), "application/json"
|
|
320
421
|
else:
|
|
321
422
|
raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
|
|
322
423
|
|
|
323
424
|
|
|
324
425
|
def predict_fn(df, models) -> pd.DataFrame:
|
|
325
|
-
"""Make
|
|
426
|
+
"""Make predictions using XGBoost for point estimates and MAPIE for conformalized intervals
|
|
326
427
|
|
|
327
428
|
Args:
|
|
328
429
|
df (pd.DataFrame): The input DataFrame
|
|
329
|
-
models (dict):
|
|
430
|
+
models (dict): Dictionary containing XGBoost and MAPIE models
|
|
330
431
|
|
|
331
432
|
Returns:
|
|
332
|
-
pd.DataFrame:
|
|
433
|
+
pd.DataFrame: DataFrame with XGBoost predictions and conformalized intervals
|
|
333
434
|
"""
|
|
334
435
|
|
|
335
436
|
# Grab our feature columns (from training)
|
|
@@ -340,44 +441,62 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
340
441
|
# Match features in a case-insensitive manner
|
|
341
442
|
matched_df = match_features_case_insensitive(df, model_features)
|
|
342
443
|
|
|
343
|
-
#
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
dist_params = y_dists.params
|
|
351
|
-
|
|
352
|
-
# Extract mean and std from distribution parameters
|
|
353
|
-
df["prediction_uq"] = dist_params['loc'] # mean
|
|
354
|
-
df["prediction_std"] = dist_params['scale'] # standard deviation
|
|
355
|
-
|
|
356
|
-
# Add 95% prediction intervals using ppf (percent point function)
|
|
357
|
-
# Note: Our hybrid model uses XGB point prediction and NGBoost UQ
|
|
358
|
-
# so we need to adjust the bounds to include the point prediction
|
|
359
|
-
df["q_025"] = np.minimum(y_dists.ppf(0.025), df["prediction"])
|
|
360
|
-
df["q_975"] = np.maximum(y_dists.ppf(0.975), df["prediction"])
|
|
361
|
-
|
|
362
|
-
# Add 90% prediction intervals
|
|
363
|
-
df["q_05"] = y_dists.ppf(0.05) # 5th percentile
|
|
364
|
-
df["q_95"] = y_dists.ppf(0.95) # 95th percentile
|
|
365
|
-
|
|
366
|
-
# Add 80% prediction intervals
|
|
367
|
-
df["q_10"] = y_dists.ppf(0.10) # 10th percentile
|
|
368
|
-
df["q_90"] = y_dists.ppf(0.90) # 90th percentile
|
|
444
|
+
# Apply categorical mappings if they exist
|
|
445
|
+
if models.get("category_mappings"):
|
|
446
|
+
matched_df, _ = convert_categorical_types(
|
|
447
|
+
matched_df,
|
|
448
|
+
model_features,
|
|
449
|
+
models["category_mappings"]
|
|
450
|
+
)
|
|
369
451
|
|
|
370
|
-
#
|
|
371
|
-
|
|
372
|
-
|
|
452
|
+
# Get features for prediction
|
|
453
|
+
X = matched_df[model_features]
|
|
454
|
+
|
|
455
|
+
# Get XGBoost point predictions
|
|
456
|
+
df["prediction"] = models["xgb_model"].predict(X)
|
|
457
|
+
|
|
458
|
+
# Get predictions from each MAPIE model for conformalized intervals
|
|
459
|
+
for conf_level in models["confidence_levels"]:
|
|
460
|
+
model_name = f"mapie_{conf_level:.2f}"
|
|
461
|
+
model = models["mapie_models"][model_name]
|
|
462
|
+
|
|
463
|
+
# Get conformalized predictions
|
|
464
|
+
y_pred, y_pis = model.predict_interval(X)
|
|
465
|
+
|
|
466
|
+
# Map confidence levels to quantile names
|
|
467
|
+
if conf_level == 0.50: # 50% CI
|
|
468
|
+
df["q_25"] = y_pis[:, 0, 0]
|
|
469
|
+
df["q_75"] = y_pis[:, 1, 0]
|
|
470
|
+
elif conf_level == 0.80: # 80% CI
|
|
471
|
+
df["q_10"] = y_pis[:, 0, 0]
|
|
472
|
+
df["q_90"] = y_pis[:, 1, 0]
|
|
473
|
+
elif conf_level == 0.90: # 90% CI
|
|
474
|
+
df["q_05"] = y_pis[:, 0, 0]
|
|
475
|
+
df["q_95"] = y_pis[:, 1, 0]
|
|
476
|
+
elif conf_level == 0.95: # 95% CI
|
|
477
|
+
df["q_025"] = y_pis[:, 0, 0]
|
|
478
|
+
df["q_975"] = y_pis[:, 1, 0]
|
|
479
|
+
|
|
480
|
+
# Add median (q_50) from XGBoost prediction
|
|
481
|
+
df["q_50"] = df["prediction"]
|
|
482
|
+
|
|
483
|
+
# Calculate uncertainty metrics based on 95% interval
|
|
484
|
+
interval_width = df["q_975"] - df["q_025"]
|
|
485
|
+
df["prediction_std"] = interval_width / 3.92
|
|
373
486
|
|
|
374
487
|
# Reorder the quantile columns for easier reading
|
|
375
488
|
quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
|
|
376
489
|
other_cols = [col for col in df.columns if col not in quantile_cols]
|
|
377
490
|
df = df[other_cols + quantile_cols]
|
|
378
491
|
|
|
379
|
-
#
|
|
380
|
-
|
|
492
|
+
# Uncertainty score
|
|
493
|
+
df["uncertainty_score"] = interval_width / (np.abs(df["prediction"]) + 1e-6)
|
|
494
|
+
|
|
495
|
+
# Confidence bands
|
|
496
|
+
df["confidence_band"] = pd.cut(
|
|
497
|
+
df["uncertainty_score"],
|
|
498
|
+
bins=[0, 0.5, 1.0, 2.0, np.inf],
|
|
499
|
+
labels=["high", "medium", "low", "very_low"]
|
|
500
|
+
)
|
|
381
501
|
|
|
382
|
-
# Return the modified DataFrame
|
|
383
502
|
return df
|