workbench 0.8.162__py3-none-any.whl → 0.8.220__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of workbench might be problematic. Click here for more details.
- workbench/algorithms/dataframe/__init__.py +1 -2
- workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
- workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
- workbench/algorithms/dataframe/fingerprint_proximity.py +422 -86
- workbench/algorithms/dataframe/projection_2d.py +44 -21
- workbench/algorithms/dataframe/proximity.py +259 -305
- workbench/algorithms/graph/light/proximity_graph.py +14 -12
- workbench/algorithms/models/cleanlab_model.py +382 -0
- workbench/algorithms/models/noise_model.py +388 -0
- workbench/algorithms/sql/outliers.py +3 -3
- workbench/api/__init__.py +5 -1
- workbench/api/compound.py +1 -1
- workbench/api/df_store.py +17 -108
- workbench/api/endpoint.py +18 -5
- workbench/api/feature_set.py +121 -15
- workbench/api/meta.py +5 -2
- workbench/api/meta_model.py +289 -0
- workbench/api/model.py +55 -21
- workbench/api/monitor.py +1 -16
- workbench/api/parameter_store.py +3 -52
- workbench/cached/cached_model.py +4 -4
- workbench/core/artifacts/__init__.py +11 -2
- workbench/core/artifacts/artifact.py +16 -8
- workbench/core/artifacts/data_capture_core.py +355 -0
- workbench/core/artifacts/df_store_core.py +114 -0
- workbench/core/artifacts/endpoint_core.py +382 -253
- workbench/core/artifacts/feature_set_core.py +249 -45
- workbench/core/artifacts/model_core.py +135 -80
- workbench/core/artifacts/monitor_core.py +33 -248
- workbench/core/artifacts/parameter_store_core.py +98 -0
- workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
- workbench/core/cloud_platform/aws/aws_meta.py +12 -5
- workbench/core/cloud_platform/aws/aws_session.py +4 -4
- workbench/core/pipelines/pipeline_executor.py +1 -1
- workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
- workbench/core/transforms/features_to_model/features_to_model.py +62 -40
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +76 -15
- workbench/core/transforms/pandas_transforms/pandas_to_features.py +38 -2
- workbench/core/views/training_view.py +113 -42
- workbench/core/views/view.py +53 -3
- workbench/core/views/view_utils.py +4 -4
- workbench/model_script_utils/model_script_utils.py +339 -0
- workbench/model_script_utils/pytorch_utils.py +405 -0
- workbench/model_script_utils/uq_harness.py +278 -0
- workbench/model_scripts/chemprop/chemprop.template +649 -0
- workbench/model_scripts/chemprop/generated_model_script.py +649 -0
- workbench/model_scripts/chemprop/model_script_utils.py +339 -0
- workbench/model_scripts/chemprop/requirements.txt +3 -0
- workbench/model_scripts/custom_models/chem_info/fingerprints.py +175 -0
- workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
- workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
- workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
- workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +8 -10
- workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
- workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
- workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
- workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
- workbench/model_scripts/custom_models/uq_models/ngboost.template +30 -18
- workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
- workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
- workbench/model_scripts/meta_model/generated_model_script.py +209 -0
- workbench/model_scripts/meta_model/meta_model.template +209 -0
- workbench/model_scripts/pytorch_model/generated_model_script.py +444 -500
- workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
- workbench/model_scripts/pytorch_model/pytorch.template +440 -496
- workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
- workbench/model_scripts/pytorch_model/requirements.txt +1 -1
- workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
- workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
- workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
- workbench/model_scripts/script_generation.py +20 -11
- workbench/model_scripts/uq_models/generated_model_script.py +248 -0
- workbench/model_scripts/xgb_model/generated_model_script.py +372 -404
- workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
- workbench/model_scripts/xgb_model/uq_harness.py +278 -0
- workbench/model_scripts/xgb_model/xgb_model.template +369 -401
- workbench/repl/workbench_shell.py +28 -19
- workbench/resources/open_source_api.key +1 -1
- workbench/scripts/endpoint_test.py +162 -0
- workbench/scripts/lambda_test.py +73 -0
- workbench/scripts/meta_model_sim.py +35 -0
- workbench/scripts/ml_pipeline_batch.py +137 -0
- workbench/scripts/ml_pipeline_sqs.py +186 -0
- workbench/scripts/monitor_cloud_watch.py +20 -100
- workbench/scripts/training_test.py +85 -0
- workbench/utils/aws_utils.py +4 -3
- workbench/utils/chem_utils/__init__.py +0 -0
- workbench/utils/chem_utils/fingerprints.py +175 -0
- workbench/utils/chem_utils/misc.py +194 -0
- workbench/utils/chem_utils/mol_descriptors.py +483 -0
- workbench/utils/chem_utils/mol_standardize.py +450 -0
- workbench/utils/chem_utils/mol_tagging.py +348 -0
- workbench/utils/chem_utils/projections.py +219 -0
- workbench/utils/chem_utils/salts.py +256 -0
- workbench/utils/chem_utils/sdf.py +292 -0
- workbench/utils/chem_utils/toxicity.py +250 -0
- workbench/utils/chem_utils/vis.py +253 -0
- workbench/utils/chemprop_utils.py +141 -0
- workbench/utils/cloudwatch_handler.py +1 -1
- workbench/utils/cloudwatch_utils.py +137 -0
- workbench/utils/config_manager.py +3 -7
- workbench/utils/endpoint_utils.py +5 -7
- workbench/utils/license_manager.py +2 -6
- workbench/utils/meta_model_simulator.py +499 -0
- workbench/utils/metrics_utils.py +256 -0
- workbench/utils/model_utils.py +278 -79
- workbench/utils/monitor_utils.py +44 -62
- workbench/utils/pandas_utils.py +3 -3
- workbench/utils/pytorch_utils.py +87 -0
- workbench/utils/shap_utils.py +11 -57
- workbench/utils/workbench_logging.py +0 -3
- workbench/utils/workbench_sqs.py +1 -1
- workbench/utils/xgboost_local_crossfold.py +267 -0
- workbench/utils/xgboost_model_utils.py +127 -219
- workbench/web_interface/components/model_plot.py +14 -2
- workbench/web_interface/components/plugin_unit_test.py +5 -2
- workbench/web_interface/components/plugins/dashboard_status.py +3 -1
- workbench/web_interface/components/plugins/generated_compounds.py +1 -1
- workbench/web_interface/components/plugins/model_details.py +38 -74
- workbench/web_interface/components/plugins/scatter_plot.py +6 -10
- {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/METADATA +31 -9
- {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/RECORD +128 -96
- workbench-0.8.220.dist-info/entry_points.txt +11 -0
- {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/licenses/LICENSE +1 -1
- workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
- workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
- workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
- workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
- workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
- workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
- workbench/model_scripts/custom_models/proximity/proximity.py +0 -384
- workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
- workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -273
- workbench/model_scripts/custom_models/uq_models/proximity.py +0 -384
- workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
- workbench/model_scripts/quant_regression/quant_regression.template +0 -279
- workbench/model_scripts/quant_regression/requirements.txt +0 -1
- workbench/utils/chem_utils.py +0 -1556
- workbench/utils/execution_environment.py +0 -211
- workbench/utils/fast_inference.py +0 -167
- workbench/utils/resource_utils.py +0 -39
- workbench-0.8.162.dist-info/entry_points.txt +0 -5
- {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/WHEEL +0 -0
- {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/top_level.txt +0 -0
|
@@ -123,7 +123,7 @@ class PipelineExecutor:
|
|
|
123
123
|
if "model" in workbench_objects and (not subset or "endpoint" in subset):
|
|
124
124
|
workbench_objects["model"].to_endpoint(**kwargs)
|
|
125
125
|
endpoint = Endpoint(kwargs["name"])
|
|
126
|
-
endpoint.auto_inference(
|
|
126
|
+
endpoint.auto_inference()
|
|
127
127
|
|
|
128
128
|
# Found something weird
|
|
129
129
|
else:
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""MolecularDescriptors: Compute a Feature Set based on RDKit Descriptors
|
|
2
2
|
|
|
3
|
-
Note: An alternative to using this class is to use the `
|
|
4
|
-
df_features =
|
|
3
|
+
Note: An alternative to using this class is to use the `compute_descriptors` function directly.
|
|
4
|
+
df_features = compute_descriptors(df)
|
|
5
5
|
to_features = PandasToFeatures("my_feature_set")
|
|
6
6
|
to_features.set_input(df_features, id_column="id")
|
|
7
7
|
to_features.set_output_tags(["blah", "whatever"])
|
|
@@ -10,7 +10,7 @@ Note: An alternative to using this class is to use the `compute_molecular_descri
|
|
|
10
10
|
|
|
11
11
|
# Local Imports
|
|
12
12
|
from workbench.core.transforms.data_to_features.light.data_to_features_light import DataToFeaturesLight
|
|
13
|
-
from workbench.utils.chem_utils import
|
|
13
|
+
from workbench.utils.chem_utils.mol_descriptors import compute_descriptors
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class MolecularDescriptors(DataToFeaturesLight):
|
|
@@ -39,7 +39,7 @@ class MolecularDescriptors(DataToFeaturesLight):
|
|
|
39
39
|
"""Compute a Feature Set based on RDKit Descriptors"""
|
|
40
40
|
|
|
41
41
|
# Compute/add all the Molecular Descriptors
|
|
42
|
-
self.output_df =
|
|
42
|
+
self.output_df = compute_descriptors(self.input_df)
|
|
43
43
|
|
|
44
44
|
|
|
45
45
|
if __name__ == "__main__":
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""FeaturesToModel: Train/Create a Model from a Feature Set"""
|
|
2
2
|
|
|
3
3
|
from pathlib import Path
|
|
4
|
+
from typing import Union
|
|
4
5
|
from sagemaker.estimator import Estimator
|
|
5
6
|
import awswrangler as wr
|
|
6
7
|
from datetime import datetime, timezone
|
|
@@ -9,7 +10,7 @@ import time
|
|
|
9
10
|
# Local Imports
|
|
10
11
|
from workbench.core.transforms.transform import Transform, TransformInput, TransformOutput
|
|
11
12
|
from workbench.core.artifacts.feature_set_core import FeatureSetCore
|
|
12
|
-
from workbench.core.artifacts.model_core import ModelCore, ModelType, ModelImages
|
|
13
|
+
from workbench.core.artifacts.model_core import ModelCore, ModelType, ModelFramework, ModelImages
|
|
13
14
|
from workbench.core.artifacts.artifact import Artifact
|
|
14
15
|
from workbench.model_scripts.script_generation import generate_model_script, fill_template
|
|
15
16
|
from workbench.utils.model_utils import supported_instance_types
|
|
@@ -33,12 +34,13 @@ class FeaturesToModel(Transform):
|
|
|
33
34
|
feature_name: str,
|
|
34
35
|
model_name: str,
|
|
35
36
|
model_type: ModelType,
|
|
37
|
+
model_framework=ModelFramework.XGBOOST,
|
|
36
38
|
model_class=None,
|
|
37
39
|
model_import_str=None,
|
|
38
40
|
custom_script=None,
|
|
39
41
|
custom_args=None,
|
|
40
|
-
training_image="
|
|
41
|
-
inference_image="
|
|
42
|
+
training_image="training",
|
|
43
|
+
inference_image="inference",
|
|
42
44
|
inference_arch="x86_64",
|
|
43
45
|
):
|
|
44
46
|
"""FeaturesToModel Initialization
|
|
@@ -46,12 +48,13 @@ class FeaturesToModel(Transform):
|
|
|
46
48
|
feature_name (str): Name of the FeatureSet to use as input
|
|
47
49
|
model_name (str): Name of the Model to create as output
|
|
48
50
|
model_type (ModelType): ModelType.REGRESSOR or ModelType.CLASSIFIER, etc.
|
|
51
|
+
model_framework (ModelFramework, optional): The model framework (default ModelFramework.XGBOOST)
|
|
49
52
|
model_class (str, optional): The scikit model (e.g. KNeighborsRegressor) (default None)
|
|
50
53
|
model_import_str (str, optional): The import string for the model (default None)
|
|
51
54
|
custom_script (str, optional): Custom script to use for the model (default None)
|
|
52
55
|
custom_args (dict, optional): Custom arguments to pass to custom model scripts (default None)
|
|
53
|
-
training_image (str, optional): Training image (default "
|
|
54
|
-
inference_image (str, optional): Inference image (default "
|
|
56
|
+
training_image (str, optional): Training image (default "training")
|
|
57
|
+
inference_image (str, optional): Inference image (default "inference")
|
|
55
58
|
inference_arch (str, optional): Inference architecture (default "x86_64")
|
|
56
59
|
"""
|
|
57
60
|
|
|
@@ -65,6 +68,7 @@ class FeaturesToModel(Transform):
|
|
|
65
68
|
self.input_type = TransformInput.FEATURE_SET
|
|
66
69
|
self.output_type = TransformOutput.MODEL
|
|
67
70
|
self.model_type = model_type
|
|
71
|
+
self.model_framework = model_framework
|
|
68
72
|
self.model_class = model_class
|
|
69
73
|
self.model_import_str = model_import_str
|
|
70
74
|
self.custom_script = str(custom_script) if custom_script else None
|
|
@@ -80,12 +84,17 @@ class FeaturesToModel(Transform):
|
|
|
80
84
|
self.inference_arch = inference_arch
|
|
81
85
|
|
|
82
86
|
def transform_impl(
|
|
83
|
-
self,
|
|
87
|
+
self,
|
|
88
|
+
target_column: Union[str, list[str]],
|
|
89
|
+
description: str = None,
|
|
90
|
+
feature_list: list = None,
|
|
91
|
+
train_all_data=False,
|
|
92
|
+
**kwargs,
|
|
84
93
|
):
|
|
85
94
|
"""Generic Features to Model: Note you should create a new class and inherit from
|
|
86
95
|
this one to include specific logic for your Feature Set/Model
|
|
87
96
|
Args:
|
|
88
|
-
target_column (str): Column name of the target variable
|
|
97
|
+
target_column (str or list[str]): Column name(s) of the target variable(s)
|
|
89
98
|
description (str): Description of the model (optional)
|
|
90
99
|
feature_list (list[str]): A list of columns for the features (default None, will try to guess)
|
|
91
100
|
train_all_data (bool): Train on ALL (100%) of the data (default False)
|
|
@@ -102,9 +111,11 @@ class FeaturesToModel(Transform):
|
|
|
102
111
|
s3_training_path = feature_set.create_s3_training_data()
|
|
103
112
|
self.log.info(f"Created new training data {s3_training_path}...")
|
|
104
113
|
|
|
105
|
-
# Report the target column
|
|
114
|
+
# Report the target column(s)
|
|
106
115
|
self.target_column = target_column
|
|
107
|
-
|
|
116
|
+
# Normalize target_column to a list for internal use
|
|
117
|
+
target_list = [target_column] if isinstance(target_column, str) else (target_column or [])
|
|
118
|
+
self.log.info(f"Target column(s): {self.target_column}")
|
|
108
119
|
|
|
109
120
|
# Did they specify a feature list?
|
|
110
121
|
if feature_list:
|
|
@@ -131,7 +142,7 @@ class FeaturesToModel(Transform):
|
|
|
131
142
|
"is_deleted",
|
|
132
143
|
"event_time",
|
|
133
144
|
"training",
|
|
134
|
-
] +
|
|
145
|
+
] + target_list
|
|
135
146
|
feature_list = [c for c in all_columns if c not in filter_list]
|
|
136
147
|
|
|
137
148
|
# AWS Feature Store has 3 user column types (String, Integral, Fractional)
|
|
@@ -154,11 +165,14 @@ class FeaturesToModel(Transform):
|
|
|
154
165
|
self.log.important(f"Feature List for Modeling: {self.model_feature_list}")
|
|
155
166
|
|
|
156
167
|
# Set up our parameters for the model script
|
|
168
|
+
# ChemProp expects target_column as a list; other templates expect a string
|
|
169
|
+
target_for_template = target_list if self.model_framework == ModelFramework.CHEMPROP else self.target_column
|
|
157
170
|
template_params = {
|
|
158
171
|
"model_imports": self.model_import_str,
|
|
159
172
|
"model_type": self.model_type,
|
|
173
|
+
"model_framework": self.model_framework,
|
|
160
174
|
"model_class": self.model_class,
|
|
161
|
-
"target_column":
|
|
175
|
+
"target_column": target_for_template,
|
|
162
176
|
"feature_list": self.model_feature_list,
|
|
163
177
|
"compressed_features": feature_set.get_compressed_features(),
|
|
164
178
|
"model_metrics_s3_path": self.model_training_root,
|
|
@@ -184,23 +198,27 @@ class FeaturesToModel(Transform):
|
|
|
184
198
|
# Generate our model script
|
|
185
199
|
script_path = generate_model_script(template_params)
|
|
186
200
|
|
|
187
|
-
# Metric Definitions for Regression
|
|
201
|
+
# Metric Definitions for Regression (matches model script output format)
|
|
188
202
|
if self.model_type in [ModelType.REGRESSOR, ModelType.UQ_REGRESSOR, ModelType.ENSEMBLE_REGRESSOR]:
|
|
189
203
|
metric_definitions = [
|
|
190
|
-
{"Name": "
|
|
191
|
-
{"Name": "
|
|
192
|
-
{"Name": "
|
|
193
|
-
{"Name": "
|
|
204
|
+
{"Name": "rmse", "Regex": r"rmse: ([0-9.]+)"},
|
|
205
|
+
{"Name": "mae", "Regex": r"mae: ([0-9.]+)"},
|
|
206
|
+
{"Name": "medae", "Regex": r"medae: ([0-9.]+)"},
|
|
207
|
+
{"Name": "r2", "Regex": r"r2: ([0-9.-]+)"},
|
|
208
|
+
{"Name": "spearmanr", "Regex": r"spearmanr: ([0-9.-]+)"},
|
|
209
|
+
{"Name": "support", "Regex": r"support: ([0-9]+)"},
|
|
194
210
|
]
|
|
195
211
|
|
|
196
212
|
# Metric Definitions for Classification
|
|
197
213
|
elif self.model_type == ModelType.CLASSIFIER:
|
|
198
214
|
# We need to get creative with the Classification Metrics
|
|
215
|
+
# Note: Classification only supports single target
|
|
216
|
+
class_target = target_list[0] if target_list else self.target_column
|
|
199
217
|
|
|
200
218
|
# Grab all the target column class values (class labels)
|
|
201
219
|
table = feature_set.data_source.table
|
|
202
|
-
self.class_labels = feature_set.query(f'select DISTINCT {
|
|
203
|
-
|
|
220
|
+
self.class_labels = feature_set.query(f'select DISTINCT {class_target} FROM "{table}"')[
|
|
221
|
+
class_target
|
|
204
222
|
].to_list()
|
|
205
223
|
|
|
206
224
|
# Sanity check on the targets
|
|
@@ -210,7 +228,7 @@ class FeaturesToModel(Transform):
|
|
|
210
228
|
raise ValueError(msg)
|
|
211
229
|
|
|
212
230
|
# Dynamically create the metric definitions
|
|
213
|
-
metrics = ["precision", "recall", "
|
|
231
|
+
metrics = ["precision", "recall", "f1", "support"]
|
|
214
232
|
metric_definitions = []
|
|
215
233
|
for t in self.class_labels:
|
|
216
234
|
for m in metrics:
|
|
@@ -233,13 +251,21 @@ class FeaturesToModel(Transform):
|
|
|
233
251
|
source_dir = str(Path(script_path).parent)
|
|
234
252
|
|
|
235
253
|
# Create a Sagemaker Model with our script
|
|
236
|
-
image = ModelImages.get_image_uri(self.sm_session.boto_region_name, self.training_image
|
|
254
|
+
image = ModelImages.get_image_uri(self.sm_session.boto_region_name, self.training_image)
|
|
255
|
+
|
|
256
|
+
# Use GPU instance for ChemProp/PyTorch, CPU for others
|
|
257
|
+
if self.model_framework in [ModelFramework.CHEMPROP, ModelFramework.PYTORCH]:
|
|
258
|
+
train_instance_type = "ml.g6.xlarge" # NVIDIA L4 GPU, ~$0.80/hr
|
|
259
|
+
self.log.important(f"Using GPU instance {train_instance_type} for {self.model_framework.value}")
|
|
260
|
+
else:
|
|
261
|
+
train_instance_type = "ml.m5.xlarge"
|
|
262
|
+
|
|
237
263
|
self.estimator = Estimator(
|
|
238
264
|
entry_point=entry_point,
|
|
239
265
|
source_dir=source_dir,
|
|
240
266
|
role=self.workbench_role_arn,
|
|
241
267
|
instance_count=1,
|
|
242
|
-
instance_type=
|
|
268
|
+
instance_type=train_instance_type,
|
|
243
269
|
sagemaker_session=self.sm_session,
|
|
244
270
|
image_uri=image,
|
|
245
271
|
metric_definitions=metric_definitions,
|
|
@@ -264,13 +290,20 @@ class FeaturesToModel(Transform):
|
|
|
264
290
|
self.log.important(f"Creating new model {self.output_name}...")
|
|
265
291
|
self.create_and_register_model(**kwargs)
|
|
266
292
|
|
|
293
|
+
# Make a copy of the training view, to lock-in the training data used for this model
|
|
294
|
+
model_training_view_name = f"{self.output_name.replace('-', '_')}_training"
|
|
295
|
+
self.log.important(f"Creating Model Training View: {model_training_view_name}...")
|
|
296
|
+
feature_set.view("training").copy(f"{model_training_view_name}")
|
|
297
|
+
|
|
267
298
|
def post_transform(self, **kwargs):
|
|
268
299
|
"""Post-Transform: Calling onboard() on the Model"""
|
|
269
300
|
self.log.info("Post-Transform: Calling onboard() on the Model...")
|
|
270
301
|
time.sleep(3) # Give AWS time to complete Model register
|
|
271
302
|
|
|
272
|
-
# Store the model
|
|
273
|
-
output_model = ModelCore(self.output_name
|
|
303
|
+
# Store the model metadata information
|
|
304
|
+
output_model = ModelCore(self.output_name)
|
|
305
|
+
output_model._set_model_type(self.model_type)
|
|
306
|
+
output_model._set_model_framework(self.model_framework)
|
|
274
307
|
output_model.upsert_workbench_meta({"workbench_model_features": self.model_feature_list})
|
|
275
308
|
output_model.upsert_workbench_meta({"workbench_model_target": self.target_column})
|
|
276
309
|
|
|
@@ -301,7 +334,7 @@ class FeaturesToModel(Transform):
|
|
|
301
334
|
|
|
302
335
|
# Register our model
|
|
303
336
|
image = ModelImages.get_image_uri(
|
|
304
|
-
self.sm_session.boto_region_name, self.inference_image,
|
|
337
|
+
self.sm_session.boto_region_name, self.inference_image, architecture=self.inference_arch
|
|
305
338
|
)
|
|
306
339
|
self.log.important(f"Registering model {self.output_name} with Inference Image {image}...")
|
|
307
340
|
model = self.estimator.create_model(role=self.workbench_role_arn)
|
|
@@ -325,12 +358,11 @@ if __name__ == "__main__":
|
|
|
325
358
|
|
|
326
359
|
# Regression Model
|
|
327
360
|
input_name = "abalone_features"
|
|
328
|
-
output_name = "
|
|
361
|
+
output_name = "abalone-regression"
|
|
329
362
|
to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.REGRESSOR)
|
|
330
363
|
to_model.set_output_tags(["test"])
|
|
331
364
|
to_model.transform(target_column="class_number_of_rings", description="Test Abalone Regression")
|
|
332
365
|
|
|
333
|
-
"""
|
|
334
366
|
# Classification Model
|
|
335
367
|
input_name = "wine_features"
|
|
336
368
|
output_name = "wine-classification"
|
|
@@ -340,10 +372,10 @@ if __name__ == "__main__":
|
|
|
340
372
|
|
|
341
373
|
# Quantile Regression Model (Abalone)
|
|
342
374
|
input_name = "abalone_features"
|
|
343
|
-
output_name = "abalone-
|
|
375
|
+
output_name = "abalone-regression-uq"
|
|
344
376
|
to_model = FeaturesToModel(input_name, output_name, ModelType.UQ_REGRESSOR)
|
|
345
|
-
to_model.set_output_tags(["abalone", "
|
|
346
|
-
to_model.transform(target_column="class_number_of_rings", description="Abalone
|
|
377
|
+
to_model.set_output_tags(["abalone", "uq"])
|
|
378
|
+
to_model.transform(target_column="class_number_of_rings", description="Abalone UQ Regression")
|
|
347
379
|
|
|
348
380
|
# Scikit-Learn Kmeans Clustering Model
|
|
349
381
|
input_name = "wine_features"
|
|
@@ -397,7 +429,7 @@ if __name__ == "__main__":
|
|
|
397
429
|
scripts_root = Path(__file__).resolve().parents[3] / "model_scripts"
|
|
398
430
|
my_script = scripts_root / "custom_models" / "chem_info" / "molecular_descriptors.py"
|
|
399
431
|
input_name = "aqsol_features"
|
|
400
|
-
output_name = "smiles-to-taut-md-stereo
|
|
432
|
+
output_name = "test-smiles-to-taut-md-stereo"
|
|
401
433
|
to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.TRANSFORMER, custom_script=my_script)
|
|
402
434
|
to_model.set_output_tags(["smiles", "molecular descriptors"])
|
|
403
435
|
to_model.transform(target_column=None, feature_list=["smiles"], description="Smiles to Molecular Descriptors")
|
|
@@ -410,13 +442,3 @@ if __name__ == "__main__":
|
|
|
410
442
|
to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.TRANSFORMER, custom_script=my_script)
|
|
411
443
|
to_model.set_output_tags(["smiles", "morgan fingerprints"])
|
|
412
444
|
to_model.transform(target_column=None, feature_list=["smiles"], description="Smiles to Morgan Fingerprints")
|
|
413
|
-
|
|
414
|
-
# Tautomerization Model
|
|
415
|
-
scripts_root = Path(__file__).resolve().parents[3] / "model_scripts"
|
|
416
|
-
my_script = scripts_root / "custom_models" / "chem_info" / "tautomerize.py"
|
|
417
|
-
input_name = "aqsol_features"
|
|
418
|
-
output_name = "tautomerize-v0"
|
|
419
|
-
to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.TRANSFORMER, custom_script=my_script)
|
|
420
|
-
to_model.set_output_tags(["smiles", "tautomerization"])
|
|
421
|
-
to_model.transform(target_column=None, feature_list=["smiles"], description="Tautomerize Smiles")
|
|
422
|
-
"""
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
"""ModelToEndpoint: Deploy an Endpoint for a Model"""
|
|
2
2
|
|
|
3
3
|
import time
|
|
4
|
+
from botocore.exceptions import ClientError
|
|
4
5
|
from sagemaker import ModelPackage
|
|
5
6
|
from sagemaker.serializers import CSVSerializer
|
|
6
7
|
from sagemaker.deserializers import CSVDeserializer
|
|
7
8
|
from sagemaker.serverless import ServerlessInferenceConfig
|
|
9
|
+
from sagemaker.model_monitor import DataCaptureConfig
|
|
8
10
|
|
|
9
11
|
# Local Imports
|
|
10
12
|
from workbench.core.transforms.transform import Transform, TransformInput, TransformOutput
|
|
@@ -51,27 +53,38 @@ class ModelToEndpoint(Transform):
|
|
|
51
53
|
EndpointCore.managed_delete(self.output_name)
|
|
52
54
|
|
|
53
55
|
# Get the Model Package ARN for our input model
|
|
54
|
-
|
|
55
|
-
model_package_arn = input_model.model_package_arn()
|
|
56
|
+
workbench_model = ModelCore(self.input_name)
|
|
56
57
|
|
|
57
58
|
# Deploy the model
|
|
58
|
-
self._deploy_model(
|
|
59
|
+
self._deploy_model(workbench_model, **kwargs)
|
|
59
60
|
|
|
60
61
|
# Add this endpoint to the set of registered endpoints for the model
|
|
61
|
-
|
|
62
|
+
workbench_model.register_endpoint(self.output_name)
|
|
62
63
|
|
|
63
64
|
# This ensures that the endpoint is ready for use
|
|
64
65
|
time.sleep(5) # We wait for AWS Lag
|
|
65
66
|
end = EndpointCore(self.output_name)
|
|
66
67
|
self.log.important(f"Endpoint {end.name} is ready for use")
|
|
67
68
|
|
|
68
|
-
def _deploy_model(
|
|
69
|
+
def _deploy_model(
|
|
70
|
+
self,
|
|
71
|
+
workbench_model: ModelCore,
|
|
72
|
+
mem_size: int = 2048,
|
|
73
|
+
max_concurrency: int = 5,
|
|
74
|
+
data_capture: bool = False,
|
|
75
|
+
capture_percentage: int = 100,
|
|
76
|
+
):
|
|
69
77
|
"""Internal Method: Deploy the Model
|
|
70
78
|
|
|
71
79
|
Args:
|
|
72
|
-
|
|
80
|
+
workbench_model(ModelCore): The Workbench ModelCore object to deploy
|
|
81
|
+
mem_size(int): Memory size for serverless deployment
|
|
82
|
+
max_concurrency(int): Max concurrency for serverless deployment
|
|
83
|
+
data_capture(bool): Enable data capture during deployment
|
|
84
|
+
capture_percentage(int): Percentage of data to capture. Defaults to 100.
|
|
73
85
|
"""
|
|
74
86
|
# Grab the specified Model Package
|
|
87
|
+
model_package_arn = workbench_model.model_package_arn()
|
|
75
88
|
model_package = ModelPackage(
|
|
76
89
|
role=self.workbench_role_arn,
|
|
77
90
|
model_package_arn=model_package_arn,
|
|
@@ -90,22 +103,70 @@ class ModelToEndpoint(Transform):
|
|
|
90
103
|
# Is this a serverless deployment?
|
|
91
104
|
serverless_config = None
|
|
92
105
|
if self.serverless:
|
|
106
|
+
# For PyTorch or ChemProp we need at least 4GB of memory
|
|
107
|
+
from workbench.api import ModelFramework
|
|
108
|
+
|
|
109
|
+
self.log.info(f"Model Framework: {workbench_model.model_framework}")
|
|
110
|
+
if workbench_model.model_framework in [ModelFramework.PYTORCH, ModelFramework.CHEMPROP]:
|
|
111
|
+
if mem_size < 4096:
|
|
112
|
+
self.log.important(
|
|
113
|
+
f"{workbench_model.model_framework} needs at least 4GB of memory (setting to 4GB)"
|
|
114
|
+
)
|
|
115
|
+
mem_size = 4096
|
|
93
116
|
serverless_config = ServerlessInferenceConfig(
|
|
94
117
|
memory_size_in_mb=mem_size,
|
|
95
118
|
max_concurrency=max_concurrency,
|
|
96
119
|
)
|
|
120
|
+
self.log.important(f"Serverless Config: Memory={mem_size}MB, MaxConcurrency={max_concurrency}")
|
|
121
|
+
|
|
122
|
+
# Configure data capture if requested (and not serverless)
|
|
123
|
+
data_capture_config = None
|
|
124
|
+
if data_capture and not self.serverless:
|
|
125
|
+
# Set up the S3 path for data capture
|
|
126
|
+
base_endpoint_path = f"{workbench_model.endpoints_s3_path}/{self.output_name}"
|
|
127
|
+
data_capture_path = f"{base_endpoint_path}/data_capture"
|
|
128
|
+
self.log.important(f"Configuring Data Capture --> {data_capture_path}")
|
|
129
|
+
data_capture_config = DataCaptureConfig(
|
|
130
|
+
enable_capture=True,
|
|
131
|
+
sampling_percentage=capture_percentage,
|
|
132
|
+
destination_s3_uri=data_capture_path,
|
|
133
|
+
)
|
|
134
|
+
elif data_capture and self.serverless:
|
|
135
|
+
self.log.warning(
|
|
136
|
+
"Data capture is not supported for serverless endpoints. Skipping data capture configuration."
|
|
137
|
+
)
|
|
97
138
|
|
|
98
139
|
# Deploy the Endpoint
|
|
99
140
|
self.log.important(f"Deploying the Endpoint {self.output_name}...")
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
141
|
+
try:
|
|
142
|
+
model_package.deploy(
|
|
143
|
+
initial_instance_count=1,
|
|
144
|
+
instance_type=self.instance_type,
|
|
145
|
+
serverless_inference_config=serverless_config,
|
|
146
|
+
endpoint_name=self.output_name,
|
|
147
|
+
serializer=CSVSerializer(),
|
|
148
|
+
deserializer=CSVDeserializer(),
|
|
149
|
+
data_capture_config=data_capture_config,
|
|
150
|
+
tags=aws_tags,
|
|
151
|
+
)
|
|
152
|
+
except ClientError as e:
|
|
153
|
+
# Check if this is the "endpoint config already exists" error
|
|
154
|
+
if "Cannot create already existing endpoint configuration" in str(e):
|
|
155
|
+
self.log.warning("Endpoint config already exists, deleting and retrying...")
|
|
156
|
+
self.sm_client.delete_endpoint_config(EndpointConfigName=self.output_name)
|
|
157
|
+
# Retry the deploy
|
|
158
|
+
model_package.deploy(
|
|
159
|
+
initial_instance_count=1,
|
|
160
|
+
instance_type=self.instance_type,
|
|
161
|
+
serverless_inference_config=serverless_config,
|
|
162
|
+
endpoint_name=self.output_name,
|
|
163
|
+
serializer=CSVSerializer(),
|
|
164
|
+
deserializer=CSVDeserializer(),
|
|
165
|
+
data_capture_config=data_capture_config,
|
|
166
|
+
tags=aws_tags,
|
|
167
|
+
)
|
|
168
|
+
else:
|
|
169
|
+
raise
|
|
109
170
|
|
|
110
171
|
def post_transform(self, **kwargs):
|
|
111
172
|
"""Post-Transform: Calling onboard() for the Endpoint"""
|
|
@@ -68,6 +68,15 @@ class PandasToFeatures(Transform):
|
|
|
68
68
|
self.output_df = input_df.copy()
|
|
69
69
|
self.one_hot_columns = one_hot_columns or []
|
|
70
70
|
|
|
71
|
+
# Warn about known AWS Iceberg bug with event_time_column
|
|
72
|
+
if event_time_column is not None:
|
|
73
|
+
self.log.warning(
|
|
74
|
+
f"event_time_column='{event_time_column}' specified. Note: AWS has a known bug with "
|
|
75
|
+
"Iceberg FeatureGroups where varying event times across multiple days can cause "
|
|
76
|
+
"duplicate rows in the offline store. Setting event_time_column=None."
|
|
77
|
+
)
|
|
78
|
+
self.event_time_column = None
|
|
79
|
+
|
|
71
80
|
# Now Prepare the DataFrame for its journey into an AWS FeatureGroup
|
|
72
81
|
self.prep_dataframe()
|
|
73
82
|
|
|
@@ -327,9 +336,36 @@ class PandasToFeatures(Transform):
|
|
|
327
336
|
self.delete_existing()
|
|
328
337
|
self.output_feature_group = self.create_feature_group()
|
|
329
338
|
|
|
339
|
+
def mac_spawn_hack(self):
|
|
340
|
+
"""Workaround for macOS Tahoe fork/spawn issue with SageMaker FeatureStore ingest.
|
|
341
|
+
|
|
342
|
+
See: https://github.com/aws/sagemaker-python-sdk/issues/5312
|
|
343
|
+
macOS Tahoe 26+ has issues with forked processes creating boto3 sessions.
|
|
344
|
+
This forces spawn mode on macOS to avoid the hang.
|
|
345
|
+
"""
|
|
346
|
+
import platform
|
|
347
|
+
|
|
348
|
+
if platform.system() == "Darwin": # macOS
|
|
349
|
+
self.log.warning("macOS detected, forcing 'spawn' mode for multiprocessing (Tahoe hang workaround)")
|
|
350
|
+
import multiprocessing
|
|
351
|
+
|
|
352
|
+
try:
|
|
353
|
+
import multiprocess
|
|
354
|
+
|
|
355
|
+
multiprocess.set_start_method("spawn", force=True)
|
|
356
|
+
except (RuntimeError, ImportError):
|
|
357
|
+
pass # Already set or multiprocess not available
|
|
358
|
+
try:
|
|
359
|
+
multiprocessing.set_start_method("spawn", force=True)
|
|
360
|
+
except RuntimeError:
|
|
361
|
+
pass # Already set
|
|
362
|
+
|
|
330
363
|
def transform_impl(self):
|
|
331
364
|
"""Transform Implementation: Ingest the data into the Feature Group"""
|
|
332
365
|
|
|
366
|
+
# Workaround for macOS Tahoe hang issue
|
|
367
|
+
self.mac_spawn_hack()
|
|
368
|
+
|
|
333
369
|
# Now we actually push the data into the Feature Group (called ingestion)
|
|
334
370
|
self.log.important(f"Ingesting rows into Feature Group {self.output_name}...")
|
|
335
371
|
ingest_manager = self.output_feature_group.ingest(self.output_df, max_workers=8, max_processes=4, wait=False)
|
|
@@ -373,7 +409,7 @@ class PandasToFeatures(Transform):
|
|
|
373
409
|
|
|
374
410
|
# Set Hold Out Ids (if we got them during creation)
|
|
375
411
|
if self.incoming_hold_out_ids:
|
|
376
|
-
self.output_feature_set.set_training_holdouts(self.
|
|
412
|
+
self.output_feature_set.set_training_holdouts(self.incoming_hold_out_ids)
|
|
377
413
|
|
|
378
414
|
def ensure_feature_group_created(self, feature_group):
|
|
379
415
|
status = feature_group.describe().get("FeatureGroupStatus")
|
|
@@ -435,7 +471,7 @@ if __name__ == "__main__":
|
|
|
435
471
|
|
|
436
472
|
# Create my DF to Feature Set Transform (with one-hot encoding)
|
|
437
473
|
df_to_features = PandasToFeatures("test_features")
|
|
438
|
-
df_to_features.set_input(data_df, id_column="id", one_hot_columns=["food"])
|
|
474
|
+
df_to_features.set_input(data_df, id_column="id", event_time_column="date", one_hot_columns=["food"])
|
|
439
475
|
df_to_features.set_output_tags(["test", "small"])
|
|
440
476
|
df_to_features.transform()
|
|
441
477
|
|