workbench 0.8.198__py3-none-any.whl → 0.8.203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- workbench/algorithms/dataframe/proximity.py +11 -4
- workbench/api/__init__.py +2 -1
- workbench/api/df_store.py +17 -108
- workbench/api/feature_set.py +48 -11
- workbench/api/model.py +1 -1
- workbench/api/parameter_store.py +3 -52
- workbench/core/artifacts/__init__.py +11 -2
- workbench/core/artifacts/artifact.py +5 -5
- workbench/core/artifacts/df_store_core.py +114 -0
- workbench/core/artifacts/endpoint_core.py +261 -78
- workbench/core/artifacts/feature_set_core.py +69 -1
- workbench/core/artifacts/model_core.py +48 -14
- workbench/core/artifacts/parameter_store_core.py +98 -0
- workbench/core/transforms/features_to_model/features_to_model.py +50 -33
- workbench/core/transforms/pandas_transforms/pandas_to_features.py +11 -2
- workbench/core/views/view.py +2 -2
- workbench/model_scripts/chemprop/chemprop.template +933 -0
- workbench/model_scripts/chemprop/generated_model_script.py +933 -0
- workbench/model_scripts/chemprop/requirements.txt +11 -0
- workbench/model_scripts/custom_models/chem_info/fingerprints.py +134 -0
- workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
- workbench/model_scripts/custom_models/proximity/proximity.py +11 -4
- workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +11 -5
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +11 -5
- workbench/model_scripts/custom_models/uq_models/ngboost.template +11 -5
- workbench/model_scripts/custom_models/uq_models/proximity.py +11 -4
- workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +11 -5
- workbench/model_scripts/pytorch_model/generated_model_script.py +365 -173
- workbench/model_scripts/pytorch_model/pytorch.template +362 -170
- workbench/model_scripts/scikit_learn/generated_model_script.py +302 -0
- workbench/model_scripts/script_generation.py +10 -7
- workbench/model_scripts/uq_models/generated_model_script.py +43 -27
- workbench/model_scripts/uq_models/mapie.template +40 -24
- workbench/model_scripts/xgb_model/generated_model_script.py +36 -7
- workbench/model_scripts/xgb_model/xgb_model.template +36 -7
- workbench/repl/workbench_shell.py +14 -5
- workbench/resources/open_source_api.key +1 -1
- workbench/scripts/endpoint_test.py +162 -0
- workbench/scripts/{lambda_launcher.py → lambda_test.py} +10 -0
- workbench/utils/chemprop_utils.py +761 -0
- workbench/utils/pytorch_utils.py +527 -0
- workbench/utils/xgboost_model_utils.py +10 -5
- workbench/web_interface/components/model_plot.py +7 -1
- {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/METADATA +3 -3
- {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/RECORD +49 -43
- {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/entry_points.txt +2 -1
- workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
- workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
- workbench/model_scripts/__pycache__/script_generation.cpython-312.pyc +0 -0
- workbench/model_scripts/__pycache__/script_generation.cpython-313.pyc +0 -0
- {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/WHEEL +0 -0
- {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/top_level.txt +0 -0
|
@@ -30,11 +30,23 @@ class ModelType(Enum):
|
|
|
30
30
|
CLASSIFIER = "classifier"
|
|
31
31
|
REGRESSOR = "regressor"
|
|
32
32
|
CLUSTERER = "clusterer"
|
|
33
|
-
TRANSFORMER = "transformer"
|
|
34
33
|
PROXIMITY = "proximity"
|
|
35
34
|
PROJECTION = "projection"
|
|
36
35
|
UQ_REGRESSOR = "uq_regressor"
|
|
37
36
|
ENSEMBLE_REGRESSOR = "ensemble_regressor"
|
|
37
|
+
TRANSFORMER = "transformer"
|
|
38
|
+
UNKNOWN = "unknown"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class ModelFramework(Enum):
|
|
42
|
+
"""Enumerated Types for Workbench Model Frameworks"""
|
|
43
|
+
|
|
44
|
+
SKLEARN = "sklearn"
|
|
45
|
+
XGBOOST = "xgboost"
|
|
46
|
+
LIGHTGBM = "lightgbm"
|
|
47
|
+
PYTORCH_TABULAR = "pytorch_tabular"
|
|
48
|
+
CHEMPROP = "chemprop"
|
|
49
|
+
TRANSFORMER = "transformer"
|
|
38
50
|
UNKNOWN = "unknown"
|
|
39
51
|
|
|
40
52
|
|
|
@@ -87,11 +99,10 @@ class ModelCore(Artifact):
|
|
|
87
99
|
```
|
|
88
100
|
"""
|
|
89
101
|
|
|
90
|
-
def __init__(self, model_name: str,
|
|
102
|
+
def __init__(self, model_name: str, **kwargs):
|
|
91
103
|
"""ModelCore Initialization
|
|
92
104
|
Args:
|
|
93
105
|
model_name (str): Name of Model in Workbench.
|
|
94
|
-
model_type (ModelType, optional): Set this for newly created Models. Defaults to None.
|
|
95
106
|
**kwargs: Additional keyword arguments
|
|
96
107
|
"""
|
|
97
108
|
|
|
@@ -125,10 +136,8 @@ class ModelCore(Artifact):
|
|
|
125
136
|
self.latest_model = self.model_meta["ModelPackageList"][0]
|
|
126
137
|
self.description = self.latest_model.get("ModelPackageDescription", "-")
|
|
127
138
|
self.training_job_name = self._extract_training_job_name()
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
else:
|
|
131
|
-
self.model_type = self._get_model_type()
|
|
139
|
+
self.model_type = self._get_model_type()
|
|
140
|
+
self.model_framework = self._get_model_framework()
|
|
132
141
|
except (IndexError, KeyError):
|
|
133
142
|
self.log.critical(f"Model {self.model_name} appears to be malformed. Delete and recreate it!")
|
|
134
143
|
return
|
|
@@ -254,21 +263,25 @@ class ModelCore(Artifact):
|
|
|
254
263
|
else:
|
|
255
264
|
self.log.important(f"No inference data found for {self.model_name}!")
|
|
256
265
|
|
|
257
|
-
def get_inference_metrics(self, capture_name: str = "
|
|
266
|
+
def get_inference_metrics(self, capture_name: str = "any") -> Union[pd.DataFrame, None]:
|
|
258
267
|
"""Retrieve the inference performance metrics for this model
|
|
259
268
|
|
|
260
269
|
Args:
|
|
261
|
-
capture_name (str, optional): Specific capture_name
|
|
270
|
+
capture_name (str, optional): Specific capture_name (default: "any")
|
|
262
271
|
Returns:
|
|
263
272
|
pd.DataFrame: DataFrame of the Model Metrics
|
|
264
273
|
|
|
265
274
|
Note:
|
|
266
|
-
If a capture_name isn't specified this will try to
|
|
275
|
+
If a capture_name isn't specified this will try to the 'first' available metrics
|
|
267
276
|
"""
|
|
268
277
|
# Try to get the auto_capture 'training_holdout' or the training
|
|
269
|
-
if capture_name == "
|
|
270
|
-
|
|
271
|
-
|
|
278
|
+
if capture_name == "any":
|
|
279
|
+
metric_list = self.list_inference_runs()
|
|
280
|
+
if metric_list:
|
|
281
|
+
return self.get_inference_metrics(metric_list[0])
|
|
282
|
+
else:
|
|
283
|
+
self.log.warning(f"No performance metrics found for {self.model_name}!")
|
|
284
|
+
return None
|
|
272
285
|
|
|
273
286
|
# Grab the metrics captured during model training (could return None)
|
|
274
287
|
if capture_name == "model_training":
|
|
@@ -860,7 +873,7 @@ class ModelCore(Artifact):
|
|
|
860
873
|
return self.df_store.get(f"/workbench/models/{self.name}/shap_data")
|
|
861
874
|
else:
|
|
862
875
|
# Loop over the SHAP data and return a dict of DataFrames
|
|
863
|
-
shap_dfs = self.df_store.
|
|
876
|
+
shap_dfs = self.df_store.list(f"/workbench/models/{self.name}/shap_data")
|
|
864
877
|
shap_data = {}
|
|
865
878
|
for df_location in shap_dfs:
|
|
866
879
|
key = df_location.split("/")[-1]
|
|
@@ -972,6 +985,27 @@ class ModelCore(Artifact):
|
|
|
972
985
|
self.log.warning(f"Could not determine model type for {self.model_name}!")
|
|
973
986
|
return ModelType.UNKNOWN
|
|
974
987
|
|
|
988
|
+
def _set_model_framework(self, model_framework: ModelFramework):
|
|
989
|
+
"""Internal: Set the Model Framework for this Model"""
|
|
990
|
+
self.model_framework = model_framework
|
|
991
|
+
self.upsert_workbench_meta({"workbench_model_framework": self.model_framework.value})
|
|
992
|
+
self.remove_health_tag("model_framework_unknown")
|
|
993
|
+
|
|
994
|
+
def _get_model_framework(self) -> ModelFramework:
|
|
995
|
+
"""Internal: Query the Workbench Metadata to get the model framework
|
|
996
|
+
Returns:
|
|
997
|
+
ModelFramework: The ModelFramework of this Model
|
|
998
|
+
Notes:
|
|
999
|
+
This is an internal method that should not be called directly
|
|
1000
|
+
Use the model_framework attribute instead
|
|
1001
|
+
"""
|
|
1002
|
+
model_framework = self.workbench_meta().get("workbench_model_framework")
|
|
1003
|
+
try:
|
|
1004
|
+
return ModelFramework(model_framework)
|
|
1005
|
+
except ValueError:
|
|
1006
|
+
self.log.warning(f"Could not determine model framework for {self.model_name}!")
|
|
1007
|
+
return ModelFramework.UNKNOWN
|
|
1008
|
+
|
|
975
1009
|
def _load_training_metrics(self):
|
|
976
1010
|
"""Internal: Retrieve the training metrics and Confusion Matrix for this model
|
|
977
1011
|
and load the data into the Workbench Metadata
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""ParameterStoreCore: Manages Workbench parameters in a Cloud Based Parameter Store."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
# Workbench Imports
|
|
6
|
+
from workbench.core.cloud_platform.aws.aws_account_clamp import AWSAccountClamp
|
|
7
|
+
|
|
8
|
+
# Workbench Bridges Import
|
|
9
|
+
from workbench_bridges.api import ParameterStore as BridgesParameterStore
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ParameterStoreCore(BridgesParameterStore):
|
|
13
|
+
"""ParameterStoreCore: Manages Workbench parameters in a Cloud Based Parameter Store.
|
|
14
|
+
|
|
15
|
+
Common Usage:
|
|
16
|
+
```python
|
|
17
|
+
params = ParameterStoreCore()
|
|
18
|
+
|
|
19
|
+
# List Parameters
|
|
20
|
+
params.list()
|
|
21
|
+
|
|
22
|
+
['/workbench/abalone_info',
|
|
23
|
+
'/workbench/my_data',
|
|
24
|
+
'/workbench/test',
|
|
25
|
+
'/workbench/pipelines/my_pipeline']
|
|
26
|
+
|
|
27
|
+
# Add Key
|
|
28
|
+
params.upsert("key", "value")
|
|
29
|
+
value = params.get("key")
|
|
30
|
+
|
|
31
|
+
# Add any data (lists, dictionaries, etc..)
|
|
32
|
+
my_data = {"key": "value", "number": 4.2, "list": [1,2,3]}
|
|
33
|
+
params.upsert("my_data", my_data)
|
|
34
|
+
|
|
35
|
+
# Retrieve data
|
|
36
|
+
return_value = params.get("my_data")
|
|
37
|
+
pprint(return_value)
|
|
38
|
+
|
|
39
|
+
{'key': 'value', 'list': [1, 2, 3], 'number': 4.2}
|
|
40
|
+
|
|
41
|
+
# Delete parameters
|
|
42
|
+
param_store.delete("my_data")
|
|
43
|
+
```
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(self):
|
|
47
|
+
"""ParameterStoreCore Init Method"""
|
|
48
|
+
session = AWSAccountClamp().boto3_session
|
|
49
|
+
|
|
50
|
+
# Initialize parent with workbench config
|
|
51
|
+
super().__init__(boto3_session=session)
|
|
52
|
+
self.log = logging.getLogger("workbench")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
if __name__ == "__main__":
|
|
56
|
+
"""Exercise the ParameterStoreCore Class"""
|
|
57
|
+
|
|
58
|
+
# Create a ParameterStoreCore manager
|
|
59
|
+
param_store = ParameterStoreCore()
|
|
60
|
+
|
|
61
|
+
# List the parameters
|
|
62
|
+
print("Listing Parameters...")
|
|
63
|
+
print(param_store.list())
|
|
64
|
+
|
|
65
|
+
# Add a new parameter
|
|
66
|
+
param_store.upsert("/workbench/test", "value")
|
|
67
|
+
|
|
68
|
+
# Get the parameter
|
|
69
|
+
print(f"Getting parameter 'test': {param_store.get('/workbench/test')}")
|
|
70
|
+
|
|
71
|
+
# Add a dictionary as a parameter
|
|
72
|
+
sample_dict = {"key": "str_value", "awesome_value": 4.2}
|
|
73
|
+
param_store.upsert("/workbench/my_data", sample_dict)
|
|
74
|
+
|
|
75
|
+
# Retrieve the parameter as a dictionary
|
|
76
|
+
retrieved_value = param_store.get("/workbench/my_data")
|
|
77
|
+
print("Retrieved value:", retrieved_value)
|
|
78
|
+
|
|
79
|
+
# List the parameters
|
|
80
|
+
print("Listing Parameters...")
|
|
81
|
+
print(param_store.list())
|
|
82
|
+
|
|
83
|
+
# List the parameters with a prefix
|
|
84
|
+
print("Listing Parameters with prefix '/workbench':")
|
|
85
|
+
print(param_store.list("/workbench"))
|
|
86
|
+
|
|
87
|
+
# Delete the parameters
|
|
88
|
+
param_store.delete("/workbench/test")
|
|
89
|
+
param_store.delete("/workbench/my_data")
|
|
90
|
+
|
|
91
|
+
# Out of scope tests
|
|
92
|
+
param_store.upsert("test", "value")
|
|
93
|
+
param_store.delete("test")
|
|
94
|
+
|
|
95
|
+
# Recursive delete test
|
|
96
|
+
param_store.upsert("/workbench/test/test1", "value1")
|
|
97
|
+
param_store.upsert("/workbench/test/test2", "value2")
|
|
98
|
+
param_store.delete_recursive("workbench/test/")
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""FeaturesToModel: Train/Create a Model from a Feature Set"""
|
|
2
2
|
|
|
3
3
|
from pathlib import Path
|
|
4
|
+
from typing import Union
|
|
4
5
|
from sagemaker.estimator import Estimator
|
|
5
6
|
import awswrangler as wr
|
|
6
7
|
from datetime import datetime, timezone
|
|
@@ -9,7 +10,7 @@ import time
|
|
|
9
10
|
# Local Imports
|
|
10
11
|
from workbench.core.transforms.transform import Transform, TransformInput, TransformOutput
|
|
11
12
|
from workbench.core.artifacts.feature_set_core import FeatureSetCore
|
|
12
|
-
from workbench.core.artifacts.model_core import ModelCore, ModelType, ModelImages
|
|
13
|
+
from workbench.core.artifacts.model_core import ModelCore, ModelType, ModelFramework, ModelImages
|
|
13
14
|
from workbench.core.artifacts.artifact import Artifact
|
|
14
15
|
from workbench.model_scripts.script_generation import generate_model_script, fill_template
|
|
15
16
|
from workbench.utils.model_utils import supported_instance_types
|
|
@@ -33,6 +34,7 @@ class FeaturesToModel(Transform):
|
|
|
33
34
|
feature_name: str,
|
|
34
35
|
model_name: str,
|
|
35
36
|
model_type: ModelType,
|
|
37
|
+
model_framework=ModelFramework.XGBOOST,
|
|
36
38
|
model_class=None,
|
|
37
39
|
model_import_str=None,
|
|
38
40
|
custom_script=None,
|
|
@@ -46,6 +48,7 @@ class FeaturesToModel(Transform):
|
|
|
46
48
|
feature_name (str): Name of the FeatureSet to use as input
|
|
47
49
|
model_name (str): Name of the Model to create as output
|
|
48
50
|
model_type (ModelType): ModelType.REGRESSOR or ModelType.CLASSIFIER, etc.
|
|
51
|
+
model_framework (ModelFramework, optional): The model framework (default ModelFramework.XGBOOST)
|
|
49
52
|
model_class (str, optional): The scikit model (e.g. KNeighborsRegressor) (default None)
|
|
50
53
|
model_import_str (str, optional): The import string for the model (default None)
|
|
51
54
|
custom_script (str, optional): Custom script to use for the model (default None)
|
|
@@ -65,6 +68,7 @@ class FeaturesToModel(Transform):
|
|
|
65
68
|
self.input_type = TransformInput.FEATURE_SET
|
|
66
69
|
self.output_type = TransformOutput.MODEL
|
|
67
70
|
self.model_type = model_type
|
|
71
|
+
self.model_framework = model_framework
|
|
68
72
|
self.model_class = model_class
|
|
69
73
|
self.model_import_str = model_import_str
|
|
70
74
|
self.custom_script = str(custom_script) if custom_script else None
|
|
@@ -80,12 +84,17 @@ class FeaturesToModel(Transform):
|
|
|
80
84
|
self.inference_arch = inference_arch
|
|
81
85
|
|
|
82
86
|
def transform_impl(
|
|
83
|
-
self,
|
|
87
|
+
self,
|
|
88
|
+
target_column: Union[str, list[str]],
|
|
89
|
+
description: str = None,
|
|
90
|
+
feature_list: list = None,
|
|
91
|
+
train_all_data=False,
|
|
92
|
+
**kwargs,
|
|
84
93
|
):
|
|
85
94
|
"""Generic Features to Model: Note you should create a new class and inherit from
|
|
86
95
|
this one to include specific logic for your Feature Set/Model
|
|
87
96
|
Args:
|
|
88
|
-
target_column (str): Column name of the target variable
|
|
97
|
+
target_column (str or list[str]): Column name(s) of the target variable(s)
|
|
89
98
|
description (str): Description of the model (optional)
|
|
90
99
|
feature_list (list[str]): A list of columns for the features (default None, will try to guess)
|
|
91
100
|
train_all_data (bool): Train on ALL (100%) of the data (default False)
|
|
@@ -102,9 +111,11 @@ class FeaturesToModel(Transform):
|
|
|
102
111
|
s3_training_path = feature_set.create_s3_training_data()
|
|
103
112
|
self.log.info(f"Created new training data {s3_training_path}...")
|
|
104
113
|
|
|
105
|
-
# Report the target column
|
|
114
|
+
# Report the target column(s)
|
|
106
115
|
self.target_column = target_column
|
|
107
|
-
|
|
116
|
+
# Normalize target_column to a list for internal use
|
|
117
|
+
target_list = [target_column] if isinstance(target_column, str) else (target_column or [])
|
|
118
|
+
self.log.info(f"Target column(s): {self.target_column}")
|
|
108
119
|
|
|
109
120
|
# Did they specify a feature list?
|
|
110
121
|
if feature_list:
|
|
@@ -131,7 +142,7 @@ class FeaturesToModel(Transform):
|
|
|
131
142
|
"is_deleted",
|
|
132
143
|
"event_time",
|
|
133
144
|
"training",
|
|
134
|
-
] +
|
|
145
|
+
] + target_list
|
|
135
146
|
feature_list = [c for c in all_columns if c not in filter_list]
|
|
136
147
|
|
|
137
148
|
# AWS Feature Store has 3 user column types (String, Integral, Fractional)
|
|
@@ -154,11 +165,14 @@ class FeaturesToModel(Transform):
|
|
|
154
165
|
self.log.important(f"Feature List for Modeling: {self.model_feature_list}")
|
|
155
166
|
|
|
156
167
|
# Set up our parameters for the model script
|
|
168
|
+
# ChemProp expects target_column as a list; other templates expect a string
|
|
169
|
+
target_for_template = target_list if self.model_framework == ModelFramework.CHEMPROP else self.target_column
|
|
157
170
|
template_params = {
|
|
158
171
|
"model_imports": self.model_import_str,
|
|
159
172
|
"model_type": self.model_type,
|
|
173
|
+
"model_framework": self.model_framework,
|
|
160
174
|
"model_class": self.model_class,
|
|
161
|
-
"target_column":
|
|
175
|
+
"target_column": target_for_template,
|
|
162
176
|
"feature_list": self.model_feature_list,
|
|
163
177
|
"compressed_features": feature_set.get_compressed_features(),
|
|
164
178
|
"model_metrics_s3_path": self.model_training_root,
|
|
@@ -184,23 +198,27 @@ class FeaturesToModel(Transform):
|
|
|
184
198
|
# Generate our model script
|
|
185
199
|
script_path = generate_model_script(template_params)
|
|
186
200
|
|
|
187
|
-
# Metric Definitions for Regression
|
|
201
|
+
# Metric Definitions for Regression (matches model script output format)
|
|
188
202
|
if self.model_type in [ModelType.REGRESSOR, ModelType.UQ_REGRESSOR, ModelType.ENSEMBLE_REGRESSOR]:
|
|
189
203
|
metric_definitions = [
|
|
190
|
-
{"Name": "
|
|
191
|
-
{"Name": "
|
|
192
|
-
{"Name": "
|
|
193
|
-
{"Name": "
|
|
204
|
+
{"Name": "rmse", "Regex": r"rmse: ([0-9.]+)"},
|
|
205
|
+
{"Name": "mae", "Regex": r"mae: ([0-9.]+)"},
|
|
206
|
+
{"Name": "medae", "Regex": r"medae: ([0-9.]+)"},
|
|
207
|
+
{"Name": "r2", "Regex": r"r2: ([0-9.-]+)"},
|
|
208
|
+
{"Name": "spearmanr", "Regex": r"spearmanr: ([0-9.-]+)"},
|
|
209
|
+
{"Name": "support", "Regex": r"support: ([0-9]+)"},
|
|
194
210
|
]
|
|
195
211
|
|
|
196
212
|
# Metric Definitions for Classification
|
|
197
213
|
elif self.model_type == ModelType.CLASSIFIER:
|
|
198
214
|
# We need to get creative with the Classification Metrics
|
|
215
|
+
# Note: Classification only supports single target
|
|
216
|
+
class_target = target_list[0] if target_list else self.target_column
|
|
199
217
|
|
|
200
218
|
# Grab all the target column class values (class labels)
|
|
201
219
|
table = feature_set.data_source.table
|
|
202
|
-
self.class_labels = feature_set.query(f'select DISTINCT {
|
|
203
|
-
|
|
220
|
+
self.class_labels = feature_set.query(f'select DISTINCT {class_target} FROM "{table}"')[
|
|
221
|
+
class_target
|
|
204
222
|
].to_list()
|
|
205
223
|
|
|
206
224
|
# Sanity check on the targets
|
|
@@ -234,12 +252,20 @@ class FeaturesToModel(Transform):
|
|
|
234
252
|
|
|
235
253
|
# Create a Sagemaker Model with our script
|
|
236
254
|
image = ModelImages.get_image_uri(self.sm_session.boto_region_name, self.training_image)
|
|
255
|
+
|
|
256
|
+
# Use GPU instance for ChemProp/PyTorch, CPU for others
|
|
257
|
+
if self.model_framework in [ModelFramework.CHEMPROP, ModelFramework.PYTORCH_TABULAR]:
|
|
258
|
+
train_instance_type = "ml.g6.xlarge" # NVIDIA L4 GPU, ~$0.80/hr
|
|
259
|
+
self.log.important(f"Using GPU instance {train_instance_type} for {self.model_framework.value}")
|
|
260
|
+
else:
|
|
261
|
+
train_instance_type = "ml.m5.xlarge"
|
|
262
|
+
|
|
237
263
|
self.estimator = Estimator(
|
|
238
264
|
entry_point=entry_point,
|
|
239
265
|
source_dir=source_dir,
|
|
240
266
|
role=self.workbench_role_arn,
|
|
241
267
|
instance_count=1,
|
|
242
|
-
instance_type=
|
|
268
|
+
instance_type=train_instance_type,
|
|
243
269
|
sagemaker_session=self.sm_session,
|
|
244
270
|
image_uri=image,
|
|
245
271
|
metric_definitions=metric_definitions,
|
|
@@ -274,8 +300,10 @@ class FeaturesToModel(Transform):
|
|
|
274
300
|
self.log.info("Post-Transform: Calling onboard() on the Model...")
|
|
275
301
|
time.sleep(3) # Give AWS time to complete Model register
|
|
276
302
|
|
|
277
|
-
# Store the model
|
|
278
|
-
output_model = ModelCore(self.output_name
|
|
303
|
+
# Store the model metadata information
|
|
304
|
+
output_model = ModelCore(self.output_name)
|
|
305
|
+
output_model._set_model_type(self.model_type)
|
|
306
|
+
output_model._set_model_framework(self.model_framework)
|
|
279
307
|
output_model.upsert_workbench_meta({"workbench_model_features": self.model_feature_list})
|
|
280
308
|
output_model.upsert_workbench_meta({"workbench_model_target": self.target_column})
|
|
281
309
|
|
|
@@ -330,12 +358,11 @@ if __name__ == "__main__":
|
|
|
330
358
|
|
|
331
359
|
# Regression Model
|
|
332
360
|
input_name = "abalone_features"
|
|
333
|
-
output_name = "
|
|
361
|
+
output_name = "abalone-regression"
|
|
334
362
|
to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.REGRESSOR)
|
|
335
363
|
to_model.set_output_tags(["test"])
|
|
336
364
|
to_model.transform(target_column="class_number_of_rings", description="Test Abalone Regression")
|
|
337
365
|
|
|
338
|
-
"""
|
|
339
366
|
# Classification Model
|
|
340
367
|
input_name = "wine_features"
|
|
341
368
|
output_name = "wine-classification"
|
|
@@ -345,10 +372,10 @@ if __name__ == "__main__":
|
|
|
345
372
|
|
|
346
373
|
# Quantile Regression Model (Abalone)
|
|
347
374
|
input_name = "abalone_features"
|
|
348
|
-
output_name = "abalone-
|
|
375
|
+
output_name = "abalone-regression-uq"
|
|
349
376
|
to_model = FeaturesToModel(input_name, output_name, ModelType.UQ_REGRESSOR)
|
|
350
|
-
to_model.set_output_tags(["abalone", "
|
|
351
|
-
to_model.transform(target_column="class_number_of_rings", description="Abalone
|
|
377
|
+
to_model.set_output_tags(["abalone", "uq"])
|
|
378
|
+
to_model.transform(target_column="class_number_of_rings", description="Abalone UQ Regression")
|
|
352
379
|
|
|
353
380
|
# Scikit-Learn Kmeans Clustering Model
|
|
354
381
|
input_name = "wine_features"
|
|
@@ -402,7 +429,7 @@ if __name__ == "__main__":
|
|
|
402
429
|
scripts_root = Path(__file__).resolve().parents[3] / "model_scripts"
|
|
403
430
|
my_script = scripts_root / "custom_models" / "chem_info" / "molecular_descriptors.py"
|
|
404
431
|
input_name = "aqsol_features"
|
|
405
|
-
output_name = "smiles-to-taut-md-stereo
|
|
432
|
+
output_name = "test-smiles-to-taut-md-stereo"
|
|
406
433
|
to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.TRANSFORMER, custom_script=my_script)
|
|
407
434
|
to_model.set_output_tags(["smiles", "molecular descriptors"])
|
|
408
435
|
to_model.transform(target_column=None, feature_list=["smiles"], description="Smiles to Molecular Descriptors")
|
|
@@ -415,13 +442,3 @@ if __name__ == "__main__":
|
|
|
415
442
|
to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.TRANSFORMER, custom_script=my_script)
|
|
416
443
|
to_model.set_output_tags(["smiles", "morgan fingerprints"])
|
|
417
444
|
to_model.transform(target_column=None, feature_list=["smiles"], description="Smiles to Morgan Fingerprints")
|
|
418
|
-
|
|
419
|
-
# Tautomerization Model
|
|
420
|
-
scripts_root = Path(__file__).resolve().parents[3] / "model_scripts"
|
|
421
|
-
my_script = scripts_root / "custom_models" / "chem_info" / "tautomerize.py"
|
|
422
|
-
input_name = "aqsol_features"
|
|
423
|
-
output_name = "tautomerize-v0"
|
|
424
|
-
to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.TRANSFORMER, custom_script=my_script)
|
|
425
|
-
to_model.set_output_tags(["smiles", "tautomerization"])
|
|
426
|
-
to_model.transform(target_column=None, feature_list=["smiles"], description="Tautomerize Smiles")
|
|
427
|
-
"""
|
|
@@ -68,6 +68,15 @@ class PandasToFeatures(Transform):
|
|
|
68
68
|
self.output_df = input_df.copy()
|
|
69
69
|
self.one_hot_columns = one_hot_columns or []
|
|
70
70
|
|
|
71
|
+
# Warn about known AWS Iceberg bug with event_time_column
|
|
72
|
+
if event_time_column is not None:
|
|
73
|
+
self.log.warning(
|
|
74
|
+
f"event_time_column='{event_time_column}' specified. Note: AWS has a known bug with "
|
|
75
|
+
"Iceberg FeatureGroups where varying event times across multiple days can cause "
|
|
76
|
+
"duplicate rows in the offline store. Setting event_time_column=None."
|
|
77
|
+
)
|
|
78
|
+
self.event_time_column = None
|
|
79
|
+
|
|
71
80
|
# Now Prepare the DataFrame for its journey into an AWS FeatureGroup
|
|
72
81
|
self.prep_dataframe()
|
|
73
82
|
|
|
@@ -400,7 +409,7 @@ class PandasToFeatures(Transform):
|
|
|
400
409
|
|
|
401
410
|
# Set Hold Out Ids (if we got them during creation)
|
|
402
411
|
if self.incoming_hold_out_ids:
|
|
403
|
-
self.output_feature_set.set_training_holdouts(self.
|
|
412
|
+
self.output_feature_set.set_training_holdouts(self.incoming_hold_out_ids)
|
|
404
413
|
|
|
405
414
|
def ensure_feature_group_created(self, feature_group):
|
|
406
415
|
status = feature_group.describe().get("FeatureGroupStatus")
|
|
@@ -462,7 +471,7 @@ if __name__ == "__main__":
|
|
|
462
471
|
|
|
463
472
|
# Create my DF to Feature Set Transform (with one-hot encoding)
|
|
464
473
|
df_to_features = PandasToFeatures("test_features")
|
|
465
|
-
df_to_features.set_input(data_df, id_column="id", one_hot_columns=["food"])
|
|
474
|
+
df_to_features.set_input(data_df, id_column="id", event_time_column="date", one_hot_columns=["food"])
|
|
466
475
|
df_to_features.set_output_tags(["test", "small"])
|
|
467
476
|
df_to_features.transform()
|
|
468
477
|
|
workbench/core/views/view.py
CHANGED
|
@@ -91,11 +91,11 @@ class View:
|
|
|
91
91
|
self.table, self.data_source.database, self.data_source.boto3_session
|
|
92
92
|
)
|
|
93
93
|
|
|
94
|
-
def pull_dataframe(self, limit: int =
|
|
94
|
+
def pull_dataframe(self, limit: int = 100000) -> Union[pd.DataFrame, None]:
|
|
95
95
|
"""Pull a DataFrame based on the view type
|
|
96
96
|
|
|
97
97
|
Args:
|
|
98
|
-
limit (int): The maximum number of rows to pull (default:
|
|
98
|
+
limit (int): The maximum number of rows to pull (default: 100000)
|
|
99
99
|
|
|
100
100
|
Returns:
|
|
101
101
|
Union[pd.DataFrame, None]: The DataFrame for the view or None if it doesn't exist
|