workbench 0.8.162__py3-none-any.whl → 0.8.220__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

Files changed (147) hide show
  1. workbench/algorithms/dataframe/__init__.py +1 -2
  2. workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
  3. workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
  4. workbench/algorithms/dataframe/fingerprint_proximity.py +422 -86
  5. workbench/algorithms/dataframe/projection_2d.py +44 -21
  6. workbench/algorithms/dataframe/proximity.py +259 -305
  7. workbench/algorithms/graph/light/proximity_graph.py +14 -12
  8. workbench/algorithms/models/cleanlab_model.py +382 -0
  9. workbench/algorithms/models/noise_model.py +388 -0
  10. workbench/algorithms/sql/outliers.py +3 -3
  11. workbench/api/__init__.py +5 -1
  12. workbench/api/compound.py +1 -1
  13. workbench/api/df_store.py +17 -108
  14. workbench/api/endpoint.py +18 -5
  15. workbench/api/feature_set.py +121 -15
  16. workbench/api/meta.py +5 -2
  17. workbench/api/meta_model.py +289 -0
  18. workbench/api/model.py +55 -21
  19. workbench/api/monitor.py +1 -16
  20. workbench/api/parameter_store.py +3 -52
  21. workbench/cached/cached_model.py +4 -4
  22. workbench/core/artifacts/__init__.py +11 -2
  23. workbench/core/artifacts/artifact.py +16 -8
  24. workbench/core/artifacts/data_capture_core.py +355 -0
  25. workbench/core/artifacts/df_store_core.py +114 -0
  26. workbench/core/artifacts/endpoint_core.py +382 -253
  27. workbench/core/artifacts/feature_set_core.py +249 -45
  28. workbench/core/artifacts/model_core.py +135 -80
  29. workbench/core/artifacts/monitor_core.py +33 -248
  30. workbench/core/artifacts/parameter_store_core.py +98 -0
  31. workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
  32. workbench/core/cloud_platform/aws/aws_meta.py +12 -5
  33. workbench/core/cloud_platform/aws/aws_session.py +4 -4
  34. workbench/core/pipelines/pipeline_executor.py +1 -1
  35. workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
  36. workbench/core/transforms/features_to_model/features_to_model.py +62 -40
  37. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +76 -15
  38. workbench/core/transforms/pandas_transforms/pandas_to_features.py +38 -2
  39. workbench/core/views/training_view.py +113 -42
  40. workbench/core/views/view.py +53 -3
  41. workbench/core/views/view_utils.py +4 -4
  42. workbench/model_script_utils/model_script_utils.py +339 -0
  43. workbench/model_script_utils/pytorch_utils.py +405 -0
  44. workbench/model_script_utils/uq_harness.py +278 -0
  45. workbench/model_scripts/chemprop/chemprop.template +649 -0
  46. workbench/model_scripts/chemprop/generated_model_script.py +649 -0
  47. workbench/model_scripts/chemprop/model_script_utils.py +339 -0
  48. workbench/model_scripts/chemprop/requirements.txt +3 -0
  49. workbench/model_scripts/custom_models/chem_info/fingerprints.py +175 -0
  50. workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
  51. workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
  52. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
  53. workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
  54. workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
  55. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +8 -10
  56. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  57. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
  58. workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
  59. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  60. workbench/model_scripts/custom_models/uq_models/ngboost.template +30 -18
  61. workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
  62. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
  63. workbench/model_scripts/meta_model/generated_model_script.py +209 -0
  64. workbench/model_scripts/meta_model/meta_model.template +209 -0
  65. workbench/model_scripts/pytorch_model/generated_model_script.py +444 -500
  66. workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
  67. workbench/model_scripts/pytorch_model/pytorch.template +440 -496
  68. workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
  69. workbench/model_scripts/pytorch_model/requirements.txt +1 -1
  70. workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
  71. workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
  72. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  73. workbench/model_scripts/script_generation.py +20 -11
  74. workbench/model_scripts/uq_models/generated_model_script.py +248 -0
  75. workbench/model_scripts/xgb_model/generated_model_script.py +372 -404
  76. workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
  77. workbench/model_scripts/xgb_model/uq_harness.py +278 -0
  78. workbench/model_scripts/xgb_model/xgb_model.template +369 -401
  79. workbench/repl/workbench_shell.py +28 -19
  80. workbench/resources/open_source_api.key +1 -1
  81. workbench/scripts/endpoint_test.py +162 -0
  82. workbench/scripts/lambda_test.py +73 -0
  83. workbench/scripts/meta_model_sim.py +35 -0
  84. workbench/scripts/ml_pipeline_batch.py +137 -0
  85. workbench/scripts/ml_pipeline_sqs.py +186 -0
  86. workbench/scripts/monitor_cloud_watch.py +20 -100
  87. workbench/scripts/training_test.py +85 -0
  88. workbench/utils/aws_utils.py +4 -3
  89. workbench/utils/chem_utils/__init__.py +0 -0
  90. workbench/utils/chem_utils/fingerprints.py +175 -0
  91. workbench/utils/chem_utils/misc.py +194 -0
  92. workbench/utils/chem_utils/mol_descriptors.py +483 -0
  93. workbench/utils/chem_utils/mol_standardize.py +450 -0
  94. workbench/utils/chem_utils/mol_tagging.py +348 -0
  95. workbench/utils/chem_utils/projections.py +219 -0
  96. workbench/utils/chem_utils/salts.py +256 -0
  97. workbench/utils/chem_utils/sdf.py +292 -0
  98. workbench/utils/chem_utils/toxicity.py +250 -0
  99. workbench/utils/chem_utils/vis.py +253 -0
  100. workbench/utils/chemprop_utils.py +141 -0
  101. workbench/utils/cloudwatch_handler.py +1 -1
  102. workbench/utils/cloudwatch_utils.py +137 -0
  103. workbench/utils/config_manager.py +3 -7
  104. workbench/utils/endpoint_utils.py +5 -7
  105. workbench/utils/license_manager.py +2 -6
  106. workbench/utils/meta_model_simulator.py +499 -0
  107. workbench/utils/metrics_utils.py +256 -0
  108. workbench/utils/model_utils.py +278 -79
  109. workbench/utils/monitor_utils.py +44 -62
  110. workbench/utils/pandas_utils.py +3 -3
  111. workbench/utils/pytorch_utils.py +87 -0
  112. workbench/utils/shap_utils.py +11 -57
  113. workbench/utils/workbench_logging.py +0 -3
  114. workbench/utils/workbench_sqs.py +1 -1
  115. workbench/utils/xgboost_local_crossfold.py +267 -0
  116. workbench/utils/xgboost_model_utils.py +127 -219
  117. workbench/web_interface/components/model_plot.py +14 -2
  118. workbench/web_interface/components/plugin_unit_test.py +5 -2
  119. workbench/web_interface/components/plugins/dashboard_status.py +3 -1
  120. workbench/web_interface/components/plugins/generated_compounds.py +1 -1
  121. workbench/web_interface/components/plugins/model_details.py +38 -74
  122. workbench/web_interface/components/plugins/scatter_plot.py +6 -10
  123. {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/METADATA +31 -9
  124. {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/RECORD +128 -96
  125. workbench-0.8.220.dist-info/entry_points.txt +11 -0
  126. {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/licenses/LICENSE +1 -1
  127. workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
  128. workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
  129. workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
  130. workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
  131. workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
  132. workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
  133. workbench/model_scripts/custom_models/proximity/proximity.py +0 -384
  134. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
  135. workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
  136. workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -273
  137. workbench/model_scripts/custom_models/uq_models/proximity.py +0 -384
  138. workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
  139. workbench/model_scripts/quant_regression/quant_regression.template +0 -279
  140. workbench/model_scripts/quant_regression/requirements.txt +0 -1
  141. workbench/utils/chem_utils.py +0 -1556
  142. workbench/utils/execution_environment.py +0 -211
  143. workbench/utils/fast_inference.py +0 -167
  144. workbench/utils/resource_utils.py +0 -39
  145. workbench-0.8.162.dist-info/entry_points.txt +0 -5
  146. {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/WHEEL +0 -0
  147. {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/top_level.txt +0 -0
@@ -123,7 +123,7 @@ class PipelineExecutor:
123
123
  if "model" in workbench_objects and (not subset or "endpoint" in subset):
124
124
  workbench_objects["model"].to_endpoint(**kwargs)
125
125
  endpoint = Endpoint(kwargs["name"])
126
- endpoint.auto_inference(capture=True)
126
+ endpoint.auto_inference()
127
127
 
128
128
  # Found something weird
129
129
  else:
@@ -1,7 +1,7 @@
1
1
  """MolecularDescriptors: Compute a Feature Set based on RDKit Descriptors
2
2
 
3
- Note: An alternative to using this class is to use the `compute_molecular_descriptors` function directly.
4
- df_features = compute_molecular_descriptors(df)
3
+ Note: An alternative to using this class is to use the `compute_descriptors` function directly.
4
+ df_features = compute_descriptors(df)
5
5
  to_features = PandasToFeatures("my_feature_set")
6
6
  to_features.set_input(df_features, id_column="id")
7
7
  to_features.set_output_tags(["blah", "whatever"])
@@ -10,7 +10,7 @@ Note: An alternative to using this class is to use the `compute_molecular_descri
10
10
 
11
11
  # Local Imports
12
12
  from workbench.core.transforms.data_to_features.light.data_to_features_light import DataToFeaturesLight
13
- from workbench.utils.chem_utils import compute_molecular_descriptors
13
+ from workbench.utils.chem_utils.mol_descriptors import compute_descriptors
14
14
 
15
15
 
16
16
  class MolecularDescriptors(DataToFeaturesLight):
@@ -39,7 +39,7 @@ class MolecularDescriptors(DataToFeaturesLight):
39
39
  """Compute a Feature Set based on RDKit Descriptors"""
40
40
 
41
41
  # Compute/add all the Molecular Descriptors
42
- self.output_df = compute_molecular_descriptors(self.input_df)
42
+ self.output_df = compute_descriptors(self.input_df)
43
43
 
44
44
 
45
45
  if __name__ == "__main__":
@@ -1,6 +1,7 @@
1
1
  """FeaturesToModel: Train/Create a Model from a Feature Set"""
2
2
 
3
3
  from pathlib import Path
4
+ from typing import Union
4
5
  from sagemaker.estimator import Estimator
5
6
  import awswrangler as wr
6
7
  from datetime import datetime, timezone
@@ -9,7 +10,7 @@ import time
9
10
  # Local Imports
10
11
  from workbench.core.transforms.transform import Transform, TransformInput, TransformOutput
11
12
  from workbench.core.artifacts.feature_set_core import FeatureSetCore
12
- from workbench.core.artifacts.model_core import ModelCore, ModelType, ModelImages
13
+ from workbench.core.artifacts.model_core import ModelCore, ModelType, ModelFramework, ModelImages
13
14
  from workbench.core.artifacts.artifact import Artifact
14
15
  from workbench.model_scripts.script_generation import generate_model_script, fill_template
15
16
  from workbench.utils.model_utils import supported_instance_types
@@ -33,12 +34,13 @@ class FeaturesToModel(Transform):
33
34
  feature_name: str,
34
35
  model_name: str,
35
36
  model_type: ModelType,
37
+ model_framework=ModelFramework.XGBOOST,
36
38
  model_class=None,
37
39
  model_import_str=None,
38
40
  custom_script=None,
39
41
  custom_args=None,
40
- training_image="xgb_training",
41
- inference_image="xgb_inference",
42
+ training_image="training",
43
+ inference_image="inference",
42
44
  inference_arch="x86_64",
43
45
  ):
44
46
  """FeaturesToModel Initialization
@@ -46,12 +48,13 @@ class FeaturesToModel(Transform):
46
48
  feature_name (str): Name of the FeatureSet to use as input
47
49
  model_name (str): Name of the Model to create as output
48
50
  model_type (ModelType): ModelType.REGRESSOR or ModelType.CLASSIFIER, etc.
51
+ model_framework (ModelFramework, optional): The model framework (default ModelFramework.XGBOOST)
49
52
  model_class (str, optional): The scikit model (e.g. KNeighborsRegressor) (default None)
50
53
  model_import_str (str, optional): The import string for the model (default None)
51
54
  custom_script (str, optional): Custom script to use for the model (default None)
52
55
  custom_args (dict, optional): Custom arguments to pass to custom model scripts (default None)
53
- training_image (str, optional): Training image (default "xgb_training")
54
- inference_image (str, optional): Inference image (default "xgb_inference")
56
+ training_image (str, optional): Training image (default "training")
57
+ inference_image (str, optional): Inference image (default "inference")
55
58
  inference_arch (str, optional): Inference architecture (default "x86_64")
56
59
  """
57
60
 
@@ -65,6 +68,7 @@ class FeaturesToModel(Transform):
65
68
  self.input_type = TransformInput.FEATURE_SET
66
69
  self.output_type = TransformOutput.MODEL
67
70
  self.model_type = model_type
71
+ self.model_framework = model_framework
68
72
  self.model_class = model_class
69
73
  self.model_import_str = model_import_str
70
74
  self.custom_script = str(custom_script) if custom_script else None
@@ -80,12 +84,17 @@ class FeaturesToModel(Transform):
80
84
  self.inference_arch = inference_arch
81
85
 
82
86
  def transform_impl(
83
- self, target_column: str, description: str = None, feature_list: list = None, train_all_data=False, **kwargs
87
+ self,
88
+ target_column: Union[str, list[str]],
89
+ description: str = None,
90
+ feature_list: list = None,
91
+ train_all_data=False,
92
+ **kwargs,
84
93
  ):
85
94
  """Generic Features to Model: Note you should create a new class and inherit from
86
95
  this one to include specific logic for your Feature Set/Model
87
96
  Args:
88
- target_column (str): Column name of the target variable
97
+ target_column (str or list[str]): Column name(s) of the target variable(s)
89
98
  description (str): Description of the model (optional)
90
99
  feature_list (list[str]): A list of columns for the features (default None, will try to guess)
91
100
  train_all_data (bool): Train on ALL (100%) of the data (default False)
@@ -102,9 +111,11 @@ class FeaturesToModel(Transform):
102
111
  s3_training_path = feature_set.create_s3_training_data()
103
112
  self.log.info(f"Created new training data {s3_training_path}...")
104
113
 
105
- # Report the target column
114
+ # Report the target column(s)
106
115
  self.target_column = target_column
107
- self.log.info(f"Target column: {self.target_column}")
116
+ # Normalize target_column to a list for internal use
117
+ target_list = [target_column] if isinstance(target_column, str) else (target_column or [])
118
+ self.log.info(f"Target column(s): {self.target_column}")
108
119
 
109
120
  # Did they specify a feature list?
110
121
  if feature_list:
@@ -131,7 +142,7 @@ class FeaturesToModel(Transform):
131
142
  "is_deleted",
132
143
  "event_time",
133
144
  "training",
134
- ] + [self.target_column]
145
+ ] + target_list
135
146
  feature_list = [c for c in all_columns if c not in filter_list]
136
147
 
137
148
  # AWS Feature Store has 3 user column types (String, Integral, Fractional)
@@ -154,11 +165,14 @@ class FeaturesToModel(Transform):
154
165
  self.log.important(f"Feature List for Modeling: {self.model_feature_list}")
155
166
 
156
167
  # Set up our parameters for the model script
168
+ # ChemProp expects target_column as a list; other templates expect a string
169
+ target_for_template = target_list if self.model_framework == ModelFramework.CHEMPROP else self.target_column
157
170
  template_params = {
158
171
  "model_imports": self.model_import_str,
159
172
  "model_type": self.model_type,
173
+ "model_framework": self.model_framework,
160
174
  "model_class": self.model_class,
161
- "target_column": self.target_column,
175
+ "target_column": target_for_template,
162
176
  "feature_list": self.model_feature_list,
163
177
  "compressed_features": feature_set.get_compressed_features(),
164
178
  "model_metrics_s3_path": self.model_training_root,
@@ -184,23 +198,27 @@ class FeaturesToModel(Transform):
184
198
  # Generate our model script
185
199
  script_path = generate_model_script(template_params)
186
200
 
187
- # Metric Definitions for Regression
201
+ # Metric Definitions for Regression (matches model script output format)
188
202
  if self.model_type in [ModelType.REGRESSOR, ModelType.UQ_REGRESSOR, ModelType.ENSEMBLE_REGRESSOR]:
189
203
  metric_definitions = [
190
- {"Name": "RMSE", "Regex": "RMSE: ([0-9.]+)"},
191
- {"Name": "MAE", "Regex": "MAE: ([0-9.]+)"},
192
- {"Name": "R2", "Regex": "R2: ([0-9.]+)"},
193
- {"Name": "NumRows", "Regex": "NumRows: ([0-9]+)"},
204
+ {"Name": "rmse", "Regex": r"rmse: ([0-9.]+)"},
205
+ {"Name": "mae", "Regex": r"mae: ([0-9.]+)"},
206
+ {"Name": "medae", "Regex": r"medae: ([0-9.]+)"},
207
+ {"Name": "r2", "Regex": r"r2: ([0-9.-]+)"},
208
+ {"Name": "spearmanr", "Regex": r"spearmanr: ([0-9.-]+)"},
209
+ {"Name": "support", "Regex": r"support: ([0-9]+)"},
194
210
  ]
195
211
 
196
212
  # Metric Definitions for Classification
197
213
  elif self.model_type == ModelType.CLASSIFIER:
198
214
  # We need to get creative with the Classification Metrics
215
+ # Note: Classification only supports single target
216
+ class_target = target_list[0] if target_list else self.target_column
199
217
 
200
218
  # Grab all the target column class values (class labels)
201
219
  table = feature_set.data_source.table
202
- self.class_labels = feature_set.query(f'select DISTINCT {self.target_column} FROM "{table}"')[
203
- self.target_column
220
+ self.class_labels = feature_set.query(f'select DISTINCT {class_target} FROM "{table}"')[
221
+ class_target
204
222
  ].to_list()
205
223
 
206
224
  # Sanity check on the targets
@@ -210,7 +228,7 @@ class FeaturesToModel(Transform):
210
228
  raise ValueError(msg)
211
229
 
212
230
  # Dynamically create the metric definitions
213
- metrics = ["precision", "recall", "fscore"]
231
+ metrics = ["precision", "recall", "f1", "support"]
214
232
  metric_definitions = []
215
233
  for t in self.class_labels:
216
234
  for m in metrics:
@@ -233,13 +251,21 @@ class FeaturesToModel(Transform):
233
251
  source_dir = str(Path(script_path).parent)
234
252
 
235
253
  # Create a Sagemaker Model with our script
236
- image = ModelImages.get_image_uri(self.sm_session.boto_region_name, self.training_image, "0.1")
254
+ image = ModelImages.get_image_uri(self.sm_session.boto_region_name, self.training_image)
255
+
256
+ # Use GPU instance for ChemProp/PyTorch, CPU for others
257
+ if self.model_framework in [ModelFramework.CHEMPROP, ModelFramework.PYTORCH]:
258
+ train_instance_type = "ml.g6.xlarge" # NVIDIA L4 GPU, ~$0.80/hr
259
+ self.log.important(f"Using GPU instance {train_instance_type} for {self.model_framework.value}")
260
+ else:
261
+ train_instance_type = "ml.m5.xlarge"
262
+
237
263
  self.estimator = Estimator(
238
264
  entry_point=entry_point,
239
265
  source_dir=source_dir,
240
266
  role=self.workbench_role_arn,
241
267
  instance_count=1,
242
- instance_type="ml.m5.xlarge",
268
+ instance_type=train_instance_type,
243
269
  sagemaker_session=self.sm_session,
244
270
  image_uri=image,
245
271
  metric_definitions=metric_definitions,
@@ -264,13 +290,20 @@ class FeaturesToModel(Transform):
264
290
  self.log.important(f"Creating new model {self.output_name}...")
265
291
  self.create_and_register_model(**kwargs)
266
292
 
293
+ # Make a copy of the training view, to lock-in the training data used for this model
294
+ model_training_view_name = f"{self.output_name.replace('-', '_')}_training"
295
+ self.log.important(f"Creating Model Training View: {model_training_view_name}...")
296
+ feature_set.view("training").copy(f"{model_training_view_name}")
297
+
267
298
  def post_transform(self, **kwargs):
268
299
  """Post-Transform: Calling onboard() on the Model"""
269
300
  self.log.info("Post-Transform: Calling onboard() on the Model...")
270
301
  time.sleep(3) # Give AWS time to complete Model register
271
302
 
272
- # Store the model feature_list and target_column in the workbench_meta
273
- output_model = ModelCore(self.output_name, model_type=self.model_type)
303
+ # Store the model metadata information
304
+ output_model = ModelCore(self.output_name)
305
+ output_model._set_model_type(self.model_type)
306
+ output_model._set_model_framework(self.model_framework)
274
307
  output_model.upsert_workbench_meta({"workbench_model_features": self.model_feature_list})
275
308
  output_model.upsert_workbench_meta({"workbench_model_target": self.target_column})
276
309
 
@@ -301,7 +334,7 @@ class FeaturesToModel(Transform):
301
334
 
302
335
  # Register our model
303
336
  image = ModelImages.get_image_uri(
304
- self.sm_session.boto_region_name, self.inference_image, "0.1", self.inference_arch
337
+ self.sm_session.boto_region_name, self.inference_image, architecture=self.inference_arch
305
338
  )
306
339
  self.log.important(f"Registering model {self.output_name} with Inference Image {image}...")
307
340
  model = self.estimator.create_model(role=self.workbench_role_arn)
@@ -325,12 +358,11 @@ if __name__ == "__main__":
325
358
 
326
359
  # Regression Model
327
360
  input_name = "abalone_features"
328
- output_name = "test-abalone-regression"
361
+ output_name = "abalone-regression"
329
362
  to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.REGRESSOR)
330
363
  to_model.set_output_tags(["test"])
331
364
  to_model.transform(target_column="class_number_of_rings", description="Test Abalone Regression")
332
365
 
333
- """
334
366
  # Classification Model
335
367
  input_name = "wine_features"
336
368
  output_name = "wine-classification"
@@ -340,10 +372,10 @@ if __name__ == "__main__":
340
372
 
341
373
  # Quantile Regression Model (Abalone)
342
374
  input_name = "abalone_features"
343
- output_name = "abalone-quantile-reg"
375
+ output_name = "abalone-regression-uq"
344
376
  to_model = FeaturesToModel(input_name, output_name, ModelType.UQ_REGRESSOR)
345
- to_model.set_output_tags(["abalone", "quantiles"])
346
- to_model.transform(target_column="class_number_of_rings", description="Abalone Quantile Regression")
377
+ to_model.set_output_tags(["abalone", "uq"])
378
+ to_model.transform(target_column="class_number_of_rings", description="Abalone UQ Regression")
347
379
 
348
380
  # Scikit-Learn Kmeans Clustering Model
349
381
  input_name = "wine_features"
@@ -397,7 +429,7 @@ if __name__ == "__main__":
397
429
  scripts_root = Path(__file__).resolve().parents[3] / "model_scripts"
398
430
  my_script = scripts_root / "custom_models" / "chem_info" / "molecular_descriptors.py"
399
431
  input_name = "aqsol_features"
400
- output_name = "smiles-to-taut-md-stereo-v0"
432
+ output_name = "test-smiles-to-taut-md-stereo"
401
433
  to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.TRANSFORMER, custom_script=my_script)
402
434
  to_model.set_output_tags(["smiles", "molecular descriptors"])
403
435
  to_model.transform(target_column=None, feature_list=["smiles"], description="Smiles to Molecular Descriptors")
@@ -410,13 +442,3 @@ if __name__ == "__main__":
410
442
  to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.TRANSFORMER, custom_script=my_script)
411
443
  to_model.set_output_tags(["smiles", "morgan fingerprints"])
412
444
  to_model.transform(target_column=None, feature_list=["smiles"], description="Smiles to Morgan Fingerprints")
413
-
414
- # Tautomerization Model
415
- scripts_root = Path(__file__).resolve().parents[3] / "model_scripts"
416
- my_script = scripts_root / "custom_models" / "chem_info" / "tautomerize.py"
417
- input_name = "aqsol_features"
418
- output_name = "tautomerize-v0"
419
- to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.TRANSFORMER, custom_script=my_script)
420
- to_model.set_output_tags(["smiles", "tautomerization"])
421
- to_model.transform(target_column=None, feature_list=["smiles"], description="Tautomerize Smiles")
422
- """
@@ -1,10 +1,12 @@
1
1
  """ModelToEndpoint: Deploy an Endpoint for a Model"""
2
2
 
3
3
  import time
4
+ from botocore.exceptions import ClientError
4
5
  from sagemaker import ModelPackage
5
6
  from sagemaker.serializers import CSVSerializer
6
7
  from sagemaker.deserializers import CSVDeserializer
7
8
  from sagemaker.serverless import ServerlessInferenceConfig
9
+ from sagemaker.model_monitor import DataCaptureConfig
8
10
 
9
11
  # Local Imports
10
12
  from workbench.core.transforms.transform import Transform, TransformInput, TransformOutput
@@ -51,27 +53,38 @@ class ModelToEndpoint(Transform):
51
53
  EndpointCore.managed_delete(self.output_name)
52
54
 
53
55
  # Get the Model Package ARN for our input model
54
- input_model = ModelCore(self.input_name)
55
- model_package_arn = input_model.model_package_arn()
56
+ workbench_model = ModelCore(self.input_name)
56
57
 
57
58
  # Deploy the model
58
- self._deploy_model(model_package_arn, **kwargs)
59
+ self._deploy_model(workbench_model, **kwargs)
59
60
 
60
61
  # Add this endpoint to the set of registered endpoints for the model
61
- input_model.register_endpoint(self.output_name)
62
+ workbench_model.register_endpoint(self.output_name)
62
63
 
63
64
  # This ensures that the endpoint is ready for use
64
65
  time.sleep(5) # We wait for AWS Lag
65
66
  end = EndpointCore(self.output_name)
66
67
  self.log.important(f"Endpoint {end.name} is ready for use")
67
68
 
68
- def _deploy_model(self, model_package_arn: str, mem_size: int = 2048, max_concurrency: int = 5):
69
+ def _deploy_model(
70
+ self,
71
+ workbench_model: ModelCore,
72
+ mem_size: int = 2048,
73
+ max_concurrency: int = 5,
74
+ data_capture: bool = False,
75
+ capture_percentage: int = 100,
76
+ ):
69
77
  """Internal Method: Deploy the Model
70
78
 
71
79
  Args:
72
- model_package_arn(str): The Model Package ARN used to deploy the Endpoint
80
+ workbench_model(ModelCore): The Workbench ModelCore object to deploy
81
+ mem_size(int): Memory size for serverless deployment
82
+ max_concurrency(int): Max concurrency for serverless deployment
83
+ data_capture(bool): Enable data capture during deployment
84
+ capture_percentage(int): Percentage of data to capture. Defaults to 100.
73
85
  """
74
86
  # Grab the specified Model Package
87
+ model_package_arn = workbench_model.model_package_arn()
75
88
  model_package = ModelPackage(
76
89
  role=self.workbench_role_arn,
77
90
  model_package_arn=model_package_arn,
@@ -90,22 +103,70 @@ class ModelToEndpoint(Transform):
90
103
  # Is this a serverless deployment?
91
104
  serverless_config = None
92
105
  if self.serverless:
106
+ # For PyTorch or ChemProp we need at least 4GB of memory
107
+ from workbench.api import ModelFramework
108
+
109
+ self.log.info(f"Model Framework: {workbench_model.model_framework}")
110
+ if workbench_model.model_framework in [ModelFramework.PYTORCH, ModelFramework.CHEMPROP]:
111
+ if mem_size < 4096:
112
+ self.log.important(
113
+ f"{workbench_model.model_framework} needs at least 4GB of memory (setting to 4GB)"
114
+ )
115
+ mem_size = 4096
93
116
  serverless_config = ServerlessInferenceConfig(
94
117
  memory_size_in_mb=mem_size,
95
118
  max_concurrency=max_concurrency,
96
119
  )
120
+ self.log.important(f"Serverless Config: Memory={mem_size}MB, MaxConcurrency={max_concurrency}")
121
+
122
+ # Configure data capture if requested (and not serverless)
123
+ data_capture_config = None
124
+ if data_capture and not self.serverless:
125
+ # Set up the S3 path for data capture
126
+ base_endpoint_path = f"{workbench_model.endpoints_s3_path}/{self.output_name}"
127
+ data_capture_path = f"{base_endpoint_path}/data_capture"
128
+ self.log.important(f"Configuring Data Capture --> {data_capture_path}")
129
+ data_capture_config = DataCaptureConfig(
130
+ enable_capture=True,
131
+ sampling_percentage=capture_percentage,
132
+ destination_s3_uri=data_capture_path,
133
+ )
134
+ elif data_capture and self.serverless:
135
+ self.log.warning(
136
+ "Data capture is not supported for serverless endpoints. Skipping data capture configuration."
137
+ )
97
138
 
98
139
  # Deploy the Endpoint
99
140
  self.log.important(f"Deploying the Endpoint {self.output_name}...")
100
- model_package.deploy(
101
- initial_instance_count=1,
102
- instance_type=self.instance_type,
103
- serverless_inference_config=serverless_config,
104
- endpoint_name=self.output_name,
105
- serializer=CSVSerializer(),
106
- deserializer=CSVDeserializer(),
107
- tags=aws_tags,
108
- )
141
+ try:
142
+ model_package.deploy(
143
+ initial_instance_count=1,
144
+ instance_type=self.instance_type,
145
+ serverless_inference_config=serverless_config,
146
+ endpoint_name=self.output_name,
147
+ serializer=CSVSerializer(),
148
+ deserializer=CSVDeserializer(),
149
+ data_capture_config=data_capture_config,
150
+ tags=aws_tags,
151
+ )
152
+ except ClientError as e:
153
+ # Check if this is the "endpoint config already exists" error
154
+ if "Cannot create already existing endpoint configuration" in str(e):
155
+ self.log.warning("Endpoint config already exists, deleting and retrying...")
156
+ self.sm_client.delete_endpoint_config(EndpointConfigName=self.output_name)
157
+ # Retry the deploy
158
+ model_package.deploy(
159
+ initial_instance_count=1,
160
+ instance_type=self.instance_type,
161
+ serverless_inference_config=serverless_config,
162
+ endpoint_name=self.output_name,
163
+ serializer=CSVSerializer(),
164
+ deserializer=CSVDeserializer(),
165
+ data_capture_config=data_capture_config,
166
+ tags=aws_tags,
167
+ )
168
+ else:
169
+ raise
109
170
 
110
171
  def post_transform(self, **kwargs):
111
172
  """Post-Transform: Calling onboard() for the Endpoint"""
@@ -68,6 +68,15 @@ class PandasToFeatures(Transform):
68
68
  self.output_df = input_df.copy()
69
69
  self.one_hot_columns = one_hot_columns or []
70
70
 
71
+ # Warn about known AWS Iceberg bug with event_time_column
72
+ if event_time_column is not None:
73
+ self.log.warning(
74
+ f"event_time_column='{event_time_column}' specified. Note: AWS has a known bug with "
75
+ "Iceberg FeatureGroups where varying event times across multiple days can cause "
76
+ "duplicate rows in the offline store. Setting event_time_column=None."
77
+ )
78
+ self.event_time_column = None
79
+
71
80
  # Now Prepare the DataFrame for its journey into an AWS FeatureGroup
72
81
  self.prep_dataframe()
73
82
 
@@ -327,9 +336,36 @@ class PandasToFeatures(Transform):
327
336
  self.delete_existing()
328
337
  self.output_feature_group = self.create_feature_group()
329
338
 
339
+ def mac_spawn_hack(self):
340
+ """Workaround for macOS Tahoe fork/spawn issue with SageMaker FeatureStore ingest.
341
+
342
+ See: https://github.com/aws/sagemaker-python-sdk/issues/5312
343
+ macOS Tahoe 26+ has issues with forked processes creating boto3 sessions.
344
+ This forces spawn mode on macOS to avoid the hang.
345
+ """
346
+ import platform
347
+
348
+ if platform.system() == "Darwin": # macOS
349
+ self.log.warning("macOS detected, forcing 'spawn' mode for multiprocessing (Tahoe hang workaround)")
350
+ import multiprocessing
351
+
352
+ try:
353
+ import multiprocess
354
+
355
+ multiprocess.set_start_method("spawn", force=True)
356
+ except (RuntimeError, ImportError):
357
+ pass # Already set or multiprocess not available
358
+ try:
359
+ multiprocessing.set_start_method("spawn", force=True)
360
+ except RuntimeError:
361
+ pass # Already set
362
+
330
363
  def transform_impl(self):
331
364
  """Transform Implementation: Ingest the data into the Feature Group"""
332
365
 
366
+ # Workaround for macOS Tahoe hang issue
367
+ self.mac_spawn_hack()
368
+
333
369
  # Now we actually push the data into the Feature Group (called ingestion)
334
370
  self.log.important(f"Ingesting rows into Feature Group {self.output_name}...")
335
371
  ingest_manager = self.output_feature_group.ingest(self.output_df, max_workers=8, max_processes=4, wait=False)
@@ -373,7 +409,7 @@ class PandasToFeatures(Transform):
373
409
 
374
410
  # Set Hold Out Ids (if we got them during creation)
375
411
  if self.incoming_hold_out_ids:
376
- self.output_feature_set.set_training_holdouts(self.id_column, self.incoming_hold_out_ids)
412
+ self.output_feature_set.set_training_holdouts(self.incoming_hold_out_ids)
377
413
 
378
414
  def ensure_feature_group_created(self, feature_group):
379
415
  status = feature_group.describe().get("FeatureGroupStatus")
@@ -435,7 +471,7 @@ if __name__ == "__main__":
435
471
 
436
472
  # Create my DF to Feature Set Transform (with one-hot encoding)
437
473
  df_to_features = PandasToFeatures("test_features")
438
- df_to_features.set_input(data_df, id_column="id", one_hot_columns=["food"])
474
+ df_to_features.set_input(data_df, id_column="id", event_time_column="date", one_hot_columns=["food"])
439
475
  df_to_features.set_output_tags(["test", "small"])
440
476
  df_to_features.transform()
441
477