PyPI - workbench - Versions diffs - 0.8.162__py3-none-any.whl → 0.8.220__py3-none-any.whl - Mend

workbench 0.8.162py3-none-any.whl → 0.8.220py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (147) hide show

workbench/algorithms/dataframe/__init__.py +1 -2
workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
workbench/algorithms/dataframe/fingerprint_proximity.py +422 -86
workbench/algorithms/dataframe/projection_2d.py +44 -21
workbench/algorithms/dataframe/proximity.py +259 -305
workbench/algorithms/graph/light/proximity_graph.py +14 -12
workbench/algorithms/models/cleanlab_model.py +382 -0
workbench/algorithms/models/noise_model.py +388 -0
workbench/algorithms/sql/outliers.py +3 -3
workbench/api/__init__.py +5 -1
workbench/api/compound.py +1 -1
workbench/api/df_store.py +17 -108
workbench/api/endpoint.py +18 -5
workbench/api/feature_set.py +121 -15
workbench/api/meta.py +5 -2
workbench/api/meta_model.py +289 -0
workbench/api/model.py +55 -21
workbench/api/monitor.py +1 -16
workbench/api/parameter_store.py +3 -52
workbench/cached/cached_model.py +4 -4
workbench/core/artifacts/__init__.py +11 -2
workbench/core/artifacts/artifact.py +16 -8
workbench/core/artifacts/data_capture_core.py +355 -0
workbench/core/artifacts/df_store_core.py +114 -0
workbench/core/artifacts/endpoint_core.py +382 -253
workbench/core/artifacts/feature_set_core.py +249 -45
workbench/core/artifacts/model_core.py +135 -80
workbench/core/artifacts/monitor_core.py +33 -248
workbench/core/artifacts/parameter_store_core.py +98 -0
workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
workbench/core/cloud_platform/aws/aws_meta.py +12 -5
workbench/core/cloud_platform/aws/aws_session.py +4 -4
workbench/core/pipelines/pipeline_executor.py +1 -1
workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
workbench/core/transforms/features_to_model/features_to_model.py +62 -40
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +76 -15
workbench/core/transforms/pandas_transforms/pandas_to_features.py +38 -2
workbench/core/views/training_view.py +113 -42
workbench/core/views/view.py +53 -3
workbench/core/views/view_utils.py +4 -4
workbench/model_script_utils/model_script_utils.py +339 -0
workbench/model_script_utils/pytorch_utils.py +405 -0
workbench/model_script_utils/uq_harness.py +278 -0
workbench/model_scripts/chemprop/chemprop.template +649 -0
workbench/model_scripts/chemprop/generated_model_script.py +649 -0
workbench/model_scripts/chemprop/model_script_utils.py +339 -0
workbench/model_scripts/chemprop/requirements.txt +3 -0
workbench/model_scripts/custom_models/chem_info/fingerprints.py +175 -0
workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +8 -10
workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
workbench/model_scripts/custom_models/uq_models/ngboost.template +30 -18
workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
workbench/model_scripts/meta_model/generated_model_script.py +209 -0
workbench/model_scripts/meta_model/meta_model.template +209 -0
workbench/model_scripts/pytorch_model/generated_model_script.py +444 -500
workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
workbench/model_scripts/pytorch_model/pytorch.template +440 -496
workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
workbench/model_scripts/pytorch_model/requirements.txt +1 -1
workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
workbench/model_scripts/script_generation.py +20 -11
workbench/model_scripts/uq_models/generated_model_script.py +248 -0
workbench/model_scripts/xgb_model/generated_model_script.py +372 -404
workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
workbench/model_scripts/xgb_model/uq_harness.py +278 -0
workbench/model_scripts/xgb_model/xgb_model.template +369 -401
workbench/repl/workbench_shell.py +28 -19
workbench/resources/open_source_api.key +1 -1
workbench/scripts/endpoint_test.py +162 -0
workbench/scripts/lambda_test.py +73 -0
workbench/scripts/meta_model_sim.py +35 -0
workbench/scripts/ml_pipeline_batch.py +137 -0
workbench/scripts/ml_pipeline_sqs.py +186 -0
workbench/scripts/monitor_cloud_watch.py +20 -100
workbench/scripts/training_test.py +85 -0
workbench/utils/aws_utils.py +4 -3
workbench/utils/chem_utils/__init__.py +0 -0
workbench/utils/chem_utils/fingerprints.py +175 -0
workbench/utils/chem_utils/misc.py +194 -0
workbench/utils/chem_utils/mol_descriptors.py +483 -0
workbench/utils/chem_utils/mol_standardize.py +450 -0
workbench/utils/chem_utils/mol_tagging.py +348 -0
workbench/utils/chem_utils/projections.py +219 -0
workbench/utils/chem_utils/salts.py +256 -0
workbench/utils/chem_utils/sdf.py +292 -0
workbench/utils/chem_utils/toxicity.py +250 -0
workbench/utils/chem_utils/vis.py +253 -0
workbench/utils/chemprop_utils.py +141 -0
workbench/utils/cloudwatch_handler.py +1 -1
workbench/utils/cloudwatch_utils.py +137 -0
workbench/utils/config_manager.py +3 -7
workbench/utils/endpoint_utils.py +5 -7
workbench/utils/license_manager.py +2 -6
workbench/utils/meta_model_simulator.py +499 -0
workbench/utils/metrics_utils.py +256 -0
workbench/utils/model_utils.py +278 -79
workbench/utils/monitor_utils.py +44 -62
workbench/utils/pandas_utils.py +3 -3
workbench/utils/pytorch_utils.py +87 -0
workbench/utils/shap_utils.py +11 -57
workbench/utils/workbench_logging.py +0 -3
workbench/utils/workbench_sqs.py +1 -1
workbench/utils/xgboost_local_crossfold.py +267 -0
workbench/utils/xgboost_model_utils.py +127 -219
workbench/web_interface/components/model_plot.py +14 -2
workbench/web_interface/components/plugin_unit_test.py +5 -2
workbench/web_interface/components/plugins/dashboard_status.py +3 -1
workbench/web_interface/components/plugins/generated_compounds.py +1 -1
workbench/web_interface/components/plugins/model_details.py +38 -74
workbench/web_interface/components/plugins/scatter_plot.py +6 -10
{workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/METADATA +31 -9
{workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/RECORD +128 -96
workbench-0.8.220.dist-info/entry_points.txt +11 -0
{workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/licenses/LICENSE +1 -1
workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
workbench/model_scripts/custom_models/proximity/proximity.py +0 -384
workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -273
workbench/model_scripts/custom_models/uq_models/proximity.py +0 -384
workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
workbench/model_scripts/quant_regression/quant_regression.template +0 -279
workbench/model_scripts/quant_regression/requirements.txt +0 -1
workbench/utils/chem_utils.py +0 -1556
workbench/utils/execution_environment.py +0 -211
workbench/utils/fast_inference.py +0 -167
workbench/utils/resource_utils.py +0 -39
workbench-0.8.162.dist-info/entry_points.txt +0 -5
{workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/WHEEL +0 -0
{workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/top_level.txt +0 -0

workbench/core/pipelines/pipeline_executor.py CHANGED Viewed

@@ -123,7 +123,7 @@ class PipelineExecutor:
                 if "model" in workbench_objects and (not subset or "endpoint" in subset):
                     workbench_objects["model"].to_endpoint(**kwargs)
                     endpoint = Endpoint(kwargs["name"])
-                    endpoint.auto_inference(capture=True)
+                    endpoint.auto_inference()
             # Found something weird
             else:

workbench/core/transforms/data_to_features/light/molecular_descriptors.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """MolecularDescriptors: Compute a Feature Set based on RDKit Descriptors
-Note: An alternative to using this class is to use the `compute_molecular_descriptors` function directly.
-      df_features = compute_molecular_descriptors(df)
+Note: An alternative to using this class is to use the `compute_descriptors` function directly.
+      df_features = compute_descriptors(df)
       to_features = PandasToFeatures("my_feature_set")
          to_features.set_input(df_features, id_column="id")
          to_features.set_output_tags(["blah", "whatever"])
@@ -10,7 +10,7 @@ Note: An alternative to using this class is to use the `compute_molecular_descri
 # Local Imports
 from workbench.core.transforms.data_to_features.light.data_to_features_light import DataToFeaturesLight
-from workbench.utils.chem_utils import compute_molecular_descriptors
+from workbench.utils.chem_utils.mol_descriptors import compute_descriptors
 class MolecularDescriptors(DataToFeaturesLight):
@@ -39,7 +39,7 @@ class MolecularDescriptors(DataToFeaturesLight):
         """Compute a Feature Set based on RDKit Descriptors"""
         # Compute/add all the Molecular Descriptors
-        self.output_df = compute_molecular_descriptors(self.input_df)
+        self.output_df = compute_descriptors(self.input_df)
 if __name__ == "__main__":

workbench/core/transforms/features_to_model/features_to_model.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """FeaturesToModel: Train/Create a Model from a Feature Set"""
 from pathlib import Path
+from typing import Union
 from sagemaker.estimator import Estimator
 import awswrangler as wr
 from datetime import datetime, timezone
@@ -9,7 +10,7 @@ import time
 # Local Imports
 from workbench.core.transforms.transform import Transform, TransformInput, TransformOutput
 from workbench.core.artifacts.feature_set_core import FeatureSetCore
-from workbench.core.artifacts.model_core import ModelCore, ModelType, ModelImages
+from workbench.core.artifacts.model_core import ModelCore, ModelType, ModelFramework, ModelImages
 from workbench.core.artifacts.artifact import Artifact
 from workbench.model_scripts.script_generation import generate_model_script, fill_template
 from workbench.utils.model_utils import supported_instance_types
@@ -33,12 +34,13 @@ class FeaturesToModel(Transform):
         feature_name: str,
         model_name: str,
         model_type: ModelType,
+        model_framework=ModelFramework.XGBOOST,
         model_class=None,
         model_import_str=None,
         custom_script=None,
         custom_args=None,
-        training_image="xgb_training",
-        inference_image="xgb_inference",
+        training_image="training",
+        inference_image="inference",
         inference_arch="x86_64",
     ):
         """FeaturesToModel Initialization
@@ -46,12 +48,13 @@ class FeaturesToModel(Transform):
             feature_name (str): Name of the FeatureSet to use as input
             model_name (str): Name of the Model to create as output
             model_type (ModelType): ModelType.REGRESSOR or ModelType.CLASSIFIER, etc.
+            model_framework (ModelFramework, optional): The model framework (default ModelFramework.XGBOOST)
             model_class (str, optional): The scikit model (e.g. KNeighborsRegressor) (default None)
             model_import_str (str, optional): The import string for the model (default None)
             custom_script (str, optional): Custom script to use for the model (default None)
             custom_args (dict, optional): Custom arguments to pass to custom model scripts (default None)
-            training_image (str, optional): Training image (default "xgb_training")
-            inference_image (str, optional): Inference image (default "xgb_inference")
+            training_image (str, optional): Training image (default "training")
+            inference_image (str, optional): Inference image (default "inference")
             inference_arch (str, optional): Inference architecture (default "x86_64")
         """
@@ -65,6 +68,7 @@ class FeaturesToModel(Transform):
         self.input_type = TransformInput.FEATURE_SET
         self.output_type = TransformOutput.MODEL
         self.model_type = model_type
+        self.model_framework = model_framework
         self.model_class = model_class
         self.model_import_str = model_import_str
         self.custom_script = str(custom_script) if custom_script else None
@@ -80,12 +84,17 @@ class FeaturesToModel(Transform):
         self.inference_arch = inference_arch
     def transform_impl(
-        self, target_column: str, description: str = None, feature_list: list = None, train_all_data=False, **kwargs
+        self,
+        target_column: Union[str, list[str]],
+        description: str = None,
+        feature_list: list = None,
+        train_all_data=False,
+        **kwargs,
     ):
         """Generic Features to Model: Note you should create a new class and inherit from
         this one to include specific logic for your Feature Set/Model
         Args:
-            target_column (str): Column name of the target variable
+            target_column (str or list[str]): Column name(s) of the target variable(s)
             description (str): Description of the model (optional)
             feature_list (list[str]): A list of columns for the features (default None, will try to guess)
             train_all_data (bool): Train on ALL (100%) of the data (default False)
@@ -102,9 +111,11 @@ class FeaturesToModel(Transform):
         s3_training_path = feature_set.create_s3_training_data()
         self.log.info(f"Created new training data {s3_training_path}...")
-        # Report the target column
+        # Report the target column(s)
         self.target_column = target_column
-        self.log.info(f"Target column: {self.target_column}")
+        # Normalize target_column to a list for internal use
+        target_list = [target_column] if isinstance(target_column, str) else (target_column or [])
+        self.log.info(f"Target column(s): {self.target_column}")
         # Did they specify a feature list?
         if feature_list:
@@ -131,7 +142,7 @@ class FeaturesToModel(Transform):
                 "is_deleted",
                 "event_time",
                 "training",
-            ] + [self.target_column]
+            ] + target_list
             feature_list = [c for c in all_columns if c not in filter_list]
             # AWS Feature Store has 3 user column types (String, Integral, Fractional)
@@ -154,11 +165,14 @@ class FeaturesToModel(Transform):
         self.log.important(f"Feature List for Modeling: {self.model_feature_list}")
         # Set up our parameters for the model script
+        # ChemProp expects target_column as a list; other templates expect a string
+        target_for_template = target_list if self.model_framework == ModelFramework.CHEMPROP else self.target_column
         template_params = {
             "model_imports": self.model_import_str,
             "model_type": self.model_type,
+            "model_framework": self.model_framework,
             "model_class": self.model_class,
-            "target_column": self.target_column,
+            "target_column": target_for_template,
             "feature_list": self.model_feature_list,
             "compressed_features": feature_set.get_compressed_features(),
             "model_metrics_s3_path": self.model_training_root,
@@ -184,23 +198,27 @@ class FeaturesToModel(Transform):
             # Generate our model script
             script_path = generate_model_script(template_params)
-        # Metric Definitions for Regression
+        # Metric Definitions for Regression (matches model script output format)
         if self.model_type in [ModelType.REGRESSOR, ModelType.UQ_REGRESSOR, ModelType.ENSEMBLE_REGRESSOR]:
             metric_definitions = [
-                {"Name": "RMSE", "Regex": "RMSE: ([0-9.]+)"},
-                {"Name": "MAE", "Regex": "MAE: ([0-9.]+)"},
-                {"Name": "R2", "Regex": "R2: ([0-9.]+)"},
-                {"Name": "NumRows", "Regex": "NumRows: ([0-9]+)"},
+                {"Name": "rmse", "Regex": r"rmse: ([0-9.]+)"},
+                {"Name": "mae", "Regex": r"mae: ([0-9.]+)"},
+                {"Name": "medae", "Regex": r"medae: ([0-9.]+)"},
+                {"Name": "r2", "Regex": r"r2: ([0-9.-]+)"},
+                {"Name": "spearmanr", "Regex": r"spearmanr: ([0-9.-]+)"},
+                {"Name": "support", "Regex": r"support: ([0-9]+)"},
             ]
         # Metric Definitions for Classification
         elif self.model_type == ModelType.CLASSIFIER:
             # We need to get creative with the Classification Metrics
+            # Note: Classification only supports single target
+            class_target = target_list[0] if target_list else self.target_column
             # Grab all the target column class values (class labels)
             table = feature_set.data_source.table
-            self.class_labels = feature_set.query(f'select DISTINCT {self.target_column} FROM "{table}"')[
-                self.target_column
+            self.class_labels = feature_set.query(f'select DISTINCT {class_target} FROM "{table}"')[
+                class_target
             ].to_list()
             # Sanity check on the targets
@@ -210,7 +228,7 @@ class FeaturesToModel(Transform):
                 raise ValueError(msg)
             # Dynamically create the metric definitions
-            metrics = ["precision", "recall", "fscore"]
+            metrics = ["precision", "recall", "f1", "support"]
             metric_definitions = []
             for t in self.class_labels:
                 for m in metrics:
@@ -233,13 +251,21 @@ class FeaturesToModel(Transform):
         source_dir = str(Path(script_path).parent)
         # Create a Sagemaker Model with our script
-        image = ModelImages.get_image_uri(self.sm_session.boto_region_name, self.training_image, "0.1")
+        image = ModelImages.get_image_uri(self.sm_session.boto_region_name, self.training_image)
+        # Use GPU instance for ChemProp/PyTorch, CPU for others
+        if self.model_framework in [ModelFramework.CHEMPROP, ModelFramework.PYTORCH]:
+            train_instance_type = "ml.g6.xlarge"  # NVIDIA L4 GPU, ~$0.80/hr
+            self.log.important(f"Using GPU instance {train_instance_type} for {self.model_framework.value}")
+        else:
+            train_instance_type = "ml.m5.xlarge"
         self.estimator = Estimator(
             entry_point=entry_point,
             source_dir=source_dir,
             role=self.workbench_role_arn,
             instance_count=1,
-            instance_type="ml.m5.xlarge",
+            instance_type=train_instance_type,
             sagemaker_session=self.sm_session,
             image_uri=image,
             metric_definitions=metric_definitions,
@@ -264,13 +290,20 @@ class FeaturesToModel(Transform):
         self.log.important(f"Creating new model {self.output_name}...")
         self.create_and_register_model(**kwargs)
+        # Make a copy of the training view, to lock-in the training data used for this model
+        model_training_view_name = f"{self.output_name.replace('-', '_')}_training"
+        self.log.important(f"Creating Model Training View: {model_training_view_name}...")
+        feature_set.view("training").copy(f"{model_training_view_name}")
     def post_transform(self, **kwargs):
         """Post-Transform: Calling onboard() on the Model"""
         self.log.info("Post-Transform: Calling onboard() on the Model...")
         time.sleep(3)  # Give AWS time to complete Model register
-        # Store the model feature_list and target_column in the workbench_meta
-        output_model = ModelCore(self.output_name, model_type=self.model_type)
+        # Store the model metadata information
+        output_model = ModelCore(self.output_name)
+        output_model._set_model_type(self.model_type)
+        output_model._set_model_framework(self.model_framework)
         output_model.upsert_workbench_meta({"workbench_model_features": self.model_feature_list})
         output_model.upsert_workbench_meta({"workbench_model_target": self.target_column})
@@ -301,7 +334,7 @@ class FeaturesToModel(Transform):
         # Register our model
         image = ModelImages.get_image_uri(
-            self.sm_session.boto_region_name, self.inference_image, "0.1", self.inference_arch
+            self.sm_session.boto_region_name, self.inference_image, architecture=self.inference_arch
         )
         self.log.important(f"Registering model {self.output_name} with Inference Image {image}...")
         model = self.estimator.create_model(role=self.workbench_role_arn)
@@ -325,12 +358,11 @@ if __name__ == "__main__":
     # Regression Model
     input_name = "abalone_features"
-    output_name = "test-abalone-regression"
+    output_name = "abalone-regression"
     to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.REGRESSOR)
     to_model.set_output_tags(["test"])
     to_model.transform(target_column="class_number_of_rings", description="Test Abalone Regression")
-    """
     # Classification Model
     input_name = "wine_features"
     output_name = "wine-classification"
@@ -340,10 +372,10 @@ if __name__ == "__main__":
     # Quantile Regression Model (Abalone)
     input_name = "abalone_features"
-    output_name = "abalone-quantile-reg"
+    output_name = "abalone-regression-uq"
     to_model = FeaturesToModel(input_name, output_name, ModelType.UQ_REGRESSOR)
-    to_model.set_output_tags(["abalone", "quantiles"])
-    to_model.transform(target_column="class_number_of_rings", description="Abalone Quantile Regression")
+    to_model.set_output_tags(["abalone", "uq"])
+    to_model.transform(target_column="class_number_of_rings", description="Abalone UQ Regression")
     # Scikit-Learn Kmeans Clustering Model
     input_name = "wine_features"
@@ -397,7 +429,7 @@ if __name__ == "__main__":
     scripts_root = Path(__file__).resolve().parents[3] / "model_scripts"
     my_script = scripts_root / "custom_models" / "chem_info" / "molecular_descriptors.py"
     input_name = "aqsol_features"
-    output_name = "smiles-to-taut-md-stereo-v0"
+    output_name = "test-smiles-to-taut-md-stereo"
     to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.TRANSFORMER, custom_script=my_script)
     to_model.set_output_tags(["smiles", "molecular descriptors"])
     to_model.transform(target_column=None, feature_list=["smiles"], description="Smiles to Molecular Descriptors")
@@ -410,13 +442,3 @@ if __name__ == "__main__":
     to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.TRANSFORMER, custom_script=my_script)
     to_model.set_output_tags(["smiles", "morgan fingerprints"])
     to_model.transform(target_column=None, feature_list=["smiles"], description="Smiles to Morgan Fingerprints")
-    # Tautomerization Model
-    scripts_root = Path(__file__).resolve().parents[3] / "model_scripts"
-    my_script = scripts_root / "custom_models" / "chem_info" / "tautomerize.py"
-    input_name = "aqsol_features"
-    output_name = "tautomerize-v0"
-    to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.TRANSFORMER, custom_script=my_script)
-    to_model.set_output_tags(["smiles", "tautomerization"])
-    to_model.transform(target_column=None, feature_list=["smiles"], description="Tautomerize Smiles")
-    """

workbench/core/transforms/model_to_endpoint/model_to_endpoint.py CHANGED Viewed

@@ -1,10 +1,12 @@
 """ModelToEndpoint: Deploy an Endpoint for a Model"""
 import time
+from botocore.exceptions import ClientError
 from sagemaker import ModelPackage
 from sagemaker.serializers import CSVSerializer
 from sagemaker.deserializers import CSVDeserializer
 from sagemaker.serverless import ServerlessInferenceConfig
+from sagemaker.model_monitor import DataCaptureConfig
 # Local Imports
 from workbench.core.transforms.transform import Transform, TransformInput, TransformOutput
@@ -51,27 +53,38 @@ class ModelToEndpoint(Transform):
         EndpointCore.managed_delete(self.output_name)
         # Get the Model Package ARN for our input model
-        input_model = ModelCore(self.input_name)
-        model_package_arn = input_model.model_package_arn()
+        workbench_model = ModelCore(self.input_name)
         # Deploy the model
-        self._deploy_model(model_package_arn, **kwargs)
+        self._deploy_model(workbench_model, **kwargs)
         # Add this endpoint to the set of registered endpoints for the model
-        input_model.register_endpoint(self.output_name)
+        workbench_model.register_endpoint(self.output_name)
         # This ensures that the endpoint is ready for use
         time.sleep(5)  # We wait for AWS Lag
         end = EndpointCore(self.output_name)
         self.log.important(f"Endpoint {end.name} is ready for use")
-    def _deploy_model(self, model_package_arn: str, mem_size: int = 2048, max_concurrency: int = 5):
+    def _deploy_model(
+        self,
+        workbench_model: ModelCore,
+        mem_size: int = 2048,
+        max_concurrency: int = 5,
+        data_capture: bool = False,
+        capture_percentage: int = 100,
+    ):
         """Internal Method: Deploy the Model
         Args:
-            model_package_arn(str): The Model Package ARN used to deploy the Endpoint
+            workbench_model(ModelCore): The Workbench ModelCore object to deploy
+            mem_size(int): Memory size for serverless deployment
+            max_concurrency(int): Max concurrency for serverless deployment
+            data_capture(bool): Enable data capture during deployment
+            capture_percentage(int): Percentage of data to capture. Defaults to 100.
         """
         # Grab the specified Model Package
+        model_package_arn = workbench_model.model_package_arn()
         model_package = ModelPackage(
             role=self.workbench_role_arn,
             model_package_arn=model_package_arn,
@@ -90,22 +103,70 @@ class ModelToEndpoint(Transform):
         # Is this a serverless deployment?
         serverless_config = None
         if self.serverless:
+            # For PyTorch or ChemProp we need at least 4GB of memory
+            from workbench.api import ModelFramework
+            self.log.info(f"Model Framework: {workbench_model.model_framework}")
+            if workbench_model.model_framework in [ModelFramework.PYTORCH, ModelFramework.CHEMPROP]:
+                if mem_size < 4096:
+                    self.log.important(
+                        f"{workbench_model.model_framework} needs at least 4GB of memory (setting to 4GB)"
+                    )
+                    mem_size = 4096
             serverless_config = ServerlessInferenceConfig(
                 memory_size_in_mb=mem_size,
                 max_concurrency=max_concurrency,
             )
+            self.log.important(f"Serverless Config: Memory={mem_size}MB, MaxConcurrency={max_concurrency}")
+        # Configure data capture if requested (and not serverless)
+        data_capture_config = None
+        if data_capture and not self.serverless:
+            # Set up the S3 path for data capture
+            base_endpoint_path = f"{workbench_model.endpoints_s3_path}/{self.output_name}"
+            data_capture_path = f"{base_endpoint_path}/data_capture"
+            self.log.important(f"Configuring Data Capture --> {data_capture_path}")
+            data_capture_config = DataCaptureConfig(
+                enable_capture=True,
+                sampling_percentage=capture_percentage,
+                destination_s3_uri=data_capture_path,
+            )
+        elif data_capture and self.serverless:
+            self.log.warning(
+                "Data capture is not supported for serverless endpoints. Skipping data capture configuration."
+            )
         # Deploy the Endpoint
         self.log.important(f"Deploying the Endpoint {self.output_name}...")
-        model_package.deploy(
-            initial_instance_count=1,
-            instance_type=self.instance_type,
-            serverless_inference_config=serverless_config,
-            endpoint_name=self.output_name,
-            serializer=CSVSerializer(),
-            deserializer=CSVDeserializer(),
-            tags=aws_tags,
-        )
+        try:
+            model_package.deploy(
+                initial_instance_count=1,
+                instance_type=self.instance_type,
+                serverless_inference_config=serverless_config,
+                endpoint_name=self.output_name,
+                serializer=CSVSerializer(),
+                deserializer=CSVDeserializer(),
+                data_capture_config=data_capture_config,
+                tags=aws_tags,
+            )
+        except ClientError as e:
+            # Check if this is the "endpoint config already exists" error
+            if "Cannot create already existing endpoint configuration" in str(e):
+                self.log.warning("Endpoint config already exists, deleting and retrying...")
+                self.sm_client.delete_endpoint_config(EndpointConfigName=self.output_name)
+                # Retry the deploy
+                model_package.deploy(
+                    initial_instance_count=1,
+                    instance_type=self.instance_type,
+                    serverless_inference_config=serverless_config,
+                    endpoint_name=self.output_name,
+                    serializer=CSVSerializer(),
+                    deserializer=CSVDeserializer(),
+                    data_capture_config=data_capture_config,
+                    tags=aws_tags,
+                )
+            else:
+                raise
     def post_transform(self, **kwargs):
         """Post-Transform: Calling onboard() for the Endpoint"""

workbench/core/transforms/pandas_transforms/pandas_to_features.py CHANGED Viewed

@@ -68,6 +68,15 @@ class PandasToFeatures(Transform):
         self.output_df = input_df.copy()
         self.one_hot_columns = one_hot_columns or []
+        # Warn about known AWS Iceberg bug with event_time_column
+        if event_time_column is not None:
+            self.log.warning(
+                f"event_time_column='{event_time_column}' specified. Note: AWS has a known bug with "
+                "Iceberg FeatureGroups where varying event times across multiple days can cause "
+                "duplicate rows in the offline store. Setting event_time_column=None."
+            )
+            self.event_time_column = None
         # Now Prepare the DataFrame for its journey into an AWS FeatureGroup
         self.prep_dataframe()
@@ -327,9 +336,36 @@ class PandasToFeatures(Transform):
         self.delete_existing()
         self.output_feature_group = self.create_feature_group()
+    def mac_spawn_hack(self):
+        """Workaround for macOS Tahoe fork/spawn issue with SageMaker FeatureStore ingest.
+        See: https://github.com/aws/sagemaker-python-sdk/issues/5312
+        macOS Tahoe 26+ has issues with forked processes creating boto3 sessions.
+        This forces spawn mode on macOS to avoid the hang.
+        """
+        import platform
+        if platform.system() == "Darwin":  # macOS
+            self.log.warning("macOS detected, forcing 'spawn' mode for multiprocessing (Tahoe hang workaround)")
+            import multiprocessing
+            try:
+                import multiprocess
+                multiprocess.set_start_method("spawn", force=True)
+            except (RuntimeError, ImportError):
+                pass  # Already set or multiprocess not available
+            try:
+                multiprocessing.set_start_method("spawn", force=True)
+            except RuntimeError:
+                pass  # Already set
     def transform_impl(self):
         """Transform Implementation: Ingest the data into the Feature Group"""
+        # Workaround for macOS Tahoe hang issue
+        self.mac_spawn_hack()
         # Now we actually push the data into the Feature Group (called ingestion)
         self.log.important(f"Ingesting rows into Feature Group {self.output_name}...")
         ingest_manager = self.output_feature_group.ingest(self.output_df, max_workers=8, max_processes=4, wait=False)
@@ -373,7 +409,7 @@ class PandasToFeatures(Transform):
         # Set Hold Out Ids (if we got them during creation)
         if self.incoming_hold_out_ids:
-            self.output_feature_set.set_training_holdouts(self.id_column, self.incoming_hold_out_ids)
+            self.output_feature_set.set_training_holdouts(self.incoming_hold_out_ids)
     def ensure_feature_group_created(self, feature_group):
         status = feature_group.describe().get("FeatureGroupStatus")
@@ -435,7 +471,7 @@ if __name__ == "__main__":
     # Create my DF to Feature Set Transform (with one-hot encoding)
     df_to_features = PandasToFeatures("test_features")
-    df_to_features.set_input(data_df, id_column="id", one_hot_columns=["food"])
+    df_to_features.set_input(data_df, id_column="id", event_time_column="date", one_hot_columns=["food"])
     df_to_features.set_output_tags(["test", "small"])
     df_to_features.transform()

workbench 0.8.162__py3-none-any.whl → 0.8.220__py3-none-any.whl

Potentially problematic release.

workbench 0.8.162py3-none-any.whl → 0.8.220py3-none-any.whl