PyPI - workbench - Versions diffs - 0.8.174__py3-none-any.whl → 0.8.227__py3-none-any.whl - Mend

workbench 0.8.174py3-none-any.whl → 0.8.227py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (145) hide show

workbench/__init__.py +1 -0
workbench/algorithms/dataframe/__init__.py +1 -2
workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
workbench/algorithms/dataframe/fingerprint_proximity.py +422 -86
workbench/algorithms/dataframe/projection_2d.py +44 -21
workbench/algorithms/dataframe/proximity.py +259 -305
workbench/algorithms/graph/light/proximity_graph.py +12 -11
workbench/algorithms/models/cleanlab_model.py +382 -0
workbench/algorithms/models/noise_model.py +388 -0
workbench/algorithms/sql/column_stats.py +0 -1
workbench/algorithms/sql/correlations.py +0 -1
workbench/algorithms/sql/descriptive_stats.py +0 -1
workbench/algorithms/sql/outliers.py +3 -3
workbench/api/__init__.py +5 -1
workbench/api/df_store.py +17 -108
workbench/api/endpoint.py +14 -12
workbench/api/feature_set.py +117 -11
workbench/api/meta.py +0 -1
workbench/api/meta_model.py +289 -0
workbench/api/model.py +52 -21
workbench/api/parameter_store.py +3 -52
workbench/cached/cached_meta.py +0 -1
workbench/cached/cached_model.py +49 -11
workbench/core/artifacts/__init__.py +11 -2
workbench/core/artifacts/artifact.py +7 -7
workbench/core/artifacts/data_capture_core.py +8 -1
workbench/core/artifacts/df_store_core.py +114 -0
workbench/core/artifacts/endpoint_core.py +323 -205
workbench/core/artifacts/feature_set_core.py +249 -45
workbench/core/artifacts/model_core.py +133 -101
workbench/core/artifacts/parameter_store_core.py +98 -0
workbench/core/cloud_platform/aws/aws_account_clamp.py +48 -2
workbench/core/cloud_platform/cloud_meta.py +0 -1
workbench/core/pipelines/pipeline_executor.py +1 -1
workbench/core/transforms/features_to_model/features_to_model.py +60 -44
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +43 -10
workbench/core/transforms/pandas_transforms/pandas_to_features.py +38 -2
workbench/core/views/training_view.py +113 -42
workbench/core/views/view.py +53 -3
workbench/core/views/view_utils.py +4 -4
workbench/model_script_utils/model_script_utils.py +339 -0
workbench/model_script_utils/pytorch_utils.py +405 -0
workbench/model_script_utils/uq_harness.py +277 -0
workbench/model_scripts/chemprop/chemprop.template +774 -0
workbench/model_scripts/chemprop/generated_model_script.py +774 -0
workbench/model_scripts/chemprop/model_script_utils.py +339 -0
workbench/model_scripts/chemprop/requirements.txt +3 -0
workbench/model_scripts/custom_models/chem_info/fingerprints.py +175 -0
workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +18 -7
workbench/model_scripts/custom_models/chem_info/mol_standardize.py +80 -58
workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +0 -1
workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -2
workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +8 -10
workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
workbench/model_scripts/custom_models/uq_models/ngboost.template +15 -16
workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
workbench/model_scripts/meta_model/generated_model_script.py +209 -0
workbench/model_scripts/meta_model/meta_model.template +209 -0
workbench/model_scripts/pytorch_model/generated_model_script.py +443 -499
workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
workbench/model_scripts/pytorch_model/pytorch.template +440 -496
workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
workbench/model_scripts/pytorch_model/requirements.txt +1 -1
workbench/model_scripts/pytorch_model/uq_harness.py +277 -0
workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
workbench/model_scripts/script_generation.py +15 -12
workbench/model_scripts/uq_models/generated_model_script.py +248 -0
workbench/model_scripts/xgb_model/generated_model_script.py +371 -403
workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
workbench/model_scripts/xgb_model/uq_harness.py +277 -0
workbench/model_scripts/xgb_model/xgb_model.template +367 -399
workbench/repl/workbench_shell.py +18 -14
workbench/resources/open_source_api.key +1 -1
workbench/scripts/endpoint_test.py +162 -0
workbench/scripts/lambda_test.py +73 -0
workbench/scripts/meta_model_sim.py +35 -0
workbench/scripts/ml_pipeline_sqs.py +122 -6
workbench/scripts/training_test.py +85 -0
workbench/themes/dark/custom.css +59 -0
workbench/themes/dark/plotly.json +5 -5
workbench/themes/light/custom.css +153 -40
workbench/themes/light/plotly.json +9 -9
workbench/themes/midnight_blue/custom.css +59 -0
workbench/utils/aws_utils.py +0 -1
workbench/utils/chem_utils/fingerprints.py +87 -46
workbench/utils/chem_utils/mol_descriptors.py +18 -7
workbench/utils/chem_utils/mol_standardize.py +80 -58
workbench/utils/chem_utils/projections.py +16 -6
workbench/utils/chem_utils/vis.py +25 -27
workbench/utils/chemprop_utils.py +141 -0
workbench/utils/config_manager.py +2 -6
workbench/utils/endpoint_utils.py +5 -7
workbench/utils/license_manager.py +2 -6
workbench/utils/markdown_utils.py +57 -0
workbench/utils/meta_model_simulator.py +499 -0
workbench/utils/metrics_utils.py +256 -0
workbench/utils/model_utils.py +274 -87
workbench/utils/pipeline_utils.py +0 -1
workbench/utils/plot_utils.py +159 -34
workbench/utils/pytorch_utils.py +87 -0
workbench/utils/shap_utils.py +11 -57
workbench/utils/theme_manager.py +95 -30
workbench/utils/xgboost_local_crossfold.py +267 -0
workbench/utils/xgboost_model_utils.py +127 -220
workbench/web_interface/components/experiments/outlier_plot.py +0 -1
workbench/web_interface/components/model_plot.py +16 -2
workbench/web_interface/components/plugin_unit_test.py +5 -3
workbench/web_interface/components/plugins/ag_table.py +2 -4
workbench/web_interface/components/plugins/confusion_matrix.py +3 -6
workbench/web_interface/components/plugins/model_details.py +48 -80
workbench/web_interface/components/plugins/scatter_plot.py +192 -92
workbench/web_interface/components/settings_menu.py +184 -0
workbench/web_interface/page_views/main_page.py +0 -1
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/METADATA +31 -17
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/RECORD +125 -111
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/entry_points.txt +4 -0
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/licenses/LICENSE +1 -1
workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
workbench/model_scripts/custom_models/proximity/proximity.py +0 -384
workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
workbench/model_scripts/custom_models/uq_models/mapie.template +0 -502
workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -386
workbench/model_scripts/custom_models/uq_models/proximity.py +0 -384
workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
workbench/model_scripts/quant_regression/quant_regression.template +0 -279
workbench/model_scripts/quant_regression/requirements.txt +0 -1
workbench/themes/quartz/base_css.url +0 -1
workbench/themes/quartz/custom.css +0 -117
workbench/themes/quartz/plotly.json +0 -642
workbench/themes/quartz_dark/base_css.url +0 -1
workbench/themes/quartz_dark/custom.css +0 -131
workbench/themes/quartz_dark/plotly.json +0 -642
workbench/utils/fast_inference.py +0 -167
workbench/utils/resource_utils.py +0 -39
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/WHEEL +0 -0
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/top_level.txt +0 -0

workbench/core/transforms/features_to_model/features_to_model.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """FeaturesToModel: Train/Create a Model from a Feature Set"""
 from pathlib import Path
+from typing import Union
 from sagemaker.estimator import Estimator
 import awswrangler as wr
 from datetime import datetime, timezone
@@ -9,7 +10,7 @@ import time
 # Local Imports
 from workbench.core.transforms.transform import Transform, TransformInput, TransformOutput
 from workbench.core.artifacts.feature_set_core import FeatureSetCore
-from workbench.core.artifacts.model_core import ModelCore, ModelType, ModelImages
+from workbench.core.artifacts.model_core import ModelCore, ModelType, ModelFramework, ModelImages
 from workbench.core.artifacts.artifact import Artifact
 from workbench.model_scripts.script_generation import generate_model_script, fill_template
 from workbench.utils.model_utils import supported_instance_types
@@ -33,6 +34,7 @@ class FeaturesToModel(Transform):
         feature_name: str,
         model_name: str,
         model_type: ModelType,
+        model_framework=ModelFramework.XGBOOST,
         model_class=None,
         model_import_str=None,
         custom_script=None,
@@ -46,6 +48,7 @@ class FeaturesToModel(Transform):
             feature_name (str): Name of the FeatureSet to use as input
             model_name (str): Name of the Model to create as output
             model_type (ModelType): ModelType.REGRESSOR or ModelType.CLASSIFIER, etc.
+            model_framework (ModelFramework, optional): The model framework (default ModelFramework.XGBOOST)
             model_class (str, optional): The scikit model (e.g. KNeighborsRegressor) (default None)
             model_import_str (str, optional): The import string for the model (default None)
             custom_script (str, optional): Custom script to use for the model (default None)
@@ -65,6 +68,7 @@ class FeaturesToModel(Transform):
         self.input_type = TransformInput.FEATURE_SET
         self.output_type = TransformOutput.MODEL
         self.model_type = model_type
+        self.model_framework = model_framework
         self.model_class = model_class
         self.model_import_str = model_import_str
         self.custom_script = str(custom_script) if custom_script else None
@@ -80,12 +84,17 @@ class FeaturesToModel(Transform):
         self.inference_arch = inference_arch
     def transform_impl(
-        self, target_column: str, description: str = None, feature_list: list = None, train_all_data=False, **kwargs
+        self,
+        target_column: Union[str, list[str]],
+        description: str = None,
+        feature_list: list = None,
+        train_all_data=False,
+        **kwargs,
     ):
         """Generic Features to Model: Note you should create a new class and inherit from
         this one to include specific logic for your Feature Set/Model
         Args:
-            target_column (str): Column name of the target variable
+            target_column (str or list[str]): Column name(s) of the target variable(s)
             description (str): Description of the model (optional)
             feature_list (list[str]): A list of columns for the features (default None, will try to guess)
             train_all_data (bool): Train on ALL (100%) of the data (default False)
@@ -102,9 +111,11 @@ class FeaturesToModel(Transform):
         s3_training_path = feature_set.create_s3_training_data()
         self.log.info(f"Created new training data {s3_training_path}...")
-        # Report the target column
+        # Report the target column(s)
         self.target_column = target_column
-        self.log.info(f"Target column: {self.target_column}")
+        # Normalize target_column to a list for internal use
+        target_list = [target_column] if isinstance(target_column, str) else (target_column or [])
+        self.log.info(f"Target column(s): {self.target_column}")
         # Did they specify a feature list?
         if feature_list:
@@ -131,7 +142,7 @@ class FeaturesToModel(Transform):
                 "is_deleted",
                 "event_time",
                 "training",
-            ] + [self.target_column]
+            ] + target_list
             feature_list = [c for c in all_columns if c not in filter_list]
             # AWS Feature Store has 3 user column types (String, Integral, Fractional)
@@ -154,11 +165,14 @@ class FeaturesToModel(Transform):
         self.log.important(f"Feature List for Modeling: {self.model_feature_list}")
         # Set up our parameters for the model script
+        # ChemProp expects target_column as a list; other templates expect a string
+        target_for_template = target_list if self.model_framework == ModelFramework.CHEMPROP else self.target_column
         template_params = {
             "model_imports": self.model_import_str,
             "model_type": self.model_type,
+            "model_framework": self.model_framework,
             "model_class": self.model_class,
-            "target_column": self.target_column,
+            "target_column": target_for_template,
             "feature_list": self.model_feature_list,
             "compressed_features": feature_set.get_compressed_features(),
             "model_metrics_s3_path": self.model_training_root,
@@ -184,23 +198,27 @@ class FeaturesToModel(Transform):
             # Generate our model script
             script_path = generate_model_script(template_params)
-        # Metric Definitions for Regression
+        # Metric Definitions for Regression (matches model script output format)
         if self.model_type in [ModelType.REGRESSOR, ModelType.UQ_REGRESSOR, ModelType.ENSEMBLE_REGRESSOR]:
             metric_definitions = [
-                {"Name": "RMSE", "Regex": "RMSE: ([0-9.]+)"},
-                {"Name": "MAE", "Regex": "MAE: ([0-9.]+)"},
-                {"Name": "R2", "Regex": "R2: ([0-9.]+)"},
-                {"Name": "NumRows", "Regex": "NumRows: ([0-9]+)"},
+                {"Name": "rmse", "Regex": r"rmse: ([0-9.]+)"},
+                {"Name": "mae", "Regex": r"mae: ([0-9.]+)"},
+                {"Name": "medae", "Regex": r"medae: ([0-9.]+)"},
+                {"Name": "r2", "Regex": r"r2: ([0-9.-]+)"},
+                {"Name": "spearmanr", "Regex": r"spearmanr: ([0-9.-]+)"},
+                {"Name": "support", "Regex": r"support: ([0-9]+)"},
             ]
         # Metric Definitions for Classification
         elif self.model_type == ModelType.CLASSIFIER:
             # We need to get creative with the Classification Metrics
+            # Note: Classification only supports single target
+            class_target = target_list[0] if target_list else self.target_column
             # Grab all the target column class values (class labels)
             table = feature_set.data_source.table
-            self.class_labels = feature_set.query(f'select DISTINCT {self.target_column} FROM "{table}"')[
-                self.target_column
+            self.class_labels = feature_set.query(f'select DISTINCT {class_target} FROM "{table}"')[
+                class_target
             ].to_list()
             # Sanity check on the targets
@@ -209,20 +227,14 @@ class FeaturesToModel(Transform):
                 self.log.critical(msg)
                 raise ValueError(msg)
-            # Dynamically create the metric definitions
-            metrics = ["precision", "recall", "fscore"]
+            # Dynamically create the metric definitions (per-class precision/recall/f1/support)
+            # Note: Confusion matrix metrics are skipped to stay under SageMaker's 40 metric limit
+            metrics = ["precision", "recall", "f1", "support"]
             metric_definitions = []
             for t in self.class_labels:
                 for m in metrics:
                     metric_definitions.append({"Name": f"Metrics:{t}:{m}", "Regex": f"Metrics:{t}:{m} ([0-9.]+)"})
-            # Add the confusion matrix metrics
-            for row in self.class_labels:
-                for col in self.class_labels:
-                    metric_definitions.append(
-                        {"Name": f"ConfusionMatrix:{row}:{col}", "Regex": f"ConfusionMatrix:{row}:{col} ([0-9.]+)"}
-                    )
         # If the model type is UNKNOWN, our metric_definitions will be empty
         else:
             self.log.important(f"ModelType is {self.model_type}, skipping metric_definitions...")
@@ -233,13 +245,21 @@ class FeaturesToModel(Transform):
         source_dir = str(Path(script_path).parent)
         # Create a Sagemaker Model with our script
-        image = ModelImages.get_image_uri(self.sm_session.boto_region_name, self.training_image, "0.1")
+        image = ModelImages.get_image_uri(self.sm_session.boto_region_name, self.training_image)
+        # Use GPU instance for ChemProp/PyTorch, CPU for others
+        if self.model_framework in [ModelFramework.CHEMPROP, ModelFramework.PYTORCH]:
+            train_instance_type = "ml.g6.xlarge"  # NVIDIA L4 GPU, ~$0.80/hr
+            self.log.important(f"Using GPU instance {train_instance_type} for {self.model_framework.value}")
+        else:
+            train_instance_type = "ml.m5.xlarge"
         self.estimator = Estimator(
             entry_point=entry_point,
             source_dir=source_dir,
             role=self.workbench_role_arn,
             instance_count=1,
-            instance_type="ml.m5.xlarge",
+            instance_type=train_instance_type,
             sagemaker_session=self.sm_session,
             image_uri=image,
             metric_definitions=metric_definitions,
@@ -264,13 +284,20 @@ class FeaturesToModel(Transform):
         self.log.important(f"Creating new model {self.output_name}...")
         self.create_and_register_model(**kwargs)
+        # Make a copy of the training view, to lock-in the training data used for this model
+        model_training_view_name = f"{self.output_name.replace('-', '_')}_training"
+        self.log.important(f"Creating Model Training View: {model_training_view_name}...")
+        feature_set.view("training").copy(f"{model_training_view_name}")
     def post_transform(self, **kwargs):
         """Post-Transform: Calling onboard() on the Model"""
         self.log.info("Post-Transform: Calling onboard() on the Model...")
         time.sleep(3)  # Give AWS time to complete Model register
-        # Store the model feature_list and target_column in the workbench_meta
-        output_model = ModelCore(self.output_name, model_type=self.model_type)
+        # Store the model metadata information
+        output_model = ModelCore(self.output_name)
+        output_model._set_model_type(self.model_type)
+        output_model._set_model_framework(self.model_framework)
         output_model.upsert_workbench_meta({"workbench_model_features": self.model_feature_list})
         output_model.upsert_workbench_meta({"workbench_model_target": self.target_column})
@@ -301,7 +328,7 @@ class FeaturesToModel(Transform):
         # Register our model
         image = ModelImages.get_image_uri(
-            self.sm_session.boto_region_name, self.inference_image, "0.1", self.inference_arch
+            self.sm_session.boto_region_name, self.inference_image, architecture=self.inference_arch
         )
         self.log.important(f"Registering model {self.output_name} with Inference Image {image}...")
         model = self.estimator.create_model(role=self.workbench_role_arn)
@@ -325,12 +352,11 @@ if __name__ == "__main__":
     # Regression Model
     input_name = "abalone_features"
-    output_name = "test-abalone-regression"
+    output_name = "abalone-regression"
     to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.REGRESSOR)
     to_model.set_output_tags(["test"])
     to_model.transform(target_column="class_number_of_rings", description="Test Abalone Regression")
-    """
     # Classification Model
     input_name = "wine_features"
     output_name = "wine-classification"
@@ -340,10 +366,10 @@ if __name__ == "__main__":
     # Quantile Regression Model (Abalone)
     input_name = "abalone_features"
-    output_name = "abalone-quantile-reg"
+    output_name = "abalone-regression-uq"
     to_model = FeaturesToModel(input_name, output_name, ModelType.UQ_REGRESSOR)
-    to_model.set_output_tags(["abalone", "quantiles"])
-    to_model.transform(target_column="class_number_of_rings", description="Abalone Quantile Regression")
+    to_model.set_output_tags(["abalone", "uq"])
+    to_model.transform(target_column="class_number_of_rings", description="Abalone UQ Regression")
     # Scikit-Learn Kmeans Clustering Model
     input_name = "wine_features"
@@ -397,7 +423,7 @@ if __name__ == "__main__":
     scripts_root = Path(__file__).resolve().parents[3] / "model_scripts"
     my_script = scripts_root / "custom_models" / "chem_info" / "molecular_descriptors.py"
     input_name = "aqsol_features"
-    output_name = "smiles-to-taut-md-stereo-v0"
+    output_name = "test-smiles-to-taut-md-stereo"
     to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.TRANSFORMER, custom_script=my_script)
     to_model.set_output_tags(["smiles", "molecular descriptors"])
     to_model.transform(target_column=None, feature_list=["smiles"], description="Smiles to Molecular Descriptors")
@@ -410,13 +436,3 @@ if __name__ == "__main__":
     to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.TRANSFORMER, custom_script=my_script)
     to_model.set_output_tags(["smiles", "morgan fingerprints"])
     to_model.transform(target_column=None, feature_list=["smiles"], description="Smiles to Morgan Fingerprints")
-    # Tautomerization Model
-    scripts_root = Path(__file__).resolve().parents[3] / "model_scripts"
-    my_script = scripts_root / "custom_models" / "chem_info" / "tautomerize.py"
-    input_name = "aqsol_features"
-    output_name = "tautomerize-v0"
-    to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.TRANSFORMER, custom_script=my_script)
-    to_model.set_output_tags(["smiles", "tautomerization"])
-    to_model.transform(target_column=None, feature_list=["smiles"], description="Tautomerize Smiles")
-    """

workbench/core/transforms/model_to_endpoint/model_to_endpoint.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """ModelToEndpoint: Deploy an Endpoint for a Model"""
 import time
+from botocore.exceptions import ClientError
 from sagemaker import ModelPackage
 from sagemaker.serializers import CSVSerializer
 from sagemaker.deserializers import CSVDeserializer
@@ -102,10 +103,21 @@ class ModelToEndpoint(Transform):
         # Is this a serverless deployment?
         serverless_config = None
         if self.serverless:
+            # For PyTorch or ChemProp we need at least 4GB of memory
+            from workbench.api import ModelFramework
+            self.log.info(f"Model Framework: {workbench_model.model_framework}")
+            if workbench_model.model_framework in [ModelFramework.PYTORCH, ModelFramework.CHEMPROP]:
+                if mem_size < 4096:
+                    self.log.important(
+                        f"{workbench_model.model_framework} needs at least 4GB of memory (setting to 4GB)"
+                    )
+                    mem_size = 4096
             serverless_config = ServerlessInferenceConfig(
                 memory_size_in_mb=mem_size,
                 max_concurrency=max_concurrency,
             )
+            self.log.important(f"Serverless Config: Memory={mem_size}MB, MaxConcurrency={max_concurrency}")
         # Configure data capture if requested (and not serverless)
         data_capture_config = None
@@ -126,16 +138,37 @@ class ModelToEndpoint(Transform):
         # Deploy the Endpoint
         self.log.important(f"Deploying the Endpoint {self.output_name}...")
-        model_package.deploy(
-            initial_instance_count=1,
-            instance_type=self.instance_type,
-            serverless_inference_config=serverless_config,
-            endpoint_name=self.output_name,
-            serializer=CSVSerializer(),
-            deserializer=CSVDeserializer(),
-            data_capture_config=data_capture_config,
-            tags=aws_tags,
-        )
+        try:
+            model_package.deploy(
+                initial_instance_count=1,
+                instance_type=self.instance_type,
+                serverless_inference_config=serverless_config,
+                endpoint_name=self.output_name,
+                serializer=CSVSerializer(),
+                deserializer=CSVDeserializer(),
+                data_capture_config=data_capture_config,
+                tags=aws_tags,
+                container_startup_health_check_timeout=300,
+            )
+        except ClientError as e:
+            # Check if this is the "endpoint config already exists" error
+            if "Cannot create already existing endpoint configuration" in str(e):
+                self.log.warning("Endpoint config already exists, deleting and retrying...")
+                self.sm_client.delete_endpoint_config(EndpointConfigName=self.output_name)
+                # Retry the deploy
+                model_package.deploy(
+                    initial_instance_count=1,
+                    instance_type=self.instance_type,
+                    serverless_inference_config=serverless_config,
+                    endpoint_name=self.output_name,
+                    serializer=CSVSerializer(),
+                    deserializer=CSVDeserializer(),
+                    data_capture_config=data_capture_config,
+                    tags=aws_tags,
+                    container_startup_health_check_timeout=300,
+                )
+            else:
+                raise
     def post_transform(self, **kwargs):
         """Post-Transform: Calling onboard() for the Endpoint"""

workbench/core/transforms/pandas_transforms/pandas_to_features.py CHANGED Viewed

@@ -68,6 +68,15 @@ class PandasToFeatures(Transform):
         self.output_df = input_df.copy()
         self.one_hot_columns = one_hot_columns or []
+        # Warn about known AWS Iceberg bug with event_time_column
+        if event_time_column is not None:
+            self.log.warning(
+                f"event_time_column='{event_time_column}' specified. Note: AWS has a known bug with "
+                "Iceberg FeatureGroups where varying event times across multiple days can cause "
+                "duplicate rows in the offline store. Setting event_time_column=None."
+            )
+            self.event_time_column = None
         # Now Prepare the DataFrame for its journey into an AWS FeatureGroup
         self.prep_dataframe()
@@ -327,9 +336,36 @@ class PandasToFeatures(Transform):
         self.delete_existing()
         self.output_feature_group = self.create_feature_group()
+    def mac_spawn_hack(self):
+        """Workaround for macOS Tahoe fork/spawn issue with SageMaker FeatureStore ingest.
+        See: https://github.com/aws/sagemaker-python-sdk/issues/5312
+        macOS Tahoe 26+ has issues with forked processes creating boto3 sessions.
+        This forces spawn mode on macOS to avoid the hang.
+        """
+        import platform
+        if platform.system() == "Darwin":  # macOS
+            self.log.warning("macOS detected, forcing 'spawn' mode for multiprocessing (Tahoe hang workaround)")
+            import multiprocessing
+            try:
+                import multiprocess
+                multiprocess.set_start_method("spawn", force=True)
+            except (RuntimeError, ImportError):
+                pass  # Already set or multiprocess not available
+            try:
+                multiprocessing.set_start_method("spawn", force=True)
+            except RuntimeError:
+                pass  # Already set
     def transform_impl(self):
         """Transform Implementation: Ingest the data into the Feature Group"""
+        # Workaround for macOS Tahoe hang issue
+        self.mac_spawn_hack()
         # Now we actually push the data into the Feature Group (called ingestion)
         self.log.important(f"Ingesting rows into Feature Group {self.output_name}...")
         ingest_manager = self.output_feature_group.ingest(self.output_df, max_workers=8, max_processes=4, wait=False)
@@ -373,7 +409,7 @@ class PandasToFeatures(Transform):
         # Set Hold Out Ids (if we got them during creation)
         if self.incoming_hold_out_ids:
-            self.output_feature_set.set_training_holdouts(self.id_column, self.incoming_hold_out_ids)
+            self.output_feature_set.set_training_holdouts(self.incoming_hold_out_ids)
     def ensure_feature_group_created(self, feature_group):
         status = feature_group.describe().get("FeatureGroupStatus")
@@ -435,7 +471,7 @@ if __name__ == "__main__":
     # Create my DF to Feature Set Transform (with one-hot encoding)
     df_to_features = PandasToFeatures("test_features")
-    df_to_features.set_input(data_df, id_column="id", one_hot_columns=["food"])
+    df_to_features.set_input(data_df, id_column="id", event_time_column="date", one_hot_columns=["food"])
     df_to_features.set_output_tags(["test", "small"])
     df_to_features.transform()

workbench/core/views/training_view.py CHANGED Viewed

@@ -3,14 +3,18 @@
 from typing import Union
 # Workbench Imports
-from workbench.api import DataSource, FeatureSet
+from workbench.api import FeatureSet
 from workbench.core.views.view import View
 from workbench.core.views.create_view import CreateView
 from workbench.core.views.view_utils import get_column_list
 class TrainingView(CreateView):
-    """TrainingView Class: A View with an additional training column that marks holdout ids
+    """TrainingView Class: A View with an additional training column (80/20 or holdout ids).
+    The TrainingView class creates a SQL view that includes all columns from the source table
+    along with an additional boolean column named "training". This view can also include
+    a SQL filter expression to filter the rows included in the view.
     Common Usage:
         ```python
@@ -19,8 +23,9 @@ class TrainingView(CreateView):
         training_view = TrainingView.create(fs)
         df = training_view.pull_dataframe()
-        # Create a TrainingView with a specific set of columns
-        training_view = TrainingView.create(fs, column_list=["my_col1", "my_col2"])
+        # Create a TrainingView with a specific filter expression
+        training_view = TrainingView.create(fs, id_column="auto_id", filter_expression="age > 30")
+        df = training_view.pull_dataframe()
         # Query the view
         df = training_view.query(f"SELECT * FROM {training_view.table} where training = TRUE")
@@ -31,17 +36,21 @@ class TrainingView(CreateView):
     def create(
         cls,
         feature_set: FeatureSet,
-        source_table: str = None,
+        *,  # Enforce keyword arguments after feature_set
         id_column: str = None,
         holdout_ids: Union[list[str], list[int], None] = None,
+        filter_expression: str = None,
+        source_table: str = None,
     ) -> Union[View, None]:
         """Factory method to create and return a TrainingView instance.
         Args:
             feature_set (FeatureSet): A FeatureSet object
-            source_table (str, optional): The table/view to create the view from. Defaults to None.
             id_column (str, optional): The name of the id column. Defaults to None.
             holdout_ids (Union[list[str], list[int], None], optional): A list of holdout ids. Defaults to None.
+            filter_expression (str, optional): SQL filter expression (e.g., "age > 25 AND status = 'active'").
+                                               Defaults to None.
+            source_table (str, optional): The table/view to create the view from. Defaults to None.
         Returns:
             Union[View, None]: The created View object (or None if failed to create the view)
@@ -69,28 +78,36 @@ class TrainingView(CreateView):
                 else:
                     id_column = instance.auto_id_column
-        # If we don't have holdout ids, create a default training view
-        if not holdout_ids:
-            instance._default_training_view(instance.data_source, id_column)
-            return View(instance.data_source, instance.view_name, auto_create_view=False)
+        # Enclose each column name in double quotes
+        sql_columns = ", ".join([f'"{column}"' for column in column_list])
+        # Build the training assignment logic
+        if holdout_ids:
+            # Format the list of holdout ids for SQL IN clause
+            if all(isinstance(id, str) for id in holdout_ids):
+                formatted_holdout_ids = ", ".join(f"'{id}'" for id in holdout_ids)
+            else:
+                formatted_holdout_ids = ", ".join(map(str, holdout_ids))
-        # Format the list of holdout ids for SQL IN clause
-        if holdout_ids and all(isinstance(id, str) for id in holdout_ids):
-            formatted_holdout_ids = ", ".join(f"'{id}'" for id in holdout_ids)
+            training_logic = f"""CASE
+                WHEN {id_column} IN ({formatted_holdout_ids}) THEN False
+                ELSE True
+            END AS training"""
         else:
-            formatted_holdout_ids = ", ".join(map(str, holdout_ids))
+            # Default 80/20 split using modulo
+            training_logic = f"""CASE
+                WHEN MOD(ROW_NUMBER() OVER (ORDER BY {id_column}), 10) < 8 THEN True
+                ELSE False
+            END AS training"""
-        # Enclose each column name in double quotes
-        sql_columns = ", ".join([f'"{column}"' for column in column_list])
+        # Build WHERE clause if filter_expression is provided
+        where_clause = f"\nWHERE {filter_expression}" if filter_expression else ""
         # Construct the CREATE VIEW query
         create_view_query = f"""
         CREATE OR REPLACE VIEW {instance.table} AS
-        SELECT {sql_columns}, CASE
-            WHEN {id_column} IN ({formatted_holdout_ids}) THEN False
-            ELSE True
-        END AS training
-        FROM {instance.source_table}
+        SELECT {sql_columns}, {training_logic}
+        FROM {instance.source_table}{where_clause}
         """
         # Execute the CREATE VIEW query
@@ -99,35 +116,56 @@ class TrainingView(CreateView):
         # Return the View
         return View(instance.data_source, instance.view_name, auto_create_view=False)
-    # This is an internal method that's used to create a default training view
-    def _default_training_view(self, data_source: DataSource, id_column: str):
-        """Create a default view in Athena that assigns roughly 80% of the data to training
+    @classmethod
+    def create_with_sql(
+        cls,
+        feature_set: FeatureSet,
+        *,
+        sql_query: str,
+        id_column: str = None,
+    ) -> Union[View, None]:
+        """Factory method to create a TrainingView from a custom SQL query.
+        This method takes a complete SQL query and adds the default 80/20 training split.
+        Use this when you need complex queries like UNION ALL for oversampling.
         Args:
-            data_source (DataSource): The Workbench DataSource object
-            id_column (str): The name of the id column
+            feature_set (FeatureSet): A FeatureSet object
+            sql_query (str): Complete SELECT query (without the final semicolon)
+            id_column (str, optional): The name of the id column for training split. Defaults to None.
+        Returns:
+            Union[View, None]: The created View object (or None if failed)
         """
-        self.log.important(f"Creating default Training View {self.table}...")
+        # Instantiate the TrainingView
+        instance = cls("training", feature_set)
-        # Drop any columns generated from AWS
-        aws_cols = ["write_time", "api_invocation_time", "is_deleted", "event_time"]
-        column_list = [col for col in data_source.columns if col not in aws_cols]
+        # Sanity check on the id column
+        if not id_column:
+            instance.log.important("No id column specified, using auto_id_column")
+            if not instance.auto_id_column:
+                instance.log.error("No id column specified and no auto_id_column found, aborting")
+                return None
+            id_column = instance.auto_id_column
-        # Enclose each column name in double quotes
-        sql_columns = ", ".join([f'"{column}"' for column in column_list])
+        # Default 80/20 split using modulo
+        training_logic = f"""CASE
+            WHEN MOD(ROW_NUMBER() OVER (ORDER BY {id_column}), 10) < 8 THEN True
+            ELSE False
+        END AS training"""
-        # Construct the CREATE VIEW query with a simple modulo operation for the 80/20 split
+        # Wrap the custom query and add training column
         create_view_query = f"""
-        CREATE OR REPLACE VIEW "{self.table}" AS
-        SELECT {sql_columns}, CASE
-            WHEN MOD(ROW_NUMBER() OVER (ORDER BY {id_column}), 10) < 8 THEN True  -- Assign 80% to training
-            ELSE False  -- Assign roughly 20% to validation/test
-        END AS training
-        FROM {self.base_table_name}
+        CREATE OR REPLACE VIEW {instance.table} AS
+        SELECT *, {training_logic}
+        FROM ({sql_query}) AS custom_source
         """
         # Execute the CREATE VIEW query
-        data_source.execute_statement(create_view_query)
+        instance.data_source.execute_statement(create_view_query)
+        # Return the View
+        return View(instance.data_source, instance.view_name, auto_create_view=False)
 if __name__ == "__main__":
@@ -135,7 +173,7 @@ if __name__ == "__main__":
     from workbench.api import FeatureSet
     # Get the FeatureSet
-    fs = FeatureSet("test_features")
+    fs = FeatureSet("abalone_features")
     # Delete the existing training view
     training_view = TrainingView.create(fs)
@@ -152,9 +190,42 @@ if __name__ == "__main__":
     # Create a TrainingView with holdout ids
     my_holdout_ids = list(range(10))
-    training_view = TrainingView.create(fs, id_column="id", holdout_ids=my_holdout_ids)
+    training_view = TrainingView.create(fs, id_column="auto_id", holdout_ids=my_holdout_ids)
     # Pull the training data
     df = training_view.pull_dataframe()
     print(df.head())
     print(df["training"].value_counts())
+    print(f"Shape: {df.shape}")
+    print(f"Diameter min: {df['diameter'].min()}, max: {df['diameter'].max()}")
+    # Test the filter expression
+    training_view = TrainingView.create(fs, id_column="auto_id", filter_expression="diameter > 0.5")
+    df = training_view.pull_dataframe()
+    print(df.head())
+    print(f"Shape with filter: {df.shape}")
+    print(f"Diameter min: {df['diameter'].min()}, max: {df['diameter'].max()}")
+    # Test create_with_sql with a custom query (UNION ALL for oversampling)
+    print("\n--- Testing create_with_sql with oversampling ---")
+    base_table = fs.table
+    replicate_ids = [0, 1, 2]  # Oversample these IDs
+    custom_sql = f"""
+        SELECT * FROM {base_table}
+        UNION ALL
+        SELECT * FROM {base_table}
+        WHERE auto_id IN ({', '.join(map(str, replicate_ids))})
+    """
+    training_view = TrainingView.create_with_sql(fs, sql_query=custom_sql, id_column="auto_id")
+    df = training_view.pull_dataframe()
+    print(f"Shape with custom SQL: {df.shape}")
+    print(df["training"].value_counts())
+    # Verify oversampling - check if replicated IDs appear twice
+    for rep_id in replicate_ids:
+        count = len(df[df["auto_id"] == rep_id])
+        print(f"ID {rep_id} appears {count} times")

workbench 0.8.174__py3-none-any.whl → 0.8.227__py3-none-any.whl

Potentially problematic release.

workbench 0.8.174py3-none-any.whl → 0.8.227py3-none-any.whl