PyPI - workbench - Versions diffs - 0.8.161__py3-none-any.whl → 0.8.192__py3-none-any.whl - Mend

workbench 0.8.161py3-none-any.whl → 0.8.192py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

workbench/algorithms/dataframe/proximity.py +143 -102
workbench/algorithms/graph/light/proximity_graph.py +2 -1
workbench/api/compound.py +1 -1
workbench/api/endpoint.py +12 -0
workbench/api/feature_set.py +4 -4
workbench/api/meta.py +5 -2
workbench/api/model.py +16 -12
workbench/api/monitor.py +1 -16
workbench/core/artifacts/artifact.py +11 -3
workbench/core/artifacts/data_capture_core.py +355 -0
workbench/core/artifacts/endpoint_core.py +168 -78
workbench/core/artifacts/feature_set_core.py +72 -13
workbench/core/artifacts/model_core.py +50 -15
workbench/core/artifacts/monitor_core.py +33 -248
workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
workbench/core/cloud_platform/aws/aws_meta.py +12 -5
workbench/core/cloud_platform/aws/aws_session.py +4 -4
workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
workbench/core/transforms/features_to_model/features_to_model.py +9 -4
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
workbench/core/views/training_view.py +49 -53
workbench/core/views/view.py +51 -1
workbench/core/views/view_utils.py +4 -4
workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
workbench/model_scripts/custom_models/proximity/proximity.py +143 -102
workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
workbench/model_scripts/custom_models/uq_models/meta_uq.template +156 -58
workbench/model_scripts/custom_models/uq_models/ngboost.template +20 -14
workbench/model_scripts/custom_models/uq_models/proximity.py +143 -102
workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
workbench/model_scripts/pytorch_model/pytorch.template +19 -20
workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
workbench/model_scripts/script_generation.py +7 -2
workbench/model_scripts/uq_models/mapie.template +492 -0
workbench/model_scripts/uq_models/requirements.txt +1 -0
workbench/model_scripts/xgb_model/xgb_model.template +31 -40
workbench/repl/workbench_shell.py +11 -6
workbench/scripts/lambda_launcher.py +63 -0
workbench/scripts/ml_pipeline_batch.py +137 -0
workbench/scripts/ml_pipeline_sqs.py +186 -0
workbench/scripts/monitor_cloud_watch.py +20 -100
workbench/utils/aws_utils.py +4 -3
workbench/utils/chem_utils/__init__.py +0 -0
workbench/utils/chem_utils/fingerprints.py +134 -0
workbench/utils/chem_utils/misc.py +194 -0
workbench/utils/chem_utils/mol_descriptors.py +483 -0
workbench/utils/chem_utils/mol_standardize.py +450 -0
workbench/utils/chem_utils/mol_tagging.py +348 -0
workbench/utils/chem_utils/projections.py +209 -0
workbench/utils/chem_utils/salts.py +256 -0
workbench/utils/chem_utils/sdf.py +292 -0
workbench/utils/chem_utils/toxicity.py +250 -0
workbench/utils/chem_utils/vis.py +253 -0
workbench/utils/cloudwatch_handler.py +1 -1
workbench/utils/cloudwatch_utils.py +137 -0
workbench/utils/config_manager.py +3 -7
workbench/utils/endpoint_utils.py +5 -7
workbench/utils/license_manager.py +2 -6
workbench/utils/model_utils.py +76 -30
workbench/utils/monitor_utils.py +44 -62
workbench/utils/pandas_utils.py +3 -3
workbench/utils/shap_utils.py +10 -2
workbench/utils/workbench_logging.py +0 -3
workbench/utils/workbench_sqs.py +1 -1
workbench/utils/xgboost_model_utils.py +283 -145
workbench/web_interface/components/plugins/dashboard_status.py +3 -1
workbench/web_interface/components/plugins/generated_compounds.py +1 -1
workbench/web_interface/components/plugins/scatter_plot.py +3 -3
{workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/METADATA +4 -4
{workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/RECORD +81 -76
{workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/entry_points.txt +3 -0
workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
workbench/model_scripts/pytorch_model/generated_model_script.py +0 -565
workbench/model_scripts/quant_regression/quant_regression.template +0 -279
workbench/model_scripts/quant_regression/requirements.txt +0 -1
workbench/model_scripts/scikit_learn/generated_model_script.py +0 -307
workbench/model_scripts/xgb_model/generated_model_script.py +0 -477
workbench/utils/chem_utils.py +0 -1556
workbench/utils/execution_environment.py +0 -211
workbench/utils/fast_inference.py +0 -167
workbench/utils/resource_utils.py +0 -39
{workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/WHEEL +0 -0
{workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/licenses/LICENSE +0 -0
{workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/top_level.txt +0 -0

workbench/core/artifacts/endpoint_core.py CHANGED Viewed

@@ -8,7 +8,7 @@ import pandas as pd
 import numpy as np
 from io import StringIO
 import awswrangler as wr
-from typing import Union, Optional
+from typing import Union, Optional, Tuple
 import hashlib
 # Model Performance Scores
@@ -32,11 +32,11 @@ from sagemaker import Predictor
 from workbench.core.artifacts.artifact import Artifact
 from workbench.core.artifacts import FeatureSetCore, ModelCore, ModelType
 from workbench.utils.endpoint_metrics import EndpointMetrics
-from workbench.utils.fast_inference import fast_inference
 from workbench.utils.cache import Cache
 from workbench.utils.s3_utils import compute_s3_object_hash
 from workbench.utils.model_utils import uq_metrics
 from workbench.utils.xgboost_model_utils import cross_fold_inference
+from workbench_bridges.endpoints.fast_inference import fast_inference
 class EndpointCore(Artifact):
@@ -164,11 +164,17 @@ class EndpointCore(Artifact):
         """
         return "Serverless" in self.endpoint_meta["InstanceType"]
-    def add_data_capture(self):
+    def data_capture(self):
+        """Get the MonitorCore class for this endpoint"""
+        from workbench.core.artifacts.data_capture_core import DataCaptureCore
+        return DataCaptureCore(self.endpoint_name)
+    def enable_data_capture(self):
         """Add data capture to the endpoint"""
-        self.get_monitor().add_data_capture()
+        self.data_capture().enable()
-    def get_monitor(self):
+    def monitor(self):
         """Get the MonitorCore class for this endpoint"""
         from workbench.core.artifacts.monitor_core import MonitorCore
@@ -350,7 +356,7 @@ class EndpointCore(Artifact):
             return pd.DataFrame()
         # Grab the evaluation data from the FeatureSet
-        table = fs.view("training").table
+        table = model.training_view().table
         eval_df = fs.query(f'SELECT * FROM "{table}" where training = FALSE')
         capture_name = "auto_inference" if capture else None
         return self.inference(eval_df, capture_name, id_column=fs.id_column)
@@ -378,16 +384,17 @@ class EndpointCore(Artifact):
             self.log.important("No model associated with this endpoint, running 'no frills' inference...")
             return self.fast_inference(eval_df)
+        # Grab the model features and target column
+        model = ModelCore(self.model_name)
+        features = model.features()
+        target_column = model.target()
         # Run predictions on the evaluation data
-        prediction_df = self._predict(eval_df, drop_error_rows)
+        prediction_df = self._predict(eval_df, features, drop_error_rows)
         if prediction_df.empty:
             self.log.warning("No predictions were made. Returning empty DataFrame.")
             return prediction_df
-        # Get the target column
-        model = ModelCore(self.model_name)
-        target_column = model.target()
         # Sanity Check that the target column is present
         if target_column and (target_column not in prediction_df.columns):
             self.log.important(f"Target Column {target_column} not found in prediction_df!")
@@ -413,28 +420,95 @@ class EndpointCore(Artifact):
             # Capture the inference results and metrics
             if capture_name is not None:
+                # If we don't have an id_column, we'll pull it from the model's FeatureSet
+                if id_column is None:
+                    fs = FeatureSetCore(model.get_input())
+                    id_column = fs.id_column
                 description = capture_name.replace("_", " ").title()
-                features = model.features()
                 self._capture_inference_results(
                     capture_name, prediction_df, target_column, model_type, metrics, description, features, id_column
                 )
-                # Capture CrossFold Inference Results
-                cross_fold_metrics = cross_fold_inference(model)
-                if cross_fold_metrics:
-                    # Now put into the Parameter Store Model Inference Namespace
-                    self.param_store.upsert(f"/workbench/models/{model.name}/inference/cross_fold", cross_fold_metrics)
                 # For UQ Models we also capture the uncertainty metrics
                 if model_type in [ModelType.UQ_REGRESSOR]:
                     metrics = uq_metrics(prediction_df, target_column)
-                    # Now put into the Parameter Store Model Inference Namespace
                     self.param_store.upsert(f"/workbench/models/{model.name}/inference/{capture_name}", metrics)
         # Return the prediction DataFrame
         return prediction_df
+    def cross_fold_inference(self, nfolds: int = 5) -> Tuple[dict, pd.DataFrame]:
+        """Run cross-fold inference (only works for XGBoost models)
+        Args:
+            nfolds (int): Number of folds to use for cross-fold (default: 5)
+        Returns:
+            Tuple[dict, pd.DataFrame]: Tuple of (cross_fold_metrics, out_of_fold_df)
+        """
+        # Grab our model
+        model = ModelCore(self.model_name)
+        # Compute CrossFold Metrics
+        cross_fold_metrics, out_of_fold_df = cross_fold_inference(model, nfolds=nfolds)
+        if cross_fold_metrics:
+            self.param_store.upsert(f"/workbench/models/{model.name}/inference/cross_fold", cross_fold_metrics)
+        # Capture the results
+        capture_name = "full_cross_fold"
+        description = capture_name.replace("_", " ").title()
+        target_column = model.target()
+        model_type = model.model_type
+        # Get the id_column from the model's FeatureSet
+        fs = FeatureSetCore(model.get_input())
+        id_column = fs.id_column
+        # Is this a UQ Model? If so, run full inference and merge the results
+        additional_columns = []
+        if model_type == ModelType.UQ_REGRESSOR:
+            self.log.important("UQ Regressor detected, running full inference to get uncertainty estimates...")
+            # Get the training view dataframe for inference
+            training_df = model.training_view().pull_dataframe()
+            # Run inference on the endpoint to get UQ outputs
+            uq_df = self.inference(training_df)
+            # Identify UQ-specific columns (quantiles and prediction_std)
+            uq_columns = [col for col in uq_df.columns if col.startswith("q_") or col == "prediction_std"]
+            # Merge UQ columns with out-of-fold predictions
+            if uq_columns:
+                # Keep id_column and UQ columns, drop 'prediction' to avoid conflict when merging
+                uq_df = uq_df[[id_column] + uq_columns]
+                # Drop duplicates in uq_df based on id_column
+                uq_df = uq_df.drop_duplicates(subset=[id_column])
+                # Merge UQ columns into out_of_fold_df
+                out_of_fold_df = pd.merge(out_of_fold_df, uq_df, on=id_column, how="left")
+                additional_columns = uq_columns
+                self.log.info(f"Added UQ columns: {', '.join(additional_columns)}")
+                # Also compute UQ metrics
+                metrics = uq_metrics(out_of_fold_df, target_column)
+                self.param_store.upsert(f"/workbench/models/{model.name}/inference/{capture_name}", metrics)
+        self._capture_inference_results(
+            capture_name,
+            out_of_fold_df,
+            target_column,
+            model_type,
+            pd.DataFrame([cross_fold_metrics["summary_metrics"]]),
+            description,
+            features=additional_columns,
+            id_column=id_column,
+        )
+        return cross_fold_metrics, out_of_fold_df
     def fast_inference(self, eval_df: pd.DataFrame, threads: int = 4) -> pd.DataFrame:
         """Run inference on the Endpoint using the provided DataFrame
@@ -450,11 +524,12 @@ class EndpointCore(Artifact):
         """
         return fast_inference(self.name, eval_df, self.sm_session, threads=threads)
-    def _predict(self, eval_df: pd.DataFrame, drop_error_rows: bool = False) -> pd.DataFrame:
-        """Internal: Run prediction on the given observations in the given DataFrame
+    def _predict(self, eval_df: pd.DataFrame, features: list[str], drop_error_rows: bool = False) -> pd.DataFrame:
+        """Internal: Run prediction on observations in the given DataFrame
         Args:
             eval_df (pd.DataFrame): DataFrame to run predictions on (must have superset of features)
+            features (list[str]): List of feature column names needed for prediction
             drop_error_rows (bool): If True, drop rows that had endpoint errors/issues (default=False)
         Returns:
             pd.DataFrame: Return the DataFrame with additional columns, prediction and any _proba columns
@@ -465,19 +540,12 @@ class EndpointCore(Artifact):
             self.log.warning("Evaluation DataFrame has 0 rows. No predictions to run.")
             return pd.DataFrame(columns=eval_df.columns)  # Return empty DataFrame with same structure
-        # Sanity check: Does the Model have Features?
-        features = ModelCore(self.model_name).features()
-        if not features:
-            self.log.warning("Model does not have features defined, using all columns in the DataFrame")
-        else:
-            # Sanity check: Does the DataFrame have the required features?
-            df_columns_lower = set(col.lower() for col in eval_df.columns)
-            features_lower = set(feature.lower() for feature in features)
-            # Check if the features are a subset of the DataFrame columns (case-insensitive)
-            if not features_lower.issubset(df_columns_lower):
-                missing_features = features_lower - df_columns_lower
-                raise ValueError(f"DataFrame does not contain required features: {missing_features}")
+        # Sanity check: Does the DataFrame have the required features?
+        df_columns_lower = set(col.lower() for col in eval_df.columns)
+        features_lower = set(feature.lower() for feature in features)
+        if not features_lower.issubset(df_columns_lower):
+            missing_features = features_lower - df_columns_lower
+            raise ValueError(f"DataFrame does not contain required features: {missing_features}")
         # Create our Endpoint Predictor Class
         predictor = Predictor(
@@ -634,6 +702,10 @@ class EndpointCore(Artifact):
     @staticmethod
     def _hash_dataframe(df: pd.DataFrame, hash_length: int = 8):
         # Internal: Compute a data hash for the dataframe
+        if df.empty:
+            return "--hash--"
+        # Sort the dataframe by columns to ensure consistent ordering
         df = df.copy()
         df = df.sort_values(by=sorted(df.columns.tolist()))
         row_hashes = pd.util.hash_pandas_object(df, index=False)
@@ -688,8 +760,8 @@ class EndpointCore(Artifact):
         wr.s3.to_csv(metrics, f"{inference_capture_path}/inference_metrics.csv", index=False)
         # Grab the target column, prediction column, any _proba columns, and the ID column (if present)
-        prediction_col = "prediction" if "prediction" in pred_results_df.columns else "predictions"
-        output_columns = [target_column, prediction_col]
+        output_columns = [target_column]
+        output_columns += [col for col in pred_results_df.columns if "prediction" in col]
         # Add any _proba columns to the output columns
         output_columns += [col for col in pred_results_df.columns if col.endswith("_proba")]
@@ -699,7 +771,7 @@ class EndpointCore(Artifact):
         # Add the ID column
         if id_column and id_column in pred_results_df.columns:
-            output_columns.append(id_column)
+            output_columns.insert(0, id_column)
         # Write the predictions to our S3 Model Inference Folder
         self.log.info(f"Writing predictions to {inference_capture_path}/inference_predictions.csv")
@@ -713,18 +785,10 @@ class EndpointCore(Artifact):
             # Note: Unlike other dataframes here, we want to write the index (labels) to the CSV
             wr.s3.to_csv(conf_mtx, f"{inference_capture_path}/inference_cm.csv", index=True)
-        # Generate SHAP values for our Prediction Dataframe
-        # generate_shap_values(self.endpoint_name, model_type.value, pred_results_df, inference_capture_path)
         # Now recompute the details for our Model
-        self.log.important(f"Recomputing Details for {self.model_name} to show latest Inference Results...")
+        self.log.important(f"Loading inference metrics for {self.model_name}...")
         model = ModelCore(self.model_name)
         model._load_inference_metrics(capture_name)
-        model.details()
-        # Recompute the details so that inference model metrics are updated
-        self.log.important(f"Recomputing Details for {self.name} to show latest Inference Results...")
-        self.details()
     def regression_metrics(self, target_column: str, prediction_df: pd.DataFrame) -> pd.DataFrame:
         """Compute the performance metrics for this Endpoint
@@ -876,9 +940,11 @@ class EndpointCore(Artifact):
     def generate_confusion_matrix(self, target_column: str, prediction_df: pd.DataFrame) -> pd.DataFrame:
         """Compute the confusion matrix for this Endpoint
         Args:
             target_column (str): Name of the target column
             prediction_df (pd.DataFrame): DataFrame with the prediction results
         Returns:
             pd.DataFrame: DataFrame with the confusion matrix
         """
@@ -887,25 +953,20 @@ class EndpointCore(Artifact):
         prediction_col = "prediction" if "prediction" in prediction_df.columns else "predictions"
         y_pred = prediction_df[prediction_col]
-        # Check if our model has class labels, if not we'll use the unique labels in the prediction
-        class_labels = ModelCore(self.model_name).class_labels()
-        if class_labels is None:
-            class_labels = sorted(list(set(y_true) | set(y_pred)))
+        # Get model class labels
+        model_class_labels = ModelCore(self.model_name).class_labels()
-        # Compute the confusion matrix (sklearn confusion_matrix)
-        conf_mtx = confusion_matrix(y_true, y_pred, labels=class_labels)
+        # Use model labels if available, otherwise infer from data
+        if model_class_labels:
+            self.log.important("Using model class labels for confusion matrix ordering...")
+            labels = model_class_labels
+        else:
+            labels = sorted(list(set(y_true) | set(y_pred)))
-        # Create a DataFrame
-        conf_mtx_df = pd.DataFrame(conf_mtx, index=class_labels, columns=class_labels)
+        # Compute confusion matrix and create DataFrame
+        conf_mtx = confusion_matrix(y_true, y_pred, labels=labels)
+        conf_mtx_df = pd.DataFrame(conf_mtx, index=labels, columns=labels)
         conf_mtx_df.index.name = "labels"
-        # Check if our model has class labels. If so make the index and columns ordered
-        model_class_labels = ModelCore(self.model_name).class_labels()
-        if model_class_labels:
-            self.log.important("Reordering the confusion matrix based on model class labels...")
-            conf_mtx_df.index = pd.Categorical(conf_mtx_df.index, categories=model_class_labels, ordered=True)
-            conf_mtx_df.columns = pd.Categorical(conf_mtx_df.columns, categories=model_class_labels, ordered=True)
-            conf_mtx_df = conf_mtx_df.sort_index().sort_index(axis=1)
         return conf_mtx_df
     def endpoint_config_name(self) -> str:
@@ -932,9 +993,9 @@ class EndpointCore(Artifact):
         self.upsert_workbench_meta({"workbench_input": input})
     def delete(self):
-        """ "Delete an existing Endpoint: Underlying Models, Configuration, and Endpoint"""
+        """Delete an existing Endpoint: Underlying Models, Configuration, and Endpoint"""
         if not self.exists():
-            self.log.warning(f"Trying to delete an Model that doesn't exist: {self.name}")
+            self.log.warning(f"Trying to delete an Endpoint that doesn't exist: {self.name}")
         # Remove this endpoint from the list of registered endpoints
         self.log.info(f"Removing {self.name} from the list of registered endpoints...")
@@ -975,12 +1036,23 @@ class EndpointCore(Artifact):
             cls.log.info(f"Deleting Monitoring Schedule {schedule['MonitoringScheduleName']}...")
             cls.sm_client.delete_monitoring_schedule(MonitoringScheduleName=schedule["MonitoringScheduleName"])
-        # Recursively delete all endpoint S3 artifacts (inference, data capture, monitoring, etc)
+        # Recursively delete all endpoint S3 artifacts (inference, etc)
+        # Note: We do not want to delete the data_capture/ files since these
+        #       might be used for collection and data drift analysis
         base_endpoint_path = f"{cls.endpoints_s3_path}/{endpoint_name}"
-        s3_objects = wr.s3.list_objects(base_endpoint_path, boto3_session=cls.boto3_session)
-        cls.log.info(f"Deleting S3 Objects at {base_endpoint_path}...")
-        cls.log.info(f"{s3_objects}")
-        wr.s3.delete_objects(s3_objects, boto3_session=cls.boto3_session)
+        all_s3_objects = wr.s3.list_objects(base_endpoint_path, boto3_session=cls.boto3_session)
+        # Filter out objects that contain 'data_capture/' in their path
+        s3_objects_to_delete = [obj for obj in all_s3_objects if "/data_capture/" not in obj]
+        cls.log.info(f"Found {len(all_s3_objects)} total objects at {base_endpoint_path}")
+        cls.log.info(f"Filtering out data_capture files, will delete {len(s3_objects_to_delete)} objects...")
+        cls.log.info(f"Objects to delete: {s3_objects_to_delete}")
+        if s3_objects_to_delete:
+            wr.s3.delete_objects(s3_objects_to_delete, boto3_session=cls.boto3_session)
+            cls.log.info(f"Successfully deleted {len(s3_objects_to_delete)} objects")
+        else:
+            cls.log.info("No objects to delete (only data_capture files found)")
         # Delete any dataframes that were stored in the Dataframe Cache
         cls.log.info("Deleting Dataframe Cache...")
@@ -1031,7 +1103,7 @@ class EndpointCore(Artifact):
 if __name__ == "__main__":
     """Exercise the Endpoint Class"""
     from workbench.api import FeatureSet
-    from workbench.utils.endpoint_utils import fs_evaluation_data
+    from workbench.utils.endpoint_utils import get_evaluation_data
     import random
     # Grab an EndpointCore object and pull some information from it
@@ -1039,7 +1111,7 @@ if __name__ == "__main__":
     # Test various error conditions (set row 42 length to pd.NA)
     # Note: This test should return ALL rows
-    my_eval_df = fs_evaluation_data(my_endpoint)
+    my_eval_df = get_evaluation_data(my_endpoint)
     my_eval_df.at[42, "length"] = pd.NA
     pred_results = my_endpoint.inference(my_eval_df, drop_error_rows=True)
     print(f"Sent rows: {len(my_eval_df)}")
@@ -1047,6 +1119,9 @@ if __name__ == "__main__":
     assert len(pred_results) == len(my_eval_df), "Predictions should match the number of sent rows"
     # Now we put in an invalid value
+    print("*" * 80)
+    print("NOW TESTING ERROR CONDITIONS...")
+    print("*" * 80)
     my_eval_df.at[42, "length"] = "invalid_value"
     pred_results = my_endpoint.inference(my_eval_df, drop_error_rows=True)
     print(f"Sent rows: {len(my_eval_df)}")
@@ -1086,13 +1161,20 @@ if __name__ == "__main__":
     df = fs.pull_dataframe()[:100]
     cap_df = df.copy()
     cap_df.columns = [col.upper() for col in cap_df.columns]
-    my_endpoint._predict(cap_df)
+    my_endpoint.inference(cap_df)
     # Boolean Type Test
     df["bool_column"] = [random.choice([True, False]) for _ in range(len(df))]
-    result_df = my_endpoint._predict(df)
+    result_df = my_endpoint.inference(df)
     assert result_df["bool_column"].dtype == bool
+    # Missing Feature Test
+    missing_df = df.drop(columns=["length"])
+    try:
+        my_endpoint.inference(missing_df)
+    except ValueError as e:
+        print(f"Expected error for missing feature: {e}")
     # Run Auto Inference on the Endpoint (uses the FeatureSet)
     print("Running Auto Inference...")
     my_endpoint.auto_inference()
@@ -1100,13 +1182,20 @@ if __name__ == "__main__":
     # Run Inference where we provide the data
     # Note: This dataframe could be from a FeatureSet or any other source
     print("Running Inference...")
-    my_eval_df = fs_evaluation_data(my_endpoint)
+    my_eval_df = get_evaluation_data(my_endpoint)
     pred_results = my_endpoint.inference(my_eval_df)
     # Now set capture=True to save inference results and metrics
-    my_eval_df = fs_evaluation_data(my_endpoint)
+    my_eval_df = get_evaluation_data(my_endpoint)
     pred_results = my_endpoint.inference(my_eval_df, capture_name="holdout_xyz")
+    # Run predictions using the fast_inference method
+    fast_results = my_endpoint.fast_inference(my_eval_df)
+    # Test the cross_fold_inference method
+    print("Running Cross-Fold Inference...")
+    metrics, all_results = my_endpoint.cross_fold_inference()
     # Run Inference and metrics for a Classification Endpoint
     class_endpoint = EndpointCore("wine-classification")
     auto_predictions = class_endpoint.auto_inference()
@@ -1115,8 +1204,9 @@ if __name__ == "__main__":
     target = "wine_class"
     print(class_endpoint.generate_confusion_matrix(target, auto_predictions))
-    # Run predictions using the fast_inference method
-    fast_results = my_endpoint.fast_inference(my_eval_df)
+    # Test the cross_fold_inference method
+    print("Running Cross-Fold Inference...")
+    metrics, all_results = class_endpoint.cross_fold_inference()
     # Test the class method delete (commented out for now)
     # from workbench.api import Model

workbench/core/artifacts/feature_set_core.py CHANGED Viewed

@@ -17,7 +17,7 @@ from workbench.core.artifacts.artifact import Artifact
 from workbench.core.artifacts.data_source_factory import DataSourceFactory
 from workbench.core.artifacts.athena_source import AthenaSource
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional, List, Union
 from workbench.utils.aws_utils import aws_throttle
@@ -194,24 +194,24 @@ class FeatureSetCore(Artifact):
         return View(self, view_name)
-    def set_display_columns(self, diplay_columns: list[str]):
+    def set_display_columns(self, display_columns: list[str]):
         """Set the display columns for this Data Source
         Args:
-            diplay_columns (list[str]): The display columns for this Data Source
+            display_columns (list[str]): The display columns for this Data Source
         """
         # Check mismatch of display columns to computation columns
         c_view = self.view("computation")
         computation_columns = c_view.columns
-        mismatch_columns = [col for col in diplay_columns if col not in computation_columns]
+        mismatch_columns = [col for col in display_columns if col not in computation_columns]
         if mismatch_columns:
             self.log.monitor(f"Display View/Computation mismatch: {mismatch_columns}")
-        self.log.important(f"Setting Display Columns...{diplay_columns}")
+        self.log.important(f"Setting Display Columns...{display_columns}")
         from workbench.core.views import DisplayView
         # Create a NEW display view
-        DisplayView.create(self, source_table=c_view.table, column_list=diplay_columns)
+        DisplayView.create(self, source_table=c_view.table, column_list=display_columns)
     def set_computation_columns(self, computation_columns: list[str], reset_display: bool = True):
         """Set the computation columns for this FeatureSet
@@ -509,6 +509,48 @@ class FeatureSetCore(Artifact):
         ].tolist()
         return hold_out_ids
+    def set_training_filter(self, filter_expression: Optional[str] = None):
+        """Set a filter expression for the training view for this FeatureSet
+        Args:
+            filter_expression (Optional[str]): A SQL filter expression (e.g., "age > 25 AND status = 'active'")
+                If None or empty string, will reset to training view with no filter
+                (default: None)
+        """
+        from workbench.core.views import TrainingView
+        # Grab the existing holdout ids
+        holdout_ids = self.get_training_holdouts()
+        # Create a NEW training view
+        self.log.important(f"Setting Training Filter: {filter_expression}")
+        TrainingView.create(
+            self, id_column=self.id_column, holdout_ids=holdout_ids, filter_expression=filter_expression
+        )
+    def exclude_ids_from_training(self, ids: List[Union[str, int]], column_name: Optional[str] = None):
+        """Exclude a list of IDs from the training view
+        Args:
+            ids (List[Union[str, int]],): List of IDs to exclude from training
+            column_name (Optional[str]): Column name to filter on.
+                If None, uses self.id_column (default: None)
+        """
+        # Use the default id_column if not specified
+        column = column_name or self.id_column
+        # Handle empty list case
+        if not ids:
+            self.log.warning("No IDs provided to exclude")
+            return
+        # Build the filter expression with proper SQL quoting
+        quoted_ids = ", ".join([repr(id) for id in ids])
+        filter_expression = f"{column} NOT IN ({quoted_ids})"
+        # Apply the filter
+        self.set_training_filter(filter_expression)
     @classmethod
     def delete_views(cls, table: str, database: str):
         """Delete any views associated with this FeatureSet
@@ -707,7 +749,7 @@ if __name__ == "__main__":
     # Test getting the holdout ids
     print("Getting the hold out ids...")
-    holdout_ids = my_features.get_training_holdouts("id")
+    holdout_ids = my_features.get_training_holdouts()
     print(f"Holdout IDs: {holdout_ids}")
     # Get a sample of the data
@@ -729,16 +771,33 @@ if __name__ == "__main__":
     table = my_features.view("training").table
     df = my_features.query(f'SELECT id, name FROM "{table}"')
     my_holdout_ids = [id for id in df["id"] if id < 20]
-    my_features.set_training_holdouts("id", my_holdout_ids)
-    # Test the hold out set functionality with strings
-    print("Setting hold out ids (strings)...")
-    my_holdout_ids = [name for name in df["name"] if int(name.split(" ")[1]) > 80]
-    my_features.set_training_holdouts("name", my_holdout_ids)
+    my_features.set_training_holdouts(my_holdout_ids)
     # Get the training data
     print("Getting the training data...")
     training_data = my_features.get_training_data()
+    print(f"Training Data: {training_data.shape}")
+    # Test the filter expression functionality
+    print("Setting a filter expression...")
+    my_features.set_training_filter("id < 50 AND height > 65.0")
+    training_data = my_features.get_training_data()
+    print(f"Training Data: {training_data.shape}")
+    print(training_data)
+    # Remove training filter
+    print("Removing the filter expression...")
+    my_features.set_training_filter(None)
+    training_data = my_features.get_training_data()
+    print(f"Training Data: {training_data.shape}")
+    print(training_data)
+    # Test excluding ids from training
+    print("Excluding ids from training...")
+    my_features.exclude_ids_from_training([1, 2, 3, 4, 5])
+    training_data = my_features.get_training_data()
+    print(f"Training Data: {training_data.shape}")
+    print(training_data)
     # Now delete the AWS artifacts associated with this Feature Set
     # print("Deleting Workbench Feature Set...")

workbench 0.8.161__py3-none-any.whl → 0.8.192__py3-none-any.whl

workbench 0.8.161py3-none-any.whl → 0.8.192py3-none-any.whl