PyPI - workbench - Versions diffs - 0.8.177__py3-none-any.whl → 0.8.179__py3-none-any.whl - Mend

workbench 0.8.177py3-none-any.whl → 0.8.179py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (27) hide show

workbench/api/endpoint.py CHANGED Viewed

@@ -4,6 +4,7 @@ Endpoints can be viewed in the AWS Sagemaker interfaces or in the Workbench
 Dashboard UI, which provides additional model details and performance metrics"""
 import pandas as pd
+from typing import Tuple
 # Workbench Imports
 from workbench.core.artifacts.endpoint_core import EndpointCore
@@ -70,14 +71,14 @@ class Endpoint(EndpointCore):
         """
         return super().fast_inference(eval_df, threads=threads)
-    def cross_fold_inference(self, nfolds: int = 5) -> dict:
+    def cross_fold_inference(self, nfolds: int = 5) -> Tuple[dict, pd.DataFrame]:
         """Run cross-fold inference (only works for XGBoost models)
         Args:
             nfolds (int): The number of folds to use for cross-validation (default: 5)
         Returns:
-            dict: A dictionary with fold results
+            Tuple(dict, pd.DataFrame): A tuple containing a dictionary of metrics and a DataFrame with predictions
         """
         return super().cross_fold_inference(nfolds)

workbench/core/artifacts/endpoint_core.py CHANGED Viewed

@@ -8,7 +8,7 @@ import pandas as pd
 import numpy as np
 from io import StringIO
 import awswrangler as wr
-from typing import Union, Optional
+from typing import Union, Optional, Tuple
 import hashlib
 # Model Performance Scores
@@ -436,24 +436,24 @@ class EndpointCore(Artifact):
         # Return the prediction DataFrame
         return prediction_df
-    def cross_fold_inference(self, nfolds: int = 5) -> dict:
+    def cross_fold_inference(self, nfolds: int = 5) -> Tuple[dict, pd.DataFrame]:
         """Run cross-fold inference (only works for XGBoost models)
         Args:
             nfolds (int): Number of folds to use for cross-fold (default: 5)
         Returns:
-            dict: Dictionary with the cross-fold inference results
+            Tuple[dict, pd.DataFrame]: Tuple of (cross_fold_metrics, out_of_fold_df)
         """
         # Grab our model
         model = ModelCore(self.model_name)
         # Compute CrossFold Metrics
-        cross_fold_metrics = cross_fold_inference(model, nfolds=nfolds)
+        cross_fold_metrics, out_of_fold_df = cross_fold_inference(model, nfolds=nfolds)
         if cross_fold_metrics:
             self.param_store.upsert(f"/workbench/models/{model.name}/inference/cross_fold", cross_fold_metrics)
-        return cross_fold_metrics
+        return cross_fold_metrics, out_of_fold_df
     def fast_inference(self, eval_df: pd.DataFrame, threads: int = 4) -> pd.DataFrame:
         """Run inference on the Endpoint using the provided DataFrame

workbench/core/artifacts/feature_set_core.py CHANGED Viewed

@@ -17,7 +17,7 @@ from workbench.core.artifacts.artifact import Artifact
 from workbench.core.artifacts.data_source_factory import DataSourceFactory
 from workbench.core.artifacts.athena_source import AthenaSource
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional, List, Union
 from workbench.utils.aws_utils import aws_throttle
@@ -509,6 +509,48 @@ class FeatureSetCore(Artifact):
         ].tolist()
         return hold_out_ids
+    def set_training_filter(self, filter_expression: Optional[str] = None):
+        """Set a filter expression for the training view for this FeatureSet
+        Args:
+            filter_expression (Optional[str]): A SQL filter expression (e.g., "age > 25 AND status = 'active'")
+                If None or empty string, will reset to training view with no filter
+                (default: None)
+        """
+        from workbench.core.views import TrainingView
+        # Grab the existing holdout ids
+        holdout_ids = self.get_training_holdouts()
+        # Create a NEW training view
+        self.log.important(f"Setting Training Filter: {filter_expression}")
+        TrainingView.create(
+            self, id_column=self.id_column, holdout_ids=holdout_ids, filter_expression=filter_expression
+        )
+    def exclude_ids_from_training(self, ids: List[Union[str, int]], column_name: Optional[str] = None):
+        """Exclude a list of IDs from the training view
+        Args:
+            ids (List[Union[str, int]],): List of IDs to exclude from training
+            column_name (Optional[str]): Column name to filter on.
+                If None, uses self.id_column (default: None)
+        """
+        # Use the default id_column if not specified
+        column = column_name or self.id_column
+        # Handle empty list case
+        if not ids:
+            self.log.warning("No IDs provided to exclude")
+            return
+        # Build the filter expression with proper SQL quoting
+        quoted_ids = ", ".join([repr(id) for id in ids])
+        filter_expression = f"{column} NOT IN ({quoted_ids})"
+        # Apply the filter
+        self.set_training_filter(filter_expression)
     @classmethod
     def delete_views(cls, table: str, database: str):
         """Delete any views associated with this FeatureSet
@@ -707,7 +749,7 @@ if __name__ == "__main__":
     # Test getting the holdout ids
     print("Getting the hold out ids...")
-    holdout_ids = my_features.get_training_holdouts("id")
+    holdout_ids = my_features.get_training_holdouts()
     print(f"Holdout IDs: {holdout_ids}")
     # Get a sample of the data
@@ -729,16 +771,33 @@ if __name__ == "__main__":
     table = my_features.view("training").table
     df = my_features.query(f'SELECT id, name FROM "{table}"')
     my_holdout_ids = [id for id in df["id"] if id < 20]
-    my_features.set_training_holdouts("id", my_holdout_ids)
-    # Test the hold out set functionality with strings
-    print("Setting hold out ids (strings)...")
-    my_holdout_ids = [name for name in df["name"] if int(name.split(" ")[1]) > 80]
-    my_features.set_training_holdouts("name", my_holdout_ids)
+    my_features.set_training_holdouts(my_holdout_ids)
     # Get the training data
     print("Getting the training data...")
     training_data = my_features.get_training_data()
+    print(f"Training Data: {training_data.shape}")
+    # Test the filter expression functionality
+    print("Setting a filter expression...")
+    my_features.set_training_filter("id < 50 AND height > 65.0")
+    training_data = my_features.get_training_data()
+    print(f"Training Data: {training_data.shape}")
+    print(training_data)
+    # Remove training filter
+    print("Removing the filter expression...")
+    my_features.set_training_filter(None)
+    training_data = my_features.get_training_data()
+    print(f"Training Data: {training_data.shape}")
+    print(training_data)
+    # Test excluding ids from training
+    print("Excluding ids from training...")
+    my_features.exclude_ids_from_training([1, 2, 3, 4, 5])
+    training_data = my_features.get_training_data()
+    print(f"Training Data: {training_data.shape}")
+    print(training_data)
     # Now delete the AWS artifacts associated with this Feature Set
     # print("Deleting Workbench Feature Set...")

workbench/core/views/training_view.py CHANGED Viewed

@@ -3,7 +3,7 @@
 from typing import Union
 # Workbench Imports
-from workbench.api import DataSource, FeatureSet
+from workbench.api import FeatureSet
 from workbench.core.views.view import View
 from workbench.core.views.create_view import CreateView
 from workbench.core.views.view_utils import get_column_list
@@ -34,6 +34,7 @@ class TrainingView(CreateView):
         source_table: str = None,
         id_column: str = None,
         holdout_ids: Union[list[str], list[int], None] = None,
+        filter_expression: str = None,
     ) -> Union[View, None]:
         """Factory method to create and return a TrainingView instance.
@@ -42,6 +43,8 @@ class TrainingView(CreateView):
             source_table (str, optional): The table/view to create the view from. Defaults to None.
             id_column (str, optional): The name of the id column. Defaults to None.
             holdout_ids (Union[list[str], list[int], None], optional): A list of holdout ids. Defaults to None.
+            filter_expression (str, optional): SQL filter expression (e.g., "age > 25 AND status = 'active'").
+                                               Defaults to None.
         Returns:
             Union[View, None]: The created View object (or None if failed to create the view)
@@ -69,28 +72,36 @@ class TrainingView(CreateView):
                 else:
                     id_column = instance.auto_id_column
-        # If we don't have holdout ids, create a default training view
-        if not holdout_ids:
-            instance._default_training_view(instance.data_source, id_column)
-            return View(instance.data_source, instance.view_name, auto_create_view=False)
+        # Enclose each column name in double quotes
+        sql_columns = ", ".join([f'"{column}"' for column in column_list])
+        # Build the training assignment logic
+        if holdout_ids:
+            # Format the list of holdout ids for SQL IN clause
+            if all(isinstance(id, str) for id in holdout_ids):
+                formatted_holdout_ids = ", ".join(f"'{id}'" for id in holdout_ids)
+            else:
+                formatted_holdout_ids = ", ".join(map(str, holdout_ids))
-        # Format the list of holdout ids for SQL IN clause
-        if holdout_ids and all(isinstance(id, str) for id in holdout_ids):
-            formatted_holdout_ids = ", ".join(f"'{id}'" for id in holdout_ids)
+            training_logic = f"""CASE
+                WHEN {id_column} IN ({formatted_holdout_ids}) THEN False
+                ELSE True
+            END AS training"""
         else:
-            formatted_holdout_ids = ", ".join(map(str, holdout_ids))
+            # Default 80/20 split using modulo
+            training_logic = f"""CASE
+                WHEN MOD(ROW_NUMBER() OVER (ORDER BY {id_column}), 10) < 8 THEN True
+                ELSE False
+            END AS training"""
-        # Enclose each column name in double quotes
-        sql_columns = ", ".join([f'"{column}"' for column in column_list])
+        # Build WHERE clause if filter_expression is provided
+        where_clause = f"\nWHERE {filter_expression}" if filter_expression else ""
         # Construct the CREATE VIEW query
         create_view_query = f"""
         CREATE OR REPLACE VIEW {instance.table} AS
-        SELECT {sql_columns}, CASE
-            WHEN {id_column} IN ({formatted_holdout_ids}) THEN False
-            ELSE True
-        END AS training
-        FROM {instance.source_table}
+        SELECT {sql_columns}, {training_logic}
+        FROM {instance.source_table}{where_clause}
         """
         # Execute the CREATE VIEW query
@@ -99,43 +110,13 @@ class TrainingView(CreateView):
         # Return the View
         return View(instance.data_source, instance.view_name, auto_create_view=False)
-    # This is an internal method that's used to create a default training view
-    def _default_training_view(self, data_source: DataSource, id_column: str):
-        """Create a default view in Athena that assigns roughly 80% of the data to training
-        Args:
-            data_source (DataSource): The Workbench DataSource object
-            id_column (str): The name of the id column
-        """
-        self.log.important(f"Creating default Training View {self.table}...")
-        # Drop any columns generated from AWS
-        aws_cols = ["write_time", "api_invocation_time", "is_deleted", "event_time"]
-        column_list = [col for col in data_source.columns if col not in aws_cols]
-        # Enclose each column name in double quotes
-        sql_columns = ", ".join([f'"{column}"' for column in column_list])
-        # Construct the CREATE VIEW query with a simple modulo operation for the 80/20 split
-        create_view_query = f"""
-        CREATE OR REPLACE VIEW "{self.table}" AS
-        SELECT {sql_columns}, CASE
-            WHEN MOD(ROW_NUMBER() OVER (ORDER BY {id_column}), 10) < 8 THEN True  -- Assign 80% to training
-            ELSE False  -- Assign roughly 20% to validation/test
-        END AS training
-        FROM {self.base_table_name}
-        """
-        # Execute the CREATE VIEW query
-        data_source.execute_statement(create_view_query)
 if __name__ == "__main__":
     """Exercise the Training View functionality"""
     from workbench.api import FeatureSet
     # Get the FeatureSet
-    fs = FeatureSet("test_features")
+    fs = FeatureSet("abalone_features")
     # Delete the existing training view
     training_view = TrainingView.create(fs)
@@ -152,9 +133,18 @@ if __name__ == "__main__":
     # Create a TrainingView with holdout ids
     my_holdout_ids = list(range(10))
-    training_view = TrainingView.create(fs, id_column="id", holdout_ids=my_holdout_ids)
+    training_view = TrainingView.create(fs, id_column="auto_id", holdout_ids=my_holdout_ids)
     # Pull the training data
     df = training_view.pull_dataframe()
     print(df.head())
     print(df["training"].value_counts())
+    print(f"Shape: {df.shape}")
+    print(f"Diameter min: {df['diameter'].min()}, max: {df['diameter'].max()}")
+    # Test the filter expression
+    training_view = TrainingView.create(fs, id_column="auto_id", filter_expression="diameter > 0.5")
+    df = training_view.pull_dataframe()
+    print(df.head())
+    print(f"Shape with filter: {df.shape}")
+    print(f"Diameter min: {df['diameter'].min()}, max: {df['diameter'].max()}")

workbench/model_scripts/custom_models/proximity/feature_space_proximity.template CHANGED Viewed

@@ -8,7 +8,7 @@ TEMPLATE_PARAMS = {
     "id_column": "{{id_column}}",
     "features": "{{feature_list}}",
     "target": "{{target_column}}",
-    "track_columns": "{{track_columns}}"
+    "track_columns": "{{track_columns}}",
 }
 from io import StringIO
@@ -73,10 +73,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Load training data from the specified directory
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
     # Check if the DataFrame is empty
@@ -88,6 +85,7 @@ if __name__ == "__main__":
     # Now serialize the model
     model.serialize(args.model_dir)
 # Model loading and prediction functions
 def model_fn(model_dir):

workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template CHANGED Viewed

@@ -14,7 +14,7 @@ import pandas as pd
 TEMPLATE_PARAMS = {
     "features": "{{feature_list}}",
     "target": "{{target_column}}",
-    "train_all_data": "{{train_all_data}}"
+    "train_all_data": "{{train_all_data}}",
 }
@@ -37,7 +37,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     """
     Matches and renames DataFrame columns to match model feature names (case-insensitive).
     Prioritizes exact matches, then case-insensitive matches.
     Raises ValueError if any model features cannot be matched.
     """
     df_columns_lower = {col.lower(): col for col in df.columns}
@@ -81,10 +81,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Load training data from the specified directory
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
     # Check if the DataFrame is empty
@@ -109,8 +106,10 @@ if __name__ == "__main__":
     # Create and train the Regression/Confidence model
     # model = BayesianRidge()
     model = BayesianRidge(
-        alpha_1=1e-6, alpha_2=1e-6,  # Noise precision
-        lambda_1=1e-6, lambda_2=1e-6,  # Weight precision
+        alpha_1=1e-6,
+        alpha_2=1e-6,  # Noise precision
+        lambda_1=1e-6,
+        lambda_2=1e-6,  # Weight precision
         fit_intercept=True,
     )

workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template CHANGED Viewed

@@ -4,11 +4,7 @@ import awswrangler as wr
 import numpy as np
 # Model Performance Scores
-from sklearn.metrics import (
-    mean_absolute_error,
-    r2_score,
-    root_mean_squared_error
-)
+from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
 from sklearn.model_selection import KFold
 from scipy.optimize import minimize
@@ -23,7 +19,7 @@ TEMPLATE_PARAMS = {
     "features": "{{feature_list}}",
     "target": "{{target_column}}",
     "train_all_data": "{{train_all_data}}",
-    "model_metrics_s3_path": "{{model_metrics_s3_path}}"
+    "model_metrics_s3_path": "{{model_metrics_s3_path}}",
 }
@@ -47,7 +43,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     """
     Matches and renames DataFrame columns to match model feature names (case-insensitive).
     Prioritizes exact matches, then case-insensitive matches.
     Raises ValueError if any model features cannot be matched.
     """
     df_columns_lower = {col.lower(): col for col in df.columns}
@@ -90,10 +86,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Load training data from the specified directory
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
     # Check if the DataFrame is empty
@@ -172,16 +165,14 @@ if __name__ == "__main__":
     cv_residuals = np.array(cv_residuals)
     cv_uncertainties = np.array(cv_uncertainties)
     # Optimize calibration parameters: σ_cal = a * σ_uc + b
     def neg_log_likelihood(params):
         a, b = params
         sigma_cal = a * cv_uncertainties + b
         sigma_cal = np.maximum(sigma_cal, 1e-8)  # Prevent division by zero
-        return np.sum(0.5 * np.log(2 * np.pi * sigma_cal ** 2) + 0.5 * (cv_residuals ** 2) / (sigma_cal ** 2))
+        return np.sum(0.5 * np.log(2 * np.pi * sigma_cal**2) + 0.5 * (cv_residuals**2) / (sigma_cal**2))
-    result = minimize(neg_log_likelihood, x0=[1.0, 0.1], method='Nelder-Mead')
+    result = minimize(neg_log_likelihood, x0=[1.0, 0.1], method="Nelder-Mead")
     cal_a, cal_b = result.x
     print(f"Calibration parameters: a={cal_a:.4f}, b={cal_b:.4f}")
@@ -205,7 +196,9 @@ if __name__ == "__main__":
     result_df["prediction"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].mean(axis=1)
     # Compute uncalibrated uncertainty
-    result_df["prediction_std_uc"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].std(axis=1)
+    result_df["prediction_std_uc"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].std(
+        axis=1
+    )
     # Apply calibration to uncertainty
     result_df["prediction_std"] = cal_a * result_df["prediction_std_uc"] + cal_b
@@ -352,4 +345,4 @@ def predict_fn(df, models) -> pd.DataFrame:
     df = df.reindex(sorted(df.columns), axis=1)
     # All done, return the DataFrame
-    return df
+    return df

workbench/model_scripts/custom_models/uq_models/gaussian_process.template CHANGED Viewed

@@ -9,7 +9,7 @@ from sklearn.model_selection import train_test_split
 TEMPLATE_PARAMS = {
     "features": "{{feature_list}}",
     "target": "{{target_column}}",
-    "train_all_data": "{{train_all_data}}"
+    "train_all_data": "{{train_all_data}}",
 }
 from io import StringIO
@@ -33,7 +33,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     """
     Matches and renames DataFrame columns to match model feature names (case-insensitive).
     Prioritizes exact matches, then case-insensitive matches.
     Raises ValueError if any model features cannot be matched.
     """
     df_columns_lower = {col.lower(): col for col in df.columns}
@@ -46,7 +46,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
             rename_dict[df_columns_lower[feature.lower()]] = feature
         else:
             missing.append(feature)
     if missing:
         raise ValueError(f"Features not found: {missing}")
@@ -76,10 +76,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Load training data from the specified directory
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
     # Check if the DataFrame is empty
@@ -112,10 +109,7 @@ if __name__ == "__main__":
     )
     # Create a Pipeline with StandardScaler
-    model = Pipeline([
-        ("scaler", StandardScaler()),
-        ("model", model)
-    ])
+    model = Pipeline([("scaler", StandardScaler()), ("model", model)])
     # Prepare features and targets for training
     X_train = df_train[features]

workbench 0.8.177__py3-none-any.whl → 0.8.179__py3-none-any.whl

Potentially problematic release.

workbench 0.8.177py3-none-any.whl → 0.8.179py3-none-any.whl