PyPI - workbench - Versions diffs - 0.8.197__py3-none-any.whl → 0.8.201__py3-none-any.whl - Mend

workbench 0.8.197py3-none-any.whl → 0.8.201py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

workbench/algorithms/dataframe/proximity.py +19 -12
workbench/api/__init__.py +2 -1
workbench/api/feature_set.py +7 -4
workbench/api/model.py +1 -1
workbench/core/artifacts/__init__.py +11 -2
workbench/core/artifacts/endpoint_core.py +84 -46
workbench/core/artifacts/feature_set_core.py +69 -1
workbench/core/artifacts/model_core.py +37 -7
workbench/core/cloud_platform/aws/aws_parameter_store.py +18 -2
workbench/core/transforms/features_to_model/features_to_model.py +23 -20
workbench/core/views/view.py +2 -2
workbench/model_scripts/chemprop/chemprop.template +931 -0
workbench/model_scripts/chemprop/generated_model_script.py +931 -0
workbench/model_scripts/chemprop/requirements.txt +11 -0
workbench/model_scripts/custom_models/chem_info/fingerprints.py +134 -0
workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
workbench/model_scripts/custom_models/proximity/proximity.py +19 -12
workbench/model_scripts/custom_models/uq_models/proximity.py +19 -12
workbench/model_scripts/pytorch_model/generated_model_script.py +130 -88
workbench/model_scripts/pytorch_model/pytorch.template +128 -86
workbench/model_scripts/scikit_learn/generated_model_script.py +302 -0
workbench/model_scripts/script_generation.py +10 -7
workbench/model_scripts/uq_models/generated_model_script.py +25 -18
workbench/model_scripts/uq_models/mapie.template +23 -16
workbench/model_scripts/xgb_model/generated_model_script.py +6 -6
workbench/model_scripts/xgb_model/xgb_model.template +2 -2
workbench/repl/workbench_shell.py +14 -5
workbench/scripts/endpoint_test.py +162 -0
workbench/scripts/{lambda_launcher.py → lambda_test.py} +10 -0
workbench/utils/chemprop_utils.py +724 -0
workbench/utils/pytorch_utils.py +497 -0
workbench/utils/xgboost_model_utils.py +12 -5
{workbench-0.8.197.dist-info → workbench-0.8.201.dist-info}/METADATA +2 -2
{workbench-0.8.197.dist-info → workbench-0.8.201.dist-info}/RECORD +38 -30
{workbench-0.8.197.dist-info → workbench-0.8.201.dist-info}/entry_points.txt +2 -1
{workbench-0.8.197.dist-info → workbench-0.8.201.dist-info}/WHEEL +0 -0
{workbench-0.8.197.dist-info → workbench-0.8.201.dist-info}/licenses/LICENSE +0 -0
{workbench-0.8.197.dist-info → workbench-0.8.201.dist-info}/top_level.txt +0 -0

workbench/algorithms/dataframe/proximity.py CHANGED Viewed

@@ -68,7 +68,8 @@ class Proximity:
         self,
         top_percent: float = 1.0,
         min_delta: Optional[float] = None,
-        k_neighbors: int = 5,
+        k_neighbors: int = 4,
+        only_coincident: bool = False,
     ) -> pd.DataFrame:
         """
         Find compounds with steep target gradients (data quality issues and activity cliffs).
@@ -80,7 +81,8 @@ class Proximity:
         Args:
             top_percent: Percentage of compounds with steepest gradients to return (e.g., 1.0 = top 1%)
             min_delta: Minimum absolute target difference to consider. If None, defaults to target_range/100
-            k_neighbors: Number of neighbors to use for median calculation (default: 5)
+            k_neighbors: Number of neighbors to use for median calculation (default: 4)
+            only_coincident: If True, only consider compounds that are coincident (default: False)
         Returns:
             DataFrame of compounds with steepest gradients, sorted by gradient (descending)
@@ -99,10 +101,15 @@ class Proximity:
             min_delta = self.target_range / 100.0 if self.target_range > 0 else 0.0
         candidates = candidates[candidates["nn_target_diff"] >= min_delta]
-        # Get top X% by initial gradient
-        percentile = 100 - top_percent
-        threshold = np.percentile(candidates["gradient"], percentile)
-        candidates = candidates[candidates["gradient"] >= threshold].copy()
+        # Filter based on mode
+        if only_coincident:
+            # Only keep coincident points (nn_distance ~= 0)
+            candidates = candidates[candidates["nn_distance"] < epsilon].copy()
+        else:
+            # Get top X% by initial gradient
+            percentile = 100 - top_percent
+            threshold = np.percentile(candidates["gradient"], percentile)
+            candidates = candidates[candidates["gradient"] >= threshold].copy()
         # Phase 2: Verify with k-neighbor median to filter out cases where nearest neighbor is the outlier
         results = []
@@ -113,23 +120,23 @@ class Proximity:
             # Get k nearest neighbors (excluding self)
             nbrs = self.neighbors(cmpd_id, n_neighbors=k_neighbors, include_self=False)
-            # Calculate median target of k nearest neighbors
-            neighbor_median = nbrs.head(k_neighbors)[self.target].median()
+            # Calculate median target of k neighbors, excluding the nearest neighbor (index 0)
+            neighbor_median = nbrs.iloc[1:k_neighbors][self.target].median()
             median_diff = abs(cmpd_target - neighbor_median)
             # Only keep if compound differs from neighborhood median
             # This filters out cases where the nearest neighbor is the outlier
             if median_diff >= min_delta:
-                mean_distance = nbrs.head(k_neighbors)["distance"].mean()
                 results.append(
                     {
                         self.id_column: cmpd_id,
                         self.target: cmpd_target,
+                        "nn_target": row["nn_target"],
+                        "nn_target_diff": row["nn_target_diff"],
+                        "nn_distance": row["nn_distance"],
+                        "gradient": row["gradient"],  # Keep Phase 1 gradient
                         "neighbor_median": neighbor_median,
                         "neighbor_median_diff": median_diff,
-                        "mean_distance": mean_distance,
-                        "gradient": median_diff / (mean_distance + epsilon),
                     }
                 )

workbench/api/__init__.py CHANGED Viewed

@@ -14,7 +14,7 @@ These class provide high-level APIs for the Workbench package, offering easy acc
 from .data_source import DataSource
 from .feature_set import FeatureSet
-from .model import Model, ModelType
+from .model import Model, ModelType, ModelFramework
 from .endpoint import Endpoint
 from .meta import Meta
 from .parameter_store import ParameterStore
@@ -25,6 +25,7 @@ __all__ = [
     "FeatureSet",
     "Model",
     "ModelType",
+    "ModelFramework",
     "Endpoint",
     "Meta",
     "ParameterStore",

workbench/api/feature_set.py CHANGED Viewed

@@ -12,7 +12,7 @@ import pandas as pd
 from workbench.core.artifacts.artifact import Artifact
 from workbench.core.artifacts.feature_set_core import FeatureSetCore
 from workbench.core.transforms.features_to_model.features_to_model import FeaturesToModel
-from workbench.api.model import Model, ModelType
+from workbench.api.model import Model, ModelType, ModelFramework
 class FeatureSet(FeatureSetCore):
@@ -79,6 +79,7 @@ class FeatureSet(FeatureSetCore):
         self,
         name: str,
         model_type: ModelType,
+        model_framework: ModelFramework = ModelFramework.XGBOOST,
         tags: list = None,
         description: str = None,
         feature_list: list = None,
@@ -98,11 +99,12 @@ class FeatureSet(FeatureSetCore):
             name (str): The name of the Model to create
             model_type (ModelType): The type of model to create (See workbench.model.ModelType)
+            model_framework (ModelFramework, optional): The framework to use for the model (default: XGBOOST)
             tags (list, optional): Set the tags for the model.  If not given tags will be generated.
             description (str, optional): Set the description for the model. If not give a description is generated.
             feature_list (list, optional): Set the feature list for the model. If not given a feature list is generated.
             target_column (str, optional): The target column for the model (use None for unsupervised model)
-            model_class (str, optional): Model class to use (e.g. "KMeans", "PyTorch", default: None)
+            model_class (str, optional): Model class to use (e.g. "KMeans", default: None)
             model_import_str (str, optional): The import for the model (e.g. "from sklearn.cluster import KMeans")
             custom_script (str, optional): The custom script to use for the model (default: None)
             training_image (str, optional): The training image to use (default: "training")
@@ -128,8 +130,8 @@ class FeatureSet(FeatureSetCore):
         # Create the Model Tags
         tags = [name] if tags is None else tags
-        # If the model_class is PyTorch, ensure we set the training and inference images
-        if model_class and model_class.lower() == "pytorch":
+        # If the model framework is PyTorch or ChemProp, ensure we set the training and inference images
+        if model_framework in (ModelFramework.PYTORCH_TABULAR, ModelFramework.CHEMPROP):
             training_image = "pytorch_training"
             inference_image = "pytorch_inference"
@@ -138,6 +140,7 @@ class FeatureSet(FeatureSetCore):
             feature_name=self.name,
             model_name=name,
             model_type=model_type,
+            model_framework=model_framework,
             model_class=model_class,
             model_import_str=model_import_str,
             custom_script=custom_script,

workbench/api/model.py CHANGED Viewed

@@ -7,7 +7,7 @@ Dashboard UI, which provides additional model details and performance metrics
 # Workbench Imports
 from workbench.core.artifacts.artifact import Artifact
-from workbench.core.artifacts.model_core import ModelCore, ModelType  # noqa: F401
+from workbench.core.artifacts.model_core import ModelCore, ModelType, ModelFramework  # noqa: F401
 from workbench.core.transforms.model_to_endpoint.model_to_endpoint import ModelToEndpoint
 from workbench.api.endpoint import Endpoint
 from workbench.utils.model_utils import proximity_model_local, uq_model

workbench/core/artifacts/__init__.py CHANGED Viewed

@@ -15,7 +15,16 @@ from .artifact import Artifact
 from .athena_source import AthenaSource
 from .data_source_abstract import DataSourceAbstract
 from .feature_set_core import FeatureSetCore
-from .model_core import ModelCore, ModelType
+from .model_core import ModelCore, ModelType, ModelFramework
 from .endpoint_core import EndpointCore
-__all__ = ["Artifact", "AthenaSource", "DataSourceAbstract", "FeatureSetCore", "ModelCore", "ModelType", "EndpointCore"]
+__all__ = [
+    "Artifact",
+    "AthenaSource",
+    "DataSourceAbstract",
+    "FeatureSetCore",
+    "ModelCore",
+    "ModelType",
+    "ModelFramework",
+    "EndpointCore",
+]

workbench/core/artifacts/endpoint_core.py CHANGED Viewed

@@ -30,12 +30,14 @@ from sagemaker import Predictor
 # Workbench Imports
 from workbench.core.artifacts.artifact import Artifact
-from workbench.core.artifacts import FeatureSetCore, ModelCore, ModelType
+from workbench.core.artifacts import FeatureSetCore, ModelCore, ModelType, ModelFramework
 from workbench.utils.endpoint_metrics import EndpointMetrics
 from workbench.utils.cache import Cache
 from workbench.utils.s3_utils import compute_s3_object_hash
 from workbench.utils.model_utils import uq_metrics
-from workbench.utils.xgboost_model_utils import cross_fold_inference
+from workbench.utils.xgboost_model_utils import cross_fold_inference as xgboost_cross_fold
+from workbench.utils.pytorch_utils import cross_fold_inference as pytorch_cross_fold
+from workbench.utils.chemprop_utils import cross_fold_inference as chemprop_cross_fold
 from workbench_bridges.endpoints.fast_inference import fast_inference
@@ -399,41 +401,40 @@ class EndpointCore(Artifact):
         if target_column and (target_column not in prediction_df.columns):
             self.log.important(f"Target Column {target_column} not found in prediction_df!")
             self.log.important("In order to compute metrics, the target column must be present!")
-            return prediction_df
+            metrics = pd.DataFrame()
         # Compute the standard performance metrics for this model
-        model_type = model.model_type
-        if model_type in [ModelType.REGRESSOR, ModelType.UQ_REGRESSOR, ModelType.ENSEMBLE_REGRESSOR]:
-            prediction_df = self.residuals(target_column, prediction_df)
-            metrics = self.regression_metrics(target_column, prediction_df)
-        elif model_type == ModelType.CLASSIFIER:
-            metrics = self.classification_metrics(target_column, prediction_df)
         else:
-            # For other model types, we don't compute metrics
-            self.log.info(f"Model Type: {model_type} doesn't have metrics...")
-            metrics = pd.DataFrame()
+            if model.model_type in [ModelType.REGRESSOR, ModelType.UQ_REGRESSOR, ModelType.ENSEMBLE_REGRESSOR]:
+                prediction_df = self.residuals(target_column, prediction_df)
+                metrics = self.regression_metrics(target_column, prediction_df)
+            elif model.model_type == ModelType.CLASSIFIER:
+                metrics = self.classification_metrics(target_column, prediction_df)
+            else:
+                # For other model types, we don't compute metrics
+                self.log.info(f"Model Type: {model.model_type} doesn't have metrics...")
+                metrics = pd.DataFrame()
         # Print out the metrics
-        if not metrics.empty:
-            print(f"Performance Metrics for {self.model_name} on {self.name}")
-            print(metrics.head())
-            # Capture the inference results and metrics
-            if capture_name is not None:
-                # If we don't have an id_column, we'll pull it from the model's FeatureSet
-                if id_column is None:
-                    fs = FeatureSetCore(model.get_input())
-                    id_column = fs.id_column
-                description = capture_name.replace("_", " ").title()
-                self._capture_inference_results(
-                    capture_name, prediction_df, target_column, model_type, metrics, description, features, id_column
-                )
-                # For UQ Models we also capture the uncertainty metrics
-                if model_type in [ModelType.UQ_REGRESSOR]:
-                    metrics = uq_metrics(prediction_df, target_column)
-                    self.param_store.upsert(f"/workbench/models/{model.name}/inference/{capture_name}", metrics)
+        print(f"Performance Metrics for {self.model_name} on {self.name}")
+        print(metrics.head())
+        # Capture the inference results and metrics
+        if capture_name is not None:
+            # If we don't have an id_column, we'll pull it from the model's FeatureSet
+            if id_column is None:
+                fs = FeatureSetCore(model.get_input())
+                id_column = fs.id_column
+            description = capture_name.replace("_", " ").title()
+            self._capture_inference_results(
+                capture_name, prediction_df, target_column, model.model_type, metrics, description, features, id_column
+            )
+            # For UQ Models we also capture the uncertainty metrics
+            if model.model_type in [ModelType.UQ_REGRESSOR]:
+                metrics = uq_metrics(prediction_df, target_column)
+                self.param_store.upsert(f"/workbench/models/{model.name}/inference/{capture_name}", metrics)
         # Return the prediction DataFrame
         return prediction_df
@@ -452,7 +453,15 @@ class EndpointCore(Artifact):
         model = ModelCore(self.model_name)
         # Compute CrossFold (Metrics and Prediction Dataframe)
-        cross_fold_metrics, out_of_fold_df = cross_fold_inference(model, nfolds=nfolds)
+        if model.model_framework in [ModelFramework.UNKNOWN, ModelFramework.XGBOOST]:
+            cross_fold_metrics, out_of_fold_df = xgboost_cross_fold(model, nfolds=nfolds)
+        elif model.model_framework == ModelFramework.PYTORCH_TABULAR:
+            cross_fold_metrics, out_of_fold_df = pytorch_cross_fold(model, nfolds=nfolds)
+        elif model.model_framework == ModelFramework.CHEMPROP:
+            cross_fold_metrics, out_of_fold_df = chemprop_cross_fold(model, nfolds=nfolds)
+        else:
+            self.log.error(f"Cross-Fold Inference not supported for Model Framework: {model.model_framework}.")
+            return pd.DataFrame()
         # If the metrics dataframe isn't empty save to the param store
         if not cross_fold_metrics.empty:
@@ -460,6 +469,11 @@ class EndpointCore(Artifact):
             metrics = cross_fold_metrics.to_dict(orient="records")
             self.param_store.upsert(f"/workbench/models/{model.name}/inference/cross_fold", metrics)
+        # If the out_of_fold_df is empty return it
+        if out_of_fold_df.empty:
+            self.log.warning("No out-of-fold predictions were made. Returning empty DataFrame.")
+            return out_of_fold_df
         # Capture the results
         capture_name = "full_cross_fold"
         description = capture_name.replace("_", " ").title()
@@ -765,20 +779,18 @@ class EndpointCore(Artifact):
         self.log.info(f"Writing metrics to {inference_capture_path}/inference_metrics.csv")
         wr.s3.to_csv(metrics, f"{inference_capture_path}/inference_metrics.csv", index=False)
-        # Grab the target column, prediction column, any _proba columns, and the ID column (if present)
-        output_columns = [target_column]
-        output_columns += [col for col in pred_results_df.columns if "prediction" in col]
+        # Grab the ID column and target column if they are present
+        output_columns = []
+        if id_column and id_column in pred_results_df.columns:
+            output_columns.append(id_column)
+        if target_column in pred_results_df.columns:
+            output_columns.append(target_column)
-        # Add any _proba columns to the output columns
+        # Grab the prediction column, any _proba columns, and UQ columns
+        output_columns += [col for col in pred_results_df.columns if "prediction" in col]
         output_columns += [col for col in pred_results_df.columns if col.endswith("_proba")]
-        # Add any Uncertainty Quantile columns to the output columns
         output_columns += [col for col in pred_results_df.columns if col.startswith("q_") or col == "confidence"]
-        # Add the ID column
-        if id_column and id_column in pred_results_df.columns:
-            output_columns.insert(0, id_column)
         # Write the predictions to our S3 Model Inference Folder
         self.log.info(f"Writing predictions to {inference_capture_path}/inference_predictions.csv")
         subset_df = pred_results_df[output_columns]
@@ -810,10 +822,23 @@ class EndpointCore(Artifact):
             self.log.warning("No predictions were made. Returning empty DataFrame.")
             return pd.DataFrame()
+        # Check for NaN values in target or prediction columns
+        prediction_col = "prediction" if "prediction" in prediction_df.columns else "predictions"
+        if prediction_df[target_column].isnull().any() or prediction_df[prediction_col].isnull().any():
+            # Compute the number of NaN values in each column
+            num_nan_target = prediction_df[target_column].isnull().sum()
+            num_nan_prediction = prediction_df[prediction_col].isnull().sum()
+            self.log.warning(
+                f"NaNs Found: {target_column} {num_nan_target} and {prediction_col}: {num_nan_prediction}."
+            )
+            self.log.warning(
+                "NaN values found in target or prediction columns. Dropping NaN rows for metric computation."
+            )
+            prediction_df = prediction_df.dropna(subset=[target_column, prediction_col])
         # Compute the metrics
         try:
             y_true = prediction_df[target_column]
-            prediction_col = "prediction" if "prediction" in prediction_df.columns else "predictions"
             y_pred = prediction_df[prediction_col]
             mae = mean_absolute_error(y_true, y_pred)
@@ -891,6 +916,14 @@ class EndpointCore(Artifact):
         Returns:
             pd.DataFrame: DataFrame with the performance metrics
         """
+        # Drop rows with NaN predictions (can't compute metrics on missing predictions)
+        prediction_col = "prediction" if "prediction" in prediction_df.columns else "predictions"
+        nan_mask = prediction_df[prediction_col].isna()
+        if nan_mask.any():
+            n_nan = nan_mask.sum()
+            self.log.warning(f"Dropping {n_nan} rows with NaN predictions for metrics calculation")
+            prediction_df = prediction_df[~nan_mask].copy()
         # Get the class labels from the model
         class_labels = ModelCore(self.model_name).class_labels()
         if class_labels is None:
@@ -903,7 +936,6 @@ class EndpointCore(Artifact):
             self.validate_proba_columns(prediction_df, class_labels)
         # Calculate precision, recall, f1, and support, handling zero division
-        prediction_col = "prediction" if "prediction" in prediction_df.columns else "predictions"
         scores = precision_recall_fscore_support(
             prediction_df[target_column],
             prediction_df[prediction_col],
@@ -954,9 +986,15 @@ class EndpointCore(Artifact):
         Returns:
             pd.DataFrame: DataFrame with the confusion matrix
         """
+        # Drop rows with NaN predictions (can't include in confusion matrix)
+        prediction_col = "prediction" if "prediction" in prediction_df.columns else "predictions"
+        nan_mask = prediction_df[prediction_col].isna()
+        if nan_mask.any():
+            n_nan = nan_mask.sum()
+            self.log.warning(f"Dropping {n_nan} rows with NaN predictions for confusion matrix")
+            prediction_df = prediction_df[~nan_mask].copy()
         y_true = prediction_df[target_column]
-        prediction_col = "prediction" if "prediction" in prediction_df.columns else "predictions"
         y_pred = prediction_df[prediction_col]
         # Get model class labels

workbench/core/artifacts/feature_set_core.py CHANGED Viewed

@@ -16,8 +16,9 @@ from sagemaker.feature_store.feature_store import FeatureStore
 from workbench.core.artifacts.artifact import Artifact
 from workbench.core.artifacts.data_source_factory import DataSourceFactory
 from workbench.core.artifacts.athena_source import AthenaSource
+from workbench.utils.deprecated_utils import deprecated
-from typing import TYPE_CHECKING, Optional, List, Union
+from typing import TYPE_CHECKING, Optional, List, Dict, Union
 from workbench.utils.aws_utils import aws_throttle
@@ -509,6 +510,71 @@ class FeatureSetCore(Artifact):
         ].tolist()
         return hold_out_ids
+    def set_sample_weights(
+        self,
+        weight_dict: Dict[Union[str, int], float],
+        default_weight: float = 1.0,
+        exclude_zero_weights: bool = True,
+    ):
+        """Configure training view with sample weights for each ID.
+        Args:
+            weight_dict: Mapping of ID to sample weight
+                - weight > 1.0: oversample/emphasize
+                - weight = 1.0: normal (default)
+                - 0 < weight < 1.0: downweight/de-emphasize
+                - weight = 0.0: exclude from training
+            default_weight: Weight for IDs not in weight_dict (default: 1.0)
+            exclude_zero_weights: If True, filter out rows with sample_weight=0 (default: True)
+        Example:
+            weights = {
+                'compound_42': 3.0,  # oversample 3x
+                'compound_99': 0.1,  # noisy, downweight
+                'compound_123': 0.0, # exclude from training
+            }
+            model.set_sample_weights(weights)  # zeros automatically excluded
+            model.set_sample_weights(weights, exclude_zero_weights=False)  # keep zeros
+        """
+        from workbench.core.views import TrainingView
+        if not weight_dict:
+            self.log.important("Empty weight_dict, creating standard training view")
+            TrainingView.create(self, id_column=self.id_column)
+            return
+        self.log.important(f"Setting sample weights for {len(weight_dict)} IDs")
+        # Helper to format IDs for SQL
+        def format_id(id_val):
+            return repr(id_val)
+        # Build CASE statement for sample_weight
+        case_conditions = [
+            f"WHEN {self.id_column} = {format_id(id_val)} THEN {weight}" for id_val, weight in weight_dict.items()
+        ]
+        case_statement = "\n        ".join(case_conditions)
+        # Build inner query with sample weights
+        inner_sql = f"""SELECT
+            *,
+            CASE
+                {case_statement}
+                ELSE {default_weight}
+            END AS sample_weight
+        FROM {self.table}"""
+        # Optionally filter out zero weights
+        if exclude_zero_weights:
+            zero_count = sum(1 for weight in weight_dict.values() if weight == 0.0)
+            custom_sql = f"SELECT * FROM ({inner_sql}) WHERE sample_weight > 0"
+            self.log.important(f"Filtering out {zero_count} rows with sample_weight = 0")
+        else:
+            custom_sql = inner_sql
+        TrainingView.create_with_sql(self, sql_query=custom_sql, id_column=self.id_column)
+    @deprecated(version=0.9)
     def set_training_filter(self, filter_expression: Optional[str] = None):
         """Set a filter expression for the training view for this FeatureSet
@@ -528,6 +594,7 @@ class FeatureSetCore(Artifact):
             self, id_column=self.id_column, holdout_ids=holdout_ids, filter_expression=filter_expression
         )
+    @deprecated(version="0.9")
     def exclude_ids_from_training(self, ids: List[Union[str, int]], column_name: Optional[str] = None):
         """Exclude a list of IDs from the training view
@@ -551,6 +618,7 @@ class FeatureSetCore(Artifact):
         # Apply the filter
         self.set_training_filter(filter_expression)
+    @deprecated(version="0.9")
     def set_training_sampling(
         self,
         exclude_ids: Optional[List[Union[str, int]]] = None,

workbench/core/artifacts/model_core.py CHANGED Viewed

@@ -30,11 +30,23 @@ class ModelType(Enum):
     CLASSIFIER = "classifier"
     REGRESSOR = "regressor"
     CLUSTERER = "clusterer"
-    TRANSFORMER = "transformer"
     PROXIMITY = "proximity"
     PROJECTION = "projection"
     UQ_REGRESSOR = "uq_regressor"
     ENSEMBLE_REGRESSOR = "ensemble_regressor"
+    TRANSFORMER = "transformer"
+    UNKNOWN = "unknown"
+class ModelFramework(Enum):
+    """Enumerated Types for Workbench Model Frameworks"""
+    SKLEARN = "sklearn"
+    XGBOOST = "xgboost"
+    LIGHTGBM = "lightgbm"
+    PYTORCH_TABULAR = "pytorch_tabular"
+    CHEMPROP = "chemprop"
+    TRANSFORMER = "transformer"
     UNKNOWN = "unknown"
@@ -87,11 +99,10 @@ class ModelCore(Artifact):
         ```
     """
-    def __init__(self, model_name: str, model_type: ModelType = None, **kwargs):
+    def __init__(self, model_name: str, **kwargs):
         """ModelCore Initialization
         Args:
             model_name (str): Name of Model in Workbench.
-            model_type (ModelType, optional): Set this for newly created Models. Defaults to None.
             **kwargs: Additional keyword arguments
         """
@@ -125,10 +136,8 @@ class ModelCore(Artifact):
                 self.latest_model = self.model_meta["ModelPackageList"][0]
                 self.description = self.latest_model.get("ModelPackageDescription", "-")
                 self.training_job_name = self._extract_training_job_name()
-                if model_type:
-                    self._set_model_type(model_type)
-                else:
-                    self.model_type = self._get_model_type()
+                self.model_type = self._get_model_type()
+                self.model_framework = self._get_model_framework()
             except (IndexError, KeyError):
                 self.log.critical(f"Model {self.model_name} appears to be malformed. Delete and recreate it!")
                 return
@@ -972,6 +981,27 @@ class ModelCore(Artifact):
             self.log.warning(f"Could not determine model type for {self.model_name}!")
             return ModelType.UNKNOWN
+    def _set_model_framework(self, model_framework: ModelFramework):
+        """Internal: Set the Model Framework for this Model"""
+        self.model_framework = model_framework
+        self.upsert_workbench_meta({"workbench_model_framework": self.model_framework.value})
+        self.remove_health_tag("model_framework_unknown")
+    def _get_model_framework(self) -> ModelFramework:
+        """Internal: Query the Workbench Metadata to get the model framework
+        Returns:
+            ModelFramework: The ModelFramework of this Model
+        Notes:
+            This is an internal method that should not be called directly
+            Use the model_framework attribute instead
+        """
+        model_framework = self.workbench_meta().get("workbench_model_framework")
+        try:
+            return ModelFramework(model_framework)
+        except ValueError:
+            self.log.warning(f"Could not determine model framework for {self.model_name}!")
+            return ModelFramework.UNKNOWN
     def _load_training_metrics(self):
         """Internal: Retrieve the training metrics and Confusion Matrix for this model
                      and load the data into the Workbench Metadata

workbench/core/cloud_platform/aws/aws_parameter_store.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import Union
 import logging
 import json
 import zlib
+import time
 import base64
 from botocore.exceptions import ClientError
@@ -77,7 +78,7 @@ class AWSParameterStore:
             all_parameters = []
             # Make the initial call to describe parameters
-            response = self.ssm_client.describe_parameters(**params)
+            response = self._call_with_retry(self.ssm_client.describe_parameters, **params)
             # Aggregate the names from the initial response
             all_parameters.extend(param["Name"] for param in response["Parameters"])
@@ -86,7 +87,7 @@ class AWSParameterStore:
             while "NextToken" in response:
                 # Update the parameters with the NextToken for subsequent calls
                 params["NextToken"] = response["NextToken"]
-                response = self.ssm_client.describe_parameters(**params)
+                response = self._call_with_retry(self.ssm_client.describe_parameters, **params)
                 # Aggregate the names from the subsequent responses
                 all_parameters.extend(param["Name"] for param in response["Parameters"])
@@ -183,6 +184,21 @@ class AWSParameterStore:
             self.log.critical(f"Failed to add/update parameter '{name}': {e}")
             raise
+    def _call_with_retry(self, func, **kwargs):
+        """Call AWS API with exponential backoff on throttling."""
+        max_retries = 5
+        base_delay = 1
+        for attempt in range(max_retries):
+            try:
+                return func(**kwargs)
+            except ClientError as e:
+                if e.response["Error"]["Code"] == "ThrottlingException" and attempt < max_retries - 1:
+                    delay = base_delay * (2**attempt)
+                    self.log.warning(f"Throttled, retrying in {delay}s...")
+                    time.sleep(delay)
+                else:
+                    raise
     @staticmethod
     def _compress_value(value) -> str:
         """Compress a value with precision reduction."""

workbench 0.8.197__py3-none-any.whl → 0.8.201__py3-none-any.whl

workbench 0.8.197py3-none-any.whl → 0.8.201py3-none-any.whl