PyPI - workbench - Versions diffs - 0.8.162__py3-none-any.whl → 0.8.202__py3-none-any.whl - Mend

workbench 0.8.162py3-none-any.whl → 0.8.202py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (113) hide show

workbench/algorithms/dataframe/__init__.py +1 -2
workbench/algorithms/dataframe/fingerprint_proximity.py +2 -2
workbench/algorithms/dataframe/proximity.py +261 -235
workbench/algorithms/graph/light/proximity_graph.py +10 -8
workbench/api/__init__.py +2 -1
workbench/api/compound.py +1 -1
workbench/api/endpoint.py +11 -0
workbench/api/feature_set.py +11 -8
workbench/api/meta.py +5 -2
workbench/api/model.py +16 -15
workbench/api/monitor.py +1 -16
workbench/core/artifacts/__init__.py +11 -2
workbench/core/artifacts/artifact.py +11 -3
workbench/core/artifacts/data_capture_core.py +355 -0
workbench/core/artifacts/endpoint_core.py +256 -118
workbench/core/artifacts/feature_set_core.py +265 -16
workbench/core/artifacts/model_core.py +107 -60
workbench/core/artifacts/monitor_core.py +33 -248
workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
workbench/core/cloud_platform/aws/aws_meta.py +12 -5
workbench/core/cloud_platform/aws/aws_parameter_store.py +18 -2
workbench/core/cloud_platform/aws/aws_session.py +4 -4
workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
workbench/core/transforms/features_to_model/features_to_model.py +42 -32
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
workbench/core/views/training_view.py +113 -42
workbench/core/views/view.py +53 -3
workbench/core/views/view_utils.py +4 -4
workbench/model_scripts/chemprop/chemprop.template +852 -0
workbench/model_scripts/chemprop/generated_model_script.py +852 -0
workbench/model_scripts/chemprop/requirements.txt +11 -0
workbench/model_scripts/custom_models/chem_info/fingerprints.py +134 -0
workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
workbench/model_scripts/custom_models/proximity/proximity.py +261 -235
workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
workbench/model_scripts/custom_models/uq_models/meta_uq.template +166 -62
workbench/model_scripts/custom_models/uq_models/ngboost.template +30 -18
workbench/model_scripts/custom_models/uq_models/proximity.py +261 -235
workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
workbench/model_scripts/pytorch_model/generated_model_script.py +373 -190
workbench/model_scripts/pytorch_model/pytorch.template +370 -187
workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
workbench/model_scripts/script_generation.py +17 -9
workbench/model_scripts/uq_models/generated_model_script.py +605 -0
workbench/model_scripts/uq_models/mapie.template +605 -0
workbench/model_scripts/uq_models/requirements.txt +1 -0
workbench/model_scripts/xgb_model/generated_model_script.py +37 -46
workbench/model_scripts/xgb_model/xgb_model.template +44 -46
workbench/repl/workbench_shell.py +28 -14
workbench/scripts/endpoint_test.py +162 -0
workbench/scripts/lambda_test.py +73 -0
workbench/scripts/ml_pipeline_batch.py +137 -0
workbench/scripts/ml_pipeline_sqs.py +186 -0
workbench/scripts/monitor_cloud_watch.py +20 -100
workbench/utils/aws_utils.py +4 -3
workbench/utils/chem_utils/__init__.py +0 -0
workbench/utils/chem_utils/fingerprints.py +134 -0
workbench/utils/chem_utils/misc.py +194 -0
workbench/utils/chem_utils/mol_descriptors.py +483 -0
workbench/utils/chem_utils/mol_standardize.py +450 -0
workbench/utils/chem_utils/mol_tagging.py +348 -0
workbench/utils/chem_utils/projections.py +209 -0
workbench/utils/chem_utils/salts.py +256 -0
workbench/utils/chem_utils/sdf.py +292 -0
workbench/utils/chem_utils/toxicity.py +250 -0
workbench/utils/chem_utils/vis.py +253 -0
workbench/utils/chemprop_utils.py +760 -0
workbench/utils/cloudwatch_handler.py +1 -1
workbench/utils/cloudwatch_utils.py +137 -0
workbench/utils/config_manager.py +3 -7
workbench/utils/endpoint_utils.py +5 -7
workbench/utils/license_manager.py +2 -6
workbench/utils/model_utils.py +95 -34
workbench/utils/monitor_utils.py +44 -62
workbench/utils/pandas_utils.py +3 -3
workbench/utils/pytorch_utils.py +526 -0
workbench/utils/shap_utils.py +10 -2
workbench/utils/workbench_logging.py +0 -3
workbench/utils/workbench_sqs.py +1 -1
workbench/utils/xgboost_model_utils.py +371 -156
workbench/web_interface/components/model_plot.py +7 -1
workbench/web_interface/components/plugin_unit_test.py +5 -2
workbench/web_interface/components/plugins/dashboard_status.py +3 -1
workbench/web_interface/components/plugins/generated_compounds.py +1 -1
workbench/web_interface/components/plugins/model_details.py +9 -7
workbench/web_interface/components/plugins/scatter_plot.py +3 -3
{workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/METADATA +27 -6
{workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/RECORD +101 -85
{workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/entry_points.txt +4 -0
{workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/licenses/LICENSE +1 -1
workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
workbench/model_scripts/quant_regression/quant_regression.template +0 -279
workbench/model_scripts/quant_regression/requirements.txt +0 -1
workbench/utils/chem_utils.py +0 -1556
workbench/utils/execution_environment.py +0 -211
workbench/utils/fast_inference.py +0 -167
workbench/utils/resource_utils.py +0 -39
{workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/WHEEL +0 -0
{workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/top_level.txt +0 -0

workbench/core/artifacts/monitor_core.py CHANGED Viewed

@@ -2,12 +2,10 @@
 import logging
 import json
-from typing import Union, Tuple
+from typing import Union
 import pandas as pd
-from sagemaker import Predictor
 from sagemaker.model_monitor import (
     CronExpressionGenerator,
-    DataCaptureConfig,
     DefaultModelMonitor,
     DatasetFormat,
 )
@@ -15,29 +13,32 @@ import awswrangler as wr
 # Workbench Imports
 from workbench.core.artifacts.endpoint_core import EndpointCore
+from workbench.core.artifacts.data_capture_core import DataCaptureCore
 from workbench.api import Model, FeatureSet
 from workbench.core.cloud_platform.aws.aws_account_clamp import AWSAccountClamp
 from workbench.utils.s3_utils import read_content_from_s3, upload_content_to_s3
 from workbench.utils.datetime_utils import datetime_string
 from workbench.utils.monitor_utils import (
-    process_data_capture,
     get_monitor_json_data,
     parse_monitoring_results,
     preprocessing_script,
 )
-# Note: This resource might come in handy when doing code refactoring
+# Note: These resources might come in handy when doing code refactoring
 # https://github.com/aws-samples/amazon-sagemaker-from-idea-to-production/blob/master/06-monitoring.ipynb
 # https://docs.aws.amazon.com/sagemaker/latest/dg/model-monitor-pre-and-post-processing.html
 # https://github.com/aws/amazon-sagemaker-examples/blob/main/sagemaker_model_monitor/introduction/SageMaker-ModelMonitoring.ipynb
 class MonitorCore:
+    """Manages monitoring, baselines, and monitoring schedules for SageMaker endpoints"""
     def __init__(self, endpoint_name, instance_type="ml.m5.large"):
         """MonitorCore Class
         Args:
             endpoint_name (str): Name of the endpoint to set up monitoring for
-            instance_type (str): Instance type to use for monitoring. Defaults to "ml.t3.medium".
+            instance_type (str): Instance type to use for monitoring. Defaults to "ml.m5.large".
         """
         self.log = logging.getLogger("workbench")
         self.endpoint_name = endpoint_name
@@ -46,7 +47,6 @@ class MonitorCore:
         # Initialize Class Attributes
         self.sagemaker_session = self.endpoint.sm_session
         self.sagemaker_client = self.endpoint.sm_client
-        self.data_capture_path = self.endpoint.endpoint_data_capture_path
         self.monitoring_path = self.endpoint.endpoint_monitoring_path
         self.monitoring_schedule_name = f"{self.endpoint_name}-monitoring-schedule"
         self.baseline_dir = f"{self.monitoring_path}/baseline"
@@ -57,6 +57,10 @@ class MonitorCore:
         self.workbench_role_arn = AWSAccountClamp().aws_session.get_workbench_execution_role_arn()
         self.instance_type = instance_type
+        # Create DataCaptureCore instance for composition
+        self.data_capture = DataCaptureCore(endpoint_name)
+        self.data_capture_path = self.data_capture.data_capture_path
         # Check if a monitoring schedule already exists for this endpoint
         existing_schedule = self.monitoring_schedule_exists()
@@ -74,23 +78,20 @@ class MonitorCore:
             self.log.info(f"Initialized new model monitor for {self.endpoint_name}")
     def summary(self) -> dict:
-        """Return the summary of information about the endpoint monitor
+        """Return the summary of monitoring configuration
         Returns:
-            dict: Summary of information about the endpoint monitor
+            dict: Summary of monitoring status
         """
         if self.endpoint.is_serverless():
             return {
                 "endpoint_type": "serverless",
-                "data_capture": "not supported",
                 "baseline": "not supported",
                 "monitoring_schedule": "not supported",
             }
         else:
             summary = {
                 "endpoint_type": "realtime",
-                "data_capture": self.data_capture_enabled(),
-                "capture_percent": self.data_capture_percent(),
                 "baseline": self.baseline_exists(),
                 "monitoring_schedule": self.monitoring_schedule_exists(),
                 "preprocessing": self.preprocessing_exists(),
@@ -103,22 +104,15 @@ class MonitorCore:
         Returns:
             dict: The monitoring details for the endpoint
         """
-        # Get the actual data capture path
-        actual_capture_path = self.data_capture_config()["DestinationS3Uri"]
-        if actual_capture_path != self.data_capture_path:
-            self.log.warning(
-                f"Data capture path mismatch: Expected {self.data_capture_path}, "
-                f"but found {actual_capture_path}. Using the actual path."
-            )
-            self.data_capture_path = actual_capture_path
         result = self.summary()
         info = {
-            "data_capture_path": self.data_capture_path if self.data_capture_enabled() else None,
-            "preprocessing_script_file": self.preprocessing_script_file if self.preprocessing_exists() else None,
             "monitoring_schedule_status": "Not Scheduled",
         }
         result.update(info)
+        if self.preprocessing_exists():
+            result["preprocessing_script_file"] = self.preprocessing_script_file
         if self.baseline_exists():
             result.update(
                 {
@@ -144,7 +138,6 @@ class MonitorCore:
             last_run = schedule_details.get("LastMonitoringExecutionSummary", {})
             if last_run:
                 # If no inference was run since the last monitoring schedule, the
                 # status will be "Failed" with reason "Job inputs had no data",
                 # so we check for that and set the status to "No New Data"
@@ -162,187 +155,22 @@ class MonitorCore:
         return result
-    def enable_data_capture(self, capture_percentage=100, force=False):
-        """
-        Enable data capture for the SageMaker endpoint.
+    def enable_data_capture(self, capture_percentage=100):
+        """Enable data capture for the endpoint
         Args:
-            capture_percentage (int): Percentage of data to capture. Defaults to 100.
-            force (bool): If True, force reconfiguration even if data capture is already enabled.
+            capture_percentage (int): Percentage of requests to capture (0-100, default 100)
         """
-        # Early returns for cases where we can't/don't need to add data capture
         if self.endpoint.is_serverless():
             self.log.warning("Data capture is not supported for serverless endpoints.")
             return
-        if self.data_capture_enabled() and not force:
-            self.log.important(f"Data capture already configured for {self.endpoint_name}.")
-            return
-        # Get the current endpoint configuration name for later deletion
-        current_endpoint_config_name = self.endpoint.endpoint_config_name()
-        # Log the data capture operation
-        self.log.important(f"Enabling Data Capture for {self.endpoint_name} --> {self.data_capture_path}")
-        self.log.important("This normally redeploys the endpoint...")
-        # Create and apply the data capture configuration
-        data_capture_config = DataCaptureConfig(
-            enable_capture=True,  # Required parameter
-            sampling_percentage=capture_percentage,
-            destination_s3_uri=self.data_capture_path,
-        )
-        # Update endpoint with the new capture configuration
-        Predictor(self.endpoint_name, sagemaker_session=self.sagemaker_session).update_data_capture_config(
-            data_capture_config=data_capture_config
-        )
-        # Clean up old endpoint configuration
-        self.sagemaker_client.delete_endpoint_config(EndpointConfigName=current_endpoint_config_name)
-    def data_capture_config(self):
-        """
-        Returns the complete data capture configuration from the endpoint config.
-        Returns:
-            dict: Complete DataCaptureConfig from AWS, or None if not configured
-        """
-        config_name = self.endpoint.endpoint_config_name()
-        response = self.sagemaker_client.describe_endpoint_config(EndpointConfigName=config_name)
-        data_capture_config = response.get("DataCaptureConfig")
-        if not data_capture_config:
-            self.log.error(f"No data capture configuration found for endpoint config {config_name}")
-            return None
-        return data_capture_config
-    def disable_data_capture(self):
-        """
-        Disable data capture for the SageMaker endpoint.
-        """
-        # Early return if data capture isn't configured
-        if not self.data_capture_enabled():
-            self.log.important(f"Data capture is not currently enabled for {self.endpoint_name}.")
+        if self.data_capture.is_enabled():
+            self.log.info(f"Data capture is already enabled for {self.endpoint_name}.")
             return
-        # Get the current endpoint configuration name for later deletion
-        current_endpoint_config_name = self.endpoint.endpoint_config_name()
-        # Log the operation
-        self.log.important(f"Disabling Data Capture for {self.endpoint_name}")
-        self.log.important("This normally redeploys the endpoint...")
-        # Create a configuration with capture disabled
-        data_capture_config = DataCaptureConfig(enable_capture=False, destination_s3_uri=self.data_capture_path)
-        # Update endpoint with the new configuration
-        Predictor(self.endpoint_name, sagemaker_session=self.sagemaker_session).update_data_capture_config(
-            data_capture_config=data_capture_config
-        )
-        # Clean up old endpoint configuration
-        self.sagemaker_client.delete_endpoint_config(EndpointConfigName=current_endpoint_config_name)
-    def data_capture_enabled(self):
-        """
-        Check if data capture is already configured on the endpoint.
-        Args:
-            capture_percentage (int): Expected data capture percentage.
-        Returns:
-            bool: True if data capture is already configured, False otherwise.
-        """
-        try:
-            endpoint_config_name = self.endpoint.endpoint_config_name()
-            endpoint_config = self.sagemaker_client.describe_endpoint_config(EndpointConfigName=endpoint_config_name)
-            data_capture_config = endpoint_config.get("DataCaptureConfig", {})
-            # Check if data capture is enabled and the percentage matches
-            is_enabled = data_capture_config.get("EnableCapture", False)
-            return is_enabled
-        except Exception as e:
-            self.log.error(f"Error checking data capture configuration: {e}")
-            return False
-    def data_capture_percent(self):
-        """
-        Get the data capture percentage from the endpoint configuration.
-        Returns:
-            int: Data capture percentage if enabled, None otherwise.
-        """
-        try:
-            endpoint_config_name = self.endpoint.endpoint_config_name()
-            endpoint_config = self.sagemaker_client.describe_endpoint_config(EndpointConfigName=endpoint_config_name)
-            data_capture_config = endpoint_config.get("DataCaptureConfig", {})
-            # Check if data capture is enabled and return the percentage
-            if data_capture_config.get("EnableCapture", False):
-                return data_capture_config.get("InitialSamplingPercentage", 0)
-            else:
-                return None
-        except Exception as e:
-            self.log.error(f"Error checking data capture percentage: {e}")
-            return None
-    def get_captured_data(self, max_files=None, add_timestamp=True) -> Tuple[pd.DataFrame, pd.DataFrame]:
-        """
-        Read and process captured data from S3.
-        Args:
-            max_files (int, optional): Maximum number of files to process.
-                                       Defaults to None to process all files.
-            add_timestamp (bool, optional): Whether to add a timestamp column to the DataFrame.
-        Returns:
-            Tuple[pd.DataFrame, pd.DataFrame]: Processed input and output DataFrames.
-        """
-        # List files in the specified S3 path
-        files = wr.s3.list_objects(self.data_capture_path)
-        if not files:
-            self.log.warning(f"No data capture files found in {self.data_capture_path}.")
-            return pd.DataFrame(), pd.DataFrame()
-        self.log.info(f"Found {len(files)} files in {self.data_capture_path}.")
-        # Sort files by timestamp (assuming the naming convention includes timestamp)
-        files.sort()
-        # Select files to process
-        if max_files is None:
-            files_to_process = files
-            self.log.info(f"Processing all {len(files)} files.")
-        else:
-            files_to_process = files[-max_files:] if files else []
-            self.log.info(f"Processing the {len(files_to_process)} most recent file(s).")
-        # Process each file
-        all_input_dfs = []
-        all_output_dfs = []
-        for file_path in files_to_process:
-            self.log.info(f"Processing {file_path}...")
-            try:
-                # Read the JSON lines file
-                df = wr.s3.read_json(path=file_path, lines=True)
-                if not df.empty:
-                    input_df, output_df = process_data_capture(df)
-                    # Generate a timestamp column if requested
-                    if add_timestamp:
-                        # Get file metadata to extract last modified time
-                        file_metadata = wr.s3.describe_objects(path=file_path)
-                        timestamp = file_metadata[file_path]["LastModified"]
-                        output_df["timestamp"] = timestamp
-                    # Append the processed DataFrames to the lists
-                    all_input_dfs.append(input_df)
-                    all_output_dfs.append(output_df)
-            except Exception as e:
-                self.log.warning(f"Error processing file {file_path}: {e}")
-        # Combine all DataFrames
-        if not all_input_dfs or not all_output_dfs:
-            self.log.warning("No valid data was processed from the captured files.")
-            return pd.DataFrame(), pd.DataFrame()
-        return pd.concat(all_input_dfs, ignore_index=True), pd.concat(all_output_dfs, ignore_index=True)
+        self.data_capture.enable(capture_percentage=capture_percentage)
+        self.log.important(f"Enabled data capture for {self.endpoint_name} at {self.data_capture_path}")
     def baseline_exists(self) -> bool:
         """
@@ -533,6 +361,11 @@ class MonitorCore:
             self.log.warning("If you want to create another one, delete existing schedule first.")
             return
+        # Check if data capture is enabled, if not enable it
+        if not self.data_capture.is_enabled():
+            self.log.warning("Data capture is not enabled for this endpoint. Enabling it now...")
+            self.enable_data_capture(capture_percentage=100)
         # Set up a NEW monitoring schedule
         schedule_args = {
             "monitor_schedule_name": self.monitoring_schedule_name,
@@ -577,33 +410,6 @@ class MonitorCore:
         self.model_monitor.delete_monitoring_schedule()
         self.log.important(f"Deleted monitoring schedule for {self.endpoint_name}.")
-    # Put this functionality into this class
-    """
-    executions = my_monitor.list_executions()
-    latest_execution = executions[-1]
-    latest_execution.describe()['ProcessingJobStatus']
-    latest_execution.describe()['ExitMessage']
-    Here are the possible terminal states and what each of them means:
-    - Completed - This means the monitoring execution completed and no issues were found in the violations report.
-    - CompletedWithViolations - This means the execution completed, but constraint violations were detected.
-    - Failed - The monitoring execution failed, maybe due to client error
-                (perhaps incorrect role premissions) or infrastructure issues. Further
-                examination of the FailureReason and ExitMessage is necessary to identify what exactly happened.
-    - Stopped - job exceeded the max runtime or was manually stopped.
-    You can also get the S3 URI for the output with latest_execution.output.destination and analyze the results.
-    Visualize results
-    You can use the monitor object to gather reports for visualization:
-    suggested_constraints = my_monitor.suggested_constraints()
-    baseline_statistics = my_monitor.baseline_statistics()
-    latest_monitoring_violations = my_monitor.latest_monitoring_constraint_violations()
-    latest_monitoring_statistics = my_monitor.latest_monitoring_statistics()
-    """
     def get_monitoring_results(self, max_results=10) -> pd.DataFrame:
         """Get the results of monitoring executions
@@ -758,7 +564,7 @@ class MonitorCore:
         Returns:
             str: String representation of this MonitorCore object
         """
-        summary_dict = {}  # Disabling for now self.summary()
+        summary_dict = self.summary()
         summary_items = [f"  {repr(key)}: {repr(value)}" for key, value in summary_dict.items()]
         summary_str = f"{self.__class__.__name__}: {self.endpoint_name}\n" + ",\n".join(summary_items)
         return summary_str
@@ -775,7 +581,6 @@ if __name__ == "__main__":
     # Create the Class and test it out
     endpoint_name = "abalone-regression-rt"
-    endpoint_name = "logd-dev-reg-rt"
     my_endpoint = EndpointCore(endpoint_name)
     if not my_endpoint.exists():
         print(f"Endpoint {endpoint_name} does not exist.")
@@ -788,11 +593,10 @@ if __name__ == "__main__":
     # Check the details of the monitoring class
     pprint(mm.details())
-    # Enable data capture on the endpoint
-    mm.enable_data_capture()
+    # Enable data capture (if not already enabled)
+    mm.enable_data_capture(capture_percentage=100)
     # Create a baseline for monitoring
-    # mm.create_baseline(recreate=True)
     mm.create_baseline()
     # Check the monitoring outputs
@@ -804,30 +608,11 @@ if __name__ == "__main__":
     pprint(mm.get_constraints())
     print("\nStatistics...")
-    print(mm.get_statistics())
+    print(str(mm.get_statistics())[:1000])  # Print only first 1000 characters
     # Set up the monitoring schedule (if it doesn't already exist)
     mm.create_monitoring_schedule()
-    #
-    # Test the data capture by running some predictions
-    #
-    # Make predictions on the Endpoint using the FeatureSet evaluation data
-    # pred_df = my_endpoint.auto_inference()
-    # print(pred_df.head())
-    # Check that data capture is working
-    input_df, output_df = mm.get_captured_data()
-    if input_df.empty or output_df.empty:
-        print("No data capture files found, for a new endpoint it may take a few minutes to start capturing data")
-    else:
-        print("Found data capture files")
-        print("Input")
-        print(input_df.head())
-        print("Output")
-        print(output_df.head())
     # Test update_constraints (commented out for now)
     # print("\nTesting constraint updates...")
     # custom_constraints = {"sex": {"allowed_values": ["M", "F", "I"]}, "length": {"min": 0.0, "max": 1.0}}
@@ -846,7 +631,7 @@ if __name__ == "__main__":
     print("\nTesting execution details retrieval...")
     if not results_df.empty:
         latest_execution_arn = results_df.iloc[0]["processing_job_arn"]
-        execution_details = mm.get_execution_details(latest_execution_arn)
+        execution_details = mm.get_execution_details(latest_execution_arn) if latest_execution_arn else None
         if execution_details:
             print(f"Execution details for {latest_execution_arn}:")
             pprint(execution_details)

workbench/core/cloud_platform/aws/aws_account_clamp.py CHANGED Viewed

@@ -54,7 +54,11 @@ class AWSAccountClamp:
         # Check our Assume Role
         self.log.info("Checking Workbench Assumed Role...")
-        self.aws_session.assumed_role_info()
+        role_info = self.aws_session.assumed_role_info()
+        self.log.info(f"Assumed Role: {role_info}")
+        # Check if we have tag write permissions (if we don't, we are read-only)
+        self.read_only = not self.check_tag_permissions()
         # Check our Workbench API Key and Load the License
         self.log.info("Checking Workbench API License...")
@@ -138,6 +142,45 @@ class AWSAccountClamp:
         """
         return self.boto3_session.client("sagemaker")
+    def check_tag_permissions(self):
+        """Check if current role has permission to add tags to SageMaker endpoints.
+        Returns:
+            bool: True if AddTags is allowed, False otherwise
+        """
+        try:
+            sagemaker = self.boto3_session.client("sagemaker")
+            # Use a non-existent endpoint name
+            fake_endpoint = "workbench-permission-check-dummy-endpoint"
+            # Try to add tags to the non-existent endpoint
+            sagemaker.add_tags(
+                ResourceArn=f"arn:aws:sagemaker:{self.region}:{self.account_id}:endpoint/{fake_endpoint}",
+                Tags=[{"Key": "PermissionCheck", "Value": "Test"}],
+            )
+            # If we get here, we have permission (but endpoint doesn't exist)
+            return True
+        except ClientError as e:
+            error_code = e.response["Error"]["Code"]
+            # AccessDeniedException = no permission
+            if error_code == "AccessDeniedException":
+                self.log.debug("No AddTags permission (AccessDeniedException)")
+                return False
+            # ResourceNotFound = we have permission, but endpoint doesn't exist
+            elif error_code in ["ResourceNotFound", "ValidationException"]:
+                self.log.debug("AddTags permission verified (resource not found)")
+                return True
+            # Unexpected error, assume no permission for safety
+            else:
+                self.log.debug(f"Unexpected error checking permissions: {error_code}")
+                return False
 if __name__ == "__main__":
     """Exercise the AWS Account Clamp Class"""
@@ -162,3 +205,9 @@ if __name__ == "__main__":
     print("\n\n*** AWS Sagemaker Session/Client Check ***")
     sm_client = aws_account_clamp.sagemaker_client()
     print(sm_client.list_feature_groups()["FeatureGroupSummaries"])
+    print("\n\n*** AWS Tag Permission Check ***")
+    if aws_account_clamp.check_tag_permissions():
+        print("Tag Permission Check Success...")
+    else:
+        print("Tag Permission Check Failed...")

workbench/core/cloud_platform/aws/aws_meta.py CHANGED Viewed

@@ -196,7 +196,9 @@ class AWSMeta:
         # Return the summary as a DataFrame
         df = pd.DataFrame(data_summary).convert_dtypes()
-        return df.sort_values(by="Created", ascending=False)
+        if not df.empty:
+            df.sort_values(by="Created", ascending=False, inplace=True)
+        return df
     def models(self, details: bool = False) -> pd.DataFrame:
         """Get a summary of the Models in AWS.
@@ -256,7 +258,9 @@ class AWSMeta:
         # Return the summary as a DataFrame
         df = pd.DataFrame(model_summary).convert_dtypes()
-        return df.sort_values(by="Created", ascending=False)
+        if not df.empty:
+            df.sort_values(by="Created", ascending=False, inplace=True)
+        return df
     def endpoints(self, details: bool = False) -> pd.DataFrame:
         """Get a summary of the Endpoints in AWS.
@@ -308,7 +312,7 @@ class AWSMeta:
                     "Status": endpoint_details.get("EndpointStatus", "-"),
                     "Config": endpoint_details.get("EndpointConfigName", "-"),
                     "Variant": endpoint_details["config"]["variant"],
-                    "Capture": str(endpoint_details.get("DataCaptureConfig", {}).get("EnableCapture", "False")),
+                    "Capture": str(endpoint_details.get("DataCaptureConfig", {}).get("EnableCapture", "-")),
                     "Samp(%)": str(endpoint_details.get("DataCaptureConfig", {}).get("CurrentSamplingPercentage", "-")),
                     "Tags": aws_tags.get("workbench_tags", "-"),
                     "Monitored": endpoint_details["monitored"],
@@ -317,7 +321,9 @@ class AWSMeta:
         # Return the summary as a DataFrame
         df = pd.DataFrame(data_summary).convert_dtypes()
-        return df.sort_values(by="Created", ascending=False)
+        if not df.empty:
+            df.sort_values(by="Created", ascending=False, inplace=True)
+        return df
     def _endpoint_config_info(self, endpoint_config_name: str) -> dict:
         """Internal: Get the Endpoint Configuration information for the given endpoint config name.
@@ -657,7 +663,8 @@ class AWSMeta:
         df = pd.DataFrame(data_summary).convert_dtypes()
         # Sort by the Modified column
-        df = df.sort_values(by="Modified", ascending=False)
+        if not df.empty:
+            df = df.sort_values(by="Modified", ascending=False)
         return df
     def _aws_pipelines(self) -> pd.DataFrame:

workbench/core/cloud_platform/aws/aws_parameter_store.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import Union
 import logging
 import json
 import zlib
+import time
 import base64
 from botocore.exceptions import ClientError
@@ -77,7 +78,7 @@ class AWSParameterStore:
             all_parameters = []
             # Make the initial call to describe parameters
-            response = self.ssm_client.describe_parameters(**params)
+            response = self._call_with_retry(self.ssm_client.describe_parameters, **params)
             # Aggregate the names from the initial response
             all_parameters.extend(param["Name"] for param in response["Parameters"])
@@ -86,7 +87,7 @@ class AWSParameterStore:
             while "NextToken" in response:
                 # Update the parameters with the NextToken for subsequent calls
                 params["NextToken"] = response["NextToken"]
-                response = self.ssm_client.describe_parameters(**params)
+                response = self._call_with_retry(self.ssm_client.describe_parameters, **params)
                 # Aggregate the names from the subsequent responses
                 all_parameters.extend(param["Name"] for param in response["Parameters"])
@@ -183,6 +184,21 @@ class AWSParameterStore:
             self.log.critical(f"Failed to add/update parameter '{name}': {e}")
             raise
+    def _call_with_retry(self, func, **kwargs):
+        """Call AWS API with exponential backoff on throttling."""
+        max_retries = 5
+        base_delay = 1
+        for attempt in range(max_retries):
+            try:
+                return func(**kwargs)
+            except ClientError as e:
+                if e.response["Error"]["Code"] == "ThrottlingException" and attempt < max_retries - 1:
+                    delay = base_delay * (2**attempt)
+                    self.log.warning(f"Throttled, retrying in {delay}s...")
+                    time.sleep(delay)
+                else:
+                    raise
     @staticmethod
     def _compress_value(value) -> str:
         """Compress a value with precision reduction."""

workbench/core/cloud_platform/aws/aws_session.py CHANGED Viewed

@@ -10,7 +10,7 @@ import logging
 # Workbench Imports
 from workbench.utils.config_manager import ConfigManager
-from workbench.utils.execution_environment import running_on_lambda, running_on_glue
+from workbench_bridges.utils.execution_environment import running_as_service
 # Attempt to import IPython-related utilities
 try:
@@ -66,10 +66,10 @@ class AWSSession:
         return self._cached_boto3_session
     def _create_boto3_session(self):
-        """Internal: Get the AWS Boto3 Session, defaulting to the Workbench Role if possible."""
+        """Internal: Get the AWS Boto3 Session, assuming the Workbench Role if necessary."""
-        # Check the execution environment and determine if we need to assume the Workbench Role
-        if running_on_lambda() or running_on_glue() or self.is_workbench_role():
+        # Check if we're running as a service or already using the Workbench Role
+        if running_as_service() or self.is_workbench_role():
             self.log.important("Using the default Boto3 session...")
             return boto3.Session(region_name=self.region)

workbench/core/transforms/data_to_features/light/molecular_descriptors.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """MolecularDescriptors: Compute a Feature Set based on RDKit Descriptors
-Note: An alternative to using this class is to use the `compute_molecular_descriptors` function directly.
-      df_features = compute_molecular_descriptors(df)
+Note: An alternative to using this class is to use the `compute_descriptors` function directly.
+      df_features = compute_descriptors(df)
       to_features = PandasToFeatures("my_feature_set")
          to_features.set_input(df_features, id_column="id")
          to_features.set_output_tags(["blah", "whatever"])
@@ -10,7 +10,7 @@ Note: An alternative to using this class is to use the `compute_molecular_descri
 # Local Imports
 from workbench.core.transforms.data_to_features.light.data_to_features_light import DataToFeaturesLight
-from workbench.utils.chem_utils import compute_molecular_descriptors
+from workbench.utils.chem_utils.mol_descriptors import compute_descriptors
 class MolecularDescriptors(DataToFeaturesLight):
@@ -39,7 +39,7 @@ class MolecularDescriptors(DataToFeaturesLight):
         """Compute a Feature Set based on RDKit Descriptors"""
         # Compute/add all the Molecular Descriptors
-        self.output_df = compute_molecular_descriptors(self.input_df)
+        self.output_df = compute_descriptors(self.input_df)
 if __name__ == "__main__":

workbench 0.8.162__py3-none-any.whl → 0.8.202__py3-none-any.whl

Potentially problematic release.

workbench 0.8.162py3-none-any.whl → 0.8.202py3-none-any.whl