PyPI - workbench - Versions diffs - 0.8.168__py3-none-any.whl → 0.8.192__py3-none-any.whl - Mend

workbench 0.8.168py3-none-any.whl → 0.8.192py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

workbench/algorithms/dataframe/proximity.py +143 -102
workbench/algorithms/graph/light/proximity_graph.py +2 -1
workbench/api/compound.py +1 -1
workbench/api/endpoint.py +3 -2
workbench/api/feature_set.py +4 -4
workbench/api/model.py +16 -12
workbench/api/monitor.py +1 -16
workbench/core/artifacts/artifact.py +11 -3
workbench/core/artifacts/data_capture_core.py +355 -0
workbench/core/artifacts/endpoint_core.py +113 -27
workbench/core/artifacts/feature_set_core.py +72 -13
workbench/core/artifacts/model_core.py +50 -15
workbench/core/artifacts/monitor_core.py +33 -249
workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
workbench/core/cloud_platform/aws/aws_meta.py +11 -4
workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
workbench/core/transforms/features_to_model/features_to_model.py +9 -4
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
workbench/core/views/training_view.py +49 -53
workbench/core/views/view.py +51 -1
workbench/core/views/view_utils.py +4 -4
workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
workbench/model_scripts/custom_models/proximity/proximity.py +143 -102
workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
workbench/model_scripts/custom_models/uq_models/meta_uq.template +156 -58
workbench/model_scripts/custom_models/uq_models/ngboost.template +20 -14
workbench/model_scripts/custom_models/uq_models/proximity.py +143 -102
workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
workbench/model_scripts/pytorch_model/pytorch.template +9 -18
workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
workbench/model_scripts/script_generation.py +7 -2
workbench/model_scripts/uq_models/mapie.template +492 -0
workbench/model_scripts/uq_models/requirements.txt +1 -0
workbench/model_scripts/xgb_model/xgb_model.template +31 -40
workbench/repl/workbench_shell.py +4 -4
workbench/scripts/lambda_launcher.py +63 -0
workbench/scripts/{ml_pipeline_launcher.py → ml_pipeline_batch.py} +49 -51
workbench/scripts/ml_pipeline_sqs.py +186 -0
workbench/utils/chem_utils/__init__.py +0 -0
workbench/utils/chem_utils/fingerprints.py +134 -0
workbench/utils/chem_utils/misc.py +194 -0
workbench/utils/chem_utils/mol_descriptors.py +483 -0
workbench/utils/chem_utils/mol_standardize.py +450 -0
workbench/utils/chem_utils/mol_tagging.py +348 -0
workbench/utils/chem_utils/projections.py +209 -0
workbench/utils/chem_utils/salts.py +256 -0
workbench/utils/chem_utils/sdf.py +292 -0
workbench/utils/chem_utils/toxicity.py +250 -0
workbench/utils/chem_utils/vis.py +253 -0
workbench/utils/config_manager.py +2 -6
workbench/utils/endpoint_utils.py +5 -7
workbench/utils/license_manager.py +2 -6
workbench/utils/model_utils.py +76 -30
workbench/utils/monitor_utils.py +44 -62
workbench/utils/pandas_utils.py +3 -3
workbench/utils/shap_utils.py +10 -2
workbench/utils/workbench_sqs.py +1 -1
workbench/utils/xgboost_model_utils.py +283 -145
workbench/web_interface/components/plugins/dashboard_status.py +3 -1
workbench/web_interface/components/plugins/generated_compounds.py +1 -1
workbench/web_interface/components/plugins/scatter_plot.py +3 -3
{workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/METADATA +2 -1
{workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/RECORD +74 -70
{workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/entry_points.txt +3 -1
workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
workbench/model_scripts/pytorch_model/generated_model_script.py +0 -576
workbench/model_scripts/quant_regression/quant_regression.template +0 -279
workbench/model_scripts/quant_regression/requirements.txt +0 -1
workbench/model_scripts/scikit_learn/generated_model_script.py +0 -307
workbench/model_scripts/xgb_model/generated_model_script.py +0 -477
workbench/utils/chem_utils.py +0 -1556
workbench/utils/fast_inference.py +0 -167
workbench/utils/resource_utils.py +0 -39
{workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/WHEEL +0 -0
{workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/licenses/LICENSE +0 -0
{workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/top_level.txt +0 -0

workbench/core/transforms/model_to_endpoint/model_to_endpoint.py CHANGED Viewed

@@ -5,6 +5,7 @@ from sagemaker import ModelPackage
 from sagemaker.serializers import CSVSerializer
 from sagemaker.deserializers import CSVDeserializer
 from sagemaker.serverless import ServerlessInferenceConfig
+from sagemaker.model_monitor import DataCaptureConfig
 # Local Imports
 from workbench.core.transforms.transform import Transform, TransformInput, TransformOutput
@@ -51,27 +52,38 @@ class ModelToEndpoint(Transform):
         EndpointCore.managed_delete(self.output_name)
         # Get the Model Package ARN for our input model
-        input_model = ModelCore(self.input_name)
-        model_package_arn = input_model.model_package_arn()
+        workbench_model = ModelCore(self.input_name)
         # Deploy the model
-        self._deploy_model(model_package_arn, **kwargs)
+        self._deploy_model(workbench_model, **kwargs)
         # Add this endpoint to the set of registered endpoints for the model
-        input_model.register_endpoint(self.output_name)
+        workbench_model.register_endpoint(self.output_name)
         # This ensures that the endpoint is ready for use
         time.sleep(5)  # We wait for AWS Lag
         end = EndpointCore(self.output_name)
         self.log.important(f"Endpoint {end.name} is ready for use")
-    def _deploy_model(self, model_package_arn: str, mem_size: int = 2048, max_concurrency: int = 5):
+    def _deploy_model(
+        self,
+        workbench_model: ModelCore,
+        mem_size: int = 2048,
+        max_concurrency: int = 5,
+        data_capture: bool = False,
+        capture_percentage: int = 100,
+    ):
         """Internal Method: Deploy the Model
         Args:
-            model_package_arn(str): The Model Package ARN used to deploy the Endpoint
+            workbench_model(ModelCore): The Workbench ModelCore object to deploy
+            mem_size(int): Memory size for serverless deployment
+            max_concurrency(int): Max concurrency for serverless deployment
+            data_capture(bool): Enable data capture during deployment
+            capture_percentage(int): Percentage of data to capture. Defaults to 100.
         """
         # Grab the specified Model Package
+        model_package_arn = workbench_model.model_package_arn()
         model_package = ModelPackage(
             role=self.workbench_role_arn,
             model_package_arn=model_package_arn,
@@ -95,6 +107,23 @@ class ModelToEndpoint(Transform):
                 max_concurrency=max_concurrency,
             )
+        # Configure data capture if requested (and not serverless)
+        data_capture_config = None
+        if data_capture and not self.serverless:
+            # Set up the S3 path for data capture
+            base_endpoint_path = f"{workbench_model.endpoints_s3_path}/{self.output_name}"
+            data_capture_path = f"{base_endpoint_path}/data_capture"
+            self.log.important(f"Configuring Data Capture --> {data_capture_path}")
+            data_capture_config = DataCaptureConfig(
+                enable_capture=True,
+                sampling_percentage=capture_percentage,
+                destination_s3_uri=data_capture_path,
+            )
+        elif data_capture and self.serverless:
+            self.log.warning(
+                "Data capture is not supported for serverless endpoints. Skipping data capture configuration."
+            )
         # Deploy the Endpoint
         self.log.important(f"Deploying the Endpoint {self.output_name}...")
         model_package.deploy(
@@ -104,6 +133,7 @@ class ModelToEndpoint(Transform):
             endpoint_name=self.output_name,
             serializer=CSVSerializer(),
             deserializer=CSVDeserializer(),
+            data_capture_config=data_capture_config,
             tags=aws_tags,
         )

workbench/core/transforms/pandas_transforms/pandas_to_features.py CHANGED Viewed

@@ -327,9 +327,36 @@ class PandasToFeatures(Transform):
         self.delete_existing()
         self.output_feature_group = self.create_feature_group()
+    def mac_spawn_hack(self):
+        """Workaround for macOS Tahoe fork/spawn issue with SageMaker FeatureStore ingest.
+        See: https://github.com/aws/sagemaker-python-sdk/issues/5312
+        macOS Tahoe 26+ has issues with forked processes creating boto3 sessions.
+        This forces spawn mode on macOS to avoid the hang.
+        """
+        import platform
+        if platform.system() == "Darwin":  # macOS
+            self.log.warning("macOS detected, forcing 'spawn' mode for multiprocessing (Tahoe hang workaround)")
+            import multiprocessing
+            try:
+                import multiprocess
+                multiprocess.set_start_method("spawn", force=True)
+            except (RuntimeError, ImportError):
+                pass  # Already set or multiprocess not available
+            try:
+                multiprocessing.set_start_method("spawn", force=True)
+            except RuntimeError:
+                pass  # Already set
     def transform_impl(self):
         """Transform Implementation: Ingest the data into the Feature Group"""
+        # Workaround for macOS Tahoe hang issue
+        self.mac_spawn_hack()
         # Now we actually push the data into the Feature Group (called ingestion)
         self.log.important(f"Ingesting rows into Feature Group {self.output_name}...")
         ingest_manager = self.output_feature_group.ingest(self.output_df, max_workers=8, max_processes=4, wait=False)

workbench/core/views/training_view.py CHANGED Viewed

@@ -3,14 +3,18 @@
 from typing import Union
 # Workbench Imports
-from workbench.api import DataSource, FeatureSet
+from workbench.api import FeatureSet
 from workbench.core.views.view import View
 from workbench.core.views.create_view import CreateView
 from workbench.core.views.view_utils import get_column_list
 class TrainingView(CreateView):
-    """TrainingView Class: A View with an additional training column that marks holdout ids
+    """TrainingView Class: A View with an additional training column (80/20 or holdout ids).
+    The TrainingView class creates a SQL view that includes all columns from the source table
+    along with an additional boolean column named "training". This view can also include
+    a SQL filter expression to filter the rows included in the view.
     Common Usage:
         ```python
@@ -19,8 +23,9 @@ class TrainingView(CreateView):
         training_view = TrainingView.create(fs)
         df = training_view.pull_dataframe()
-        # Create a TrainingView with a specific set of columns
-        training_view = TrainingView.create(fs, column_list=["my_col1", "my_col2"])
+        # Create a TrainingView with a specific filter expression
+        training_view = TrainingView.create(fs, id_column="auto_id", filter_expression="age > 30")
+        df = training_view.pull_dataframe()
         # Query the view
         df = training_view.query(f"SELECT * FROM {training_view.table} where training = TRUE")
@@ -31,17 +36,21 @@ class TrainingView(CreateView):
     def create(
         cls,
         feature_set: FeatureSet,
-        source_table: str = None,
+        *,  # Enforce keyword arguments after feature_set
         id_column: str = None,
         holdout_ids: Union[list[str], list[int], None] = None,
+        filter_expression: str = None,
+        source_table: str = None,
     ) -> Union[View, None]:
         """Factory method to create and return a TrainingView instance.
         Args:
             feature_set (FeatureSet): A FeatureSet object
-            source_table (str, optional): The table/view to create the view from. Defaults to None.
             id_column (str, optional): The name of the id column. Defaults to None.
             holdout_ids (Union[list[str], list[int], None], optional): A list of holdout ids. Defaults to None.
+            filter_expression (str, optional): SQL filter expression (e.g., "age > 25 AND status = 'active'").
+                                               Defaults to None.
+            source_table (str, optional): The table/view to create the view from. Defaults to None.
         Returns:
             Union[View, None]: The created View object (or None if failed to create the view)
@@ -69,28 +78,36 @@ class TrainingView(CreateView):
                 else:
                     id_column = instance.auto_id_column
-        # If we don't have holdout ids, create a default training view
-        if not holdout_ids:
-            instance._default_training_view(instance.data_source, id_column)
-            return View(instance.data_source, instance.view_name, auto_create_view=False)
+        # Enclose each column name in double quotes
+        sql_columns = ", ".join([f'"{column}"' for column in column_list])
-        # Format the list of holdout ids for SQL IN clause
-        if holdout_ids and all(isinstance(id, str) for id in holdout_ids):
-            formatted_holdout_ids = ", ".join(f"'{id}'" for id in holdout_ids)
+        # Build the training assignment logic
+        if holdout_ids:
+            # Format the list of holdout ids for SQL IN clause
+            if all(isinstance(id, str) for id in holdout_ids):
+                formatted_holdout_ids = ", ".join(f"'{id}'" for id in holdout_ids)
+            else:
+                formatted_holdout_ids = ", ".join(map(str, holdout_ids))
+            training_logic = f"""CASE
+                WHEN {id_column} IN ({formatted_holdout_ids}) THEN False
+                ELSE True
+            END AS training"""
         else:
-            formatted_holdout_ids = ", ".join(map(str, holdout_ids))
+            # Default 80/20 split using modulo
+            training_logic = f"""CASE
+                WHEN MOD(ROW_NUMBER() OVER (ORDER BY {id_column}), 10) < 8 THEN True
+                ELSE False
+            END AS training"""
-        # Enclose each column name in double quotes
-        sql_columns = ", ".join([f'"{column}"' for column in column_list])
+        # Build WHERE clause if filter_expression is provided
+        where_clause = f"\nWHERE {filter_expression}" if filter_expression else ""
         # Construct the CREATE VIEW query
         create_view_query = f"""
         CREATE OR REPLACE VIEW {instance.table} AS
-        SELECT {sql_columns}, CASE
-            WHEN {id_column} IN ({formatted_holdout_ids}) THEN False
-            ELSE True
-        END AS training
-        FROM {instance.source_table}
+        SELECT {sql_columns}, {training_logic}
+        FROM {instance.source_table}{where_clause}
         """
         # Execute the CREATE VIEW query
@@ -99,43 +116,13 @@ class TrainingView(CreateView):
         # Return the View
         return View(instance.data_source, instance.view_name, auto_create_view=False)
-    # This is an internal method that's used to create a default training view
-    def _default_training_view(self, data_source: DataSource, id_column: str):
-        """Create a default view in Athena that assigns roughly 80% of the data to training
-        Args:
-            data_source (DataSource): The Workbench DataSource object
-            id_column (str): The name of the id column
-        """
-        self.log.important(f"Creating default Training View {self.table}...")
-        # Drop any columns generated from AWS
-        aws_cols = ["write_time", "api_invocation_time", "is_deleted", "event_time"]
-        column_list = [col for col in data_source.columns if col not in aws_cols]
-        # Enclose each column name in double quotes
-        sql_columns = ", ".join([f'"{column}"' for column in column_list])
-        # Construct the CREATE VIEW query with a simple modulo operation for the 80/20 split
-        create_view_query = f"""
-        CREATE OR REPLACE VIEW "{self.table}" AS
-        SELECT {sql_columns}, CASE
-            WHEN MOD(ROW_NUMBER() OVER (ORDER BY {id_column}), 10) < 8 THEN True  -- Assign 80% to training
-            ELSE False  -- Assign roughly 20% to validation/test
-        END AS training
-        FROM {self.base_table_name}
-        """
-        # Execute the CREATE VIEW query
-        data_source.execute_statement(create_view_query)
 if __name__ == "__main__":
     """Exercise the Training View functionality"""
     from workbench.api import FeatureSet
     # Get the FeatureSet
-    fs = FeatureSet("test_features")
+    fs = FeatureSet("abalone_features")
     # Delete the existing training view
     training_view = TrainingView.create(fs)
@@ -152,9 +139,18 @@ if __name__ == "__main__":
     # Create a TrainingView with holdout ids
     my_holdout_ids = list(range(10))
-    training_view = TrainingView.create(fs, id_column="id", holdout_ids=my_holdout_ids)
+    training_view = TrainingView.create(fs, id_column="auto_id", holdout_ids=my_holdout_ids)
     # Pull the training data
     df = training_view.pull_dataframe()
     print(df.head())
     print(df["training"].value_counts())
+    print(f"Shape: {df.shape}")
+    print(f"Diameter min: {df['diameter'].min()}, max: {df['diameter'].max()}")
+    # Test the filter expression
+    training_view = TrainingView.create(fs, id_column="auto_id", filter_expression="diameter > 0.5")
+    df = training_view.pull_dataframe()
+    print(df.head())
+    print(f"Shape with filter: {df.shape}")
+    print(f"Diameter min: {df['diameter'].min()}, max: {df['diameter'].max()}")

workbench/core/views/view.py CHANGED Viewed

@@ -196,12 +196,52 @@ class View:
         # The BaseView always exists
         if self.view_name == "base":
-            return True
+            return
         # Check the database directly
         if not self._check_database():
             self._auto_create_view()
+    def copy(self, dest_view_name: str) -> "View":
+        """Copy this view to a new view with a different name
+        Args:
+            dest_view_name (str): The destination view name (e.g. "training_v1")
+        Returns:
+            View: A new View object for the destination view
+        """
+        # Can't copy the base view
+        if self.view_name == "base":
+            self.log.error("Cannot copy the base view")
+            return None
+        # Get the view definition
+        get_view_query = f"""
+        SELECT view_definition
+        FROM information_schema.views
+        WHERE table_schema = '{self.database}'
+        AND table_name = '{self.table}'
+        """
+        df = self.data_source.query(get_view_query)
+        if df.empty:
+            self.log.error(f"View {self.table} not found")
+            return None
+        view_definition = df.iloc[0]["view_definition"]
+        # Create the new view with the destination name
+        dest_table = f"{self.base_table_name}___{dest_view_name}"
+        create_view_query = f'CREATE OR REPLACE VIEW "{dest_table}" AS {view_definition}'
+        self.log.important(f"Copying view {self.table} to {dest_table}...")
+        self.data_source.execute_statement(create_view_query)
+        # Return a new View object for the destination
+        artifact = FeatureSet(self.artifact_name) if self.is_feature_set else DataSource(self.artifact_name)
+        return View(artifact, dest_view_name, auto_create_view=False)
     def _check_database(self) -> bool:
         """Internal: Check if the view exists in the database
@@ -324,3 +364,13 @@ if __name__ == "__main__":
     # Test supplemental data tables deletion
     view = View(fs, "test_view")
     view.delete()
+    # Test copying a view
+    fs = FeatureSet("test_features")
+    display_view = View(fs, "display")
+    copied_view = display_view.copy("display_copy")
+    print(copied_view)
+    print(copied_view.pull_dataframe().head())
+    # Clean up copied view
+    copied_view.delete()

workbench/core/views/view_utils.py CHANGED Viewed

@@ -296,15 +296,15 @@ if __name__ == "__main__":
     print("View Details on the FeatureSet Table...")
     print(view_details(my_data_source.table, my_data_source.database, my_data_source.boto3_session))
-    print("View Details on the Training View...")
-    training_view = fs.view("training")
+    print("View Details on the Display View...")
+    training_view = fs.view("display")
     print(view_details(training_view.table, training_view.database, my_data_source.boto3_session))
     # Test get_column_list
     print(get_column_list(my_data_source))
-    # Test get_column_list (with training view)
-    training_table = fs.view("training").table
+    # Test get_column_list (with display view)
+    training_table = fs.view("display").table
     print(get_column_list(my_data_source, training_table))
     # Test list_views

workbench 0.8.168__py3-none-any.whl → 0.8.192__py3-none-any.whl

workbench 0.8.168py3-none-any.whl → 0.8.192py3-none-any.whl