PyPI - workbench - Versions diffs - 0.8.162__py3-none-any.whl → 0.8.202__py3-none-any.whl - Mend

workbench 0.8.162py3-none-any.whl → 0.8.202py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (113) hide show

workbench/algorithms/dataframe/__init__.py +1 -2
workbench/algorithms/dataframe/fingerprint_proximity.py +2 -2
workbench/algorithms/dataframe/proximity.py +261 -235
workbench/algorithms/graph/light/proximity_graph.py +10 -8
workbench/api/__init__.py +2 -1
workbench/api/compound.py +1 -1
workbench/api/endpoint.py +11 -0
workbench/api/feature_set.py +11 -8
workbench/api/meta.py +5 -2
workbench/api/model.py +16 -15
workbench/api/monitor.py +1 -16
workbench/core/artifacts/__init__.py +11 -2
workbench/core/artifacts/artifact.py +11 -3
workbench/core/artifacts/data_capture_core.py +355 -0
workbench/core/artifacts/endpoint_core.py +256 -118
workbench/core/artifacts/feature_set_core.py +265 -16
workbench/core/artifacts/model_core.py +107 -60
workbench/core/artifacts/monitor_core.py +33 -248
workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
workbench/core/cloud_platform/aws/aws_meta.py +12 -5
workbench/core/cloud_platform/aws/aws_parameter_store.py +18 -2
workbench/core/cloud_platform/aws/aws_session.py +4 -4
workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
workbench/core/transforms/features_to_model/features_to_model.py +42 -32
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
workbench/core/views/training_view.py +113 -42
workbench/core/views/view.py +53 -3
workbench/core/views/view_utils.py +4 -4
workbench/model_scripts/chemprop/chemprop.template +852 -0
workbench/model_scripts/chemprop/generated_model_script.py +852 -0
workbench/model_scripts/chemprop/requirements.txt +11 -0
workbench/model_scripts/custom_models/chem_info/fingerprints.py +134 -0
workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
workbench/model_scripts/custom_models/proximity/proximity.py +261 -235
workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
workbench/model_scripts/custom_models/uq_models/meta_uq.template +166 -62
workbench/model_scripts/custom_models/uq_models/ngboost.template +30 -18
workbench/model_scripts/custom_models/uq_models/proximity.py +261 -235
workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
workbench/model_scripts/pytorch_model/generated_model_script.py +373 -190
workbench/model_scripts/pytorch_model/pytorch.template +370 -187
workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
workbench/model_scripts/script_generation.py +17 -9
workbench/model_scripts/uq_models/generated_model_script.py +605 -0
workbench/model_scripts/uq_models/mapie.template +605 -0
workbench/model_scripts/uq_models/requirements.txt +1 -0
workbench/model_scripts/xgb_model/generated_model_script.py +37 -46
workbench/model_scripts/xgb_model/xgb_model.template +44 -46
workbench/repl/workbench_shell.py +28 -14
workbench/scripts/endpoint_test.py +162 -0
workbench/scripts/lambda_test.py +73 -0
workbench/scripts/ml_pipeline_batch.py +137 -0
workbench/scripts/ml_pipeline_sqs.py +186 -0
workbench/scripts/monitor_cloud_watch.py +20 -100
workbench/utils/aws_utils.py +4 -3
workbench/utils/chem_utils/__init__.py +0 -0
workbench/utils/chem_utils/fingerprints.py +134 -0
workbench/utils/chem_utils/misc.py +194 -0
workbench/utils/chem_utils/mol_descriptors.py +483 -0
workbench/utils/chem_utils/mol_standardize.py +450 -0
workbench/utils/chem_utils/mol_tagging.py +348 -0
workbench/utils/chem_utils/projections.py +209 -0
workbench/utils/chem_utils/salts.py +256 -0
workbench/utils/chem_utils/sdf.py +292 -0
workbench/utils/chem_utils/toxicity.py +250 -0
workbench/utils/chem_utils/vis.py +253 -0
workbench/utils/chemprop_utils.py +760 -0
workbench/utils/cloudwatch_handler.py +1 -1
workbench/utils/cloudwatch_utils.py +137 -0
workbench/utils/config_manager.py +3 -7
workbench/utils/endpoint_utils.py +5 -7
workbench/utils/license_manager.py +2 -6
workbench/utils/model_utils.py +95 -34
workbench/utils/monitor_utils.py +44 -62
workbench/utils/pandas_utils.py +3 -3
workbench/utils/pytorch_utils.py +526 -0
workbench/utils/shap_utils.py +10 -2
workbench/utils/workbench_logging.py +0 -3
workbench/utils/workbench_sqs.py +1 -1
workbench/utils/xgboost_model_utils.py +371 -156
workbench/web_interface/components/model_plot.py +7 -1
workbench/web_interface/components/plugin_unit_test.py +5 -2
workbench/web_interface/components/plugins/dashboard_status.py +3 -1
workbench/web_interface/components/plugins/generated_compounds.py +1 -1
workbench/web_interface/components/plugins/model_details.py +9 -7
workbench/web_interface/components/plugins/scatter_plot.py +3 -3
{workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/METADATA +27 -6
{workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/RECORD +101 -85
{workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/entry_points.txt +4 -0
{workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/licenses/LICENSE +1 -1
workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
workbench/model_scripts/quant_regression/quant_regression.template +0 -279
workbench/model_scripts/quant_regression/requirements.txt +0 -1
workbench/utils/chem_utils.py +0 -1556
workbench/utils/execution_environment.py +0 -211
workbench/utils/fast_inference.py +0 -167
workbench/utils/resource_utils.py +0 -39
{workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/WHEEL +0 -0
{workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/top_level.txt +0 -0

workbench/core/transforms/features_to_model/features_to_model.py CHANGED Viewed

@@ -9,7 +9,7 @@ import time
 # Local Imports
 from workbench.core.transforms.transform import Transform, TransformInput, TransformOutput
 from workbench.core.artifacts.feature_set_core import FeatureSetCore
-from workbench.core.artifacts.model_core import ModelCore, ModelType, ModelImages
+from workbench.core.artifacts.model_core import ModelCore, ModelType, ModelFramework, ModelImages
 from workbench.core.artifacts.artifact import Artifact
 from workbench.model_scripts.script_generation import generate_model_script, fill_template
 from workbench.utils.model_utils import supported_instance_types
@@ -33,12 +33,13 @@ class FeaturesToModel(Transform):
         feature_name: str,
         model_name: str,
         model_type: ModelType,
+        model_framework=ModelFramework.XGBOOST,
         model_class=None,
         model_import_str=None,
         custom_script=None,
         custom_args=None,
-        training_image="xgb_training",
-        inference_image="xgb_inference",
+        training_image="training",
+        inference_image="inference",
         inference_arch="x86_64",
     ):
         """FeaturesToModel Initialization
@@ -46,12 +47,13 @@ class FeaturesToModel(Transform):
             feature_name (str): Name of the FeatureSet to use as input
             model_name (str): Name of the Model to create as output
             model_type (ModelType): ModelType.REGRESSOR or ModelType.CLASSIFIER, etc.
+            model_framework (ModelFramework, optional): The model framework (default ModelFramework.XGBOOST)
             model_class (str, optional): The scikit model (e.g. KNeighborsRegressor) (default None)
             model_import_str (str, optional): The import string for the model (default None)
             custom_script (str, optional): Custom script to use for the model (default None)
             custom_args (dict, optional): Custom arguments to pass to custom model scripts (default None)
-            training_image (str, optional): Training image (default "xgb_training")
-            inference_image (str, optional): Inference image (default "xgb_inference")
+            training_image (str, optional): Training image (default "training")
+            inference_image (str, optional): Inference image (default "inference")
             inference_arch (str, optional): Inference architecture (default "x86_64")
         """
@@ -65,6 +67,7 @@ class FeaturesToModel(Transform):
         self.input_type = TransformInput.FEATURE_SET
         self.output_type = TransformOutput.MODEL
         self.model_type = model_type
+        self.model_framework = model_framework
         self.model_class = model_class
         self.model_import_str = model_import_str
         self.custom_script = str(custom_script) if custom_script else None
@@ -157,6 +160,7 @@ class FeaturesToModel(Transform):
         template_params = {
             "model_imports": self.model_import_str,
             "model_type": self.model_type,
+            "model_framework": self.model_framework,
             "model_class": self.model_class,
             "target_column": self.target_column,
             "feature_list": self.model_feature_list,
@@ -184,13 +188,15 @@ class FeaturesToModel(Transform):
             # Generate our model script
             script_path = generate_model_script(template_params)
-        # Metric Definitions for Regression
+        # Metric Definitions for Regression (matches model script output format)
         if self.model_type in [ModelType.REGRESSOR, ModelType.UQ_REGRESSOR, ModelType.ENSEMBLE_REGRESSOR]:
             metric_definitions = [
-                {"Name": "RMSE", "Regex": "RMSE: ([0-9.]+)"},
-                {"Name": "MAE", "Regex": "MAE: ([0-9.]+)"},
-                {"Name": "R2", "Regex": "R2: ([0-9.]+)"},
-                {"Name": "NumRows", "Regex": "NumRows: ([0-9]+)"},
+                {"Name": "rmse", "Regex": r"rmse: ([0-9.]+)"},
+                {"Name": "mae", "Regex": r"mae: ([0-9.]+)"},
+                {"Name": "medae", "Regex": r"medae: ([0-9.]+)"},
+                {"Name": "r2", "Regex": r"r2: ([0-9.-]+)"},
+                {"Name": "spearmanr", "Regex": r"spearmanr: ([0-9.-]+)"},
+                {"Name": "support", "Regex": r"support: ([0-9]+)"},
             ]
         # Metric Definitions for Classification
@@ -210,7 +216,7 @@ class FeaturesToModel(Transform):
                 raise ValueError(msg)
             # Dynamically create the metric definitions
-            metrics = ["precision", "recall", "fscore"]
+            metrics = ["precision", "recall", "f1"]
             metric_definitions = []
             for t in self.class_labels:
                 for m in metrics:
@@ -233,13 +239,21 @@ class FeaturesToModel(Transform):
         source_dir = str(Path(script_path).parent)
         # Create a Sagemaker Model with our script
-        image = ModelImages.get_image_uri(self.sm_session.boto_region_name, self.training_image, "0.1")
+        image = ModelImages.get_image_uri(self.sm_session.boto_region_name, self.training_image)
+        # Use GPU instance for ChemProp/PyTorch, CPU for others
+        if self.model_framework in [ModelFramework.CHEMPROP, ModelFramework.PYTORCH_TABULAR]:
+            train_instance_type = "ml.g6.xlarge"  # NVIDIA L4 GPU, ~$0.80/hr
+            self.log.important(f"Using GPU instance {train_instance_type} for {self.model_framework.value}")
+        else:
+            train_instance_type = "ml.m5.xlarge"
         self.estimator = Estimator(
             entry_point=entry_point,
             source_dir=source_dir,
             role=self.workbench_role_arn,
             instance_count=1,
-            instance_type="ml.m5.xlarge",
+            instance_type=train_instance_type,
             sagemaker_session=self.sm_session,
             image_uri=image,
             metric_definitions=metric_definitions,
@@ -264,13 +278,20 @@ class FeaturesToModel(Transform):
         self.log.important(f"Creating new model {self.output_name}...")
         self.create_and_register_model(**kwargs)
+        # Make a copy of the training view, to lock-in the training data used for this model
+        model_training_view_name = f"{self.output_name.replace('-', '_')}_training"
+        self.log.important(f"Creating Model Training View: {model_training_view_name}...")
+        feature_set.view("training").copy(f"{model_training_view_name}")
     def post_transform(self, **kwargs):
         """Post-Transform: Calling onboard() on the Model"""
         self.log.info("Post-Transform: Calling onboard() on the Model...")
         time.sleep(3)  # Give AWS time to complete Model register
-        # Store the model feature_list and target_column in the workbench_meta
-        output_model = ModelCore(self.output_name, model_type=self.model_type)
+        # Store the model metadata information
+        output_model = ModelCore(self.output_name)
+        output_model._set_model_type(self.model_type)
+        output_model._set_model_framework(self.model_framework)
         output_model.upsert_workbench_meta({"workbench_model_features": self.model_feature_list})
         output_model.upsert_workbench_meta({"workbench_model_target": self.target_column})
@@ -301,7 +322,7 @@ class FeaturesToModel(Transform):
         # Register our model
         image = ModelImages.get_image_uri(
-            self.sm_session.boto_region_name, self.inference_image, "0.1", self.inference_arch
+            self.sm_session.boto_region_name, self.inference_image, architecture=self.inference_arch
         )
         self.log.important(f"Registering model {self.output_name} with Inference Image {image}...")
         model = self.estimator.create_model(role=self.workbench_role_arn)
@@ -325,12 +346,11 @@ if __name__ == "__main__":
     # Regression Model
     input_name = "abalone_features"
-    output_name = "test-abalone-regression"
+    output_name = "abalone-regression"
     to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.REGRESSOR)
     to_model.set_output_tags(["test"])
     to_model.transform(target_column="class_number_of_rings", description="Test Abalone Regression")
-    """
     # Classification Model
     input_name = "wine_features"
     output_name = "wine-classification"
@@ -340,10 +360,10 @@ if __name__ == "__main__":
     # Quantile Regression Model (Abalone)
     input_name = "abalone_features"
-    output_name = "abalone-quantile-reg"
+    output_name = "abalone-regression-uq"
     to_model = FeaturesToModel(input_name, output_name, ModelType.UQ_REGRESSOR)
-    to_model.set_output_tags(["abalone", "quantiles"])
-    to_model.transform(target_column="class_number_of_rings", description="Abalone Quantile Regression")
+    to_model.set_output_tags(["abalone", "uq"])
+    to_model.transform(target_column="class_number_of_rings", description="Abalone UQ Regression")
     # Scikit-Learn Kmeans Clustering Model
     input_name = "wine_features"
@@ -397,7 +417,7 @@ if __name__ == "__main__":
     scripts_root = Path(__file__).resolve().parents[3] / "model_scripts"
     my_script = scripts_root / "custom_models" / "chem_info" / "molecular_descriptors.py"
     input_name = "aqsol_features"
-    output_name = "smiles-to-taut-md-stereo-v0"
+    output_name = "test-smiles-to-taut-md-stereo"
     to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.TRANSFORMER, custom_script=my_script)
     to_model.set_output_tags(["smiles", "molecular descriptors"])
     to_model.transform(target_column=None, feature_list=["smiles"], description="Smiles to Molecular Descriptors")
@@ -410,13 +430,3 @@ if __name__ == "__main__":
     to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.TRANSFORMER, custom_script=my_script)
     to_model.set_output_tags(["smiles", "morgan fingerprints"])
     to_model.transform(target_column=None, feature_list=["smiles"], description="Smiles to Morgan Fingerprints")
-    # Tautomerization Model
-    scripts_root = Path(__file__).resolve().parents[3] / "model_scripts"
-    my_script = scripts_root / "custom_models" / "chem_info" / "tautomerize.py"
-    input_name = "aqsol_features"
-    output_name = "tautomerize-v0"
-    to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.TRANSFORMER, custom_script=my_script)
-    to_model.set_output_tags(["smiles", "tautomerization"])
-    to_model.transform(target_column=None, feature_list=["smiles"], description="Tautomerize Smiles")
-    """

workbench/core/transforms/model_to_endpoint/model_to_endpoint.py CHANGED Viewed

@@ -5,6 +5,7 @@ from sagemaker import ModelPackage
 from sagemaker.serializers import CSVSerializer
 from sagemaker.deserializers import CSVDeserializer
 from sagemaker.serverless import ServerlessInferenceConfig
+from sagemaker.model_monitor import DataCaptureConfig
 # Local Imports
 from workbench.core.transforms.transform import Transform, TransformInput, TransformOutput
@@ -51,27 +52,38 @@ class ModelToEndpoint(Transform):
         EndpointCore.managed_delete(self.output_name)
         # Get the Model Package ARN for our input model
-        input_model = ModelCore(self.input_name)
-        model_package_arn = input_model.model_package_arn()
+        workbench_model = ModelCore(self.input_name)
         # Deploy the model
-        self._deploy_model(model_package_arn, **kwargs)
+        self._deploy_model(workbench_model, **kwargs)
         # Add this endpoint to the set of registered endpoints for the model
-        input_model.register_endpoint(self.output_name)
+        workbench_model.register_endpoint(self.output_name)
         # This ensures that the endpoint is ready for use
         time.sleep(5)  # We wait for AWS Lag
         end = EndpointCore(self.output_name)
         self.log.important(f"Endpoint {end.name} is ready for use")
-    def _deploy_model(self, model_package_arn: str, mem_size: int = 2048, max_concurrency: int = 5):
+    def _deploy_model(
+        self,
+        workbench_model: ModelCore,
+        mem_size: int = 2048,
+        max_concurrency: int = 5,
+        data_capture: bool = False,
+        capture_percentage: int = 100,
+    ):
         """Internal Method: Deploy the Model
         Args:
-            model_package_arn(str): The Model Package ARN used to deploy the Endpoint
+            workbench_model(ModelCore): The Workbench ModelCore object to deploy
+            mem_size(int): Memory size for serverless deployment
+            max_concurrency(int): Max concurrency for serverless deployment
+            data_capture(bool): Enable data capture during deployment
+            capture_percentage(int): Percentage of data to capture. Defaults to 100.
         """
         # Grab the specified Model Package
+        model_package_arn = workbench_model.model_package_arn()
         model_package = ModelPackage(
             role=self.workbench_role_arn,
             model_package_arn=model_package_arn,
@@ -95,6 +107,23 @@ class ModelToEndpoint(Transform):
                 max_concurrency=max_concurrency,
             )
+        # Configure data capture if requested (and not serverless)
+        data_capture_config = None
+        if data_capture and not self.serverless:
+            # Set up the S3 path for data capture
+            base_endpoint_path = f"{workbench_model.endpoints_s3_path}/{self.output_name}"
+            data_capture_path = f"{base_endpoint_path}/data_capture"
+            self.log.important(f"Configuring Data Capture --> {data_capture_path}")
+            data_capture_config = DataCaptureConfig(
+                enable_capture=True,
+                sampling_percentage=capture_percentage,
+                destination_s3_uri=data_capture_path,
+            )
+        elif data_capture and self.serverless:
+            self.log.warning(
+                "Data capture is not supported for serverless endpoints. Skipping data capture configuration."
+            )
         # Deploy the Endpoint
         self.log.important(f"Deploying the Endpoint {self.output_name}...")
         model_package.deploy(
@@ -104,6 +133,7 @@ class ModelToEndpoint(Transform):
             endpoint_name=self.output_name,
             serializer=CSVSerializer(),
             deserializer=CSVDeserializer(),
+            data_capture_config=data_capture_config,
             tags=aws_tags,
         )

workbench/core/transforms/pandas_transforms/pandas_to_features.py CHANGED Viewed

@@ -327,9 +327,36 @@ class PandasToFeatures(Transform):
         self.delete_existing()
         self.output_feature_group = self.create_feature_group()
+    def mac_spawn_hack(self):
+        """Workaround for macOS Tahoe fork/spawn issue with SageMaker FeatureStore ingest.
+        See: https://github.com/aws/sagemaker-python-sdk/issues/5312
+        macOS Tahoe 26+ has issues with forked processes creating boto3 sessions.
+        This forces spawn mode on macOS to avoid the hang.
+        """
+        import platform
+        if platform.system() == "Darwin":  # macOS
+            self.log.warning("macOS detected, forcing 'spawn' mode for multiprocessing (Tahoe hang workaround)")
+            import multiprocessing
+            try:
+                import multiprocess
+                multiprocess.set_start_method("spawn", force=True)
+            except (RuntimeError, ImportError):
+                pass  # Already set or multiprocess not available
+            try:
+                multiprocessing.set_start_method("spawn", force=True)
+            except RuntimeError:
+                pass  # Already set
     def transform_impl(self):
         """Transform Implementation: Ingest the data into the Feature Group"""
+        # Workaround for macOS Tahoe hang issue
+        self.mac_spawn_hack()
         # Now we actually push the data into the Feature Group (called ingestion)
         self.log.important(f"Ingesting rows into Feature Group {self.output_name}...")
         ingest_manager = self.output_feature_group.ingest(self.output_df, max_workers=8, max_processes=4, wait=False)

workbench/core/views/training_view.py CHANGED Viewed

@@ -3,14 +3,18 @@
 from typing import Union
 # Workbench Imports
-from workbench.api import DataSource, FeatureSet
+from workbench.api import FeatureSet
 from workbench.core.views.view import View
 from workbench.core.views.create_view import CreateView
 from workbench.core.views.view_utils import get_column_list
 class TrainingView(CreateView):
-    """TrainingView Class: A View with an additional training column that marks holdout ids
+    """TrainingView Class: A View with an additional training column (80/20 or holdout ids).
+    The TrainingView class creates a SQL view that includes all columns from the source table
+    along with an additional boolean column named "training". This view can also include
+    a SQL filter expression to filter the rows included in the view.
     Common Usage:
         ```python
@@ -19,8 +23,9 @@ class TrainingView(CreateView):
         training_view = TrainingView.create(fs)
         df = training_view.pull_dataframe()
-        # Create a TrainingView with a specific set of columns
-        training_view = TrainingView.create(fs, column_list=["my_col1", "my_col2"])
+        # Create a TrainingView with a specific filter expression
+        training_view = TrainingView.create(fs, id_column="auto_id", filter_expression="age > 30")
+        df = training_view.pull_dataframe()
         # Query the view
         df = training_view.query(f"SELECT * FROM {training_view.table} where training = TRUE")
@@ -31,17 +36,21 @@ class TrainingView(CreateView):
     def create(
         cls,
         feature_set: FeatureSet,
-        source_table: str = None,
+        *,  # Enforce keyword arguments after feature_set
         id_column: str = None,
         holdout_ids: Union[list[str], list[int], None] = None,
+        filter_expression: str = None,
+        source_table: str = None,
     ) -> Union[View, None]:
         """Factory method to create and return a TrainingView instance.
         Args:
             feature_set (FeatureSet): A FeatureSet object
-            source_table (str, optional): The table/view to create the view from. Defaults to None.
             id_column (str, optional): The name of the id column. Defaults to None.
             holdout_ids (Union[list[str], list[int], None], optional): A list of holdout ids. Defaults to None.
+            filter_expression (str, optional): SQL filter expression (e.g., "age > 25 AND status = 'active'").
+                                               Defaults to None.
+            source_table (str, optional): The table/view to create the view from. Defaults to None.
         Returns:
             Union[View, None]: The created View object (or None if failed to create the view)
@@ -69,28 +78,36 @@ class TrainingView(CreateView):
                 else:
                     id_column = instance.auto_id_column
-        # If we don't have holdout ids, create a default training view
-        if not holdout_ids:
-            instance._default_training_view(instance.data_source, id_column)
-            return View(instance.data_source, instance.view_name, auto_create_view=False)
+        # Enclose each column name in double quotes
+        sql_columns = ", ".join([f'"{column}"' for column in column_list])
+        # Build the training assignment logic
+        if holdout_ids:
+            # Format the list of holdout ids for SQL IN clause
+            if all(isinstance(id, str) for id in holdout_ids):
+                formatted_holdout_ids = ", ".join(f"'{id}'" for id in holdout_ids)
+            else:
+                formatted_holdout_ids = ", ".join(map(str, holdout_ids))
-        # Format the list of holdout ids for SQL IN clause
-        if holdout_ids and all(isinstance(id, str) for id in holdout_ids):
-            formatted_holdout_ids = ", ".join(f"'{id}'" for id in holdout_ids)
+            training_logic = f"""CASE
+                WHEN {id_column} IN ({formatted_holdout_ids}) THEN False
+                ELSE True
+            END AS training"""
         else:
-            formatted_holdout_ids = ", ".join(map(str, holdout_ids))
+            # Default 80/20 split using modulo
+            training_logic = f"""CASE
+                WHEN MOD(ROW_NUMBER() OVER (ORDER BY {id_column}), 10) < 8 THEN True
+                ELSE False
+            END AS training"""
-        # Enclose each column name in double quotes
-        sql_columns = ", ".join([f'"{column}"' for column in column_list])
+        # Build WHERE clause if filter_expression is provided
+        where_clause = f"\nWHERE {filter_expression}" if filter_expression else ""
         # Construct the CREATE VIEW query
         create_view_query = f"""
         CREATE OR REPLACE VIEW {instance.table} AS
-        SELECT {sql_columns}, CASE
-            WHEN {id_column} IN ({formatted_holdout_ids}) THEN False
-            ELSE True
-        END AS training
-        FROM {instance.source_table}
+        SELECT {sql_columns}, {training_logic}
+        FROM {instance.source_table}{where_clause}
         """
         # Execute the CREATE VIEW query
@@ -99,35 +116,56 @@ class TrainingView(CreateView):
         # Return the View
         return View(instance.data_source, instance.view_name, auto_create_view=False)
-    # This is an internal method that's used to create a default training view
-    def _default_training_view(self, data_source: DataSource, id_column: str):
-        """Create a default view in Athena that assigns roughly 80% of the data to training
+    @classmethod
+    def create_with_sql(
+        cls,
+        feature_set: FeatureSet,
+        *,
+        sql_query: str,
+        id_column: str = None,
+    ) -> Union[View, None]:
+        """Factory method to create a TrainingView from a custom SQL query.
+        This method takes a complete SQL query and adds the default 80/20 training split.
+        Use this when you need complex queries like UNION ALL for oversampling.
         Args:
-            data_source (DataSource): The Workbench DataSource object
-            id_column (str): The name of the id column
+            feature_set (FeatureSet): A FeatureSet object
+            sql_query (str): Complete SELECT query (without the final semicolon)
+            id_column (str, optional): The name of the id column for training split. Defaults to None.
+        Returns:
+            Union[View, None]: The created View object (or None if failed)
         """
-        self.log.important(f"Creating default Training View {self.table}...")
+        # Instantiate the TrainingView
+        instance = cls("training", feature_set)
-        # Drop any columns generated from AWS
-        aws_cols = ["write_time", "api_invocation_time", "is_deleted", "event_time"]
-        column_list = [col for col in data_source.columns if col not in aws_cols]
+        # Sanity check on the id column
+        if not id_column:
+            instance.log.important("No id column specified, using auto_id_column")
+            if not instance.auto_id_column:
+                instance.log.error("No id column specified and no auto_id_column found, aborting")
+                return None
+            id_column = instance.auto_id_column
-        # Enclose each column name in double quotes
-        sql_columns = ", ".join([f'"{column}"' for column in column_list])
+        # Default 80/20 split using modulo
+        training_logic = f"""CASE
+            WHEN MOD(ROW_NUMBER() OVER (ORDER BY {id_column}), 10) < 8 THEN True
+            ELSE False
+        END AS training"""
-        # Construct the CREATE VIEW query with a simple modulo operation for the 80/20 split
+        # Wrap the custom query and add training column
         create_view_query = f"""
-        CREATE OR REPLACE VIEW "{self.table}" AS
-        SELECT {sql_columns}, CASE
-            WHEN MOD(ROW_NUMBER() OVER (ORDER BY {id_column}), 10) < 8 THEN True  -- Assign 80% to training
-            ELSE False  -- Assign roughly 20% to validation/test
-        END AS training
-        FROM {self.base_table_name}
+        CREATE OR REPLACE VIEW {instance.table} AS
+        SELECT *, {training_logic}
+        FROM ({sql_query}) AS custom_source
         """
         # Execute the CREATE VIEW query
-        data_source.execute_statement(create_view_query)
+        instance.data_source.execute_statement(create_view_query)
+        # Return the View
+        return View(instance.data_source, instance.view_name, auto_create_view=False)
 if __name__ == "__main__":
@@ -135,7 +173,7 @@ if __name__ == "__main__":
     from workbench.api import FeatureSet
     # Get the FeatureSet
-    fs = FeatureSet("test_features")
+    fs = FeatureSet("abalone_features")
     # Delete the existing training view
     training_view = TrainingView.create(fs)
@@ -152,9 +190,42 @@ if __name__ == "__main__":
     # Create a TrainingView with holdout ids
     my_holdout_ids = list(range(10))
-    training_view = TrainingView.create(fs, id_column="id", holdout_ids=my_holdout_ids)
+    training_view = TrainingView.create(fs, id_column="auto_id", holdout_ids=my_holdout_ids)
     # Pull the training data
     df = training_view.pull_dataframe()
     print(df.head())
     print(df["training"].value_counts())
+    print(f"Shape: {df.shape}")
+    print(f"Diameter min: {df['diameter'].min()}, max: {df['diameter'].max()}")
+    # Test the filter expression
+    training_view = TrainingView.create(fs, id_column="auto_id", filter_expression="diameter > 0.5")
+    df = training_view.pull_dataframe()
+    print(df.head())
+    print(f"Shape with filter: {df.shape}")
+    print(f"Diameter min: {df['diameter'].min()}, max: {df['diameter'].max()}")
+    # Test create_with_sql with a custom query (UNION ALL for oversampling)
+    print("\n--- Testing create_with_sql with oversampling ---")
+    base_table = fs.table
+    replicate_ids = [0, 1, 2]  # Oversample these IDs
+    custom_sql = f"""
+        SELECT * FROM {base_table}
+        UNION ALL
+        SELECT * FROM {base_table}
+        WHERE auto_id IN ({', '.join(map(str, replicate_ids))})
+    """
+    training_view = TrainingView.create_with_sql(fs, sql_query=custom_sql, id_column="auto_id")
+    df = training_view.pull_dataframe()
+    print(f"Shape with custom SQL: {df.shape}")
+    print(df["training"].value_counts())
+    # Verify oversampling - check if replicated IDs appear twice
+    for rep_id in replicate_ids:
+        count = len(df[df["auto_id"] == rep_id])
+        print(f"ID {rep_id} appears {count} times")

workbench/core/views/view.py CHANGED Viewed

@@ -91,11 +91,11 @@ class View:
             self.table, self.data_source.database, self.data_source.boto3_session
         )
-    def pull_dataframe(self, limit: int = 50000) -> Union[pd.DataFrame, None]:
+    def pull_dataframe(self, limit: int = 100000) -> Union[pd.DataFrame, None]:
         """Pull a DataFrame based on the view type
         Args:
-            limit (int): The maximum number of rows to pull (default: 50000)
+            limit (int): The maximum number of rows to pull (default: 100000)
         Returns:
             Union[pd.DataFrame, None]: The DataFrame for the view or None if it doesn't exist
@@ -196,12 +196,52 @@ class View:
         # The BaseView always exists
         if self.view_name == "base":
-            return True
+            return
         # Check the database directly
         if not self._check_database():
             self._auto_create_view()
+    def copy(self, dest_view_name: str) -> "View":
+        """Copy this view to a new view with a different name
+        Args:
+            dest_view_name (str): The destination view name (e.g. "training_v1")
+        Returns:
+            View: A new View object for the destination view
+        """
+        # Can't copy the base view
+        if self.view_name == "base":
+            self.log.error("Cannot copy the base view")
+            return None
+        # Get the view definition
+        get_view_query = f"""
+        SELECT view_definition
+        FROM information_schema.views
+        WHERE table_schema = '{self.database}'
+        AND table_name = '{self.table}'
+        """
+        df = self.data_source.query(get_view_query)
+        if df.empty:
+            self.log.error(f"View {self.table} not found")
+            return None
+        view_definition = df.iloc[0]["view_definition"]
+        # Create the new view with the destination name
+        dest_table = f"{self.base_table_name}___{dest_view_name.lower()}"
+        create_view_query = f'CREATE OR REPLACE VIEW "{dest_table}" AS {view_definition}'
+        self.log.important(f"Copying view {self.table} to {dest_table}...")
+        self.data_source.execute_statement(create_view_query)
+        # Return a new View object for the destination
+        artifact = FeatureSet(self.artifact_name) if self.is_feature_set else DataSource(self.artifact_name)
+        return View(artifact, dest_view_name, auto_create_view=False)
     def _check_database(self) -> bool:
         """Internal: Check if the view exists in the database
@@ -324,3 +364,13 @@ if __name__ == "__main__":
     # Test supplemental data tables deletion
     view = View(fs, "test_view")
     view.delete()
+    # Test copying a view
+    fs = FeatureSet("test_features")
+    display_view = View(fs, "display")
+    copied_view = display_view.copy("display_copy")
+    print(copied_view)
+    print(copied_view.pull_dataframe().head())
+    # Clean up copied view
+    copied_view.delete()

workbench/core/views/view_utils.py CHANGED Viewed

@@ -296,15 +296,15 @@ if __name__ == "__main__":
     print("View Details on the FeatureSet Table...")
     print(view_details(my_data_source.table, my_data_source.database, my_data_source.boto3_session))
-    print("View Details on the Training View...")
-    training_view = fs.view("training")
+    print("View Details on the Display View...")
+    training_view = fs.view("display")
     print(view_details(training_view.table, training_view.database, my_data_source.boto3_session))
     # Test get_column_list
     print(get_column_list(my_data_source))
-    # Test get_column_list (with training view)
-    training_table = fs.view("training").table
+    # Test get_column_list (with display view)
+    training_table = fs.view("display").table
     print(get_column_list(my_data_source, training_table))
     # Test list_views

workbench 0.8.162__py3-none-any.whl → 0.8.202__py3-none-any.whl

Potentially problematic release.

workbench 0.8.162py3-none-any.whl → 0.8.202py3-none-any.whl