PyPI - workbench - Versions diffs - 0.8.193__py3-none-any.whl → 0.8.198__py3-none-any.whl - Mend

workbench 0.8.193py3-none-any.whl → 0.8.198py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

workbench/api/model.py CHANGED Viewed

@@ -83,16 +83,13 @@ class Model(ModelCore):
         end.set_owner(self.get_owner())
         return end
-    def prox_model(self, filtered: bool = True):
+    def prox_model(self):
         """Create a local Proximity Model for this Model
-        Args:
-            filtered: bool, optional): Use filtered training data for the Proximity Model (default: True)
         Returns:
            Proximity: A local Proximity Model
         """
-        return proximity_model_local(self, filtered=filtered)
+        return proximity_model_local(self)
     def uq_model(self, uq_model_name: str = None, train_all_data: bool = False) -> "Model":
         """Create a Uncertainty Quantification Model for this Model

workbench/core/artifacts/endpoint_core.py CHANGED Viewed

@@ -8,7 +8,7 @@ import pandas as pd
 import numpy as np
 from io import StringIO
 import awswrangler as wr
-from typing import Union, Optional, Tuple
+from typing import Union, Optional
 import hashlib
 # Model Performance Scores
@@ -438,23 +438,27 @@ class EndpointCore(Artifact):
         # Return the prediction DataFrame
         return prediction_df
-    def cross_fold_inference(self, nfolds: int = 5) -> Tuple[dict, pd.DataFrame]:
+    def cross_fold_inference(self, nfolds: int = 5) -> pd.DataFrame:
         """Run cross-fold inference (only works for XGBoost models)
         Args:
             nfolds (int): Number of folds to use for cross-fold (default: 5)
         Returns:
-            Tuple[dict, pd.DataFrame]: Tuple of (cross_fold_metrics, out_of_fold_df)
+            pd.DataFrame: A DataFrame with cross fold predictions
         """
         # Grab our model
         model = ModelCore(self.model_name)
-        # Compute CrossFold Metrics
+        # Compute CrossFold (Metrics and Prediction Dataframe)
         cross_fold_metrics, out_of_fold_df = cross_fold_inference(model, nfolds=nfolds)
-        if cross_fold_metrics:
-            self.param_store.upsert(f"/workbench/models/{model.name}/inference/cross_fold", cross_fold_metrics)
+        # If the metrics dataframe isn't empty save to the param store
+        if not cross_fold_metrics.empty:
+            # Convert to list of dictionaries
+            metrics = cross_fold_metrics.to_dict(orient="records")
+            self.param_store.upsert(f"/workbench/models/{model.name}/inference/cross_fold", metrics)
         # Capture the results
         capture_name = "full_cross_fold"
@@ -478,7 +482,9 @@ class EndpointCore(Artifact):
             uq_df = self.inference(training_df)
             # Identify UQ-specific columns (quantiles and prediction_std)
-            uq_columns = [col for col in uq_df.columns if col.startswith("q_") or col == "prediction_std"]
+            uq_columns = [
+                col for col in uq_df.columns if col.startswith("q_") or col == "prediction_std" or col == "confidence"
+            ]
             # Merge UQ columns with out-of-fold predictions
             if uq_columns:
@@ -502,12 +508,12 @@ class EndpointCore(Artifact):
             out_of_fold_df,
             target_column,
             model_type,
-            pd.DataFrame([cross_fold_metrics["summary_metrics"]]),
+            cross_fold_metrics,
             description,
             features=additional_columns,
             id_column=id_column,
         )
-        return cross_fold_metrics, out_of_fold_df
+        return out_of_fold_df
     def fast_inference(self, eval_df: pd.DataFrame, threads: int = 4) -> pd.DataFrame:
         """Run inference on the Endpoint using the provided DataFrame
@@ -766,8 +772,8 @@ class EndpointCore(Artifact):
         # Add any _proba columns to the output columns
         output_columns += [col for col in pred_results_df.columns if col.endswith("_proba")]
-        # Add any quantile columns to the output columns
-        output_columns += [col for col in pred_results_df.columns if col.startswith("q_") or col.startswith("qr_")]
+        # Add any Uncertainty Quantile columns to the output columns
+        output_columns += [col for col in pred_results_df.columns if col.startswith("q_") or col == "confidence"]
         # Add the ID column
         if id_column and id_column in pred_results_df.columns:
@@ -896,7 +902,7 @@ class EndpointCore(Artifact):
         else:
             self.validate_proba_columns(prediction_df, class_labels)
-        # Calculate precision, recall, fscore, and support, handling zero division
+        # Calculate precision, recall, f1, and support, handling zero division
         prediction_col = "prediction" if "prediction" in prediction_df.columns else "predictions"
         scores = precision_recall_fscore_support(
             prediction_df[target_column],
@@ -931,7 +937,7 @@ class EndpointCore(Artifact):
                 target_column: class_labels,
                 "precision": scores[0],
                 "recall": scores[1],
-                "fscore": scores[2],
+                "f1": scores[2],
                 "roc_auc": roc_auc_per_label,
                 "support": scores[3],
             }
@@ -1039,7 +1045,7 @@ class EndpointCore(Artifact):
         # Recursively delete all endpoint S3 artifacts (inference, etc)
         # Note: We do not want to delete the data_capture/ files since these
         #       might be used for collection and data drift analysis
-        base_endpoint_path = f"{cls.endpoints_s3_path}/{endpoint_name}"
+        base_endpoint_path = f"{cls.endpoints_s3_path}/{endpoint_name}/"
         all_s3_objects = wr.s3.list_objects(base_endpoint_path, boto3_session=cls.boto3_session)
         # Filter out objects that contain 'data_capture/' in their path
@@ -1194,7 +1200,8 @@ if __name__ == "__main__":
     # Test the cross_fold_inference method
     print("Running Cross-Fold Inference...")
-    metrics, all_results = my_endpoint.cross_fold_inference()
+    all_results = my_endpoint.cross_fold_inference()
+    print(all_results)
     # Run Inference and metrics for a Classification Endpoint
     class_endpoint = EndpointCore("wine-classification")
@@ -1206,7 +1213,9 @@ if __name__ == "__main__":
     # Test the cross_fold_inference method
     print("Running Cross-Fold Inference...")
-    metrics, all_results = class_endpoint.cross_fold_inference()
+    all_results = class_endpoint.cross_fold_inference()
+    print(all_results)
+    print("All done...")
     # Test the class method delete (commented out for now)
     # from workbench.api import Model

workbench/core/artifacts/feature_set_core.py CHANGED Viewed

@@ -551,6 +551,75 @@ class FeatureSetCore(Artifact):
         # Apply the filter
         self.set_training_filter(filter_expression)
+    def set_training_sampling(
+        self,
+        exclude_ids: Optional[List[Union[str, int]]] = None,
+        replicate_ids: Optional[List[Union[str, int]]] = None,
+        replication_factor: int = 2,
+    ):
+        """Configure training view with ID exclusions and replications (oversampling).
+        Args:
+            exclude_ids: List of IDs to exclude from training view
+            replicate_ids: List of IDs to replicate in training view for oversampling
+            replication_factor: Number of times to replicate each ID (default: 2)
+        Note:
+            If an ID appears in both lists, exclusion takes precedence.
+        """
+        from workbench.core.views import TrainingView
+        # Normalize to empty lists if None
+        exclude_ids = exclude_ids or []
+        replicate_ids = replicate_ids or []
+        # Remove any replicate_ids that are also in exclude_ids (exclusion wins)
+        replicate_ids = [rid for rid in replicate_ids if rid not in exclude_ids]
+        # If no sampling needed, just create normal view
+        if not exclude_ids and not replicate_ids:
+            self.log.important("No sampling specified, creating standard training view")
+            TrainingView.create(self, id_column=self.id_column)
+            return
+        # Build the custom SQL query
+        self.log.important(
+            f"Excluding {len(exclude_ids)} IDs, Replicating {len(replicate_ids)} IDs "
+            f"(factor: {replication_factor}x)"
+        )
+        # Helper to format IDs for SQL
+        def format_ids(ids):
+            return ", ".join([repr(id) for id in ids])
+        # Start with base query
+        base_query = f"SELECT * FROM {self.table}"
+        # Add exclusions if needed
+        if exclude_ids:
+            base_query += f"\nWHERE {self.id_column} NOT IN ({format_ids(exclude_ids)})"
+        # Build full query with replication
+        if replicate_ids:
+            # Generate VALUES clause for CROSS JOIN: (1), (2), ..., (N-1)
+            # We want N-1 additional copies since the original row is already in base_query
+            values_clause = ", ".join([f"({i})" for i in range(1, replication_factor)])
+            custom_sql = f"""{base_query}
+            UNION ALL
+            SELECT t.*
+            FROM {self.table} t
+            CROSS JOIN (VALUES {values_clause}) AS n(num)
+            WHERE t.{self.id_column} IN ({format_ids(replicate_ids)})"""
+        else:
+            # Only exclusions, no UNION needed
+            custom_sql = base_query
+        # Create the training view with our custom SQL
+        TrainingView.create_with_sql(self, sql_query=custom_sql, id_column=self.id_column)
     @classmethod
     def delete_views(cls, table: str, database: str):
         """Delete any views associated with this FeatureSet
@@ -709,7 +778,7 @@ if __name__ == "__main__":
     pd.set_option("display.width", 1000)
     # Grab a FeatureSet object and pull some information from it
-    my_features = LocalFeatureSetCore("test_features")
+    my_features = LocalFeatureSetCore("abalone_features")
     if not my_features.exists():
         print("FeatureSet not found!")
         sys.exit(1)
@@ -769,8 +838,8 @@ if __name__ == "__main__":
     # Set the holdout ids for the training view
     print("Setting hold out ids...")
     table = my_features.view("training").table
-    df = my_features.query(f'SELECT id, name FROM "{table}"')
-    my_holdout_ids = [id for id in df["id"] if id < 20]
+    df = my_features.query(f'SELECT auto_id, length FROM "{table}"')
+    my_holdout_ids = [id for id in df["auto_id"] if id < 20]
     my_features.set_training_holdouts(my_holdout_ids)
     # Get the training data
@@ -780,7 +849,7 @@ if __name__ == "__main__":
     # Test the filter expression functionality
     print("Setting a filter expression...")
-    my_features.set_training_filter("id < 50 AND height > 65.0")
+    my_features.set_training_filter("auto_id < 50 AND length > 65.0")
     training_data = my_features.get_training_data()
     print(f"Training Data: {training_data.shape}")
     print(training_data)
@@ -803,3 +872,56 @@ if __name__ == "__main__":
     # print("Deleting Workbench Feature Set...")
     # my_features.delete()
     # print("Done")
+    # Test set_training_sampling with exclusions and replications
+    print("\n--- Testing set_training_sampling ---")
+    my_features.set_training_filter(None)  # Reset any existing filters
+    original_count = num_rows
+    # Get valid IDs from the table
+    all_data = my_features.query(f'SELECT auto_id, length FROM "{table}"')
+    valid_ids = sorted(all_data["auto_id"].tolist())
+    print(f"Valid IDs range from {valid_ids[0]} to {valid_ids[-1]}")
+    exclude_list = valid_ids[0:3]  # First 3 IDs
+    replicate_list = valid_ids[10:13]  # IDs at positions 10, 11, 12
+    print(f"Original row count: {original_count}")
+    print(f"Excluding IDs: {exclude_list}")
+    print(f"Replicating IDs: {replicate_list}")
+    # Test with default replication factor (2x)
+    print("\n--- Testing with replication_factor=2 (default) ---")
+    my_features.set_training_sampling(exclude_ids=exclude_list, replicate_ids=replicate_list)
+    training_data = my_features.get_training_data()
+    print(f"Training Data after sampling: {training_data.shape}")
+    # Verify exclusions
+    for exc_id in exclude_list:
+        count = len(training_data[training_data["auto_id"] == exc_id])
+        print(f"Excluded ID {exc_id} appears {count} times (should be 0)")
+    # Verify replications
+    for rep_id in replicate_list:
+        count = len(training_data[training_data["auto_id"] == rep_id])
+        print(f"Replicated ID {rep_id} appears {count} times (should be 2)")
+    # Test with replication factor of 5
+    print("\n--- Testing with replication_factor=5 ---")
+    replicate_list_5x = [20, 21]
+    my_features.set_training_sampling(exclude_ids=exclude_list, replicate_ids=replicate_list_5x, replication_factor=5)
+    training_data = my_features.get_training_data()
+    print(f"Training Data after sampling: {training_data.shape}")
+    # Verify 5x replication
+    for rep_id in replicate_list_5x:
+        count = len(training_data[training_data["auto_id"] == rep_id])
+        print(f"Replicated ID {rep_id} appears {count} times (should be 5)")
+    # Test with large replication list (simulate 100 IDs)
+    print("\n--- Testing with large ID list (100 IDs) ---")
+    large_replicate_list = list(range(30, 130))  # 100 IDs
+    my_features.set_training_sampling(replicate_ids=large_replicate_list, replication_factor=3)
+    training_data = my_features.get_training_data()
+    print(f"Training Data after sampling: {training_data.shape}")
+    print(f"Expected extra rows: {len(large_replicate_list) * 3}")

workbench/core/artifacts/model_core.py CHANGED Viewed

@@ -54,14 +54,14 @@ class ModelImages:
     }
     @classmethod
-    def get_image_uri(cls, region, image_type, version="0.1", architecture="x86_64"):
+    def get_image_uri(cls, region, image_type, version="latest", architecture="x86_64"):
         """
         Dynamically construct ECR image URI.
         Args:
             region: AWS region (e.g., 'us-east-1', 'us-west-2')
             image_type: Type of image (e.g., 'training', 'inference', 'pytorch_training')
-            version: Image version (e.g., '0.1', '0.2')
+            version: Image version (e.g., '0.1', '0.2' defaults to 'latest')
             architecture: CPU architecture (default: 'x86_64', currently unused but kept for compatibility)
         Returns:
@@ -589,7 +589,7 @@ class ModelCore(Artifact):
         fs = FeatureSetCore(self.get_input())
         # See if we have a training view for this model
-        my_model_training_view = f"{self.name.replace('-', '_')}_training"
+        my_model_training_view = f"{self.name.replace('-', '_')}_training".lower()
         view = View(fs, my_model_training_view, auto_create_view=False)
         if view.exists():
             return view
@@ -867,14 +867,6 @@ class ModelCore(Artifact):
                 shap_data[key] = self.df_store.get(df_location)
             return shap_data or None
-    def cross_folds(self) -> dict:
-        """Retrieve the cross-fold inference results(only works for XGBoost models)
-        Returns:
-            dict: Dictionary with the cross-fold inference results
-        """
-        return self.param_store.get(f"/workbench/models/{self.name}/inference/cross_fold")
     def supported_inference_instances(self) -> Optional[list]:
         """Retrieve the supported endpoint inference instance types
@@ -1171,13 +1163,11 @@ if __name__ == "__main__":
     # Grab a ModelCore object and pull some information from it
     my_model = ModelCore("abalone-regression")
-    # Call the various methods
     # Let's do a check/validation of the Model
     print(f"Model Check: {my_model.exists()}")
     # Make sure the model is 'ready'
-    # my_model.onboard()
+    my_model.onboard()
     # Get the ARN of the Model Group
     print(f"Model Group ARN: {my_model.group_arn()}")
@@ -1243,5 +1233,10 @@ if __name__ == "__main__":
     # Delete the Model
     # ModelCore.managed_delete("wine-classification")
+    # Check the training view logic
+    model = ModelCore("wine-class-test-251112-BW")
+    training_view = model.training_view()
+    print(f"Training View Name: {training_view.name}")
     # Check for a model that doesn't exist
     my_model = ModelCore("empty-model-group")

workbench/core/transforms/features_to_model/features_to_model.py CHANGED Viewed

@@ -210,7 +210,7 @@ class FeaturesToModel(Transform):
                 raise ValueError(msg)
             # Dynamically create the metric definitions
-            metrics = ["precision", "recall", "fscore"]
+            metrics = ["precision", "recall", "f1"]
             metric_definitions = []
             for t in self.class_labels:
                 for m in metrics:
@@ -233,7 +233,7 @@ class FeaturesToModel(Transform):
         source_dir = str(Path(script_path).parent)
         # Create a Sagemaker Model with our script
-        image = ModelImages.get_image_uri(self.sm_session.boto_region_name, self.training_image, "0.2")
+        image = ModelImages.get_image_uri(self.sm_session.boto_region_name, self.training_image)
         self.estimator = Estimator(
             entry_point=entry_point,
             source_dir=source_dir,
@@ -306,7 +306,7 @@ class FeaturesToModel(Transform):
         # Register our model
         image = ModelImages.get_image_uri(
-            self.sm_session.boto_region_name, self.inference_image, "0.2", self.inference_arch
+            self.sm_session.boto_region_name, self.inference_image, architecture=self.inference_arch
         )
         self.log.important(f"Registering model {self.output_name} with Inference Image {image}...")
         model = self.estimator.create_model(role=self.workbench_role_arn)

workbench/core/views/training_view.py CHANGED Viewed

@@ -116,6 +116,57 @@ class TrainingView(CreateView):
         # Return the View
         return View(instance.data_source, instance.view_name, auto_create_view=False)
+    @classmethod
+    def create_with_sql(
+        cls,
+        feature_set: FeatureSet,
+        *,
+        sql_query: str,
+        id_column: str = None,
+    ) -> Union[View, None]:
+        """Factory method to create a TrainingView from a custom SQL query.
+        This method takes a complete SQL query and adds the default 80/20 training split.
+        Use this when you need complex queries like UNION ALL for oversampling.
+        Args:
+            feature_set (FeatureSet): A FeatureSet object
+            sql_query (str): Complete SELECT query (without the final semicolon)
+            id_column (str, optional): The name of the id column for training split. Defaults to None.
+        Returns:
+            Union[View, None]: The created View object (or None if failed)
+        """
+        # Instantiate the TrainingView
+        instance = cls("training", feature_set)
+        # Sanity check on the id column
+        if not id_column:
+            instance.log.important("No id column specified, using auto_id_column")
+            if not instance.auto_id_column:
+                instance.log.error("No id column specified and no auto_id_column found, aborting")
+                return None
+            id_column = instance.auto_id_column
+        # Default 80/20 split using modulo
+        training_logic = f"""CASE
+            WHEN MOD(ROW_NUMBER() OVER (ORDER BY {id_column}), 10) < 8 THEN True
+            ELSE False
+        END AS training"""
+        # Wrap the custom query and add training column
+        create_view_query = f"""
+        CREATE OR REPLACE VIEW {instance.table} AS
+        SELECT *, {training_logic}
+        FROM ({sql_query}) AS custom_source
+        """
+        # Execute the CREATE VIEW query
+        instance.data_source.execute_statement(create_view_query)
+        # Return the View
+        return View(instance.data_source, instance.view_name, auto_create_view=False)
 if __name__ == "__main__":
     """Exercise the Training View functionality"""
@@ -154,3 +205,27 @@ if __name__ == "__main__":
     print(df.head())
     print(f"Shape with filter: {df.shape}")
     print(f"Diameter min: {df['diameter'].min()}, max: {df['diameter'].max()}")
+    # Test create_with_sql with a custom query (UNION ALL for oversampling)
+    print("\n--- Testing create_with_sql with oversampling ---")
+    base_table = fs.table
+    replicate_ids = [0, 1, 2]  # Oversample these IDs
+    custom_sql = f"""
+        SELECT * FROM {base_table}
+        UNION ALL
+        SELECT * FROM {base_table}
+        WHERE auto_id IN ({', '.join(map(str, replicate_ids))})
+    """
+    training_view = TrainingView.create_with_sql(fs, sql_query=custom_sql, id_column="auto_id")
+    df = training_view.pull_dataframe()
+    print(f"Shape with custom SQL: {df.shape}")
+    print(df["training"].value_counts())
+    # Verify oversampling - check if replicated IDs appear twice
+    for rep_id in replicate_ids:
+        count = len(df[df["auto_id"] == rep_id])
+        print(f"ID {rep_id} appears {count} times")

workbench/core/views/view.py CHANGED Viewed

@@ -232,7 +232,7 @@ class View:
         view_definition = df.iloc[0]["view_definition"]
         # Create the new view with the destination name
-        dest_table = f"{self.base_table_name}___{dest_view_name}"
+        dest_table = f"{self.base_table_name}___{dest_view_name.lower()}"
         create_view_query = f'CREATE OR REPLACE VIEW "{dest_table}" AS {view_definition}'
         self.log.important(f"Copying view {self.table} to {dest_table}...")

workbench/model_scripts/__pycache__/script_generation.cpython-312.pyc ADDED Viewed

Binary file

workbench/model_scripts/__pycache__/script_generation.cpython-313.pyc ADDED Viewed

Binary file

workbench 0.8.193__py3-none-any.whl → 0.8.198__py3-none-any.whl

workbench 0.8.193py3-none-any.whl → 0.8.198py3-none-any.whl