PyPI - workbench - Versions diffs - 0.8.217__py3-none-any.whl → 0.8.224__py3-none-any.whl - Mend

workbench 0.8.217py3-none-any.whl → 0.8.224py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
workbench/algorithms/dataframe/fingerprint_proximity.py +190 -31
workbench/algorithms/dataframe/projection_2d.py +8 -2
workbench/algorithms/dataframe/proximity.py +3 -0
workbench/algorithms/sql/outliers.py +3 -3
workbench/api/feature_set.py +0 -1
workbench/core/artifacts/endpoint_core.py +2 -2
workbench/core/artifacts/feature_set_core.py +185 -230
workbench/core/transforms/features_to_model/features_to_model.py +2 -8
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +2 -0
workbench/model_script_utils/model_script_utils.py +15 -11
workbench/model_scripts/chemprop/chemprop.template +195 -70
workbench/model_scripts/chemprop/generated_model_script.py +198 -73
workbench/model_scripts/chemprop/model_script_utils.py +15 -11
workbench/model_scripts/custom_models/chem_info/fingerprints.py +80 -43
workbench/model_scripts/pytorch_model/generated_model_script.py +2 -2
workbench/model_scripts/pytorch_model/model_script_utils.py +15 -11
workbench/model_scripts/xgb_model/generated_model_script.py +7 -7
workbench/model_scripts/xgb_model/model_script_utils.py +15 -11
workbench/scripts/meta_model_sim.py +35 -0
workbench/scripts/ml_pipeline_sqs.py +71 -2
workbench/themes/light/custom.css +7 -1
workbench/themes/midnight_blue/custom.css +34 -0
workbench/utils/chem_utils/fingerprints.py +80 -43
workbench/utils/chem_utils/projections.py +16 -6
workbench/utils/meta_model_simulator.py +41 -13
workbench/utils/model_utils.py +0 -1
workbench/utils/plot_utils.py +146 -28
workbench/utils/shap_utils.py +1 -55
workbench/utils/theme_manager.py +95 -30
workbench/web_interface/components/plugins/scatter_plot.py +152 -66
workbench/web_interface/components/settings_menu.py +184 -0
{workbench-0.8.217.dist-info → workbench-0.8.224.dist-info}/METADATA +4 -13
{workbench-0.8.217.dist-info → workbench-0.8.224.dist-info}/RECORD +38 -37
{workbench-0.8.217.dist-info → workbench-0.8.224.dist-info}/entry_points.txt +1 -0
workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -377
{workbench-0.8.217.dist-info → workbench-0.8.224.dist-info}/WHEEL +0 -0
{workbench-0.8.217.dist-info → workbench-0.8.224.dist-info}/licenses/LICENSE +0 -0
{workbench-0.8.217.dist-info → workbench-0.8.224.dist-info}/top_level.txt +0 -0

workbench/core/artifacts/endpoint_core.py CHANGED Viewed

@@ -410,7 +410,7 @@ class EndpointCore(Artifact):
             primary_target = targets
         # Sanity Check that the target column is present
-        if primary_target and (primary_target not in prediction_df.columns):
+        if primary_target not in prediction_df.columns:
             self.log.important(f"Target Column {primary_target} not found in prediction_df!")
             self.log.important("In order to compute metrics, the target column must be present!")
             metrics = pd.DataFrame()
@@ -432,7 +432,7 @@ class EndpointCore(Artifact):
         print(metrics.head())
         # Capture the inference results and metrics
-        if capture_name is not None:
+        if primary_target and capture_name:
             # If we don't have an id_column, we'll pull it from the model's FeatureSet
             if id_column is None:

workbench/core/artifacts/feature_set_core.py CHANGED Viewed

@@ -7,7 +7,6 @@ from datetime import datetime, timezone
 import botocore.exceptions
 import pandas as pd
 import awswrangler as wr
-import numpy as np
 from sagemaker.feature_store.feature_group import FeatureGroup
 from sagemaker.feature_store.feature_store import FeatureStore
@@ -16,9 +15,8 @@ from sagemaker.feature_store.feature_store import FeatureStore
 from workbench.core.artifacts.artifact import Artifact
 from workbench.core.artifacts.data_source_factory import DataSourceFactory
 from workbench.core.artifacts.athena_source import AthenaSource
-from workbench.utils.deprecated_utils import deprecated
-from typing import TYPE_CHECKING, Optional, List, Dict, Union
+from typing import TYPE_CHECKING, List, Dict, Union
 from workbench.utils.aws_utils import aws_throttle
@@ -247,7 +245,7 @@ class FeatureSetCore(Artifact):
         # Set the compressed features in our FeatureSet metadata
         self.log.important(f"Setting Compressed Columns...{compressed_columns}")
-        self.upsert_workbench_meta({"comp_features": compressed_columns})
+        self.upsert_workbench_meta({"compressed_features": compressed_columns})
     def get_compressed_features(self) -> list[str]:
         """Get the compressed features for this FeatureSet
@@ -256,7 +254,7 @@ class FeatureSetCore(Artifact):
             list[str]: The compressed columns for this FeatureSet
         """
         # Get the compressed features from our FeatureSet metadata
-        return self.workbench_meta().get("comp_features", [])
+        return self.workbench_meta().get("compressed_features", [])
     def num_columns(self) -> int:
         """Return the number of columns of the Feature Set"""
@@ -483,18 +481,6 @@ class FeatureSetCore(Artifact):
             time.sleep(1)
         cls.log.info(f"FeatureSet {feature_group.name} successfully deleted")
-    def set_training_holdouts(self, holdout_ids: list[str]):
-        """Set the hold out ids for the training view for this FeatureSet
-        Args:
-            holdout_ids (list[str]): The list of holdout ids.
-        """
-        from workbench.core.views import TrainingView
-        # Create a NEW training view
-        self.log.important(f"Setting Training Holdouts: {len(holdout_ids)} ids...")
-        TrainingView.create(self, id_column=self.id_column, holdout_ids=holdout_ids)
     def get_training_holdouts(self) -> list[str]:
         """Get the hold out ids for the training view for this FeatureSet
@@ -510,183 +496,176 @@ class FeatureSetCore(Artifact):
         ].tolist()
         return hold_out_ids
-    def set_sample_weights(
+    # ---- Public methods for training configuration ----
+    def set_training_config(
         self,
-        weight_dict: Dict[Union[str, int], float],
+        holdout_ids: List[Union[str, int]] = None,
+        weight_dict: Dict[Union[str, int], float] = None,
         default_weight: float = 1.0,
         exclude_zero_weights: bool = True,
     ):
-        """Configure training view with sample weights for each ID.
+        """Configure training view with holdout IDs and/or sample weights.
+        This method creates a training view that can include both:
+        - A 'training' column (True/False) based on holdout IDs
+        - A 'sample_weight' column for weighted training
         Args:
+            holdout_ids: List of IDs to mark as training=False (validation/holdout set)
             weight_dict: Mapping of ID to sample weight
                 - weight > 1.0: oversample/emphasize
                 - weight = 1.0: normal (default)
                 - 0 < weight < 1.0: downweight/de-emphasize
-                - weight = 0.0: exclude from training
+                - weight = 0.0: exclude from training (filtered out if exclude_zero_weights=True)
             default_weight: Weight for IDs not in weight_dict (default: 1.0)
             exclude_zero_weights: If True, filter out rows with sample_weight=0 (default: True)
         Example:
-            weights = {
-                'compound_42': 3.0,  # oversample 3x
-                'compound_99': 0.1,  # noisy, downweight
-                'compound_123': 0.0, # exclude from training
-            }
-            model.set_sample_weights(weights)  # zeros automatically excluded
-            model.set_sample_weights(weights, exclude_zero_weights=False)  # keep zeros
+            # Temporal split with sample weights
+            fs.set_training_config(
+                holdout_ids=temporal_hold_out_ids,  # IDs after cutoff date
+                weight_dict={'compound_42': 0.0, 'compound_99': 2.0},  # exclude/upweight
+            )
         """
-        from workbench.core.views import TrainingView
+        from workbench.core.views.training_view import TrainingView
-        if not weight_dict:
-            self.log.important("Empty weight_dict, creating standard training view")
+        # If neither is provided, create a standard training view
+        if not holdout_ids and not weight_dict:
+            self.log.important("No holdouts or weights specified, creating standard training view")
             TrainingView.create(self, id_column=self.id_column)
             return
-        self.log.important(f"Setting sample weights for {len(weight_dict)} IDs")
+        # If only holdout_ids, delegate to set_training_holdouts
+        if holdout_ids and not weight_dict:
+            self.set_training_holdouts(holdout_ids)
+            return
-        # Helper to format IDs for SQL
-        def format_id(id_val):
-            return repr(id_val)
+        # If only weight_dict, delegate to set_sample_weights
+        if weight_dict and not holdout_ids:
+            self.set_sample_weights(weight_dict, default_weight, exclude_zero_weights)
+            return
-        # Build CASE statement for sample_weight
-        case_conditions = [
-            f"WHEN {self.id_column} = {format_id(id_val)} THEN {weight}" for id_val, weight in weight_dict.items()
-        ]
-        case_statement = "\n        ".join(case_conditions)
+        # Both holdout_ids and weight_dict provided - build combined view
+        self.log.important(f"Setting training config: {len(holdout_ids)} holdouts, {len(weight_dict)} weights")
-        # Build inner query with sample weights
-        inner_sql = f"""SELECT
-            *,
-            CASE
-                {case_statement}
-                ELSE {default_weight}
-            END AS sample_weight
-        FROM {self.table}"""
+        # Get column list (excluding AWS-generated columns)
+        from workbench.core.views.view_utils import get_column_list
+        aws_cols = ["write_time", "api_invocation_time", "is_deleted", "event_time"]
+        source_columns = get_column_list(self.data_source, self.table)
+        column_list = [col for col in source_columns if col not in aws_cols]
+        sql_columns = ", ".join([f'"{column}"' for column in column_list])
+        # Build inner query with both columns
+        training_case = self._build_holdout_case(holdout_ids)
+        weight_case = self._build_weight_case(weight_dict, default_weight)
+        inner_sql = f"SELECT {sql_columns}, {training_case}, {weight_case} FROM {self.table}"
         # Optionally filter out zero weights
         if exclude_zero_weights:
-            zero_count = sum(1 for weight in weight_dict.values() if weight == 0.0)
-            custom_sql = f"SELECT * FROM ({inner_sql}) WHERE sample_weight > 0"
-            self.log.important(f"Filtering out {zero_count} rows with sample_weight = 0")
+            zero_count = sum(1 for w in weight_dict.values() if w == 0.0)
+            if zero_count:
+                self.log.important(f"Filtering out {zero_count} rows with sample_weight = 0")
+            sql_query = f"SELECT * FROM ({inner_sql}) WHERE sample_weight > 0"
         else:
-            custom_sql = inner_sql
+            sql_query = inner_sql
-        TrainingView.create_with_sql(self, sql_query=custom_sql, id_column=self.id_column)
+        self._create_training_view(sql_query)
-    @deprecated(version="0.9")
-    def set_training_filter(self, filter_expression: Optional[str] = None):
-        """Set a filter expression for the training view for this FeatureSet
+    def set_training_holdouts(self, holdout_ids: list[str]):
+        """Set the hold out ids for the training view for this FeatureSet
         Args:
-            filter_expression (Optional[str]): A SQL filter expression (e.g., "age > 25 AND status = 'active'")
-                If None or empty string, will reset to training view with no filter
-                (default: None)
+            holdout_ids (list[str]): The list of holdout ids.
         """
         from workbench.core.views import TrainingView
-        # Grab the existing holdout ids
-        holdout_ids = self.get_training_holdouts()
-        # Create a NEW training view
-        self.log.important(f"Setting Training Filter: {filter_expression}")
-        TrainingView.create(
-            self, id_column=self.id_column, holdout_ids=holdout_ids, filter_expression=filter_expression
-        )
-    @deprecated(version="0.9")
-    def exclude_ids_from_training(self, ids: List[Union[str, int]], column_name: Optional[str] = None):
-        """Exclude a list of IDs from the training view
-        Args:
-            ids (List[Union[str, int]],): List of IDs to exclude from training
-            column_name (Optional[str]): Column name to filter on.
-                If None, uses self.id_column (default: None)
-        """
-        # Use the default id_column if not specified
-        column = column_name or self.id_column
-        # Handle empty list case
-        if not ids:
-            self.log.warning("No IDs provided to exclude")
-            return
-        # Build the filter expression with proper SQL quoting
-        quoted_ids = ", ".join([repr(id) for id in ids])
-        filter_expression = f"{column} NOT IN ({quoted_ids})"
-        # Apply the filter
-        self.set_training_filter(filter_expression)
+        self.log.important(f"Setting Training Holdouts: {len(holdout_ids)} ids...")
+        TrainingView.create(self, id_column=self.id_column, holdout_ids=holdout_ids)
-    @deprecated(version="0.9")
-    def set_training_sampling(
+    def set_sample_weights(
         self,
-        exclude_ids: Optional[List[Union[str, int]]] = None,
-        replicate_ids: Optional[List[Union[str, int]]] = None,
-        replication_factor: int = 2,
+        weight_dict: Dict[Union[str, int], float],
+        default_weight: float = 1.0,
+        exclude_zero_weights: bool = True,
     ):
-        """Configure training view with ID exclusions and replications (oversampling).
+        """Configure training view with sample weights for each ID.
         Args:
-            exclude_ids: List of IDs to exclude from training view
-            replicate_ids: List of IDs to replicate in training view for oversampling
-            replication_factor: Number of times to replicate each ID (default: 2)
+            weight_dict: Mapping of ID to sample weight
+                - weight > 1.0: oversample/emphasize
+                - weight = 1.0: normal (default)
+                - 0 < weight < 1.0: downweight/de-emphasize
+                - weight = 0.0: exclude from training
+            default_weight: Weight for IDs not in weight_dict (default: 1.0)
+            exclude_zero_weights: If True, filter out rows with sample_weight=0 (default: True)
-        Note:
-            If an ID appears in both lists, exclusion takes precedence.
+        Example:
+            weights = {
+                'compound_42': 3.0,  # oversample 3x
+                'compound_99': 0.1,  # noisy, downweight
+                'compound_123': 0.0, # exclude from training
+            }
+            fs.set_sample_weights(weights)  # zeros automatically excluded
+            fs.set_sample_weights(weights, exclude_zero_weights=False)  # keep zeros
         """
         from workbench.core.views import TrainingView
-        # Normalize to empty lists if None
-        exclude_ids = exclude_ids or []
-        replicate_ids = replicate_ids or []
-        # Remove any replicate_ids that are also in exclude_ids (exclusion wins)
-        replicate_ids = [rid for rid in replicate_ids if rid not in exclude_ids]
-        # If no sampling needed, just create normal view
-        if not exclude_ids and not replicate_ids:
-            self.log.important("No sampling specified, creating standard training view")
+        if not weight_dict:
+            self.log.important("Empty weight_dict, creating standard training view")
             TrainingView.create(self, id_column=self.id_column)
             return
-        # Build the custom SQL query
-        self.log.important(
-            f"Excluding {len(exclude_ids)} IDs, Replicating {len(replicate_ids)} IDs "
-            f"(factor: {replication_factor}x)"
-        )
-        # Helper to format IDs for SQL
-        def format_ids(ids):
-            return ", ".join([repr(id) for id in ids])
-        # Start with base query
-        base_query = f"SELECT * FROM {self.table}"
+        self.log.important(f"Setting sample weights for {len(weight_dict)} IDs")
-        # Add exclusions if needed
-        if exclude_ids:
-            base_query += f"\nWHERE {self.id_column} NOT IN ({format_ids(exclude_ids)})"
+        # Build inner query with sample weights
+        weight_case = self._build_weight_case(weight_dict, default_weight)
+        inner_sql = f"SELECT *, {weight_case} FROM {self.table}"
-        # Build full query with replication
-        if replicate_ids:
-            # Generate VALUES clause for CROSS JOIN: (1), (2), ..., (N-1)
-            # We want N-1 additional copies since the original row is already in base_query
-            values_clause = ", ".join([f"({i})" for i in range(1, replication_factor)])
+        # Optionally filter out zero weights
+        if exclude_zero_weights:
+            zero_count = sum(1 for w in weight_dict.values() if w == 0.0)
+            self.log.important(f"Filtering out {zero_count} rows with sample_weight = 0")
+            sql_query = f"SELECT * FROM ({inner_sql}) WHERE sample_weight > 0"
+        else:
+            sql_query = inner_sql
-            custom_sql = f"""{base_query}
+        TrainingView.create_with_sql(self, sql_query=sql_query, id_column=self.id_column)
-            UNION ALL
+    # ---- Internal helpers for training view SQL generation ----
+    @staticmethod
+    def _format_id_for_sql(id_val: Union[str, int]) -> str:
+        """Format an ID value for use in SQL."""
+        return repr(id_val)
-            SELECT t.*
-            FROM {self.table} t
-            CROSS JOIN (VALUES {values_clause}) AS n(num)
-            WHERE t.{self.id_column} IN ({format_ids(replicate_ids)})"""
+    def _build_holdout_case(self, holdout_ids: List[Union[str, int]]) -> str:
+        """Build SQL CASE statement for training column based on holdout IDs."""
+        if all(isinstance(id_val, str) for id_val in holdout_ids):
+            formatted_ids = ", ".join(f"'{id_val}'" for id_val in holdout_ids)
         else:
-            # Only exclusions, no UNION needed
-            custom_sql = base_query
-        # Create the training view with our custom SQL
-        TrainingView.create_with_sql(self, sql_query=custom_sql, id_column=self.id_column)
+            formatted_ids = ", ".join(map(str, holdout_ids))
+        return f"""CASE
+            WHEN {self.id_column} IN ({formatted_ids}) THEN False
+            ELSE True
+        END AS training"""
+    def _build_weight_case(self, weight_dict: Dict[Union[str, int], float], default_weight: float) -> str:
+        """Build SQL CASE statement for sample_weight column."""
+        conditions = [
+            f"WHEN {self.id_column} = {self._format_id_for_sql(id_val)} THEN {weight}"
+            for id_val, weight in weight_dict.items()
+        ]
+        case_body = "\n            ".join(conditions)
+        return f"""CASE
+            {case_body}
+            ELSE {default_weight}
+        END AS sample_weight"""
+    def _create_training_view(self, sql_query: str):
+        """Create the training view directly from a SQL query."""
+        view_table = f"{self.table}___training"
+        create_view_query = f"CREATE OR REPLACE VIEW {view_table} AS\n{sql_query}"
+        self.data_source.execute_statement(create_view_query)
     @classmethod
     def delete_views(cls, table: str, database: str):
@@ -737,20 +716,6 @@ class FeatureSetCore(Artifact):
         """
         return self.data_source.smart_sample()
-    def anomalies(self) -> pd.DataFrame:
-        """Get a set of anomalous data from the underlying DataSource
-        Returns:
-            pd.DataFrame: A dataframe of anomalies from the underlying DataSource
-        """
-        # FIXME: Mock this for now
-        anom_df = self.sample().copy()
-        anom_df["anomaly_score"] = np.random.rand(anom_df.shape[0])
-        anom_df["cluster"] = np.random.randint(0, 10, anom_df.shape[0])
-        anom_df["x"] = np.random.rand(anom_df.shape[0])
-        anom_df["y"] = np.random.rand(anom_df.shape[0])
-        return anom_df
     def value_counts(self) -> dict:
         """Get the value counts for the string columns of the underlying DataSource
@@ -915,81 +880,71 @@ if __name__ == "__main__":
     training_data = my_features.get_training_data()
     print(f"Training Data: {training_data.shape}")
-    # Test the filter expression functionality
-    print("Setting a filter expression...")
-    my_features.set_training_filter("auto_id < 50 AND length > 65.0")
-    training_data = my_features.get_training_data()
-    print(f"Training Data: {training_data.shape}")
-    print(training_data)
-    # Remove training filter
-    print("Removing the filter expression...")
-    my_features.set_training_filter(None)
-    training_data = my_features.get_training_data()
-    print(f"Training Data: {training_data.shape}")
-    print(training_data)
-    # Test excluding ids from training
-    print("Excluding ids from training...")
-    my_features.exclude_ids_from_training([1, 2, 3, 4, 5])
-    training_data = my_features.get_training_data()
-    print(f"Training Data: {training_data.shape}")
-    print(training_data)
+    # Test set_sample_weights
+    print("\n--- Testing set_sample_weights ---")
+    sample_ids = df["auto_id"].tolist()[:5]
+    weight_dict = {sample_ids[0]: 0.0, sample_ids[1]: 0.5, sample_ids[2]: 2.0}
+    my_features.set_sample_weights(weight_dict)
+    training_view = my_features.view("training")
+    training_df = training_view.pull_dataframe()
+    print(f"Training view shape after set_sample_weights: {training_df.shape}")
+    print(f"Columns: {training_df.columns.tolist()}")
+    assert "sample_weight" in training_df.columns, "sample_weight column missing!"
+    assert "training" in training_df.columns, "training column missing!"
+    # Verify zero-weight row was excluded
+    assert sample_ids[0] not in training_df["auto_id"].values, "Zero-weight ID should be excluded!"
+    print("set_sample_weights test passed!")
+    # Test set_training_config with both holdouts and weights
+    print("\n--- Testing set_training_config (combined) ---")
+    holdout_ids = [id for id in df["auto_id"] if id >= 100 and id < 120]
+    weight_dict = {sample_ids[3]: 0.0, sample_ids[4]: 3.0}  # exclude one, upweight another
+    my_features.set_training_config(holdout_ids=holdout_ids, weight_dict=weight_dict)
+    training_view = my_features.view("training")
+    training_df = training_view.pull_dataframe()
+    print(f"Training view shape after set_training_config: {training_df.shape}")
+    print(f"Columns: {training_df.columns.tolist()}")
+    assert "sample_weight" in training_df.columns, "sample_weight column missing!"
+    assert "training" in training_df.columns, "training column missing!"
+    # Verify holdout IDs are marked as training=False
+    holdout_rows = training_df[training_df["auto_id"].isin(holdout_ids)]
+    assert all(holdout_rows["training"] == False), "Holdout IDs should have training=False!"  # noqa: E712
+    # Verify zero-weight row was excluded
+    assert sample_ids[3] not in training_df["auto_id"].values, "Zero-weight ID should be excluded!"
+    # Verify upweighted row has correct weight
+    upweight_row = training_df[training_df["auto_id"] == sample_ids[4]]
+    assert upweight_row["sample_weight"].iloc[0] == 3.0, "Upweighted ID should have weight=3.0!"
+    print("set_training_config (combined) test passed!")
+    # Test set_training_config with only holdouts (should delegate to set_training_holdouts)
+    print("\n--- Testing set_training_config (holdouts only) ---")
+    my_features.set_training_config(holdout_ids=holdout_ids)
+    training_view = my_features.view("training")
+    training_df = training_view.pull_dataframe()
+    assert "training" in training_df.columns, "training column missing!"
+    holdout_rows = training_df[training_df["auto_id"].isin(holdout_ids)]
+    assert all(holdout_rows["training"] == False), "Holdout IDs should have training=False!"  # noqa: E712
+    print("set_training_config (holdouts only) test passed!")
+    # Test set_training_config with only weights (should delegate to set_sample_weights)
+    print("\n--- Testing set_training_config (weights only) ---")
+    my_features.set_training_config(weight_dict={sample_ids[0]: 0.5, sample_ids[1]: 2.0})
+    training_view = my_features.view("training")
+    training_df = training_view.pull_dataframe()
+    assert "sample_weight" in training_df.columns, "sample_weight column missing!"
+    print("set_training_config (weights only) test passed!")
+    # Test set_training_config with neither (should create standard training view)
+    print("\n--- Testing set_training_config (neither) ---")
+    my_features.set_training_config()
+    training_view = my_features.view("training")
+    training_df = training_view.pull_dataframe()
+    assert "training" in training_df.columns, "training column missing!"
+    print("set_training_config (neither) test passed!")
+    print("\n=== All training config tests passed! ===")
     # Now delete the AWS artifacts associated with this Feature Set
     # print("Deleting Workbench Feature Set...")
     # my_features.delete()
     # print("Done")
-    # Test set_training_sampling with exclusions and replications
-    print("\n--- Testing set_training_sampling ---")
-    my_features.set_training_filter(None)  # Reset any existing filters
-    original_count = num_rows
-    # Get valid IDs from the table
-    all_data = my_features.query(f'SELECT auto_id, length FROM "{table}"')
-    valid_ids = sorted(all_data["auto_id"].tolist())
-    print(f"Valid IDs range from {valid_ids[0]} to {valid_ids[-1]}")
-    exclude_list = valid_ids[0:3]  # First 3 IDs
-    replicate_list = valid_ids[10:13]  # IDs at positions 10, 11, 12
-    print(f"Original row count: {original_count}")
-    print(f"Excluding IDs: {exclude_list}")
-    print(f"Replicating IDs: {replicate_list}")
-    # Test with default replication factor (2x)
-    print("\n--- Testing with replication_factor=2 (default) ---")
-    my_features.set_training_sampling(exclude_ids=exclude_list, replicate_ids=replicate_list)
-    training_data = my_features.get_training_data()
-    print(f"Training Data after sampling: {training_data.shape}")
-    # Verify exclusions
-    for exc_id in exclude_list:
-        count = len(training_data[training_data["auto_id"] == exc_id])
-        print(f"Excluded ID {exc_id} appears {count} times (should be 0)")
-    # Verify replications
-    for rep_id in replicate_list:
-        count = len(training_data[training_data["auto_id"] == rep_id])
-        print(f"Replicated ID {rep_id} appears {count} times (should be 2)")
-    # Test with replication factor of 5
-    print("\n--- Testing with replication_factor=5 ---")
-    replicate_list_5x = [20, 21]
-    my_features.set_training_sampling(exclude_ids=exclude_list, replicate_ids=replicate_list_5x, replication_factor=5)
-    training_data = my_features.get_training_data()
-    print(f"Training Data after sampling: {training_data.shape}")
-    # Verify 5x replication
-    for rep_id in replicate_list_5x:
-        count = len(training_data[training_data["auto_id"] == rep_id])
-        print(f"Replicated ID {rep_id} appears {count} times (should be 5)")
-    # Test with large replication list (simulate 100 IDs)
-    print("\n--- Testing with large ID list (100 IDs) ---")
-    large_replicate_list = list(range(30, 130))  # 100 IDs
-    my_features.set_training_sampling(replicate_ids=large_replicate_list, replication_factor=3)
-    training_data = my_features.get_training_data()
-    print(f"Training Data after sampling: {training_data.shape}")
-    print(f"Expected extra rows: {len(large_replicate_list) * 3}")

workbench/core/transforms/features_to_model/features_to_model.py CHANGED Viewed

@@ -227,20 +227,14 @@ class FeaturesToModel(Transform):
                 self.log.critical(msg)
                 raise ValueError(msg)
-            # Dynamically create the metric definitions
+            # Dynamically create the metric definitions (per-class precision/recall/f1/support)
+            # Note: Confusion matrix metrics are skipped to stay under SageMaker's 40 metric limit
             metrics = ["precision", "recall", "f1", "support"]
             metric_definitions = []
             for t in self.class_labels:
                 for m in metrics:
                     metric_definitions.append({"Name": f"Metrics:{t}:{m}", "Regex": f"Metrics:{t}:{m} ([0-9.]+)"})
-            # Add the confusion matrix metrics
-            for row in self.class_labels:
-                for col in self.class_labels:
-                    metric_definitions.append(
-                        {"Name": f"ConfusionMatrix:{row}:{col}", "Regex": f"ConfusionMatrix:{row}:{col} ([0-9.]+)"}
-                    )
         # If the model type is UNKNOWN, our metric_definitions will be empty
         else:
             self.log.important(f"ModelType is {self.model_type}, skipping metric_definitions...")

workbench/core/transforms/model_to_endpoint/model_to_endpoint.py CHANGED Viewed

@@ -148,6 +148,7 @@ class ModelToEndpoint(Transform):
                 deserializer=CSVDeserializer(),
                 data_capture_config=data_capture_config,
                 tags=aws_tags,
+                container_startup_health_check_timeout=300,
             )
         except ClientError as e:
             # Check if this is the "endpoint config already exists" error
@@ -164,6 +165,7 @@ class ModelToEndpoint(Transform):
                     deserializer=CSVDeserializer(),
                     data_capture_config=data_capture_config,
                     tags=aws_tags,
+                    container_startup_health_check_timeout=300,
                 )
             else:
                 raise

workbench/model_script_utils/model_script_utils.py CHANGED Viewed

@@ -148,12 +148,16 @@ def convert_categorical_types(
 def decompress_features(
     df: pd.DataFrame, features: list[str], compressed_features: list[str]
 ) -> tuple[pd.DataFrame, list[str]]:
-    """Decompress bitstring features into individual bit columns.
+    """Decompress compressed features (bitstrings or count vectors) into individual columns.
+    Supports two formats (auto-detected):
+        - Bitstrings: "10110010..." → individual uint8 columns (0 or 1)
+        - Count vectors: "0,3,0,1,5,..." → individual uint8 columns (0-255)
     Args:
         df: The features DataFrame
         features: Full list of feature names
-        compressed_features: List of feature names to decompress (bitstrings)
+        compressed_features: List of feature names to decompress
     Returns:
         Tuple of (DataFrame with decompressed features, updated feature list)
@@ -178,18 +182,18 @@ def decompress_features(
         # Remove the feature from the list to avoid duplication
         decompressed_features.remove(feature)
-        # Handle all compressed features as bitstrings
-        bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
-        prefix = feature[:3]
+        # Auto-detect format and parse: comma-separated counts or bitstring
+        sample = str(df[feature].dropna().iloc[0]) if not df[feature].dropna().empty else ""
+        parse_fn = (lambda s: list(map(int, s.split(",")))) if "," in sample else list
+        feature_matrix = np.array([parse_fn(s) for s in df[feature]], dtype=np.uint8)
-        # Create all new columns at once - avoids fragmentation
-        new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
-        new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
+        # Create new columns with prefix from feature name
+        prefix = feature[:3]
+        new_col_names = [f"{prefix}_{i}" for i in range(feature_matrix.shape[1])]
+        new_df = pd.DataFrame(feature_matrix, columns=new_col_names, index=df.index)
-        # Add to features list
+        # Update features list and dataframe
         decompressed_features.extend(new_col_names)
-        # Drop original column and concatenate new ones
         df = df.drop(columns=[feature])
         df = pd.concat([df, new_df], axis=1)

workbench 0.8.217__py3-none-any.whl → 0.8.224__py3-none-any.whl

workbench 0.8.217py3-none-any.whl → 0.8.224py3-none-any.whl