PyPI - workbench - Versions diffs - 0.8.176__py3-none-any.whl → 0.8.178__py3-none-any.whl - Mend

workbench 0.8.176py3-none-any.whl → 0.8.178py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (21) hide show

workbench/core/artifacts/endpoint_core.py CHANGED Viewed

@@ -32,11 +32,11 @@ from sagemaker import Predictor
 from workbench.core.artifacts.artifact import Artifact
 from workbench.core.artifacts import FeatureSetCore, ModelCore, ModelType
 from workbench.utils.endpoint_metrics import EndpointMetrics
-from workbench.utils.fast_inference import fast_inference
 from workbench.utils.cache import Cache
 from workbench.utils.s3_utils import compute_s3_object_hash
 from workbench.utils.model_utils import uq_metrics
 from workbench.utils.xgboost_model_utils import cross_fold_inference
+from workbench_bridges.endpoints.fast_inference import fast_inference
 class EndpointCore(Artifact):
@@ -1061,6 +1061,9 @@ if __name__ == "__main__":
     assert len(pred_results) == len(my_eval_df), "Predictions should match the number of sent rows"
     # Now we put in an invalid value
+    print("*" * 80)
+    print("NOW TESTING ERROR CONDITIONS...")
+    print("*" * 80)
     my_eval_df.at[42, "length"] = "invalid_value"
     pred_results = my_endpoint.inference(my_eval_df, drop_error_rows=True)
     print(f"Sent rows: {len(my_eval_df)}")

workbench/core/artifacts/feature_set_core.py CHANGED Viewed

@@ -17,7 +17,7 @@ from workbench.core.artifacts.artifact import Artifact
 from workbench.core.artifacts.data_source_factory import DataSourceFactory
 from workbench.core.artifacts.athena_source import AthenaSource
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 from workbench.utils.aws_utils import aws_throttle
@@ -509,6 +509,25 @@ class FeatureSetCore(Artifact):
         ].tolist()
         return hold_out_ids
+    def set_training_filter(self, filter_expression: Optional[str] = None):
+        """Set a filter expression for the training view for this FeatureSet
+        Args:
+            filter_expression (Optional[str]): A SQL filter expression (e.g., "age > 25 AND status = 'active'")
+                If None or empty string, will reset to default training view with no filter
+                (default: None)
+        """
+        from workbench.core.views import TrainingView
+        # Grab the existing holdout ids
+        holdout_ids = self.get_training_holdouts()
+        # Create a NEW training view
+        self.log.important(f"Setting Training Filter: {filter_expression}")
+        TrainingView.create(
+            self, id_column=self.id_column, holdout_ids=holdout_ids, filter_expression=filter_expression
+        )
     @classmethod
     def delete_views(cls, table: str, database: str):
         """Delete any views associated with this FeatureSet
@@ -707,7 +726,7 @@ if __name__ == "__main__":
     # Test getting the holdout ids
     print("Getting the hold out ids...")
-    holdout_ids = my_features.get_training_holdouts("id")
+    holdout_ids = my_features.get_training_holdouts()
     print(f"Holdout IDs: {holdout_ids}")
     # Get a sample of the data
@@ -729,16 +748,26 @@ if __name__ == "__main__":
     table = my_features.view("training").table
     df = my_features.query(f'SELECT id, name FROM "{table}"')
     my_holdout_ids = [id for id in df["id"] if id < 20]
-    my_features.set_training_holdouts("id", my_holdout_ids)
-    # Test the hold out set functionality with strings
-    print("Setting hold out ids (strings)...")
-    my_holdout_ids = [name for name in df["name"] if int(name.split(" ")[1]) > 80]
-    my_features.set_training_holdouts("name", my_holdout_ids)
+    my_features.set_training_holdouts(my_holdout_ids)
     # Get the training data
     print("Getting the training data...")
     training_data = my_features.get_training_data()
+    print(f"Training Data: {training_data.shape}")
+    # Test the filter expression functionality
+    print("Setting a filter expression...")
+    my_features.set_training_filter("id < 50 AND height > 65.0")
+    training_data = my_features.get_training_data()
+    print(f"Training Data: {training_data.shape}")
+    print(training_data)
+    # Remove training filter
+    print("Removing the filter expression...")
+    my_features.set_training_filter(None)
+    training_data = my_features.get_training_data()
+    print(f"Training Data: {training_data.shape}")
+    print(training_data)
     # Now delete the AWS artifacts associated with this Feature Set
     # print("Deleting Workbench Feature Set...")

workbench/core/artifacts/model_core.py CHANGED Viewed

@@ -37,35 +37,6 @@ class ModelType(Enum):
     UNKNOWN = "unknown"
-# Deprecated Images
-"""
-        # US East 1 images
-        "py312-general-ml-training"
-        ("us-east-1", "training", "0.1", "x86_64"): (
-            "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-sklearn-xgb-training:0.1"
-        ),
-        ("us-east-1", "inference", "0.1", "x86_64"): (
-            "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1"
-        ),
-        # US West 2 images
-        ("us-west-2", "training", "0.1", "x86_64"): (
-            "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-training:0.1"
-        ),
-        ("us-west-2", "inference", "0.1", "x86_64"): (
-            "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1"
-        ),
-        # ARM64 images
-        ("us-east-1", "inference", "0.1", "arm64"): (
-            "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1-arm64"
-        ),
-        ("us-west-2", "inference", "0.1", "arm64"): (
-            "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1-arm64"
-        ),
-"""
 class ModelImages:
     """Class for retrieving workbench inference images"""
@@ -890,6 +861,14 @@ class ModelCore(Artifact):
                 shap_data[key] = self.df_store.get(df_location)
             return shap_data or None
+    def cross_folds(self) -> dict:
+        """Retrieve the cross-fold inference results(only works for XGBoost models)
+        Returns:
+            dict: Dictionary with the cross-fold inference results
+        """
+        return self.param_store.get(f"/workbench/models/{self.name}/inference/cross_fold")
     def supported_inference_instances(self) -> Optional[list]:
         """Retrieve the supported endpoint inference instance types

workbench/core/views/training_view.py CHANGED Viewed

@@ -3,7 +3,7 @@
 from typing import Union
 # Workbench Imports
-from workbench.api import DataSource, FeatureSet
+from workbench.api import FeatureSet
 from workbench.core.views.view import View
 from workbench.core.views.create_view import CreateView
 from workbench.core.views.view_utils import get_column_list
@@ -34,6 +34,7 @@ class TrainingView(CreateView):
         source_table: str = None,
         id_column: str = None,
         holdout_ids: Union[list[str], list[int], None] = None,
+        filter_expression: str = None,
     ) -> Union[View, None]:
         """Factory method to create and return a TrainingView instance.
@@ -42,6 +43,8 @@ class TrainingView(CreateView):
             source_table (str, optional): The table/view to create the view from. Defaults to None.
             id_column (str, optional): The name of the id column. Defaults to None.
             holdout_ids (Union[list[str], list[int], None], optional): A list of holdout ids. Defaults to None.
+            filter_expression (str, optional): SQL filter expression (e.g., "age > 25 AND status = 'active'").
+                                               Defaults to None.
         Returns:
             Union[View, None]: The created View object (or None if failed to create the view)
@@ -69,28 +72,36 @@ class TrainingView(CreateView):
                 else:
                     id_column = instance.auto_id_column
-        # If we don't have holdout ids, create a default training view
-        if not holdout_ids:
-            instance._default_training_view(instance.data_source, id_column)
-            return View(instance.data_source, instance.view_name, auto_create_view=False)
+        # Enclose each column name in double quotes
+        sql_columns = ", ".join([f'"{column}"' for column in column_list])
+        # Build the training assignment logic
+        if holdout_ids:
+            # Format the list of holdout ids for SQL IN clause
+            if all(isinstance(id, str) for id in holdout_ids):
+                formatted_holdout_ids = ", ".join(f"'{id}'" for id in holdout_ids)
+            else:
+                formatted_holdout_ids = ", ".join(map(str, holdout_ids))
-        # Format the list of holdout ids for SQL IN clause
-        if holdout_ids and all(isinstance(id, str) for id in holdout_ids):
-            formatted_holdout_ids = ", ".join(f"'{id}'" for id in holdout_ids)
+            training_logic = f"""CASE
+                WHEN {id_column} IN ({formatted_holdout_ids}) THEN False
+                ELSE True
+            END AS training"""
         else:
-            formatted_holdout_ids = ", ".join(map(str, holdout_ids))
+            # Default 80/20 split using modulo
+            training_logic = f"""CASE
+                WHEN MOD(ROW_NUMBER() OVER (ORDER BY {id_column}), 10) < 8 THEN True
+                ELSE False
+            END AS training"""
-        # Enclose each column name in double quotes
-        sql_columns = ", ".join([f'"{column}"' for column in column_list])
+        # Build WHERE clause if filter_expression is provided
+        where_clause = f"\nWHERE {filter_expression}" if filter_expression else ""
         # Construct the CREATE VIEW query
         create_view_query = f"""
         CREATE OR REPLACE VIEW {instance.table} AS
-        SELECT {sql_columns}, CASE
-            WHEN {id_column} IN ({formatted_holdout_ids}) THEN False
-            ELSE True
-        END AS training
-        FROM {instance.source_table}
+        SELECT {sql_columns}, {training_logic}
+        FROM {instance.source_table}{where_clause}
         """
         # Execute the CREATE VIEW query
@@ -99,43 +110,13 @@ class TrainingView(CreateView):
         # Return the View
         return View(instance.data_source, instance.view_name, auto_create_view=False)
-    # This is an internal method that's used to create a default training view
-    def _default_training_view(self, data_source: DataSource, id_column: str):
-        """Create a default view in Athena that assigns roughly 80% of the data to training
-        Args:
-            data_source (DataSource): The Workbench DataSource object
-            id_column (str): The name of the id column
-        """
-        self.log.important(f"Creating default Training View {self.table}...")
-        # Drop any columns generated from AWS
-        aws_cols = ["write_time", "api_invocation_time", "is_deleted", "event_time"]
-        column_list = [col for col in data_source.columns if col not in aws_cols]
-        # Enclose each column name in double quotes
-        sql_columns = ", ".join([f'"{column}"' for column in column_list])
-        # Construct the CREATE VIEW query with a simple modulo operation for the 80/20 split
-        create_view_query = f"""
-        CREATE OR REPLACE VIEW "{self.table}" AS
-        SELECT {sql_columns}, CASE
-            WHEN MOD(ROW_NUMBER() OVER (ORDER BY {id_column}), 10) < 8 THEN True  -- Assign 80% to training
-            ELSE False  -- Assign roughly 20% to validation/test
-        END AS training
-        FROM {self.base_table_name}
-        """
-        # Execute the CREATE VIEW query
-        data_source.execute_statement(create_view_query)
 if __name__ == "__main__":
     """Exercise the Training View functionality"""
     from workbench.api import FeatureSet
     # Get the FeatureSet
-    fs = FeatureSet("test_features")
+    fs = FeatureSet("abalone_features")
     # Delete the existing training view
     training_view = TrainingView.create(fs)
@@ -152,9 +133,18 @@ if __name__ == "__main__":
     # Create a TrainingView with holdout ids
     my_holdout_ids = list(range(10))
-    training_view = TrainingView.create(fs, id_column="id", holdout_ids=my_holdout_ids)
+    training_view = TrainingView.create(fs, id_column="auto_id", holdout_ids=my_holdout_ids)
     # Pull the training data
     df = training_view.pull_dataframe()
     print(df.head())
     print(df["training"].value_counts())
+    print(f"Shape: {df.shape}")
+    print(f"Diameter min: {df['diameter'].min()}, max: {df['diameter'].max()}")
+    # Test the filter expression
+    training_view = TrainingView.create(fs, id_column="auto_id", filter_expression="diameter > 0.5")
+    df = training_view.pull_dataframe()
+    print(df.head())
+    print(f"Shape with filter: {df.shape}")
+    print(f"Diameter min: {df['diameter'].min()}, max: {df['diameter'].max()}")

workbench/model_scripts/custom_models/chem_info/mol_descriptors.py CHANGED Viewed

@@ -91,16 +91,27 @@ import logging
 import pandas as pd
 import numpy as np
 import re
+import time
+from contextlib import contextmanager
 from rdkit import Chem
 from rdkit.Chem import Descriptors, rdCIPLabeler
 from rdkit.ML.Descriptors import MoleculeDescriptors
 from mordred import Calculator as MordredCalculator
 from mordred import AcidBase, Aromatic, Constitutional, Chi, CarbonTypes
 logger = logging.getLogger("workbench")
 logger.setLevel(logging.DEBUG)
+# Helper context manager for timing
+@contextmanager
+def timer(name):
+    start = time.time()
+    yield
+    print(f"{name}: {time.time() - start:.2f}s")
 def compute_stereochemistry_features(mol):
     """
     Compute stereochemistry descriptors using modern RDKit methods.
@@ -280,9 +291,11 @@ def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_
                 descriptor_values.append([np.nan] * len(all_descriptors))
     # Create RDKit features DataFrame
-    rdkit_features_df = pd.DataFrame(descriptor_values, columns=calc.GetDescriptorNames(), index=result.index)
+    rdkit_features_df = pd.DataFrame(descriptor_values, columns=calc.GetDescriptorNames())
     # Add RDKit features to result
+    # Remove any columns from result that exist in rdkit_features_df
+    result = result.drop(columns=result.columns.intersection(rdkit_features_df.columns))
     result = pd.concat([result, rdkit_features_df], axis=1)
     # Compute Mordred descriptors
@@ -299,7 +312,7 @@ def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_
         # Compute Mordred descriptors
         valid_mols = [mol if mol is not None else Chem.MolFromSmiles("C") for mol in molecules]
-        mordred_df = calc.pandas(valid_mols, nproc=1)  # For serverless, use nproc=1
+        mordred_df = calc.pandas(valid_mols, nproc=1)  # Endpoint multiprocessing will fail with nproc>1
         # Replace values for invalid molecules with NaN
         for i, mol in enumerate(molecules):
@@ -310,10 +323,9 @@ def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_
         for col in mordred_df.columns:
             mordred_df[col] = pd.to_numeric(mordred_df[col], errors="coerce")
-        # Set index to match result DataFrame
-        mordred_df.index = result.index
         # Add Mordred features to result
+        # Remove any columns from result that exist in mordred
+        result = result.drop(columns=result.columns.intersection(mordred_df.columns))
         result = pd.concat([result, mordred_df], axis=1)
     # Compute stereochemistry features if requested
@@ -326,9 +338,10 @@ def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_
             stereo_features.append(stereo_dict)
         # Create stereochemistry DataFrame
-        stereo_df = pd.DataFrame(stereo_features, index=result.index)
+        stereo_df = pd.DataFrame(stereo_features)
         # Add stereochemistry features to result
+        result = result.drop(columns=result.columns.intersection(stereo_df.columns))
         result = pd.concat([result, stereo_df], axis=1)
         logger.info(f"Added {len(stereo_df.columns)} stereochemistry descriptors")
@@ -357,7 +370,6 @@ def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_
 if __name__ == "__main__":
-    import time
     from mol_standardize import standardize
     from workbench.api import DataSource

workbench/model_scripts/custom_models/chem_info/mol_standardize.py CHANGED Viewed

@@ -81,6 +81,8 @@ Usage:
 import logging
 from typing import Optional, Tuple
 import pandas as pd
+import time
+from contextlib import contextmanager
 from rdkit import Chem
 from rdkit.Chem import Mol
 from rdkit.Chem.MolStandardize import rdMolStandardize
@@ -90,6 +92,14 @@ log = logging.getLogger("workbench")
 RDLogger.DisableLog("rdApp.warning")
+# Helper context manager for timing
+@contextmanager
+def timer(name):
+    start = time.time()
+    yield
+    print(f"{name}: {time.time() - start:.2f}s")
 class MolStandardizer:
     """
     Streamlined molecular standardizer for ADMET preprocessing
@@ -116,6 +126,7 @@ class MolStandardizer:
         Pipeline:
         1. Cleanup (remove Hs, disconnect metals, normalize)
         2. Get largest fragment (optional - only if remove_salts=True)
+           2a. Extract salt information BEFORE further modifications
         3. Neutralize charges
         4. Canonicalize tautomer (optional)
@@ -130,18 +141,24 @@ class MolStandardizer:
         try:
             # Step 1: Cleanup
-            mol = rdMolStandardize.Cleanup(mol, self.params)
-            if mol is None:
+            cleaned_mol = rdMolStandardize.Cleanup(mol, self.params)
+            if cleaned_mol is None:
                 return None, None
+            # If not doing any transformations, return early
+            if not self.remove_salts and not self.canonicalize_tautomer:
+                return cleaned_mol, None
             salt_smiles = None
+            mol = cleaned_mol
             # Step 2: Fragment handling (conditional based on remove_salts)
             if self.remove_salts:
-                # Get parent molecule and extract salt information
-                parent_mol = rdMolStandardize.FragmentParent(mol, self.params)
+                # Get parent molecule
+                parent_mol = rdMolStandardize.FragmentParent(cleaned_mol, self.params)
                 if parent_mol:
-                    salt_smiles = self._extract_salt(mol, parent_mol)
+                    # Extract salt BEFORE any modifications to parent
+                    salt_smiles = self._extract_salt(cleaned_mol, parent_mol)
                     mol = parent_mol
                 else:
                     return None, None
@@ -153,7 +170,7 @@ class MolStandardizer:
                 if mol is None:
                     return None, salt_smiles
-            # Step 4: Canonicalize tautomer
+            # Step 4: Canonicalize tautomer (LAST STEP)
             if self.canonicalize_tautomer:
                 mol = self.tautomer_enumerator.Canonicalize(mol)
@@ -172,13 +189,22 @@ class MolStandardizer:
         - Mixtures: multiple large neutral organic fragments
         Args:
-            orig_mol: Original molecule (before FragmentParent)
-            parent_mol: Parent molecule (after FragmentParent)
+            orig_mol: Original molecule (after Cleanup, before FragmentParent)
+            parent_mol: Parent molecule (after FragmentParent, before tautomerization)
         Returns:
             SMILES string of salt components or None if no salts/mixture detected
         """
         try:
+            # Quick atom count check
+            if orig_mol.GetNumAtoms() == parent_mol.GetNumAtoms():
+                return None
+            # Quick heavy atom difference check
+            heavy_diff = orig_mol.GetNumHeavyAtoms() - parent_mol.GetNumHeavyAtoms()
+            if heavy_diff <= 0:
+                return None
             # Get all fragments from original molecule
             orig_frags = Chem.GetMolFrags(orig_mol, asMols=True)
@@ -268,7 +294,7 @@ def standardize(
     if "orig_smiles" not in result.columns:
         result["orig_smiles"] = result[smiles_column]
-    # Initialize standardizer with salt removal control
+    # Initialize standardizer
     standardizer = MolStandardizer(canonicalize_tautomer=canonicalize_tautomer, remove_salts=extract_salts)
     def process_smiles(smiles: str) -> pd.Series:
@@ -286,6 +312,11 @@ def standardize(
             log.error("Encountered missing or empty SMILES string")
             return pd.Series({"smiles": None, "salt": None})
+        # Early check for unreasonably long SMILES
+        if len(smiles) > 1000:
+            log.error(f"SMILES too long ({len(smiles)} chars): {smiles[:50]}...")
+            return pd.Series({"smiles": None, "salt": None})
         # Parse molecule
         mol = Chem.MolFromSmiles(smiles)
         if mol is None:
@@ -299,7 +330,9 @@ def standardize(
         if std_mol is not None:
             # Check if molecule is reasonable
             if std_mol.GetNumAtoms() == 0 or std_mol.GetNumAtoms() > 200:  # Arbitrary limits
-                log.error(f"Unusual molecule size: {std_mol.GetNumAtoms()} atoms")
+                log.error(f"Rejecting molecule size: {std_mol.GetNumAtoms()} atoms")
+                log.error(f"Original SMILES: {smiles}")
+                return pd.Series({"smiles": None, "salt": salt_smiles})
         if std_mol is None:
             return pd.Series(
@@ -325,8 +358,11 @@ def standardize(
 if __name__ == "__main__":
-    import time
-    from workbench.api import DataSource
+    # Pandas display options for better readability
+    pd.set_option("display.max_columns", None)
+    pd.set_option("display.width", 1000)
+    pd.set_option("display.max_colwidth", 100)
     # Test with DataFrame including various salt forms
     test_data = pd.DataFrame(
@@ -362,67 +398,53 @@ if __name__ == "__main__":
     )
     # General test
+    print("Testing standardization with full dataset...")
     standardize(test_data)
     # Remove the last two rows to avoid errors with None and INVALID
     test_data = test_data.iloc[:-2].reset_index(drop=True)
     # Test WITHOUT salt removal (keeps full molecule)
-    print("\nStandardization KEEPING salts (extract_salts=False):")
-    print("This preserves the full molecule including counterions")
+    print("\nStandardization KEEPING salts (extract_salts=False) Tautomerization: True")
     result_keep = standardize(test_data, extract_salts=False, canonicalize_tautomer=True)
-    display_cols = ["compound_id", "orig_smiles", "smiles", "salt"]
-    print(result_keep[display_cols].to_string())
+    display_order = ["compound_id", "orig_smiles", "smiles", "salt"]
+    print(result_keep[display_order])
     # Test WITH salt removal
     print("\n" + "=" * 70)
     print("Standardization REMOVING salts (extract_salts=True):")
-    print("This extracts parent molecule and records salt information")
     result_remove = standardize(test_data, extract_salts=True, canonicalize_tautomer=True)
-    print(result_remove[display_cols].to_string())
+    print(result_remove[display_order])
-    # Test WITHOUT tautomerization (keeping salts)
+    # Test with problematic cases specifically
     print("\n" + "=" * 70)
-    print("Standardization KEEPING salts, NO tautomerization:")
-    result_no_taut = standardize(test_data, extract_salts=False, canonicalize_tautomer=False)
-    print(result_no_taut[display_cols].to_string())
+    print("Testing specific problematic cases:")
+    problem_cases = pd.DataFrame(
+        {
+            "smiles": [
+                "CC(=O)O.CCN",  # Should extract CC(=O)O as salt
+                "CCO.CC",  # Should return CC as salt
+            ],
+            "compound_id": ["TEST_C002", "TEST_C005"],
+        }
+    )
+    problem_result = standardize(problem_cases, extract_salts=True, canonicalize_tautomer=True)
+    print(problem_result[display_order])
+    # Performance test with larger dataset
+    from workbench.api import DataSource
-    # Show the difference for salt-containing molecules
-    print("\n" + "=" * 70)
-    print("Comparison showing differences:")
-    for idx, row in result_keep.iterrows():
-        keep_smiles = row["smiles"]
-        remove_smiles = result_remove.loc[idx, "smiles"]
-        no_taut_smiles = result_no_taut.loc[idx, "smiles"]
-        salt = result_remove.loc[idx, "salt"]
-        # Show differences when they exist
-        if keep_smiles != remove_smiles or keep_smiles != no_taut_smiles:
-            print(f"\n{row['compound_id']} ({row['orig_smiles']}):")
-            if keep_smiles != no_taut_smiles:
-                print(f"  With salt + taut:    {keep_smiles}")
-                print(f"  With salt, no taut:  {no_taut_smiles}")
-            if keep_smiles != remove_smiles:
-                print(f"  Parent only + taut:  {remove_smiles}")
-            if salt:
-                print(f"  Extracted salt:      {salt}")
-    # Summary statistics
     print("\n" + "=" * 70)
-    print("Summary:")
-    print(f"Total molecules: {len(result_remove)}")
-    print(f"Molecules with salts: {result_remove['salt'].notna().sum()}")
-    unique_salts = result_remove["salt"].dropna().unique()
-    print(f"Unique salts found: {unique_salts[:5].tolist()}")
-    # Get a real dataset from Workbench and time the standardization
     ds = DataSource("aqsol_data")
-    df = ds.pull_dataframe()[["id", "smiles"]]
-    start_time = time.time()
-    std_df = standardize(df, extract_salts=True, canonicalize_tautomer=True)
-    end_time = time.time()
-    print(f"\nStandardized {len(std_df)} molecules from Workbench in {end_time - start_time:.2f} seconds")
-    print(std_df.head())
-    print(f"Molecules with salts: {std_df['salt'].notna().sum()}")
-    unique_salts = std_df["salt"].dropna().unique()
-    print(f"Unique salts found: {unique_salts[:5].tolist()}")
+    df = ds.pull_dataframe()[["id", "smiles"]][:1000]
+    for tautomer in [True, False]:
+        for extract in [True, False]:
+            print(f"Performance test with AQSol dataset: tautomer={tautomer} extract_salts={extract}:")
+            start_time = time.time()
+            std_df = standardize(df, canonicalize_tautomer=tautomer, extract_salts=extract)
+            elapsed = time.time() - start_time
+            mol_per_sec = len(df) / elapsed
+            print(f"{elapsed:.2f}s ({mol_per_sec:.0f} mol/s)")

workbench 0.8.176__py3-none-any.whl → 0.8.178__py3-none-any.whl

Potentially problematic release.

workbench 0.8.176py3-none-any.whl → 0.8.178py3-none-any.whl