PyPI - workbench - Versions diffs - 0.8.176__py3-none-any.whl → 0.8.177__py3-none-any.whl - Mend

workbench 0.8.176py3-none-any.whl → 0.8.177py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

workbench/core/artifacts/endpoint_core.py CHANGED Viewed

@@ -32,11 +32,11 @@ from sagemaker import Predictor
 from workbench.core.artifacts.artifact import Artifact
 from workbench.core.artifacts import FeatureSetCore, ModelCore, ModelType
 from workbench.utils.endpoint_metrics import EndpointMetrics
-from workbench.utils.fast_inference import fast_inference
 from workbench.utils.cache import Cache
 from workbench.utils.s3_utils import compute_s3_object_hash
 from workbench.utils.model_utils import uq_metrics
 from workbench.utils.xgboost_model_utils import cross_fold_inference
+from workbench_bridges.endpoints.fast_inference import fast_inference
 class EndpointCore(Artifact):
@@ -1061,6 +1061,9 @@ if __name__ == "__main__":
     assert len(pred_results) == len(my_eval_df), "Predictions should match the number of sent rows"
     # Now we put in an invalid value
+    print("*" * 80)
+    print("NOW TESTING ERROR CONDITIONS...")
+    print("*" * 80)
     my_eval_df.at[42, "length"] = "invalid_value"
     pred_results = my_endpoint.inference(my_eval_df, drop_error_rows=True)
     print(f"Sent rows: {len(my_eval_df)}")

workbench/core/artifacts/model_core.py CHANGED Viewed

@@ -37,35 +37,6 @@ class ModelType(Enum):
     UNKNOWN = "unknown"
-# Deprecated Images
-"""
-        # US East 1 images
-        "py312-general-ml-training"
-        ("us-east-1", "training", "0.1", "x86_64"): (
-            "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-sklearn-xgb-training:0.1"
-        ),
-        ("us-east-1", "inference", "0.1", "x86_64"): (
-            "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1"
-        ),
-        # US West 2 images
-        ("us-west-2", "training", "0.1", "x86_64"): (
-            "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-training:0.1"
-        ),
-        ("us-west-2", "inference", "0.1", "x86_64"): (
-            "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1"
-        ),
-        # ARM64 images
-        ("us-east-1", "inference", "0.1", "arm64"): (
-            "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1-arm64"
-        ),
-        ("us-west-2", "inference", "0.1", "arm64"): (
-            "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1-arm64"
-        ),
-"""
 class ModelImages:
     """Class for retrieving workbench inference images"""
@@ -890,6 +861,14 @@ class ModelCore(Artifact):
                 shap_data[key] = self.df_store.get(df_location)
             return shap_data or None
+    def cross_folds(self) -> dict:
+        """Retrieve the cross-fold inference results(only works for XGBoost models)
+        Returns:
+            dict: Dictionary with the cross-fold inference results
+        """
+        return self.param_store.get(f"/workbench/models/{self.name}/inference/cross_fold")
     def supported_inference_instances(self) -> Optional[list]:
         """Retrieve the supported endpoint inference instance types

workbench/model_scripts/custom_models/chem_info/mol_descriptors.py CHANGED Viewed

@@ -91,16 +91,27 @@ import logging
 import pandas as pd
 import numpy as np
 import re
+import time
+from contextlib import contextmanager
 from rdkit import Chem
 from rdkit.Chem import Descriptors, rdCIPLabeler
 from rdkit.ML.Descriptors import MoleculeDescriptors
 from mordred import Calculator as MordredCalculator
 from mordred import AcidBase, Aromatic, Constitutional, Chi, CarbonTypes
 logger = logging.getLogger("workbench")
 logger.setLevel(logging.DEBUG)
+# Helper context manager for timing
+@contextmanager
+def timer(name):
+    start = time.time()
+    yield
+    print(f"{name}: {time.time() - start:.2f}s")
 def compute_stereochemistry_features(mol):
     """
     Compute stereochemistry descriptors using modern RDKit methods.
@@ -280,9 +291,11 @@ def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_
                 descriptor_values.append([np.nan] * len(all_descriptors))
     # Create RDKit features DataFrame
-    rdkit_features_df = pd.DataFrame(descriptor_values, columns=calc.GetDescriptorNames(), index=result.index)
+    rdkit_features_df = pd.DataFrame(descriptor_values, columns=calc.GetDescriptorNames())
     # Add RDKit features to result
+    # Remove any columns from result that exist in rdkit_features_df
+    result = result.drop(columns=result.columns.intersection(rdkit_features_df.columns))
     result = pd.concat([result, rdkit_features_df], axis=1)
     # Compute Mordred descriptors
@@ -299,7 +312,7 @@ def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_
         # Compute Mordred descriptors
         valid_mols = [mol if mol is not None else Chem.MolFromSmiles("C") for mol in molecules]
-        mordred_df = calc.pandas(valid_mols, nproc=1)  # For serverless, use nproc=1
+        mordred_df = calc.pandas(valid_mols, nproc=1)  # Endpoint multiprocessing will fail with nproc>1
         # Replace values for invalid molecules with NaN
         for i, mol in enumerate(molecules):
@@ -310,10 +323,9 @@ def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_
         for col in mordred_df.columns:
             mordred_df[col] = pd.to_numeric(mordred_df[col], errors="coerce")
-        # Set index to match result DataFrame
-        mordred_df.index = result.index
         # Add Mordred features to result
+        # Remove any columns from result that exist in mordred
+        result = result.drop(columns=result.columns.intersection(mordred_df.columns))
         result = pd.concat([result, mordred_df], axis=1)
     # Compute stereochemistry features if requested
@@ -326,9 +338,10 @@ def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_
             stereo_features.append(stereo_dict)
         # Create stereochemistry DataFrame
-        stereo_df = pd.DataFrame(stereo_features, index=result.index)
+        stereo_df = pd.DataFrame(stereo_features)
         # Add stereochemistry features to result
+        result = result.drop(columns=result.columns.intersection(stereo_df.columns))
         result = pd.concat([result, stereo_df], axis=1)
         logger.info(f"Added {len(stereo_df.columns)} stereochemistry descriptors")
@@ -357,7 +370,6 @@ def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_
 if __name__ == "__main__":
-    import time
     from mol_standardize import standardize
     from workbench.api import DataSource

workbench/model_scripts/custom_models/chem_info/mol_standardize.py CHANGED Viewed

@@ -81,6 +81,8 @@ Usage:
 import logging
 from typing import Optional, Tuple
 import pandas as pd
+import time
+from contextlib import contextmanager
 from rdkit import Chem
 from rdkit.Chem import Mol
 from rdkit.Chem.MolStandardize import rdMolStandardize
@@ -90,6 +92,14 @@ log = logging.getLogger("workbench")
 RDLogger.DisableLog("rdApp.warning")
+# Helper context manager for timing
+@contextmanager
+def timer(name):
+    start = time.time()
+    yield
+    print(f"{name}: {time.time() - start:.2f}s")
 class MolStandardizer:
     """
     Streamlined molecular standardizer for ADMET preprocessing
@@ -116,6 +126,7 @@ class MolStandardizer:
         Pipeline:
         1. Cleanup (remove Hs, disconnect metals, normalize)
         2. Get largest fragment (optional - only if remove_salts=True)
+           2a. Extract salt information BEFORE further modifications
         3. Neutralize charges
         4. Canonicalize tautomer (optional)
@@ -130,18 +141,24 @@ class MolStandardizer:
         try:
             # Step 1: Cleanup
-            mol = rdMolStandardize.Cleanup(mol, self.params)
-            if mol is None:
+            cleaned_mol = rdMolStandardize.Cleanup(mol, self.params)
+            if cleaned_mol is None:
                 return None, None
+            # If not doing any transformations, return early
+            if not self.remove_salts and not self.canonicalize_tautomer:
+                return cleaned_mol, None
             salt_smiles = None
+            mol = cleaned_mol
             # Step 2: Fragment handling (conditional based on remove_salts)
             if self.remove_salts:
-                # Get parent molecule and extract salt information
-                parent_mol = rdMolStandardize.FragmentParent(mol, self.params)
+                # Get parent molecule
+                parent_mol = rdMolStandardize.FragmentParent(cleaned_mol, self.params)
                 if parent_mol:
-                    salt_smiles = self._extract_salt(mol, parent_mol)
+                    # Extract salt BEFORE any modifications to parent
+                    salt_smiles = self._extract_salt(cleaned_mol, parent_mol)
                     mol = parent_mol
                 else:
                     return None, None
@@ -153,7 +170,7 @@ class MolStandardizer:
                 if mol is None:
                     return None, salt_smiles
-            # Step 4: Canonicalize tautomer
+            # Step 4: Canonicalize tautomer (LAST STEP)
             if self.canonicalize_tautomer:
                 mol = self.tautomer_enumerator.Canonicalize(mol)
@@ -172,13 +189,22 @@ class MolStandardizer:
         - Mixtures: multiple large neutral organic fragments
         Args:
-            orig_mol: Original molecule (before FragmentParent)
-            parent_mol: Parent molecule (after FragmentParent)
+            orig_mol: Original molecule (after Cleanup, before FragmentParent)
+            parent_mol: Parent molecule (after FragmentParent, before tautomerization)
         Returns:
             SMILES string of salt components or None if no salts/mixture detected
         """
         try:
+            # Quick atom count check
+            if orig_mol.GetNumAtoms() == parent_mol.GetNumAtoms():
+                return None
+            # Quick heavy atom difference check
+            heavy_diff = orig_mol.GetNumHeavyAtoms() - parent_mol.GetNumHeavyAtoms()
+            if heavy_diff <= 0:
+                return None
             # Get all fragments from original molecule
             orig_frags = Chem.GetMolFrags(orig_mol, asMols=True)
@@ -268,7 +294,7 @@ def standardize(
     if "orig_smiles" not in result.columns:
         result["orig_smiles"] = result[smiles_column]
-    # Initialize standardizer with salt removal control
+    # Initialize standardizer
     standardizer = MolStandardizer(canonicalize_tautomer=canonicalize_tautomer, remove_salts=extract_salts)
     def process_smiles(smiles: str) -> pd.Series:
@@ -286,6 +312,11 @@ def standardize(
             log.error("Encountered missing or empty SMILES string")
             return pd.Series({"smiles": None, "salt": None})
+        # Early check for unreasonably long SMILES
+        if len(smiles) > 1000:
+            log.error(f"SMILES too long ({len(smiles)} chars): {smiles[:50]}...")
+            return pd.Series({"smiles": None, "salt": None})
         # Parse molecule
         mol = Chem.MolFromSmiles(smiles)
         if mol is None:
@@ -299,7 +330,9 @@ def standardize(
         if std_mol is not None:
             # Check if molecule is reasonable
             if std_mol.GetNumAtoms() == 0 or std_mol.GetNumAtoms() > 200:  # Arbitrary limits
-                log.error(f"Unusual molecule size: {std_mol.GetNumAtoms()} atoms")
+                log.error(f"Rejecting molecule size: {std_mol.GetNumAtoms()} atoms")
+                log.error(f"Original SMILES: {smiles}")
+                return pd.Series({"smiles": None, "salt": salt_smiles})
         if std_mol is None:
             return pd.Series(
@@ -325,8 +358,11 @@ def standardize(
 if __name__ == "__main__":
-    import time
-    from workbench.api import DataSource
+    # Pandas display options for better readability
+    pd.set_option("display.max_columns", None)
+    pd.set_option("display.width", 1000)
+    pd.set_option("display.max_colwidth", 100)
     # Test with DataFrame including various salt forms
     test_data = pd.DataFrame(
@@ -362,67 +398,53 @@ if __name__ == "__main__":
     )
     # General test
+    print("Testing standardization with full dataset...")
     standardize(test_data)
     # Remove the last two rows to avoid errors with None and INVALID
     test_data = test_data.iloc[:-2].reset_index(drop=True)
     # Test WITHOUT salt removal (keeps full molecule)
-    print("\nStandardization KEEPING salts (extract_salts=False):")
-    print("This preserves the full molecule including counterions")
+    print("\nStandardization KEEPING salts (extract_salts=False) Tautomerization: True")
     result_keep = standardize(test_data, extract_salts=False, canonicalize_tautomer=True)
-    display_cols = ["compound_id", "orig_smiles", "smiles", "salt"]
-    print(result_keep[display_cols].to_string())
+    display_order = ["compound_id", "orig_smiles", "smiles", "salt"]
+    print(result_keep[display_order])
     # Test WITH salt removal
     print("\n" + "=" * 70)
     print("Standardization REMOVING salts (extract_salts=True):")
-    print("This extracts parent molecule and records salt information")
     result_remove = standardize(test_data, extract_salts=True, canonicalize_tautomer=True)
-    print(result_remove[display_cols].to_string())
+    print(result_remove[display_order])
-    # Test WITHOUT tautomerization (keeping salts)
+    # Test with problematic cases specifically
     print("\n" + "=" * 70)
-    print("Standardization KEEPING salts, NO tautomerization:")
-    result_no_taut = standardize(test_data, extract_salts=False, canonicalize_tautomer=False)
-    print(result_no_taut[display_cols].to_string())
+    print("Testing specific problematic cases:")
+    problem_cases = pd.DataFrame(
+        {
+            "smiles": [
+                "CC(=O)O.CCN",  # Should extract CC(=O)O as salt
+                "CCO.CC",  # Should return CC as salt
+            ],
+            "compound_id": ["TEST_C002", "TEST_C005"],
+        }
+    )
+    problem_result = standardize(problem_cases, extract_salts=True, canonicalize_tautomer=True)
+    print(problem_result[display_order])
+    # Performance test with larger dataset
+    from workbench.api import DataSource
-    # Show the difference for salt-containing molecules
-    print("\n" + "=" * 70)
-    print("Comparison showing differences:")
-    for idx, row in result_keep.iterrows():
-        keep_smiles = row["smiles"]
-        remove_smiles = result_remove.loc[idx, "smiles"]
-        no_taut_smiles = result_no_taut.loc[idx, "smiles"]
-        salt = result_remove.loc[idx, "salt"]
-        # Show differences when they exist
-        if keep_smiles != remove_smiles or keep_smiles != no_taut_smiles:
-            print(f"\n{row['compound_id']} ({row['orig_smiles']}):")
-            if keep_smiles != no_taut_smiles:
-                print(f"  With salt + taut:    {keep_smiles}")
-                print(f"  With salt, no taut:  {no_taut_smiles}")
-            if keep_smiles != remove_smiles:
-                print(f"  Parent only + taut:  {remove_smiles}")
-            if salt:
-                print(f"  Extracted salt:      {salt}")
-    # Summary statistics
     print("\n" + "=" * 70)
-    print("Summary:")
-    print(f"Total molecules: {len(result_remove)}")
-    print(f"Molecules with salts: {result_remove['salt'].notna().sum()}")
-    unique_salts = result_remove["salt"].dropna().unique()
-    print(f"Unique salts found: {unique_salts[:5].tolist()}")
-    # Get a real dataset from Workbench and time the standardization
     ds = DataSource("aqsol_data")
-    df = ds.pull_dataframe()[["id", "smiles"]]
-    start_time = time.time()
-    std_df = standardize(df, extract_salts=True, canonicalize_tautomer=True)
-    end_time = time.time()
-    print(f"\nStandardized {len(std_df)} molecules from Workbench in {end_time - start_time:.2f} seconds")
-    print(std_df.head())
-    print(f"Molecules with salts: {std_df['salt'].notna().sum()}")
-    unique_salts = std_df["salt"].dropna().unique()
-    print(f"Unique salts found: {unique_salts[:5].tolist()}")
+    df = ds.pull_dataframe()[["id", "smiles"]][:1000]
+    for tautomer in [True, False]:
+        for extract in [True, False]:
+            print(f"Performance test with AQSol dataset: tautomer={tautomer} extract_salts={extract}:")
+            start_time = time.time()
+            std_df = standardize(df, canonicalize_tautomer=tautomer, extract_salts=extract)
+            elapsed = time.time() - start_time
+            mol_per_sec = len(df) / elapsed
+            print(f"{elapsed:.2f}s ({mol_per_sec:.0f} mol/s)")

workbench/model_scripts/custom_models/uq_models/mapie.template CHANGED Viewed

@@ -472,9 +472,9 @@ def predict_fn(df, models) -> pd.DataFrame:
     # Add median (q_50) from XGBoost prediction
     df["q_50"] = df["prediction"]
-    # Calculate uncertainty metrics based on 95% interval
-    interval_width = df["q_975"] - df["q_025"]
-    df["prediction_std"] = interval_width / 3.92
+    # Calculate uncertainty metrics based on 50% interval
+    interval_width = df["q_75"] - df["q_25"]
+    df["prediction_std"] = interval_width / 1.348
     # Reorder the quantile columns for easier reading
     quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]

workbench/model_scripts/xgb_model/generated_model_script.py CHANGED Viewed

@@ -28,11 +28,11 @@ from typing import List, Tuple
 # Template Parameters
 TEMPLATE_PARAMS = {
-    "model_type": "classifier",
-    "target": "class",
+    "model_type": "regressor",
+    "target": "udm_asy_res_value",
     "features": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo'],
     "compressed_features": [],
-    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/sol-class-f1-100/training",
+    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/pka-a1-reg-0-nightly-100-test/training",
     "train_all_data": True
 }

workbench/utils/chem_utils/mol_descriptors.py CHANGED Viewed

@@ -91,16 +91,27 @@ import logging
 import pandas as pd
 import numpy as np
 import re
+import time
+from contextlib import contextmanager
 from rdkit import Chem
 from rdkit.Chem import Descriptors, rdCIPLabeler
 from rdkit.ML.Descriptors import MoleculeDescriptors
 from mordred import Calculator as MordredCalculator
 from mordred import AcidBase, Aromatic, Constitutional, Chi, CarbonTypes
 logger = logging.getLogger("workbench")
 logger.setLevel(logging.DEBUG)
+# Helper context manager for timing
+@contextmanager
+def timer(name):
+    start = time.time()
+    yield
+    print(f"{name}: {time.time() - start:.2f}s")
 def compute_stereochemistry_features(mol):
     """
     Compute stereochemistry descriptors using modern RDKit methods.
@@ -280,9 +291,11 @@ def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_
                 descriptor_values.append([np.nan] * len(all_descriptors))
     # Create RDKit features DataFrame
-    rdkit_features_df = pd.DataFrame(descriptor_values, columns=calc.GetDescriptorNames(), index=result.index)
+    rdkit_features_df = pd.DataFrame(descriptor_values, columns=calc.GetDescriptorNames())
     # Add RDKit features to result
+    # Remove any columns from result that exist in rdkit_features_df
+    result = result.drop(columns=result.columns.intersection(rdkit_features_df.columns))
     result = pd.concat([result, rdkit_features_df], axis=1)
     # Compute Mordred descriptors
@@ -299,7 +312,7 @@ def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_
         # Compute Mordred descriptors
         valid_mols = [mol if mol is not None else Chem.MolFromSmiles("C") for mol in molecules]
-        mordred_df = calc.pandas(valid_mols, nproc=1)  # For serverless, use nproc=1
+        mordred_df = calc.pandas(valid_mols, nproc=1)  # Endpoint multiprocessing will fail with nproc>1
         # Replace values for invalid molecules with NaN
         for i, mol in enumerate(molecules):
@@ -310,10 +323,9 @@ def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_
         for col in mordred_df.columns:
             mordred_df[col] = pd.to_numeric(mordred_df[col], errors="coerce")
-        # Set index to match result DataFrame
-        mordred_df.index = result.index
         # Add Mordred features to result
+        # Remove any columns from result that exist in mordred
+        result = result.drop(columns=result.columns.intersection(mordred_df.columns))
         result = pd.concat([result, mordred_df], axis=1)
     # Compute stereochemistry features if requested
@@ -326,9 +338,10 @@ def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_
             stereo_features.append(stereo_dict)
         # Create stereochemistry DataFrame
-        stereo_df = pd.DataFrame(stereo_features, index=result.index)
+        stereo_df = pd.DataFrame(stereo_features)
         # Add stereochemistry features to result
+        result = result.drop(columns=result.columns.intersection(stereo_df.columns))
         result = pd.concat([result, stereo_df], axis=1)
         logger.info(f"Added {len(stereo_df.columns)} stereochemistry descriptors")
@@ -357,7 +370,6 @@ def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_
 if __name__ == "__main__":
-    import time
     from mol_standardize import standardize
     from workbench.api import DataSource

workbench/utils/chem_utils/mol_standardize.py CHANGED Viewed

@@ -81,6 +81,8 @@ Usage:
 import logging
 from typing import Optional, Tuple
 import pandas as pd
+import time
+from contextlib import contextmanager
 from rdkit import Chem
 from rdkit.Chem import Mol
 from rdkit.Chem.MolStandardize import rdMolStandardize
@@ -90,6 +92,14 @@ log = logging.getLogger("workbench")
 RDLogger.DisableLog("rdApp.warning")
+# Helper context manager for timing
+@contextmanager
+def timer(name):
+    start = time.time()
+    yield
+    print(f"{name}: {time.time() - start:.2f}s")
 class MolStandardizer:
     """
     Streamlined molecular standardizer for ADMET preprocessing
@@ -116,6 +126,7 @@ class MolStandardizer:
         Pipeline:
         1. Cleanup (remove Hs, disconnect metals, normalize)
         2. Get largest fragment (optional - only if remove_salts=True)
+           2a. Extract salt information BEFORE further modifications
         3. Neutralize charges
         4. Canonicalize tautomer (optional)
@@ -130,18 +141,24 @@ class MolStandardizer:
         try:
             # Step 1: Cleanup
-            mol = rdMolStandardize.Cleanup(mol, self.params)
-            if mol is None:
+            cleaned_mol = rdMolStandardize.Cleanup(mol, self.params)
+            if cleaned_mol is None:
                 return None, None
+            # If not doing any transformations, return early
+            if not self.remove_salts and not self.canonicalize_tautomer:
+                return cleaned_mol, None
             salt_smiles = None
+            mol = cleaned_mol
             # Step 2: Fragment handling (conditional based on remove_salts)
             if self.remove_salts:
-                # Get parent molecule and extract salt information
-                parent_mol = rdMolStandardize.FragmentParent(mol, self.params)
+                # Get parent molecule
+                parent_mol = rdMolStandardize.FragmentParent(cleaned_mol, self.params)
                 if parent_mol:
-                    salt_smiles = self._extract_salt(mol, parent_mol)
+                    # Extract salt BEFORE any modifications to parent
+                    salt_smiles = self._extract_salt(cleaned_mol, parent_mol)
                     mol = parent_mol
                 else:
                     return None, None
@@ -153,7 +170,7 @@ class MolStandardizer:
                 if mol is None:
                     return None, salt_smiles
-            # Step 4: Canonicalize tautomer
+            # Step 4: Canonicalize tautomer (LAST STEP)
             if self.canonicalize_tautomer:
                 mol = self.tautomer_enumerator.Canonicalize(mol)
@@ -172,13 +189,22 @@ class MolStandardizer:
         - Mixtures: multiple large neutral organic fragments
         Args:
-            orig_mol: Original molecule (before FragmentParent)
-            parent_mol: Parent molecule (after FragmentParent)
+            orig_mol: Original molecule (after Cleanup, before FragmentParent)
+            parent_mol: Parent molecule (after FragmentParent, before tautomerization)
         Returns:
             SMILES string of salt components or None if no salts/mixture detected
         """
         try:
+            # Quick atom count check
+            if orig_mol.GetNumAtoms() == parent_mol.GetNumAtoms():
+                return None
+            # Quick heavy atom difference check
+            heavy_diff = orig_mol.GetNumHeavyAtoms() - parent_mol.GetNumHeavyAtoms()
+            if heavy_diff <= 0:
+                return None
             # Get all fragments from original molecule
             orig_frags = Chem.GetMolFrags(orig_mol, asMols=True)
@@ -268,7 +294,7 @@ def standardize(
     if "orig_smiles" not in result.columns:
         result["orig_smiles"] = result[smiles_column]
-    # Initialize standardizer with salt removal control
+    # Initialize standardizer
     standardizer = MolStandardizer(canonicalize_tautomer=canonicalize_tautomer, remove_salts=extract_salts)
     def process_smiles(smiles: str) -> pd.Series:
@@ -286,6 +312,11 @@ def standardize(
             log.error("Encountered missing or empty SMILES string")
             return pd.Series({"smiles": None, "salt": None})
+        # Early check for unreasonably long SMILES
+        if len(smiles) > 1000:
+            log.error(f"SMILES too long ({len(smiles)} chars): {smiles[:50]}...")
+            return pd.Series({"smiles": None, "salt": None})
         # Parse molecule
         mol = Chem.MolFromSmiles(smiles)
         if mol is None:
@@ -299,7 +330,9 @@ def standardize(
         if std_mol is not None:
             # Check if molecule is reasonable
             if std_mol.GetNumAtoms() == 0 or std_mol.GetNumAtoms() > 200:  # Arbitrary limits
-                log.error(f"Unusual molecule size: {std_mol.GetNumAtoms()} atoms")
+                log.error(f"Rejecting molecule size: {std_mol.GetNumAtoms()} atoms")
+                log.error(f"Original SMILES: {smiles}")
+                return pd.Series({"smiles": None, "salt": salt_smiles})
         if std_mol is None:
             return pd.Series(
@@ -325,8 +358,11 @@ def standardize(
 if __name__ == "__main__":
-    import time
-    from workbench.api import DataSource
+    # Pandas display options for better readability
+    pd.set_option("display.max_columns", None)
+    pd.set_option("display.width", 1000)
+    pd.set_option("display.max_colwidth", 100)
     # Test with DataFrame including various salt forms
     test_data = pd.DataFrame(
@@ -362,67 +398,53 @@ if __name__ == "__main__":
     )
     # General test
+    print("Testing standardization with full dataset...")
     standardize(test_data)
     # Remove the last two rows to avoid errors with None and INVALID
     test_data = test_data.iloc[:-2].reset_index(drop=True)
     # Test WITHOUT salt removal (keeps full molecule)
-    print("\nStandardization KEEPING salts (extract_salts=False):")
-    print("This preserves the full molecule including counterions")
+    print("\nStandardization KEEPING salts (extract_salts=False) Tautomerization: True")
     result_keep = standardize(test_data, extract_salts=False, canonicalize_tautomer=True)
-    display_cols = ["compound_id", "orig_smiles", "smiles", "salt"]
-    print(result_keep[display_cols].to_string())
+    display_order = ["compound_id", "orig_smiles", "smiles", "salt"]
+    print(result_keep[display_order])
     # Test WITH salt removal
     print("\n" + "=" * 70)
     print("Standardization REMOVING salts (extract_salts=True):")
-    print("This extracts parent molecule and records salt information")
     result_remove = standardize(test_data, extract_salts=True, canonicalize_tautomer=True)
-    print(result_remove[display_cols].to_string())
+    print(result_remove[display_order])
-    # Test WITHOUT tautomerization (keeping salts)
+    # Test with problematic cases specifically
     print("\n" + "=" * 70)
-    print("Standardization KEEPING salts, NO tautomerization:")
-    result_no_taut = standardize(test_data, extract_salts=False, canonicalize_tautomer=False)
-    print(result_no_taut[display_cols].to_string())
+    print("Testing specific problematic cases:")
+    problem_cases = pd.DataFrame(
+        {
+            "smiles": [
+                "CC(=O)O.CCN",  # Should extract CC(=O)O as salt
+                "CCO.CC",  # Should return CC as salt
+            ],
+            "compound_id": ["TEST_C002", "TEST_C005"],
+        }
+    )
+    problem_result = standardize(problem_cases, extract_salts=True, canonicalize_tautomer=True)
+    print(problem_result[display_order])
+    # Performance test with larger dataset
+    from workbench.api import DataSource
-    # Show the difference for salt-containing molecules
-    print("\n" + "=" * 70)
-    print("Comparison showing differences:")
-    for idx, row in result_keep.iterrows():
-        keep_smiles = row["smiles"]
-        remove_smiles = result_remove.loc[idx, "smiles"]
-        no_taut_smiles = result_no_taut.loc[idx, "smiles"]
-        salt = result_remove.loc[idx, "salt"]
-        # Show differences when they exist
-        if keep_smiles != remove_smiles or keep_smiles != no_taut_smiles:
-            print(f"\n{row['compound_id']} ({row['orig_smiles']}):")
-            if keep_smiles != no_taut_smiles:
-                print(f"  With salt + taut:    {keep_smiles}")
-                print(f"  With salt, no taut:  {no_taut_smiles}")
-            if keep_smiles != remove_smiles:
-                print(f"  Parent only + taut:  {remove_smiles}")
-            if salt:
-                print(f"  Extracted salt:      {salt}")
-    # Summary statistics
     print("\n" + "=" * 70)
-    print("Summary:")
-    print(f"Total molecules: {len(result_remove)}")
-    print(f"Molecules with salts: {result_remove['salt'].notna().sum()}")
-    unique_salts = result_remove["salt"].dropna().unique()
-    print(f"Unique salts found: {unique_salts[:5].tolist()}")
-    # Get a real dataset from Workbench and time the standardization
     ds = DataSource("aqsol_data")
-    df = ds.pull_dataframe()[["id", "smiles"]]
-    start_time = time.time()
-    std_df = standardize(df, extract_salts=True, canonicalize_tautomer=True)
-    end_time = time.time()
-    print(f"\nStandardized {len(std_df)} molecules from Workbench in {end_time - start_time:.2f} seconds")
-    print(std_df.head())
-    print(f"Molecules with salts: {std_df['salt'].notna().sum()}")
-    unique_salts = std_df["salt"].dropna().unique()
-    print(f"Unique salts found: {unique_salts[:5].tolist()}")
+    df = ds.pull_dataframe()[["id", "smiles"]][:1000]
+    for tautomer in [True, False]:
+        for extract in [True, False]:
+            print(f"Performance test with AQSol dataset: tautomer={tautomer} extract_salts={extract}:")
+            start_time = time.time()
+            std_df = standardize(df, canonicalize_tautomer=tautomer, extract_salts=extract)
+            elapsed = time.time() - start_time
+            mol_per_sec = len(df) / elapsed
+            print(f"{elapsed:.2f}s ({mol_per_sec:.0f} mol/s)")

workbench/utils/model_utils.py CHANGED Viewed

@@ -226,12 +226,18 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
     elif "prediction_std" in df.columns:
         lower_95 = df["prediction"] - 1.96 * df["prediction_std"]
         upper_95 = df["prediction"] + 1.96 * df["prediction_std"]
+        lower_90 = df["prediction"] - 1.645 * df["prediction_std"]
+        upper_90 = df["prediction"] + 1.645 * df["prediction_std"]
+        lower_80 = df["prediction"] - 1.282 * df["prediction_std"]
+        upper_80 = df["prediction"] + 1.282 * df["prediction_std"]
         lower_50 = df["prediction"] - 0.674 * df["prediction_std"]
         upper_50 = df["prediction"] + 0.674 * df["prediction_std"]
     else:
         raise ValueError(
             "Either quantile columns (q_025, q_975, q_25, q_75) or 'prediction_std' column must be present."
         )
+    avg_std = df["prediction_std"].mean()
+    median_std = df["prediction_std"].median()
     coverage_95 = np.mean((df[target_col] >= lower_95) & (df[target_col] <= upper_95))
     coverage_90 = np.mean((df[target_col] >= lower_90) & (df[target_col] <= upper_90))
     coverage_80 = np.mean((df[target_col] >= lower_80) & (df[target_col] <= upper_80))
@@ -242,12 +248,9 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
     avg_width_50 = np.mean(upper_50 - lower_50)
     # --- CRPS (measures calibration + sharpness) ---
-    if "prediction_std" in df.columns:
-        z = (df[target_col] - df["prediction"]) / df["prediction_std"]
-        crps = df["prediction_std"] * (z * (2 * norm.cdf(z) - 1) + 2 * norm.pdf(z) - 1 / np.sqrt(np.pi))
-        mean_crps = np.mean(crps)
-    else:
-        mean_crps = np.nan
+    z = (df[target_col] - df["prediction"]) / df["prediction_std"]
+    crps = df["prediction_std"] * (z * (2 * norm.cdf(z) - 1) + 2 * norm.pdf(z) - 1 / np.sqrt(np.pi))
+    mean_crps = np.mean(crps)
     # --- Interval Score @ 95% (penalizes miscoverage) ---
     alpha_95 = 0.05
@@ -265,27 +268,33 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
     # Collect results
     results = {
-        "coverage_95": coverage_95,
-        "coverage_90": coverage_90,
-        "coverage_80": coverage_80,
         "coverage_50": coverage_50,
-        "avg_width_95": avg_width_95,
+        "coverage_80": coverage_80,
+        "coverage_90": coverage_90,
+        "coverage_95": coverage_95,
+        "avg_std": avg_std,
+        "median_std": median_std,
         "avg_width_50": avg_width_50,
-        "crps": mean_crps,
-        "interval_score_95": mean_is_95,
-        "adaptive_calibration": adaptive_calibration,
+        "avg_width_80": avg_width_80,
+        "avg_width_90": avg_width_90,
+        "avg_width_95": avg_width_95,
+        # "crps": mean_crps,
+        # "interval_score_95": mean_is_95,
+        # "adaptive_calibration": adaptive_calibration,
         "n_samples": len(df),
     }
     print("\n=== UQ Metrics ===")
-    print(f"Coverage @ 95%: {coverage_95:.3f} (target: 0.95)")
-    print(f"Coverage @ 90%: {coverage_90:.3f} (target: 0.90)")
-    print(f"Coverage @ 80%: {coverage_80:.3f} (target: 0.80)")
     print(f"Coverage @ 50%: {coverage_50:.3f} (target: 0.50)")
-    print(f"Average 95% Width: {avg_width_95:.3f}")
-    print(f"Average 90% Width: {avg_width_90:.3f}")
-    print(f"Average 80% Width: {avg_width_80:.3f}")
+    print(f"Coverage @ 80%: {coverage_80:.3f} (target: 0.80)")
+    print(f"Coverage @ 90%: {coverage_90:.3f} (target: 0.90)")
+    print(f"Coverage @ 95%: {coverage_95:.3f} (target: 0.95)")
+    print(f"Avg Prediction StdDev: {avg_std:.3f}")
+    print(f"Median Prediction StdDev: {median_std:.3f}")
     print(f"Average 50% Width: {avg_width_50:.3f}")
+    print(f"Average 80% Width: {avg_width_80:.3f}")
+    print(f"Average 90% Width: {avg_width_90:.3f}")
+    print(f"Average 95% Width: {avg_width_95:.3f}")
     print(f"CRPS: {mean_crps:.3f} (lower is better)")
     print(f"Interval Score 95%: {mean_is_95:.3f} (lower is better)")
     print(f"Adaptive Calibration: {adaptive_calibration:.3f} (higher is better, target: >0.5)")
@@ -325,9 +334,3 @@ if __name__ == "__main__":
     df = end.auto_inference(capture=True)
     results = uq_metrics(df, target_col="solubility")
     print(results)
-    # Test the uq_metrics function
-    end = Endpoint("aqsol-uq-100")
-    df = end.auto_inference(capture=True)
-    results = uq_metrics(df, target_col="solubility")
-    print(results)

{workbench-0.8.176.dist-info → workbench-0.8.177.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: workbench
-Version: 0.8.176
+Version: 0.8.177
 Summary: Workbench: A Dashboard and Python API for creating and deploying AWS SageMaker Model Pipelines
 Author-email: SuperCowPowers LLC <support@supercowpowers.com>
 License-Expression: MIT

{workbench-0.8.176.dist-info → workbench-0.8.177.dist-info}/RECORD RENAMED Viewed

@@ -54,9 +54,9 @@ workbench/core/artifacts/cached_artifact_mixin.py,sha256=ngqFLZ4cQx_TFouXZgXZQsv
 workbench/core/artifacts/data_capture_core.py,sha256=q8f79rRTYiZ7T4IQRWXl8ZvPpcvZyNxYERwvo8o0OQc,14858
 workbench/core/artifacts/data_source_abstract.py,sha256=5IRCzFVK-17cd4NXPMRfx99vQAmQ0WHE5jcm5RfsVTg,10619
 workbench/core/artifacts/data_source_factory.py,sha256=YL_tA5fsgubbB3dPF6T4tO0rGgz-6oo3ge4i_YXVC-M,2380
-workbench/core/artifacts/endpoint_core.py,sha256=lwgiz0jttW8C4YqcKaA8nf231WI3kol-nLnKcAbFJko,49049
+workbench/core/artifacts/endpoint_core.py,sha256=Q6wL0IpMgCkVssX-BvPwawgogQjq9klSaoBUZ6tEIuc,49146
 workbench/core/artifacts/feature_set_core.py,sha256=055VdSYR09HP4ygAuYvIYtHQ7Ec4XxsZygpgEl5H5jQ,29136
-workbench/core/artifacts/model_core.py,sha256=6d5dV4DGUBgD9E_Gpk0F5x7OEc4oiDKokvA8m42vnK4,51724
+workbench/core/artifacts/model_core.py,sha256=ECDwQ0qM5qb1yGJ07U70BVdfkrW9m7p9e6YJWib3uR0,50855
 workbench/core/artifacts/monitor_core.py,sha256=M307yz7tEzOEHgv-LmtVy9jKjSbM98fHW3ckmNYrwlU,27897
 workbench/core/cloud_platform/cloud_meta.py,sha256=-g4-LTC3D0PXb3VfaXdLR1ERijKuHdffeMK_zhD-koQ,8809
 workbench/core/cloud_platform/aws/README.md,sha256=QT5IQXoUHbIA0qQ2wO6_2P2lYjYQFVYuezc22mWY4i8,97
@@ -124,8 +124,8 @@ workbench/core/views/view_utils.py,sha256=y0YuPW-90nAfgAD1UW_49-j7Mvncfm7-5rV8I_
 workbench/core/views/storage/mdq_view.py,sha256=qf_ep1KwaXOIfO930laEwNIiCYP7VNOqjE3VdHfopRE,5195
 workbench/model_scripts/script_generation.py,sha256=dL23XYwEsHIStc7i53DtF_47FqOrI9gq0kQAT6sNpZ8,7923
 workbench/model_scripts/custom_models/chem_info/Readme.md,sha256=mH1lxJ4Pb7F5nBnVXaiuxpi8zS_yjUw_LBJepVKXhlA,574
-workbench/model_scripts/custom_models/chem_info/mol_descriptors.py,sha256=N07kGqyLd9DE9S23WfPqXGO5NMQzNxe0jtl1RgtC4yY,18315
-workbench/model_scripts/custom_models/chem_info/mol_standardize.py,sha256=-BMtNzZSbXFnfoxFESHdfg7yjXO83JVecpIEsj39eDM,17145
+workbench/model_scripts/custom_models/chem_info/mol_descriptors.py,sha256=c8gkHZ-8s3HJaW9zN9pnYGK7YVW8Y0xFqQ1G_ysrF2Y,18789
+workbench/model_scripts/custom_models/chem_info/mol_standardize.py,sha256=qPLCdVMSXMOWN-01O1isg2zq7eQyFAI0SNatHkRq1uw,17524
 workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py,sha256=xljMjdfh4Idi4v1Afq1zZxvF1SDa7pDOLSAhvGBEj88,2891
 workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py,sha256=tMyMmeN1xajVWkqkV5mobYB8CYkzW9FRH8Vi3t81uo8,3231
 workbench/model_scripts/custom_models/chem_info/requirements.txt,sha256=7HBUzvNiM8lOir-UfQabXYlUp3gxdGJ42u18EuSMGjc,39
@@ -141,7 +141,7 @@ workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template,sha256=U
 workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template,sha256=0IJnSBACQ556ldEiPqR7yPCOOLJs1hQhHmPBvB2d9tY,13491
 workbench/model_scripts/custom_models/uq_models/gaussian_process.template,sha256=QbDUfkiPCwJ-c-4Twgu4utZuYZaAyeW_3T1IP-_tutw,6683
 workbench/model_scripts/custom_models/uq_models/generated_model_script.py,sha256=AcLf-vXOmn_vpTeiKpNKCW_dRhR8Co1sMFC84EPT4IE,22392
-workbench/model_scripts/custom_models/uq_models/mapie.template,sha256=VkFM0eZM2d-hzDbngk9s08DD5vn2nQRD4coCUfj36Fk,18181
+workbench/model_scripts/custom_models/uq_models/mapie.template,sha256=Vou_g0ux-KOrs36S98g27Y8ckU9sdYrKWwypJjasQX4,18180
 workbench/model_scripts/custom_models/uq_models/meta_uq.template,sha256=eawh0Fp3DhbdCXzWN6KloczT5ZS_ou4ayW65yUTTE4o,14109
 workbench/model_scripts/custom_models/uq_models/ngboost.template,sha256=9-O6P-SW50ul5Wl6es2DMWXSbrwOg7HWsdc8Qdln0MM,8278
 workbench/model_scripts/custom_models/uq_models/proximity.py,sha256=zqmNlX70LnWXr5fdtFFQppSNTLjlOciQVrjGr-g9jRE,13716
@@ -159,7 +159,7 @@ workbench/model_scripts/quant_regression/requirements.txt,sha256=jWlGc7HH7vqyukT
 workbench/model_scripts/scikit_learn/generated_model_script.py,sha256=c73ZpJBlU5k13Nx-ZDkLXu7da40CYyhwjwwmuPq6uLg,12870
 workbench/model_scripts/scikit_learn/requirements.txt,sha256=aVvwiJ3LgBUhM_PyFlb2gHXu_kpGPho3ANBzlOkfcvs,107
 workbench/model_scripts/scikit_learn/scikit_learn.template,sha256=d4pgeZYFezUQsB-7iIsjsUgB1FM6d27651wpfDdXmI0,12640
-workbench/model_scripts/xgb_model/generated_model_script.py,sha256=vxM9dxRwrAZoDwAkj-a7LNNcBNd3KpHdNrublpAIVQo,22194
+workbench/model_scripts/xgb_model/generated_model_script.py,sha256=BPhr2gfJQC1C26knsyktfLGL7Jp0YBKCIQjplCuHUg0,22218
 workbench/model_scripts/xgb_model/requirements.txt,sha256=jWlGc7HH7vqyukTm38LN4EyDi8jDUPEay4n45z-30uc,104
 workbench/model_scripts/xgb_model/xgb_model.template,sha256=HViJRsMWn393hP8VJRS45UQBzUVBhwR5sKc8Ern-9f4,17963
 workbench/repl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -211,7 +211,6 @@ workbench/utils/ecs_info.py,sha256=Gs9jNb4vcj2pziufIOI4BVIH1J-3XBMtWm1phVh8oRY,2
 workbench/utils/endpoint_metrics.py,sha256=_4WVU6cLLuV0t_i0PSvhi0EoA5ss5aDFe7ZDpumx2R8,7822
 workbench/utils/endpoint_utils.py,sha256=3-njrhMSAIOaEEiH7qMA9vgD3I7J2S9iUAcqXKx3OBo,7104
 workbench/utils/extract_model_artifact.py,sha256=sFwkJd5mfJ1PU37pIHVmUIQS-taIUJdqi3D9-qRmy8g,7870
-workbench/utils/fast_inference.py,sha256=Sm0EV1oPsYYGqiDBVUu3Nj6Ti68JV-UR2S0ZliBDPTk,6148
 workbench/utils/glue_utils.py,sha256=dslfXQcJ4C-mGmsD6LqeK8vsXBez570t3fZBVZLV7HA,2039
 workbench/utils/graph_utils.py,sha256=T4aslYVbzPmFe0_qKCQP6PZnaw1KATNXQNVO-yDGBxY,10839
 workbench/utils/ipython_utils.py,sha256=skbdbBwUT-iuY3FZwy3ACS7-FWSe9M2qVXfLlQWnikE,700
@@ -220,7 +219,7 @@ workbench/utils/lambda_utils.py,sha256=7GhGRPyXn9o-toWb9HBGSnI8-DhK9YRkwhCSk_mNK
 workbench/utils/license_manager.py,sha256=sDuhk1mZZqUbFmnuFXehyGnui_ALxrmYBg7gYwoo7ho,6975
 workbench/utils/log_utils.py,sha256=7n1NJXO_jUX82e6LWAQug6oPo3wiPDBYsqk9gsYab_A,3167
 workbench/utils/markdown_utils.py,sha256=4lEqzgG4EVmLcvvKKNUwNxVCySLQKJTJmWDiaDroI1w,8306
-workbench/utils/model_utils.py,sha256=JeEztmFyDJ7yqRozDX0L6apuhLgKx1sgNlO5duB73qc,11938
+workbench/utils/model_utils.py,sha256=7TYxTa2KCoLJfJ47QcnzmibMwKHX3bP37-sPvfqgdVM,12273
 workbench/utils/monitor_utils.py,sha256=kVaJ7BgUXs3VPMFYfLC03wkIV4Dq-pEhoXS0wkJFxCc,7858
 workbench/utils/pandas_utils.py,sha256=uTUx-d1KYfjbS9PMQp2_9FogCV7xVZR6XLzU5YAGmfs,39371
 workbench/utils/performance_utils.py,sha256=WDNvz-bOdC99cDuXl0urAV4DJ7alk_V3yzKPwvqgST4,1329
@@ -247,8 +246,8 @@ workbench/utils/xgboost_model_utils.py,sha256=iiDJH0O81aO6aOTwgssqQygvTgjE7lRDRz
 workbench/utils/chem_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 workbench/utils/chem_utils/fingerprints.py,sha256=Qvs8jaUwguWUq3Q3j695MY0t0Wk3BvroW-oWBwalMUo,5255
 workbench/utils/chem_utils/misc.py,sha256=Nevf8_opu-uIPrv_1_0ubuFVVo2_fGUkMoLAHB3XAeo,7372
-workbench/utils/chem_utils/mol_descriptors.py,sha256=N07kGqyLd9DE9S23WfPqXGO5NMQzNxe0jtl1RgtC4yY,18315
-workbench/utils/chem_utils/mol_standardize.py,sha256=-BMtNzZSbXFnfoxFESHdfg7yjXO83JVecpIEsj39eDM,17145
+workbench/utils/chem_utils/mol_descriptors.py,sha256=c8gkHZ-8s3HJaW9zN9pnYGK7YVW8Y0xFqQ1G_ysrF2Y,18789
+workbench/utils/chem_utils/mol_standardize.py,sha256=qPLCdVMSXMOWN-01O1isg2zq7eQyFAI0SNatHkRq1uw,17524
 workbench/utils/chem_utils/mol_tagging.py,sha256=8Bt6gHvyN8B2jvVuz12JgYMHVLDkCLnEPAfqkyMEoMc,9995
 workbench/utils/chem_utils/projections.py,sha256=smV-VTB-pqRrgn4DXyDIpuCYcopJdPZ54YoCQv60JY0,7480
 workbench/utils/chem_utils/salts.py,sha256=ZzFb6Z71Z_kMjVF-PKwHx0fn9pN9rPMj-oEY8Nt5JWA,9095
@@ -288,9 +287,9 @@ workbench/web_interface/page_views/main_page.py,sha256=X4-KyGTKLAdxR-Zk2niuLJB2Y
 workbench/web_interface/page_views/models_page_view.py,sha256=M0bdC7bAzLyIaE2jviY12FF4abdMFZmg6sFuOY_LaGI,2650
 workbench/web_interface/page_views/page_view.py,sha256=Gh6YnpOGlUejx-bHZAf5pzqoQ1H1R0OSwOpGhOBO06w,455
 workbench/web_interface/page_views/pipelines_page_view.py,sha256=v2pxrIbsHBcYiblfius3JK766NZ7ciD2yPx0t3E5IJo,2656
-workbench-0.8.176.dist-info/licenses/LICENSE,sha256=z4QMMPlLJkZjU8VOKqJkZiQZCEZ--saIU2Z8-p3aVc0,1080
-workbench-0.8.176.dist-info/METADATA,sha256=4uDF0MKfrLJrqmAiwYsUlCOA8o5BlxNTLweZLFwtYS0,9210
-workbench-0.8.176.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-workbench-0.8.176.dist-info/entry_points.txt,sha256=zPFPruY9uayk8-wsKrhfnIyIB6jvZOW_ibyllEIsLWo,356
-workbench-0.8.176.dist-info/top_level.txt,sha256=Dhy72zTxaA_o_yRkPZx5zw-fwumnjGaeGf0hBN3jc_w,10
-workbench-0.8.176.dist-info/RECORD,,
+workbench-0.8.177.dist-info/licenses/LICENSE,sha256=z4QMMPlLJkZjU8VOKqJkZiQZCEZ--saIU2Z8-p3aVc0,1080
+workbench-0.8.177.dist-info/METADATA,sha256=sjKEEHLha3-tDo9uYsRtpjPTHV_pj5PkucHuc2WWxBM,9210
+workbench-0.8.177.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+workbench-0.8.177.dist-info/entry_points.txt,sha256=zPFPruY9uayk8-wsKrhfnIyIB6jvZOW_ibyllEIsLWo,356
+workbench-0.8.177.dist-info/top_level.txt,sha256=Dhy72zTxaA_o_yRkPZx5zw-fwumnjGaeGf0hBN3jc_w,10
+workbench-0.8.177.dist-info/RECORD,,

workbench/utils/fast_inference.py DELETED Viewed

@@ -1,167 +0,0 @@
-"""Fast Inference on SageMaker Endpoints"""
-import pandas as pd
-from io import StringIO
-import logging
-from concurrent.futures import ThreadPoolExecutor
-# Sagemaker Imports
-import sagemaker
-from sagemaker.serializers import CSVSerializer
-from sagemaker.deserializers import CSVDeserializer
-from sagemaker import Predictor
-log = logging.getLogger("workbench")
-_CACHED_SM_SESSION = None
-def get_or_create_sm_session():
-    global _CACHED_SM_SESSION
-    if _CACHED_SM_SESSION is None:
-        _CACHED_SM_SESSION = sagemaker.Session()
-    return _CACHED_SM_SESSION
-def fast_inference(endpoint_name: str, eval_df: pd.DataFrame, sm_session=None, threads: int = 4) -> pd.DataFrame:
-    """Run inference on the Endpoint using the provided DataFrame
-    Args:
-        endpoint_name (str): The name of the Endpoint
-        eval_df (pd.DataFrame): The DataFrame to run predictions on
-        sm_session (sagemaker.session.Session, optional): SageMaker Session. If None, a cached session is created.
-        threads (int): The number of threads to use (default: 4)
-    Returns:
-        pd.DataFrame: The DataFrame with predictions
-    """
-    # Use cached session if none is provided
-    if sm_session is None:
-        sm_session = get_or_create_sm_session()
-    predictor = Predictor(
-        endpoint_name,
-        sagemaker_session=sm_session,
-        serializer=CSVSerializer(),
-        deserializer=CSVDeserializer(),
-    )
-    total_rows = len(eval_df)
-    def process_chunk(chunk_df: pd.DataFrame, start_index: int) -> pd.DataFrame:
-        log.info(f"Processing {start_index}:{min(start_index + chunk_size, total_rows)} out of {total_rows} rows...")
-        csv_buffer = StringIO()
-        chunk_df.to_csv(csv_buffer, index=False)
-        response = predictor.predict(csv_buffer.getvalue())
-        # CSVDeserializer returns a nested list: first row is headers
-        return pd.DataFrame.from_records(response[1:], columns=response[0])
-    # Sagemaker has a connection pool limit of 10
-    if threads > 10:
-        log.warning("Sagemaker has a connection pool limit of 10. Reducing threads to 10.")
-        threads = 10
-    # Compute the chunk size (divide number of threads)
-    chunk_size = max(1, total_rows // threads)
-    # We also need to ensure that the chunk size is not too big
-    if chunk_size > 100:
-        chunk_size = 100
-    # Split DataFrame into chunks and process them concurrently
-    chunks = [(eval_df[i : i + chunk_size], i) for i in range(0, total_rows, chunk_size)]
-    with ThreadPoolExecutor(max_workers=threads) as executor:
-        df_list = list(executor.map(lambda p: process_chunk(*p), chunks))
-    combined_df = pd.concat(df_list, ignore_index=True)
-    # Convert the types of the dataframe
-    combined_df = df_type_conversions(combined_df)
-    return combined_df
-def df_type_conversions(df: pd.DataFrame) -> pd.DataFrame:
-    """Convert the types of the dataframe that we get from an endpoint
-    Args:
-        df (pd.DataFrame): DataFrame to convert
-    Returns:
-        pd.DataFrame: Converted DataFrame
-    """
-    # Some endpoints will put in "N/A" values (for CSV serialization)
-    # We need to convert these to NaN and the run the conversions below
-    # Report on the number of N/A values in each column in the DataFrame
-    # For any count above 0 list the column name and the number of N/A values
-    na_counts = df.isin(["N/A"]).sum()
-    for column, count in na_counts.items():
-        if count > 0:
-            log.warning(f"{column} has {count} N/A values, converting to NaN")
-    pd.set_option("future.no_silent_downcasting", True)
-    df = df.replace("N/A", float("nan"))
-    # Convert data to numeric
-    # Note: Since we're using CSV serializers numeric columns often get changed to generic 'object' types
-    # Hard Conversion
-    # Note: We explicitly catch exceptions for columns that cannot be converted to numeric
-    for column in df.columns:
-        try:
-            df[column] = pd.to_numeric(df[column])
-        except ValueError:
-            # If a ValueError is raised, the column cannot be converted to numeric, so we keep it as is
-            pass
-        except TypeError:
-            # This typically means a duplicated column name, so confirm duplicate (more than 1) and log it
-            column_count = (df.columns == column).sum()
-            log.critical(f"{column} occurs {column_count} times in the DataFrame.")
-            pass
-    # Soft Conversion
-    # Convert columns to the best possible dtype that supports the pd.NA missing value.
-    df = df.convert_dtypes()
-    # Convert pd.NA placeholders to pd.NA
-    # Note: CSV serialization converts pd.NA to blank strings, so we have to put in placeholders
-    df.replace("__NA__", pd.NA, inplace=True)
-    # Check for True/False values in the string columns
-    for column in df.select_dtypes(include=["string"]).columns:
-        if df[column].str.lower().isin(["true", "false"]).all():
-            df[column] = df[column].str.lower().map({"true": True, "false": False})
-    # Return the Dataframe
-    return df
-if __name__ == "__main__":
-    """Exercise the Endpoint Utilities"""
-    import time
-    from workbench.api.endpoint import Endpoint
-    from workbench.utils.endpoint_utils import fs_training_data, fs_evaluation_data
-    # Create an Endpoint
-    my_endpoint_name = "abalone-regression"
-    my_endpoint = Endpoint(my_endpoint_name)
-    if not my_endpoint.exists():
-        print(f"Endpoint {my_endpoint_name} does not exist.")
-        exit(1)
-    # Get the training data
-    my_train_df = fs_training_data(my_endpoint)
-    print(my_train_df)
-    # Run Fast Inference and time it
-    my_sm_session = my_endpoint.sm_session
-    my_eval_df = fs_evaluation_data(my_endpoint)
-    start_time = time.time()
-    my_results_df = fast_inference(my_endpoint_name, my_eval_df, my_sm_session)
-    end_time = time.time()
-    print(f"Fast Inference took {end_time - start_time} seconds")
-    print(my_results_df)
-    print(my_results_df.info())
-    # Test with no session
-    my_results_df = fast_inference(my_endpoint_name, my_eval_df)
-    print(my_results_df)
-    print(my_results_df.info())

{workbench-0.8.176.dist-info → workbench-0.8.177.dist-info}/WHEEL RENAMED Viewed

File without changes

{workbench-0.8.176.dist-info → workbench-0.8.177.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{workbench-0.8.176.dist-info → workbench-0.8.177.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{workbench-0.8.176.dist-info → workbench-0.8.177.dist-info}/top_level.txt RENAMED Viewed

File without changes

workbench 0.8.176__py3-none-any.whl → 0.8.177__py3-none-any.whl

workbench 0.8.176py3-none-any.whl → 0.8.177py3-none-any.whl