PyPI - workbench - Versions diffs - 0.8.176__py3-none-any.whl → 0.8.178__py3-none-any.whl - Mend

workbench 0.8.176py3-none-any.whl → 0.8.178py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (21) hide show

workbench/model_scripts/custom_models/uq_models/generated_model_script.py CHANGED Viewed

@@ -22,7 +22,7 @@ from typing import List, Tuple
 # Template Placeholders
 TEMPLATE_PARAMS = {
-    "target": "udm_asy_res_value",
+    "target": "logs",
     "features": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo'],
     "compressed_features": [],
     "train_all_data": True
@@ -242,7 +242,7 @@ if __name__ == "__main__":
     print(f"R2: {xgb_r2:.3f}")
     # Define confidence levels we want to model
-    confidence_levels = [0.50, 0.80, 0.90, 0.95]  # 50%, 80%, 90%, 95% confidence intervals
+    confidence_levels = [0.50, 0.68, 0.80, 0.90, 0.95]  # 50%, 68%, 80%, 90%, 95% confidence intervals
     # Store MAPIE models for each confidence level
     mapie_models = {}
@@ -459,6 +459,9 @@ def predict_fn(df, models) -> pd.DataFrame:
         if conf_level == 0.50:  # 50% CI
             df["q_25"] = y_pis[:, 0, 0]
             df["q_75"] = y_pis[:, 1, 0]
+        elif conf_level == 0.68:  # 68% CI
+            df["q_16"] = y_pis[:, 0, 0]
+            df["q_84"] = y_pis[:, 1, 0]
         elif conf_level == 0.80:  # 80% CI
             df["q_10"] = y_pis[:, 0, 0]
             df["q_90"] = y_pis[:, 1, 0]
@@ -472,23 +475,16 @@ def predict_fn(df, models) -> pd.DataFrame:
     # Add median (q_50) from XGBoost prediction
     df["q_50"] = df["prediction"]
-    # Calculate uncertainty metrics based on 95% interval
-    interval_width = df["q_975"] - df["q_025"]
-    df["prediction_std"] = interval_width / 3.92
+    # Calculate a psueduo-standard deviation from the 68% interval width
+    df["prediction_std"] = (df["q_84"] - df["q_16"]) / 2.0
     # Reorder the quantile columns for easier reading
-    quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
+    quantile_cols = ["q_025", "q_05", "q_10", "q_16", "q_25", "q_75", "q_84", "q_90", "q_95", "q_975"]
     other_cols = [col for col in df.columns if col not in quantile_cols]
     df = df[other_cols + quantile_cols]
-    # Uncertainty score
-    df["uncertainty_score"] = interval_width / (np.abs(df["prediction"]) + 1e-6)
-    # Confidence bands
-    df["confidence_band"] = pd.cut(
-        df["uncertainty_score"],
-        bins=[0, 0.5, 1.0, 2.0, np.inf],
-        labels=["high", "medium", "low", "very_low"]
-    )
+    # Adjust the outer quantiles to ensure they encompass the prediction
+    df["q_025"] = np.minimum(df["q_025"], df["prediction"])
+    df["q_975"] = np.maximum(df["q_975"], df["prediction"])
     return df

workbench/model_scripts/custom_models/uq_models/mapie.template CHANGED Viewed

@@ -242,7 +242,7 @@ if __name__ == "__main__":
     print(f"R2: {xgb_r2:.3f}")
     # Define confidence levels we want to model
-    confidence_levels = [0.50, 0.80, 0.90, 0.95]  # 50%, 80%, 90%, 95% confidence intervals
+    confidence_levels = [0.50, 0.68, 0.80, 0.90, 0.95]  # 50%, 68%, 80%, 90%, 95% confidence intervals
     # Store MAPIE models for each confidence level
     mapie_models = {}
@@ -459,6 +459,9 @@ def predict_fn(df, models) -> pd.DataFrame:
         if conf_level == 0.50:  # 50% CI
             df["q_25"] = y_pis[:, 0, 0]
             df["q_75"] = y_pis[:, 1, 0]
+        elif conf_level == 0.68:  # 68% CI
+            df["q_16"] = y_pis[:, 0, 0]
+            df["q_84"] = y_pis[:, 1, 0]
         elif conf_level == 0.80:  # 80% CI
             df["q_10"] = y_pis[:, 0, 0]
             df["q_90"] = y_pis[:, 1, 0]
@@ -472,23 +475,16 @@ def predict_fn(df, models) -> pd.DataFrame:
     # Add median (q_50) from XGBoost prediction
     df["q_50"] = df["prediction"]
-    # Calculate uncertainty metrics based on 95% interval
-    interval_width = df["q_975"] - df["q_025"]
-    df["prediction_std"] = interval_width / 3.92
+    # Calculate a psueduo-standard deviation from the 68% interval width
+    df["prediction_std"] = (df["q_84"] - df["q_16"]) / 2.0
     # Reorder the quantile columns for easier reading
-    quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
+    quantile_cols = ["q_025", "q_05", "q_10", "q_16", "q_25", "q_75", "q_84", "q_90", "q_95", "q_975"]
     other_cols = [col for col in df.columns if col not in quantile_cols]
     df = df[other_cols + quantile_cols]
-    # Uncertainty score
-    df["uncertainty_score"] = interval_width / (np.abs(df["prediction"]) + 1e-6)
-    # Confidence bands
-    df["confidence_band"] = pd.cut(
-        df["uncertainty_score"],
-        bins=[0, 0.5, 1.0, 2.0, np.inf],
-        labels=["high", "medium", "low", "very_low"]
-    )
+    # Adjust the outer quantiles to ensure they encompass the prediction
+    df["q_025"] = np.minimum(df["q_025"], df["prediction"])
+    df["q_975"] = np.maximum(df["q_975"], df["prediction"])
     return df

workbench/model_scripts/xgb_model/generated_model_script.py CHANGED Viewed

@@ -28,11 +28,11 @@ from typing import List, Tuple
 # Template Parameters
 TEMPLATE_PARAMS = {
-    "model_type": "classifier",
-    "target": "class",
+    "model_type": "regressor",
+    "target": "udm_asy_res_value",
     "features": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo'],
     "compressed_features": [],
-    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/sol-class-f1-100/training",
+    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/pka-a1-reg-0-nightly-100-test/training",
     "train_all_data": True
 }

workbench/scripts/ml_pipeline_sqs.py CHANGED Viewed

@@ -13,12 +13,13 @@ cm = ConfigManager()
 workbench_bucket = cm.get_config("WORKBENCH_BUCKET")
-def submit_to_sqs(script_path: str, size: str = "small") -> None:
+def submit_to_sqs(script_path: str, size: str = "small", realtime: bool = False) -> None:
     """
     Upload script to S3 and submit message to SQS queue for processing.
     Args:
         script_path: Local path to the ML pipeline script
         size: Job size tier - "small" (default), "medium", or "large"
+        realtime: If True, sets serverless=False for real-time processing (default: False, meaning serverless=True)
     """
     print(f"\n{'=' * 60}")
     print("🚀  SUBMITTING ML PIPELINE JOB")
@@ -33,6 +34,7 @@ def submit_to_sqs(script_path: str, size: str = "small") -> None:
     print(f"📄  Script: {script_file.name}")
     print(f"📏  Size tier: {size}")
+    print(f"⚡  Mode: {'Real-time' if realtime else 'Serverless'} (serverless={'False' if realtime else 'True'})")
     print(f"🪣  Bucket: {workbench_bucket}")
     sqs = AWSAccountClamp().boto3_session.client("sqs")
     script_name = script_file.name
@@ -88,6 +90,10 @@ def submit_to_sqs(script_path: str, size: str = "small") -> None:
     # Prepare message
     message = {"script_path": s3_path, "size": size}
+    # Set serverless environment variable (defaults to True, False if --realtime)
+    message["environment"] = {"SERVERLESS": "False" if realtime else "True"}
     print("\n📨  Sending message to SQS...")
     # Send the message to SQS
@@ -110,6 +116,7 @@ def submit_to_sqs(script_path: str, size: str = "small") -> None:
     print(f"{'=' * 60}")
     print(f"📄  Script: {script_name}")
     print(f"📏  Size: {size}")
+    print(f"⚡  Mode: {'Real-time' if realtime else 'Serverless'} (SERVERLESS={'False' if realtime else 'True'})")
     print(f"🆔  Message ID: {message_id}")
     print("\n🔍  MONITORING LOCATIONS:")
     print(f"   • SQS Queue: AWS Console → SQS → {queue_name}")
@@ -126,9 +133,14 @@ def main():
     parser.add_argument(
         "--size", default="small", choices=["small", "medium", "large"], help="Job size tier (default: small)"
     )
+    parser.add_argument(
+        "--realtime",
+        action="store_true",
+        help="Run in real-time mode (sets serverless=False). Default is serverless mode (serverless=True)",
+    )
     args = parser.parse_args()
     try:
-        submit_to_sqs(args.script_file, args.size)
+        submit_to_sqs(args.script_file, args.size, realtime=args.realtime)
     except Exception as e:
         print(f"\n❌  ERROR: {e}")
         log.error(f"Error: {e}")

workbench/utils/chem_utils/mol_descriptors.py CHANGED Viewed

@@ -91,16 +91,27 @@ import logging
 import pandas as pd
 import numpy as np
 import re
+import time
+from contextlib import contextmanager
 from rdkit import Chem
 from rdkit.Chem import Descriptors, rdCIPLabeler
 from rdkit.ML.Descriptors import MoleculeDescriptors
 from mordred import Calculator as MordredCalculator
 from mordred import AcidBase, Aromatic, Constitutional, Chi, CarbonTypes
 logger = logging.getLogger("workbench")
 logger.setLevel(logging.DEBUG)
+# Helper context manager for timing
+@contextmanager
+def timer(name):
+    start = time.time()
+    yield
+    print(f"{name}: {time.time() - start:.2f}s")
 def compute_stereochemistry_features(mol):
     """
     Compute stereochemistry descriptors using modern RDKit methods.
@@ -280,9 +291,11 @@ def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_
                 descriptor_values.append([np.nan] * len(all_descriptors))
     # Create RDKit features DataFrame
-    rdkit_features_df = pd.DataFrame(descriptor_values, columns=calc.GetDescriptorNames(), index=result.index)
+    rdkit_features_df = pd.DataFrame(descriptor_values, columns=calc.GetDescriptorNames())
     # Add RDKit features to result
+    # Remove any columns from result that exist in rdkit_features_df
+    result = result.drop(columns=result.columns.intersection(rdkit_features_df.columns))
     result = pd.concat([result, rdkit_features_df], axis=1)
     # Compute Mordred descriptors
@@ -299,7 +312,7 @@ def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_
         # Compute Mordred descriptors
         valid_mols = [mol if mol is not None else Chem.MolFromSmiles("C") for mol in molecules]
-        mordred_df = calc.pandas(valid_mols, nproc=1)  # For serverless, use nproc=1
+        mordred_df = calc.pandas(valid_mols, nproc=1)  # Endpoint multiprocessing will fail with nproc>1
         # Replace values for invalid molecules with NaN
         for i, mol in enumerate(molecules):
@@ -310,10 +323,9 @@ def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_
         for col in mordred_df.columns:
             mordred_df[col] = pd.to_numeric(mordred_df[col], errors="coerce")
-        # Set index to match result DataFrame
-        mordred_df.index = result.index
         # Add Mordred features to result
+        # Remove any columns from result that exist in mordred
+        result = result.drop(columns=result.columns.intersection(mordred_df.columns))
         result = pd.concat([result, mordred_df], axis=1)
     # Compute stereochemistry features if requested
@@ -326,9 +338,10 @@ def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_
             stereo_features.append(stereo_dict)
         # Create stereochemistry DataFrame
-        stereo_df = pd.DataFrame(stereo_features, index=result.index)
+        stereo_df = pd.DataFrame(stereo_features)
         # Add stereochemistry features to result
+        result = result.drop(columns=result.columns.intersection(stereo_df.columns))
         result = pd.concat([result, stereo_df], axis=1)
         logger.info(f"Added {len(stereo_df.columns)} stereochemistry descriptors")
@@ -357,7 +370,6 @@ def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_
 if __name__ == "__main__":
-    import time
     from mol_standardize import standardize
     from workbench.api import DataSource

workbench/utils/chem_utils/mol_standardize.py CHANGED Viewed

@@ -81,6 +81,8 @@ Usage:
 import logging
 from typing import Optional, Tuple
 import pandas as pd
+import time
+from contextlib import contextmanager
 from rdkit import Chem
 from rdkit.Chem import Mol
 from rdkit.Chem.MolStandardize import rdMolStandardize
@@ -90,6 +92,14 @@ log = logging.getLogger("workbench")
 RDLogger.DisableLog("rdApp.warning")
+# Helper context manager for timing
+@contextmanager
+def timer(name):
+    start = time.time()
+    yield
+    print(f"{name}: {time.time() - start:.2f}s")
 class MolStandardizer:
     """
     Streamlined molecular standardizer for ADMET preprocessing
@@ -116,6 +126,7 @@ class MolStandardizer:
         Pipeline:
         1. Cleanup (remove Hs, disconnect metals, normalize)
         2. Get largest fragment (optional - only if remove_salts=True)
+           2a. Extract salt information BEFORE further modifications
         3. Neutralize charges
         4. Canonicalize tautomer (optional)
@@ -130,18 +141,24 @@ class MolStandardizer:
         try:
             # Step 1: Cleanup
-            mol = rdMolStandardize.Cleanup(mol, self.params)
-            if mol is None:
+            cleaned_mol = rdMolStandardize.Cleanup(mol, self.params)
+            if cleaned_mol is None:
                 return None, None
+            # If not doing any transformations, return early
+            if not self.remove_salts and not self.canonicalize_tautomer:
+                return cleaned_mol, None
             salt_smiles = None
+            mol = cleaned_mol
             # Step 2: Fragment handling (conditional based on remove_salts)
             if self.remove_salts:
-                # Get parent molecule and extract salt information
-                parent_mol = rdMolStandardize.FragmentParent(mol, self.params)
+                # Get parent molecule
+                parent_mol = rdMolStandardize.FragmentParent(cleaned_mol, self.params)
                 if parent_mol:
-                    salt_smiles = self._extract_salt(mol, parent_mol)
+                    # Extract salt BEFORE any modifications to parent
+                    salt_smiles = self._extract_salt(cleaned_mol, parent_mol)
                     mol = parent_mol
                 else:
                     return None, None
@@ -153,7 +170,7 @@ class MolStandardizer:
                 if mol is None:
                     return None, salt_smiles
-            # Step 4: Canonicalize tautomer
+            # Step 4: Canonicalize tautomer (LAST STEP)
             if self.canonicalize_tautomer:
                 mol = self.tautomer_enumerator.Canonicalize(mol)
@@ -172,13 +189,22 @@ class MolStandardizer:
         - Mixtures: multiple large neutral organic fragments
         Args:
-            orig_mol: Original molecule (before FragmentParent)
-            parent_mol: Parent molecule (after FragmentParent)
+            orig_mol: Original molecule (after Cleanup, before FragmentParent)
+            parent_mol: Parent molecule (after FragmentParent, before tautomerization)
         Returns:
             SMILES string of salt components or None if no salts/mixture detected
         """
         try:
+            # Quick atom count check
+            if orig_mol.GetNumAtoms() == parent_mol.GetNumAtoms():
+                return None
+            # Quick heavy atom difference check
+            heavy_diff = orig_mol.GetNumHeavyAtoms() - parent_mol.GetNumHeavyAtoms()
+            if heavy_diff <= 0:
+                return None
             # Get all fragments from original molecule
             orig_frags = Chem.GetMolFrags(orig_mol, asMols=True)
@@ -268,7 +294,7 @@ def standardize(
     if "orig_smiles" not in result.columns:
         result["orig_smiles"] = result[smiles_column]
-    # Initialize standardizer with salt removal control
+    # Initialize standardizer
     standardizer = MolStandardizer(canonicalize_tautomer=canonicalize_tautomer, remove_salts=extract_salts)
     def process_smiles(smiles: str) -> pd.Series:
@@ -286,6 +312,11 @@ def standardize(
             log.error("Encountered missing or empty SMILES string")
             return pd.Series({"smiles": None, "salt": None})
+        # Early check for unreasonably long SMILES
+        if len(smiles) > 1000:
+            log.error(f"SMILES too long ({len(smiles)} chars): {smiles[:50]}...")
+            return pd.Series({"smiles": None, "salt": None})
         # Parse molecule
         mol = Chem.MolFromSmiles(smiles)
         if mol is None:
@@ -299,7 +330,9 @@ def standardize(
         if std_mol is not None:
             # Check if molecule is reasonable
             if std_mol.GetNumAtoms() == 0 or std_mol.GetNumAtoms() > 200:  # Arbitrary limits
-                log.error(f"Unusual molecule size: {std_mol.GetNumAtoms()} atoms")
+                log.error(f"Rejecting molecule size: {std_mol.GetNumAtoms()} atoms")
+                log.error(f"Original SMILES: {smiles}")
+                return pd.Series({"smiles": None, "salt": salt_smiles})
         if std_mol is None:
             return pd.Series(
@@ -325,8 +358,11 @@ def standardize(
 if __name__ == "__main__":
-    import time
-    from workbench.api import DataSource
+    # Pandas display options for better readability
+    pd.set_option("display.max_columns", None)
+    pd.set_option("display.width", 1000)
+    pd.set_option("display.max_colwidth", 100)
     # Test with DataFrame including various salt forms
     test_data = pd.DataFrame(
@@ -362,67 +398,53 @@ if __name__ == "__main__":
     )
     # General test
+    print("Testing standardization with full dataset...")
     standardize(test_data)
     # Remove the last two rows to avoid errors with None and INVALID
     test_data = test_data.iloc[:-2].reset_index(drop=True)
     # Test WITHOUT salt removal (keeps full molecule)
-    print("\nStandardization KEEPING salts (extract_salts=False):")
-    print("This preserves the full molecule including counterions")
+    print("\nStandardization KEEPING salts (extract_salts=False) Tautomerization: True")
     result_keep = standardize(test_data, extract_salts=False, canonicalize_tautomer=True)
-    display_cols = ["compound_id", "orig_smiles", "smiles", "salt"]
-    print(result_keep[display_cols].to_string())
+    display_order = ["compound_id", "orig_smiles", "smiles", "salt"]
+    print(result_keep[display_order])
     # Test WITH salt removal
     print("\n" + "=" * 70)
     print("Standardization REMOVING salts (extract_salts=True):")
-    print("This extracts parent molecule and records salt information")
     result_remove = standardize(test_data, extract_salts=True, canonicalize_tautomer=True)
-    print(result_remove[display_cols].to_string())
+    print(result_remove[display_order])
-    # Test WITHOUT tautomerization (keeping salts)
+    # Test with problematic cases specifically
     print("\n" + "=" * 70)
-    print("Standardization KEEPING salts, NO tautomerization:")
-    result_no_taut = standardize(test_data, extract_salts=False, canonicalize_tautomer=False)
-    print(result_no_taut[display_cols].to_string())
+    print("Testing specific problematic cases:")
+    problem_cases = pd.DataFrame(
+        {
+            "smiles": [
+                "CC(=O)O.CCN",  # Should extract CC(=O)O as salt
+                "CCO.CC",  # Should return CC as salt
+            ],
+            "compound_id": ["TEST_C002", "TEST_C005"],
+        }
+    )
+    problem_result = standardize(problem_cases, extract_salts=True, canonicalize_tautomer=True)
+    print(problem_result[display_order])
+    # Performance test with larger dataset
+    from workbench.api import DataSource
-    # Show the difference for salt-containing molecules
-    print("\n" + "=" * 70)
-    print("Comparison showing differences:")
-    for idx, row in result_keep.iterrows():
-        keep_smiles = row["smiles"]
-        remove_smiles = result_remove.loc[idx, "smiles"]
-        no_taut_smiles = result_no_taut.loc[idx, "smiles"]
-        salt = result_remove.loc[idx, "salt"]
-        # Show differences when they exist
-        if keep_smiles != remove_smiles or keep_smiles != no_taut_smiles:
-            print(f"\n{row['compound_id']} ({row['orig_smiles']}):")
-            if keep_smiles != no_taut_smiles:
-                print(f"  With salt + taut:    {keep_smiles}")
-                print(f"  With salt, no taut:  {no_taut_smiles}")
-            if keep_smiles != remove_smiles:
-                print(f"  Parent only + taut:  {remove_smiles}")
-            if salt:
-                print(f"  Extracted salt:      {salt}")
-    # Summary statistics
     print("\n" + "=" * 70)
-    print("Summary:")
-    print(f"Total molecules: {len(result_remove)}")
-    print(f"Molecules with salts: {result_remove['salt'].notna().sum()}")
-    unique_salts = result_remove["salt"].dropna().unique()
-    print(f"Unique salts found: {unique_salts[:5].tolist()}")
-    # Get a real dataset from Workbench and time the standardization
     ds = DataSource("aqsol_data")
-    df = ds.pull_dataframe()[["id", "smiles"]]
-    start_time = time.time()
-    std_df = standardize(df, extract_salts=True, canonicalize_tautomer=True)
-    end_time = time.time()
-    print(f"\nStandardized {len(std_df)} molecules from Workbench in {end_time - start_time:.2f} seconds")
-    print(std_df.head())
-    print(f"Molecules with salts: {std_df['salt'].notna().sum()}")
-    unique_salts = std_df["salt"].dropna().unique()
-    print(f"Unique salts found: {unique_salts[:5].tolist()}")
+    df = ds.pull_dataframe()[["id", "smiles"]][:1000]
+    for tautomer in [True, False]:
+        for extract in [True, False]:
+            print(f"Performance test with AQSol dataset: tautomer={tautomer} extract_salts={extract}:")
+            start_time = time.time()
+            std_df = standardize(df, canonicalize_tautomer=tautomer, extract_salts=extract)
+            elapsed = time.time() - start_time
+            mol_per_sec = len(df) / elapsed
+            print(f"{elapsed:.2f}s ({mol_per_sec:.0f} mol/s)")

workbench/utils/model_utils.py CHANGED Viewed

@@ -222,32 +222,40 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
         lower_95, upper_95 = df["q_025"], df["q_975"]
         lower_90, upper_90 = df["q_05"], df["q_95"]
         lower_80, upper_80 = df["q_10"], df["q_90"]
+        lower_68, upper_68 = df["q_16"], df["q_84"]
         lower_50, upper_50 = df["q_25"], df["q_75"]
     elif "prediction_std" in df.columns:
         lower_95 = df["prediction"] - 1.96 * df["prediction_std"]
         upper_95 = df["prediction"] + 1.96 * df["prediction_std"]
+        lower_90 = df["prediction"] - 1.645 * df["prediction_std"]
+        upper_90 = df["prediction"] + 1.645 * df["prediction_std"]
+        lower_80 = df["prediction"] - 1.282 * df["prediction_std"]
+        upper_80 = df["prediction"] + 1.282 * df["prediction_std"]
+        lower_68 = df["prediction"] - 1.0 * df["prediction_std"]
+        upper_68 = df["prediction"] + 1.0 * df["prediction_std"]
         lower_50 = df["prediction"] - 0.674 * df["prediction_std"]
         upper_50 = df["prediction"] + 0.674 * df["prediction_std"]
     else:
         raise ValueError(
             "Either quantile columns (q_025, q_975, q_25, q_75) or 'prediction_std' column must be present."
         )
+    avg_std = df["prediction_std"].mean()
+    median_std = df["prediction_std"].median()
     coverage_95 = np.mean((df[target_col] >= lower_95) & (df[target_col] <= upper_95))
     coverage_90 = np.mean((df[target_col] >= lower_90) & (df[target_col] <= upper_90))
     coverage_80 = np.mean((df[target_col] >= lower_80) & (df[target_col] <= upper_80))
+    coverage_68 = np.mean((df[target_col] >= lower_68) & (df[target_col] <= upper_68))
     coverage_50 = np.mean((df[target_col] >= lower_50) & (df[target_col] <= upper_50))
     avg_width_95 = np.mean(upper_95 - lower_95)
     avg_width_90 = np.mean(upper_90 - lower_90)
     avg_width_80 = np.mean(upper_80 - lower_80)
     avg_width_50 = np.mean(upper_50 - lower_50)
+    avg_width_68 = np.mean(upper_68 - lower_68)
     # --- CRPS (measures calibration + sharpness) ---
-    if "prediction_std" in df.columns:
-        z = (df[target_col] - df["prediction"]) / df["prediction_std"]
-        crps = df["prediction_std"] * (z * (2 * norm.cdf(z) - 1) + 2 * norm.pdf(z) - 1 / np.sqrt(np.pi))
-        mean_crps = np.mean(crps)
-    else:
-        mean_crps = np.nan
+    z = (df[target_col] - df["prediction"]) / df["prediction_std"]
+    crps = df["prediction_std"] * (z * (2 * norm.cdf(z) - 1) + 2 * norm.pdf(z) - 1 / np.sqrt(np.pi))
+    mean_crps = np.mean(crps)
     # --- Interval Score @ 95% (penalizes miscoverage) ---
     alpha_95 = 0.05
@@ -265,27 +273,37 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
     # Collect results
     results = {
-        "coverage_95": coverage_95,
-        "coverage_90": coverage_90,
-        "coverage_80": coverage_80,
         "coverage_50": coverage_50,
-        "avg_width_95": avg_width_95,
+        "coverage_68": coverage_68,
+        "coverage_80": coverage_80,
+        "coverage_90": coverage_90,
+        "coverage_95": coverage_95,
+        "median_std": median_std,
+        "avg_std": avg_std,
         "avg_width_50": avg_width_50,
-        "crps": mean_crps,
-        "interval_score_95": mean_is_95,
-        "adaptive_calibration": adaptive_calibration,
+        "avg_width_68": avg_width_68,
+        "avg_width_80": avg_width_80,
+        "avg_width_90": avg_width_90,
+        "avg_width_95": avg_width_95,
+        # "crps": mean_crps,
+        # "interval_score_95": mean_is_95,
+        # "adaptive_calibration": adaptive_calibration,
         "n_samples": len(df),
     }
     print("\n=== UQ Metrics ===")
-    print(f"Coverage @ 95%: {coverage_95:.3f} (target: 0.95)")
-    print(f"Coverage @ 90%: {coverage_90:.3f} (target: 0.90)")
-    print(f"Coverage @ 80%: {coverage_80:.3f} (target: 0.80)")
     print(f"Coverage @ 50%: {coverage_50:.3f} (target: 0.50)")
-    print(f"Average 95% Width: {avg_width_95:.3f}")
-    print(f"Average 90% Width: {avg_width_90:.3f}")
-    print(f"Average 80% Width: {avg_width_80:.3f}")
+    print(f"Coverage @ 68%: {coverage_68:.3f} (target: 0.68)")
+    print(f"Coverage @ 80%: {coverage_80:.3f} (target: 0.80)")
+    print(f"Coverage @ 90%: {coverage_90:.3f} (target: 0.90)")
+    print(f"Coverage @ 95%: {coverage_95:.3f} (target: 0.95)")
+    print(f"Median Prediction StdDev: {median_std:.3f}")
+    print(f"Avg Prediction StdDev: {avg_std:.3f}")
     print(f"Average 50% Width: {avg_width_50:.3f}")
+    print(f"Average 68% Width: {avg_width_68:.3f}")
+    print(f"Average 80% Width: {avg_width_80:.3f}")
+    print(f"Average 90% Width: {avg_width_90:.3f}")
+    print(f"Average 95% Width: {avg_width_95:.3f}")
     print(f"CRPS: {mean_crps:.3f} (lower is better)")
     print(f"Interval Score 95%: {mean_is_95:.3f} (lower is better)")
     print(f"Adaptive Calibration: {adaptive_calibration:.3f} (higher is better, target: >0.5)")
@@ -325,9 +343,3 @@ if __name__ == "__main__":
     df = end.auto_inference(capture=True)
     results = uq_metrics(df, target_col="solubility")
     print(results)
-    # Test the uq_metrics function
-    end = Endpoint("aqsol-uq-100")
-    df = end.auto_inference(capture=True)
-    results = uq_metrics(df, target_col="solubility")
-    print(results)

workbench/utils/xgboost_model_utils.py CHANGED Viewed

@@ -259,7 +259,7 @@ def cross_fold_inference(workbench_model: Any, nfolds: int = 5) -> Dict[str, Any
     xgb_model._Booster = loaded_booster
     # Prepare data
     fs = FeatureSet(workbench_model.get_input())
-    df = fs.pull_dataframe()
+    df = fs.view("training").pull_dataframe()
     feature_cols = workbench_model.features()
     # Convert string features to categorical
     for col in feature_cols:

workbench 0.8.176__py3-none-any.whl → 0.8.178__py3-none-any.whl

Potentially problematic release.

workbench 0.8.176py3-none-any.whl → 0.8.178py3-none-any.whl