workbench 0.8.198__py3-none-any.whl → 0.8.203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. workbench/algorithms/dataframe/proximity.py +11 -4
  2. workbench/api/__init__.py +2 -1
  3. workbench/api/df_store.py +17 -108
  4. workbench/api/feature_set.py +48 -11
  5. workbench/api/model.py +1 -1
  6. workbench/api/parameter_store.py +3 -52
  7. workbench/core/artifacts/__init__.py +11 -2
  8. workbench/core/artifacts/artifact.py +5 -5
  9. workbench/core/artifacts/df_store_core.py +114 -0
  10. workbench/core/artifacts/endpoint_core.py +261 -78
  11. workbench/core/artifacts/feature_set_core.py +69 -1
  12. workbench/core/artifacts/model_core.py +48 -14
  13. workbench/core/artifacts/parameter_store_core.py +98 -0
  14. workbench/core/transforms/features_to_model/features_to_model.py +50 -33
  15. workbench/core/transforms/pandas_transforms/pandas_to_features.py +11 -2
  16. workbench/core/views/view.py +2 -2
  17. workbench/model_scripts/chemprop/chemprop.template +933 -0
  18. workbench/model_scripts/chemprop/generated_model_script.py +933 -0
  19. workbench/model_scripts/chemprop/requirements.txt +11 -0
  20. workbench/model_scripts/custom_models/chem_info/fingerprints.py +134 -0
  21. workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
  22. workbench/model_scripts/custom_models/proximity/proximity.py +11 -4
  23. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +11 -5
  24. workbench/model_scripts/custom_models/uq_models/meta_uq.template +11 -5
  25. workbench/model_scripts/custom_models/uq_models/ngboost.template +11 -5
  26. workbench/model_scripts/custom_models/uq_models/proximity.py +11 -4
  27. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +11 -5
  28. workbench/model_scripts/pytorch_model/generated_model_script.py +365 -173
  29. workbench/model_scripts/pytorch_model/pytorch.template +362 -170
  30. workbench/model_scripts/scikit_learn/generated_model_script.py +302 -0
  31. workbench/model_scripts/script_generation.py +10 -7
  32. workbench/model_scripts/uq_models/generated_model_script.py +43 -27
  33. workbench/model_scripts/uq_models/mapie.template +40 -24
  34. workbench/model_scripts/xgb_model/generated_model_script.py +36 -7
  35. workbench/model_scripts/xgb_model/xgb_model.template +36 -7
  36. workbench/repl/workbench_shell.py +14 -5
  37. workbench/resources/open_source_api.key +1 -1
  38. workbench/scripts/endpoint_test.py +162 -0
  39. workbench/scripts/{lambda_launcher.py → lambda_test.py} +10 -0
  40. workbench/utils/chemprop_utils.py +761 -0
  41. workbench/utils/pytorch_utils.py +527 -0
  42. workbench/utils/xgboost_model_utils.py +10 -5
  43. workbench/web_interface/components/model_plot.py +7 -1
  44. {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/METADATA +3 -3
  45. {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/RECORD +49 -43
  46. {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/entry_points.txt +2 -1
  47. workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
  48. workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
  49. workbench/model_scripts/__pycache__/script_generation.cpython-312.pyc +0 -0
  50. workbench/model_scripts/__pycache__/script_generation.cpython-313.pyc +0 -0
  51. {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/WHEEL +0 -0
  52. {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/licenses/LICENSE +0 -0
  53. {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,11 @@
1
+ # Requirements for ChemProp model scripts
2
+ # Note: These are the local dev requirements. The Docker images have their own requirements.txt
3
+ chemprop==2.2.1
4
+ rdkit==2025.9.1
5
+ torch>=2.0.0
6
+ lightning>=2.0.0
7
+ pandas>=2.0.0
8
+ numpy>=1.24.0
9
+ scikit-learn>=1.3.0
10
+ awswrangler>=3.0.0
11
+ joblib>=1.3.0
@@ -0,0 +1,134 @@
1
+ """Molecular fingerprint computation utilities"""
2
+
3
+ import logging
4
+ import pandas as pd
5
+
6
+ # Molecular Descriptor Imports
7
+ from rdkit import Chem
8
+ from rdkit.Chem import rdFingerprintGenerator
9
+ from rdkit.Chem.MolStandardize import rdMolStandardize
10
+
11
+ # Set up the logger
12
+ log = logging.getLogger("workbench")
13
+
14
+
15
+ def compute_morgan_fingerprints(df: pd.DataFrame, radius=2, n_bits=2048, counts=True) -> pd.DataFrame:
16
+ """Compute and add Morgan fingerprints to the DataFrame.
17
+
18
+ Args:
19
+ df (pd.DataFrame): Input DataFrame containing SMILES strings.
20
+ radius (int): Radius for the Morgan fingerprint.
21
+ n_bits (int): Number of bits for the fingerprint.
22
+ counts (bool): Count simulation for the fingerprint.
23
+
24
+ Returns:
25
+ pd.DataFrame: The input DataFrame with the Morgan fingerprints added as bit strings.
26
+
27
+ Note:
28
+ See: https://greglandrum.github.io/rdkit-blog/posts/2021-07-06-simulating-counts.html
29
+ """
30
+ delete_mol_column = False
31
+
32
+ # Check for the SMILES column (case-insensitive)
33
+ smiles_column = next((col for col in df.columns if col.lower() == "smiles"), None)
34
+ if smiles_column is None:
35
+ raise ValueError("Input DataFrame must have a 'smiles' column")
36
+
37
+ # Sanity check the molecule column (sometimes it gets serialized, which doesn't work)
38
+ if "molecule" in df.columns and df["molecule"].dtype == "string":
39
+ log.warning("Detected serialized molecules in 'molecule' column. Removing...")
40
+ del df["molecule"]
41
+
42
+ # Convert SMILES to RDKit molecule objects (vectorized)
43
+ if "molecule" not in df.columns:
44
+ log.info("Converting SMILES to RDKit Molecules...")
45
+ delete_mol_column = True
46
+ df["molecule"] = df[smiles_column].apply(Chem.MolFromSmiles)
47
+ # Make sure our molecules are not None
48
+ failed_smiles = df[df["molecule"].isnull()][smiles_column].tolist()
49
+ if failed_smiles:
50
+ log.error(f"Failed to convert the following SMILES to molecules: {failed_smiles}")
51
+ df = df.dropna(subset=["molecule"])
52
+
53
+ # If we have fragments in our compounds, get the largest fragment before computing fingerprints
54
+ largest_frags = df["molecule"].apply(
55
+ lambda mol: rdMolStandardize.LargestFragmentChooser().choose(mol) if mol else None
56
+ )
57
+
58
+ # Create a Morgan fingerprint generator
59
+ if counts:
60
+ n_bits *= 4 # Multiply by 4 to simulate counts
61
+ morgan_generator = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits, countSimulation=counts)
62
+
63
+ # Compute Morgan fingerprints (vectorized)
64
+ fingerprints = largest_frags.apply(
65
+ lambda mol: (morgan_generator.GetFingerprint(mol).ToBitString() if mol else pd.NA)
66
+ )
67
+
68
+ # Add the fingerprints to the DataFrame
69
+ df["fingerprint"] = fingerprints
70
+
71
+ # Drop the intermediate 'molecule' column if it was added
72
+ if delete_mol_column:
73
+ del df["molecule"]
74
+ return df
75
+
76
+
77
+ if __name__ == "__main__":
78
+ print("Running molecular fingerprint tests...")
79
+ print("Note: This requires molecular_screening module to be available")
80
+
81
+ # Test molecules
82
+ test_molecules = {
83
+ "aspirin": "CC(=O)OC1=CC=CC=C1C(=O)O",
84
+ "caffeine": "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",
85
+ "glucose": "C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O", # With stereochemistry
86
+ "sodium_acetate": "CC(=O)[O-].[Na+]", # Salt
87
+ "benzene": "c1ccccc1",
88
+ "butene_e": "C/C=C/C", # E-butene
89
+ "butene_z": "C/C=C\\C", # Z-butene
90
+ }
91
+
92
+ # Test 1: Morgan Fingerprints
93
+ print("\n1. Testing Morgan fingerprint generation...")
94
+
95
+ test_df = pd.DataFrame({"SMILES": list(test_molecules.values()), "name": list(test_molecules.keys())})
96
+
97
+ fp_df = compute_morgan_fingerprints(test_df.copy(), radius=2, n_bits=512, counts=False)
98
+
99
+ print(" Fingerprint generation results:")
100
+ for _, row in fp_df.iterrows():
101
+ fp = row.get("fingerprint", "N/A")
102
+ fp_len = len(fp) if fp != "N/A" else 0
103
+ print(f" {row['name']:15} → {fp_len} bits")
104
+
105
+ # Test 2: Different fingerprint parameters
106
+ print("\n2. Testing different fingerprint parameters...")
107
+
108
+ # Test with counts enabled
109
+ fp_counts_df = compute_morgan_fingerprints(test_df.copy(), radius=3, n_bits=256, counts=True)
110
+
111
+ print(" With count simulation (256 bits * 4):")
112
+ for _, row in fp_counts_df.iterrows():
113
+ fp = row.get("fingerprint", "N/A")
114
+ fp_len = len(fp) if fp != "N/A" else 0
115
+ print(f" {row['name']:15} → {fp_len} bits")
116
+
117
+ # Test 3: Edge cases
118
+ print("\n3. Testing edge cases...")
119
+
120
+ # Invalid SMILES
121
+ invalid_df = pd.DataFrame({"SMILES": ["INVALID", ""]})
122
+ try:
123
+ fp_invalid = compute_morgan_fingerprints(invalid_df.copy())
124
+ print(f" ✓ Invalid SMILES handled: {len(fp_invalid)} valid molecules")
125
+ except Exception as e:
126
+ print(f" ✓ Invalid SMILES properly raised error: {type(e).__name__}")
127
+
128
+ # Test with pre-existing molecule column
129
+ mol_df = test_df.copy()
130
+ mol_df["molecule"] = mol_df["SMILES"].apply(Chem.MolFromSmiles)
131
+ fp_with_mol = compute_morgan_fingerprints(mol_df)
132
+ print(f" ✓ Pre-existing molecule column handled: {len(fp_with_mol)} fingerprints generated")
133
+
134
+ print("\n✅ All fingerprint tests completed!")
@@ -15,7 +15,7 @@ import pandas as pd
15
15
  import json
16
16
 
17
17
  # Local imports
18
- from local_utils import compute_morgan_fingerprints
18
+ from fingerprints import compute_morgan_fingerprints
19
19
 
20
20
 
21
21
  # TRAINING SECTION
@@ -69,6 +69,7 @@ class Proximity:
69
69
  top_percent: float = 1.0,
70
70
  min_delta: Optional[float] = None,
71
71
  k_neighbors: int = 4,
72
+ only_coincident: bool = False,
72
73
  ) -> pd.DataFrame:
73
74
  """
74
75
  Find compounds with steep target gradients (data quality issues and activity cliffs).
@@ -81,6 +82,7 @@ class Proximity:
81
82
  top_percent: Percentage of compounds with steepest gradients to return (e.g., 1.0 = top 1%)
82
83
  min_delta: Minimum absolute target difference to consider. If None, defaults to target_range/100
83
84
  k_neighbors: Number of neighbors to use for median calculation (default: 4)
85
+ only_coincident: If True, only consider compounds that are coincident (default: False)
84
86
 
85
87
  Returns:
86
88
  DataFrame of compounds with steepest gradients, sorted by gradient (descending)
@@ -99,10 +101,15 @@ class Proximity:
99
101
  min_delta = self.target_range / 100.0 if self.target_range > 0 else 0.0
100
102
  candidates = candidates[candidates["nn_target_diff"] >= min_delta]
101
103
 
102
- # Get top X% by initial gradient
103
- percentile = 100 - top_percent
104
- threshold = np.percentile(candidates["gradient"], percentile)
105
- candidates = candidates[candidates["gradient"] >= threshold].copy()
104
+ # Filter based on mode
105
+ if only_coincident:
106
+ # Only keep coincident points (nn_distance ~= 0)
107
+ candidates = candidates[candidates["nn_distance"] < epsilon].copy()
108
+ else:
109
+ # Get top X% by initial gradient
110
+ percentile = 100 - top_percent
111
+ threshold = np.percentile(candidates["gradient"], percentile)
112
+ candidates = candidates[candidates["gradient"] >= threshold].copy()
106
113
 
107
114
  # Phase 2: Verify with k-neighbor median to filter out cases where nearest neighbor is the outlier
108
115
  results = []
@@ -4,9 +4,10 @@ import awswrangler as wr
4
4
  import numpy as np
5
5
 
6
6
  # Model Performance Scores
7
- from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
7
+ from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, root_mean_squared_error
8
8
  from sklearn.model_selection import KFold
9
9
  from scipy.optimize import minimize
10
+ from scipy.stats import spearmanr
10
11
 
11
12
  from io import StringIO
12
13
  import json
@@ -217,11 +218,16 @@ if __name__ == "__main__":
217
218
  # Report Performance Metrics
218
219
  rmse = root_mean_squared_error(result_df[target], result_df["prediction"])
219
220
  mae = mean_absolute_error(result_df[target], result_df["prediction"])
221
+ medae = median_absolute_error(result_df[target], result_df["prediction"])
220
222
  r2 = r2_score(result_df[target], result_df["prediction"])
221
- print(f"RMSE: {rmse:.3f}")
222
- print(f"MAE: {mae:.3f}")
223
- print(f"R2: {r2:.3f}")
224
- print(f"NumRows: {len(result_df)}")
223
+ spearman_corr = spearmanr(result_df[target], result_df["prediction"]).correlation
224
+ support = len(result_df)
225
+ print(f"rmse: {rmse:.3f}")
226
+ print(f"mae: {mae:.3f}")
227
+ print(f"medae: {medae:.3f}")
228
+ print(f"r2: {r2:.3f}")
229
+ print(f"spearmanr: {spearman_corr:.3f}")
230
+ print(f"support: {support}")
225
231
 
226
232
  # Now save the models
227
233
  for name, model in models.items():
@@ -5,7 +5,8 @@ from xgboost import XGBRegressor # Point Estimator
5
5
  from sklearn.model_selection import train_test_split
6
6
 
7
7
  # Model Performance Scores
8
- from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
8
+ from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, root_mean_squared_error
9
+ from scipy.stats import spearmanr
9
10
 
10
11
  from io import StringIO
11
12
  import json
@@ -238,11 +239,16 @@ if __name__ == "__main__":
238
239
  # Calculate various model performance metrics (regression)
239
240
  rmse = root_mean_squared_error(y_validate, preds)
240
241
  mae = mean_absolute_error(y_validate, preds)
242
+ medae = median_absolute_error(y_validate, preds)
241
243
  r2 = r2_score(y_validate, preds)
242
- print(f"RMSE: {rmse:.3f}")
243
- print(f"MAE: {mae:.3f}")
244
- print(f"R2: {r2:.3f}")
245
- print(f"NumRows: {len(df_val)}")
244
+ spearman_corr = spearmanr(y_validate, preds).correlation
245
+ support = len(df_val)
246
+ print(f"rmse: {rmse:.3f}")
247
+ print(f"mae: {mae:.3f}")
248
+ print(f"medae: {medae:.3f}")
249
+ print(f"r2: {r2:.3f}")
250
+ print(f"spearmanr: {spearman_corr:.3f}")
251
+ print(f"support: {support}")
246
252
 
247
253
  # Save the trained XGBoost model
248
254
  xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
@@ -3,7 +3,8 @@ from ngboost import NGBRegressor
3
3
  from sklearn.model_selection import train_test_split
4
4
 
5
5
  # Model Performance Scores
6
- from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
6
+ from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, root_mean_squared_error
7
+ from scipy.stats import spearmanr
7
8
 
8
9
  from io import StringIO
9
10
  import json
@@ -129,11 +130,16 @@ if __name__ == "__main__":
129
130
  # Calculate various model performance metrics (regression)
130
131
  rmse = root_mean_squared_error(y_validate, preds)
131
132
  mae = mean_absolute_error(y_validate, preds)
133
+ medae = median_absolute_error(y_validate, preds)
132
134
  r2 = r2_score(y_validate, preds)
133
- print(f"RMSE: {rmse:.3f}")
134
- print(f"MAE: {mae:.3f}")
135
- print(f"R2: {r2:.3f}")
136
- print(f"NumRows: {len(df_val)}")
135
+ spearman_corr = spearmanr(y_validate, preds).correlation
136
+ support = len(df_val)
137
+ print(f"rmse: {rmse:.3f}")
138
+ print(f"mae: {mae:.3f}")
139
+ print(f"medae: {medae:.3f}")
140
+ print(f"r2: {r2:.3f}")
141
+ print(f"spearmanr: {spearman_corr:.3f}")
142
+ print(f"support: {support}")
137
143
 
138
144
  # Save the trained NGBoost model
139
145
  joblib.dump(ngb_model, os.path.join(args.model_dir, "ngb_model.joblib"))
@@ -69,6 +69,7 @@ class Proximity:
69
69
  top_percent: float = 1.0,
70
70
  min_delta: Optional[float] = None,
71
71
  k_neighbors: int = 4,
72
+ only_coincident: bool = False,
72
73
  ) -> pd.DataFrame:
73
74
  """
74
75
  Find compounds with steep target gradients (data quality issues and activity cliffs).
@@ -81,6 +82,7 @@ class Proximity:
81
82
  top_percent: Percentage of compounds with steepest gradients to return (e.g., 1.0 = top 1%)
82
83
  min_delta: Minimum absolute target difference to consider. If None, defaults to target_range/100
83
84
  k_neighbors: Number of neighbors to use for median calculation (default: 4)
85
+ only_coincident: If True, only consider compounds that are coincident (default: False)
84
86
 
85
87
  Returns:
86
88
  DataFrame of compounds with steepest gradients, sorted by gradient (descending)
@@ -99,10 +101,15 @@ class Proximity:
99
101
  min_delta = self.target_range / 100.0 if self.target_range > 0 else 0.0
100
102
  candidates = candidates[candidates["nn_target_diff"] >= min_delta]
101
103
 
102
- # Get top X% by initial gradient
103
- percentile = 100 - top_percent
104
- threshold = np.percentile(candidates["gradient"], percentile)
105
- candidates = candidates[candidates["gradient"] >= threshold].copy()
104
+ # Filter based on mode
105
+ if only_coincident:
106
+ # Only keep coincident points (nn_distance ~= 0)
107
+ candidates = candidates[candidates["nn_distance"] < epsilon].copy()
108
+ else:
109
+ # Get top X% by initial gradient
110
+ percentile = 100 - top_percent
111
+ threshold = np.percentile(candidates["gradient"], percentile)
112
+ candidates = candidates[candidates["gradient"] >= threshold].copy()
106
113
 
107
114
  # Phase 2: Verify with k-neighbor median to filter out cases where nearest neighbor is the outlier
108
115
  results = []
@@ -12,7 +12,8 @@ import awswrangler as wr
12
12
  import numpy as np
13
13
 
14
14
  # Model Performance Scores
15
- from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
15
+ from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, root_mean_squared_error
16
+ from scipy.stats import spearmanr
16
17
 
17
18
  from io import StringIO
18
19
  import json
@@ -153,11 +154,16 @@ if __name__ == "__main__":
153
154
  # Report Performance Metrics
154
155
  rmse = root_mean_squared_error(result_df[target], result_df["prediction"])
155
156
  mae = mean_absolute_error(result_df[target], result_df["prediction"])
157
+ medae = median_absolute_error(result_df[target], result_df["prediction"])
156
158
  r2 = r2_score(result_df[target], result_df["prediction"])
157
- print(f"RMSE: {rmse:.3f}")
158
- print(f"MAE: {mae:.3f}")
159
- print(f"R2: {r2:.3f}")
160
- print(f"NumRows: {len(result_df)}")
159
+ spearman_corr = spearmanr(result_df[target], result_df["prediction"]).correlation
160
+ support = len(result_df)
161
+ print(f"rmse: {rmse:.3f}")
162
+ print(f"mae: {mae:.3f}")
163
+ print(f"medae: {medae:.3f}")
164
+ print(f"r2: {r2:.3f}")
165
+ print(f"spearmanr: {spearman_corr:.3f}")
166
+ print(f"support: {support}")
161
167
 
162
168
  # Now save the models
163
169
  for name, model in models.items():