workbench 0.8.217__py3-none-any.whl → 0.8.219__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- workbench/algorithms/sql/outliers.py +3 -3
- workbench/core/artifacts/endpoint_core.py +2 -2
- workbench/core/artifacts/feature_set_core.py +2 -2
- workbench/model_script_utils/model_script_utils.py +15 -11
- workbench/model_scripts/chemprop/chemprop.template +2 -2
- workbench/model_scripts/chemprop/generated_model_script.py +6 -6
- workbench/model_scripts/chemprop/model_script_utils.py +15 -11
- workbench/model_scripts/custom_models/chem_info/fingerprints.py +80 -43
- workbench/model_scripts/pytorch_model/generated_model_script.py +3 -3
- workbench/model_scripts/pytorch_model/model_script_utils.py +15 -11
- workbench/model_scripts/xgb_model/generated_model_script.py +6 -6
- workbench/model_scripts/xgb_model/model_script_utils.py +15 -11
- workbench/scripts/meta_model_sim.py +35 -0
- workbench/utils/chem_utils/fingerprints.py +80 -43
- workbench/utils/meta_model_simulator.py +41 -13
- workbench/utils/shap_utils.py +1 -55
- {workbench-0.8.217.dist-info → workbench-0.8.219.dist-info}/METADATA +1 -1
- {workbench-0.8.217.dist-info → workbench-0.8.219.dist-info}/RECORD +22 -23
- {workbench-0.8.217.dist-info → workbench-0.8.219.dist-info}/entry_points.txt +1 -0
- workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -377
- {workbench-0.8.217.dist-info → workbench-0.8.219.dist-info}/WHEEL +0 -0
- {workbench-0.8.217.dist-info → workbench-0.8.219.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.217.dist-info → workbench-0.8.219.dist-info}/top_level.txt +0 -0
|
@@ -209,9 +209,9 @@ class Outliers:
|
|
|
209
209
|
else:
|
|
210
210
|
return group.nlargest(n, col)
|
|
211
211
|
|
|
212
|
-
# Group by 'outlier_group' and apply the helper function, explicitly selecting columns
|
|
213
|
-
top_outliers = outlier_df.groupby("outlier_group", group_keys=False).apply(
|
|
214
|
-
get_extreme_values
|
|
212
|
+
# Group by 'outlier_group' and apply the helper function, explicitly selecting columns to silence warning
|
|
213
|
+
top_outliers = outlier_df.groupby("outlier_group", group_keys=False)[outlier_df.columns].apply(
|
|
214
|
+
get_extreme_values
|
|
215
215
|
)
|
|
216
216
|
return top_outliers.reset_index(drop=True)
|
|
217
217
|
|
|
@@ -410,7 +410,7 @@ class EndpointCore(Artifact):
|
|
|
410
410
|
primary_target = targets
|
|
411
411
|
|
|
412
412
|
# Sanity Check that the target column is present
|
|
413
|
-
if primary_target
|
|
413
|
+
if primary_target not in prediction_df.columns:
|
|
414
414
|
self.log.important(f"Target Column {primary_target} not found in prediction_df!")
|
|
415
415
|
self.log.important("In order to compute metrics, the target column must be present!")
|
|
416
416
|
metrics = pd.DataFrame()
|
|
@@ -432,7 +432,7 @@ class EndpointCore(Artifact):
|
|
|
432
432
|
print(metrics.head())
|
|
433
433
|
|
|
434
434
|
# Capture the inference results and metrics
|
|
435
|
-
if
|
|
435
|
+
if primary_target and capture_name:
|
|
436
436
|
|
|
437
437
|
# If we don't have an id_column, we'll pull it from the model's FeatureSet
|
|
438
438
|
if id_column is None:
|
|
@@ -247,7 +247,7 @@ class FeatureSetCore(Artifact):
|
|
|
247
247
|
|
|
248
248
|
# Set the compressed features in our FeatureSet metadata
|
|
249
249
|
self.log.important(f"Setting Compressed Columns...{compressed_columns}")
|
|
250
|
-
self.upsert_workbench_meta({"
|
|
250
|
+
self.upsert_workbench_meta({"compressed_features": compressed_columns})
|
|
251
251
|
|
|
252
252
|
def get_compressed_features(self) -> list[str]:
|
|
253
253
|
"""Get the compressed features for this FeatureSet
|
|
@@ -256,7 +256,7 @@ class FeatureSetCore(Artifact):
|
|
|
256
256
|
list[str]: The compressed columns for this FeatureSet
|
|
257
257
|
"""
|
|
258
258
|
# Get the compressed features from our FeatureSet metadata
|
|
259
|
-
return self.workbench_meta().get("
|
|
259
|
+
return self.workbench_meta().get("compressed_features", [])
|
|
260
260
|
|
|
261
261
|
def num_columns(self) -> int:
|
|
262
262
|
"""Return the number of columns of the Feature Set"""
|
|
@@ -148,12 +148,16 @@ def convert_categorical_types(
|
|
|
148
148
|
def decompress_features(
|
|
149
149
|
df: pd.DataFrame, features: list[str], compressed_features: list[str]
|
|
150
150
|
) -> tuple[pd.DataFrame, list[str]]:
|
|
151
|
-
"""Decompress
|
|
151
|
+
"""Decompress compressed features (bitstrings or count vectors) into individual columns.
|
|
152
|
+
|
|
153
|
+
Supports two formats (auto-detected):
|
|
154
|
+
- Bitstrings: "10110010..." → individual uint8 columns (0 or 1)
|
|
155
|
+
- Count vectors: "0,3,0,1,5,..." → individual uint8 columns (0-255)
|
|
152
156
|
|
|
153
157
|
Args:
|
|
154
158
|
df: The features DataFrame
|
|
155
159
|
features: Full list of feature names
|
|
156
|
-
compressed_features: List of feature names to decompress
|
|
160
|
+
compressed_features: List of feature names to decompress
|
|
157
161
|
|
|
158
162
|
Returns:
|
|
159
163
|
Tuple of (DataFrame with decompressed features, updated feature list)
|
|
@@ -178,18 +182,18 @@ def decompress_features(
|
|
|
178
182
|
# Remove the feature from the list to avoid duplication
|
|
179
183
|
decompressed_features.remove(feature)
|
|
180
184
|
|
|
181
|
-
#
|
|
182
|
-
|
|
183
|
-
|
|
185
|
+
# Auto-detect format and parse: comma-separated counts or bitstring
|
|
186
|
+
sample = str(df[feature].dropna().iloc[0]) if not df[feature].dropna().empty else ""
|
|
187
|
+
parse_fn = (lambda s: list(map(int, s.split(",")))) if "," in sample else list
|
|
188
|
+
feature_matrix = np.array([parse_fn(s) for s in df[feature]], dtype=np.uint8)
|
|
184
189
|
|
|
185
|
-
# Create
|
|
186
|
-
|
|
187
|
-
|
|
190
|
+
# Create new columns with prefix from feature name
|
|
191
|
+
prefix = feature[:3]
|
|
192
|
+
new_col_names = [f"{prefix}_{i}" for i in range(feature_matrix.shape[1])]
|
|
193
|
+
new_df = pd.DataFrame(feature_matrix, columns=new_col_names, index=df.index)
|
|
188
194
|
|
|
189
|
-
#
|
|
195
|
+
# Update features list and dataframe
|
|
190
196
|
decompressed_features.extend(new_col_names)
|
|
191
|
-
|
|
192
|
-
# Drop original column and concatenate new ones
|
|
193
197
|
df = df.drop(columns=[feature])
|
|
194
198
|
df = pd.concat([df, new_df], axis=1)
|
|
195
199
|
|
|
@@ -481,8 +481,8 @@ if __name__ == "__main__":
|
|
|
481
481
|
val_dataset.normalize_targets(target_scaler)
|
|
482
482
|
output_transform = nn.UnscaleTransform.from_standard_scaler(target_scaler)
|
|
483
483
|
|
|
484
|
-
train_loader = data.build_dataloader(train_dataset, batch_size=batch_size, shuffle=True)
|
|
485
|
-
val_loader = data.build_dataloader(val_dataset, batch_size=batch_size, shuffle=False)
|
|
484
|
+
train_loader = data.build_dataloader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=3)
|
|
485
|
+
val_loader = data.build_dataloader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=3)
|
|
486
486
|
|
|
487
487
|
# Build and train model
|
|
488
488
|
pl.seed_everything(hyperparameters["seed"] + fold_idx)
|
|
@@ -50,10 +50,10 @@ DEFAULT_HYPERPARAMETERS = {
|
|
|
50
50
|
# Template parameters (filled in by Workbench)
|
|
51
51
|
TEMPLATE_PARAMS = {
|
|
52
52
|
"model_type": "uq_regressor",
|
|
53
|
-
"targets": ['
|
|
54
|
-
"feature_list": ['smiles'],
|
|
55
|
-
"id_column": "
|
|
56
|
-
"model_metrics_s3_path": "s3://
|
|
53
|
+
"targets": ['logd'],
|
|
54
|
+
"feature_list": ['smiles', 'mollogp', 'fr_halogen', 'nbase', 'peoe_vsa6', 'bcut2d_mrlow', 'peoe_vsa7', 'peoe_vsa9', 'vsa_estate1', 'peoe_vsa1', 'numhdonors', 'vsa_estate5', 'smr_vsa3', 'slogp_vsa1', 'vsa_estate7', 'bcut2d_mwhi', 'axp_2dv', 'axp_3dv', 'mi', 'smr_vsa9', 'vsa_estate3', 'estate_vsa9', 'bcut2d_mwlow', 'tpsa', 'vsa_estate10', 'xch_5dv', 'slogp_vsa2', 'nhohcount', 'bcut2d_logplow', 'hallkieralpha', 'c2sp2', 'bcut2d_chglo', 'smr_vsa4', 'maxabspartialcharge', 'estate_vsa6', 'qed', 'slogp_vsa6', 'vsa_estate2', 'bcut2d_logphi', 'vsa_estate8', 'xch_7dv', 'fpdensitymorgan3', 'xpc_6d', 'smr_vsa10', 'axp_0d', 'fr_nh1', 'axp_4dv', 'peoe_vsa2', 'estate_vsa8', 'peoe_vsa5', 'vsa_estate6'],
|
|
55
|
+
"id_column": "molecule_name",
|
|
56
|
+
"model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/logd-reg-chemprop-hybrid/training",
|
|
57
57
|
"hyperparameters": {},
|
|
58
58
|
}
|
|
59
59
|
|
|
@@ -481,8 +481,8 @@ if __name__ == "__main__":
|
|
|
481
481
|
val_dataset.normalize_targets(target_scaler)
|
|
482
482
|
output_transform = nn.UnscaleTransform.from_standard_scaler(target_scaler)
|
|
483
483
|
|
|
484
|
-
train_loader = data.build_dataloader(train_dataset, batch_size=batch_size, shuffle=True)
|
|
485
|
-
val_loader = data.build_dataloader(val_dataset, batch_size=batch_size, shuffle=False)
|
|
484
|
+
train_loader = data.build_dataloader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=3)
|
|
485
|
+
val_loader = data.build_dataloader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=3)
|
|
486
486
|
|
|
487
487
|
# Build and train model
|
|
488
488
|
pl.seed_everything(hyperparameters["seed"] + fold_idx)
|
|
@@ -148,12 +148,16 @@ def convert_categorical_types(
|
|
|
148
148
|
def decompress_features(
|
|
149
149
|
df: pd.DataFrame, features: list[str], compressed_features: list[str]
|
|
150
150
|
) -> tuple[pd.DataFrame, list[str]]:
|
|
151
|
-
"""Decompress
|
|
151
|
+
"""Decompress compressed features (bitstrings or count vectors) into individual columns.
|
|
152
|
+
|
|
153
|
+
Supports two formats (auto-detected):
|
|
154
|
+
- Bitstrings: "10110010..." → individual uint8 columns (0 or 1)
|
|
155
|
+
- Count vectors: "0,3,0,1,5,..." → individual uint8 columns (0-255)
|
|
152
156
|
|
|
153
157
|
Args:
|
|
154
158
|
df: The features DataFrame
|
|
155
159
|
features: Full list of feature names
|
|
156
|
-
compressed_features: List of feature names to decompress
|
|
160
|
+
compressed_features: List of feature names to decompress
|
|
157
161
|
|
|
158
162
|
Returns:
|
|
159
163
|
Tuple of (DataFrame with decompressed features, updated feature list)
|
|
@@ -178,18 +182,18 @@ def decompress_features(
|
|
|
178
182
|
# Remove the feature from the list to avoid duplication
|
|
179
183
|
decompressed_features.remove(feature)
|
|
180
184
|
|
|
181
|
-
#
|
|
182
|
-
|
|
183
|
-
|
|
185
|
+
# Auto-detect format and parse: comma-separated counts or bitstring
|
|
186
|
+
sample = str(df[feature].dropna().iloc[0]) if not df[feature].dropna().empty else ""
|
|
187
|
+
parse_fn = (lambda s: list(map(int, s.split(",")))) if "," in sample else list
|
|
188
|
+
feature_matrix = np.array([parse_fn(s) for s in df[feature]], dtype=np.uint8)
|
|
184
189
|
|
|
185
|
-
# Create
|
|
186
|
-
|
|
187
|
-
|
|
190
|
+
# Create new columns with prefix from feature name
|
|
191
|
+
prefix = feature[:3]
|
|
192
|
+
new_col_names = [f"{prefix}_{i}" for i in range(feature_matrix.shape[1])]
|
|
193
|
+
new_df = pd.DataFrame(feature_matrix, columns=new_col_names, index=df.index)
|
|
188
194
|
|
|
189
|
-
#
|
|
195
|
+
# Update features list and dataframe
|
|
190
196
|
decompressed_features.extend(new_col_names)
|
|
191
|
-
|
|
192
|
-
# Drop original column and concatenate new ones
|
|
193
197
|
df = df.drop(columns=[feature])
|
|
194
198
|
df = pd.concat([df, new_df], axis=1)
|
|
195
199
|
|
|
@@ -1,11 +1,19 @@
|
|
|
1
|
-
"""Molecular fingerprint computation utilities
|
|
1
|
+
"""Molecular fingerprint computation utilities for ADMET modeling.
|
|
2
|
+
|
|
3
|
+
This module provides Morgan count fingerprints, the standard for ADMET prediction.
|
|
4
|
+
Count fingerprints outperform binary fingerprints for molecular property prediction.
|
|
5
|
+
|
|
6
|
+
References:
|
|
7
|
+
- Count vs Binary: https://pubs.acs.org/doi/10.1021/acs.est.3c02198
|
|
8
|
+
- ECFP/Morgan: https://pubs.acs.org/doi/10.1021/ci100050t
|
|
9
|
+
"""
|
|
2
10
|
|
|
3
11
|
import logging
|
|
4
|
-
import pandas as pd
|
|
5
12
|
|
|
6
|
-
|
|
13
|
+
import numpy as np
|
|
14
|
+
import pandas as pd
|
|
7
15
|
from rdkit import Chem, RDLogger
|
|
8
|
-
from rdkit.Chem import
|
|
16
|
+
from rdkit.Chem import AllChem
|
|
9
17
|
from rdkit.Chem.MolStandardize import rdMolStandardize
|
|
10
18
|
|
|
11
19
|
# Suppress RDKit warnings (e.g., "not removing hydrogen atom without neighbors")
|
|
@@ -16,20 +24,25 @@ RDLogger.DisableLog("rdApp.warning")
|
|
|
16
24
|
log = logging.getLogger("workbench")
|
|
17
25
|
|
|
18
26
|
|
|
19
|
-
def compute_morgan_fingerprints(df: pd.DataFrame, radius=2, n_bits=2048
|
|
20
|
-
"""Compute
|
|
27
|
+
def compute_morgan_fingerprints(df: pd.DataFrame, radius: int = 2, n_bits: int = 2048) -> pd.DataFrame:
|
|
28
|
+
"""Compute Morgan count fingerprints for ADMET modeling.
|
|
29
|
+
|
|
30
|
+
Generates true count fingerprints where each bit position contains the
|
|
31
|
+
number of times that substructure appears in the molecule (clamped to 0-255).
|
|
32
|
+
This is the recommended approach for ADMET prediction per 2025 research.
|
|
21
33
|
|
|
22
34
|
Args:
|
|
23
|
-
df
|
|
24
|
-
radius
|
|
25
|
-
n_bits
|
|
26
|
-
counts (bool): Count simulation for the fingerprint.
|
|
35
|
+
df: Input DataFrame containing SMILES strings.
|
|
36
|
+
radius: Radius for the Morgan fingerprint (default 2 = ECFP4 equivalent).
|
|
37
|
+
n_bits: Number of bits for the fingerprint (default 2048).
|
|
27
38
|
|
|
28
39
|
Returns:
|
|
29
|
-
pd.DataFrame:
|
|
40
|
+
pd.DataFrame: Input DataFrame with 'fingerprint' column added.
|
|
41
|
+
Values are comma-separated uint8 counts.
|
|
30
42
|
|
|
31
43
|
Note:
|
|
32
|
-
|
|
44
|
+
Count fingerprints outperform binary for ADMET prediction.
|
|
45
|
+
See: https://pubs.acs.org/doi/10.1021/acs.est.3c02198
|
|
33
46
|
"""
|
|
34
47
|
delete_mol_column = False
|
|
35
48
|
|
|
@@ -43,7 +56,7 @@ def compute_morgan_fingerprints(df: pd.DataFrame, radius=2, n_bits=2048, counts=
|
|
|
43
56
|
log.warning("Detected serialized molecules in 'molecule' column. Removing...")
|
|
44
57
|
del df["molecule"]
|
|
45
58
|
|
|
46
|
-
# Convert SMILES to RDKit molecule objects
|
|
59
|
+
# Convert SMILES to RDKit molecule objects
|
|
47
60
|
if "molecule" not in df.columns:
|
|
48
61
|
log.info("Converting SMILES to RDKit Molecules...")
|
|
49
62
|
delete_mol_column = True
|
|
@@ -59,15 +72,24 @@ def compute_morgan_fingerprints(df: pd.DataFrame, radius=2, n_bits=2048, counts=
|
|
|
59
72
|
lambda mol: rdMolStandardize.LargestFragmentChooser().choose(mol) if mol else None
|
|
60
73
|
)
|
|
61
74
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
75
|
+
def mol_to_count_string(mol):
|
|
76
|
+
"""Convert molecule to comma-separated count fingerprint string."""
|
|
77
|
+
if mol is None:
|
|
78
|
+
return pd.NA
|
|
66
79
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
80
|
+
# Get hashed Morgan fingerprint with counts
|
|
81
|
+
fp = AllChem.GetHashedMorganFingerprint(mol, radius, nBits=n_bits)
|
|
82
|
+
|
|
83
|
+
# Initialize array and populate with counts (clamped to uint8 range)
|
|
84
|
+
counts = np.zeros(n_bits, dtype=np.uint8)
|
|
85
|
+
for idx, count in fp.GetNonzeroElements().items():
|
|
86
|
+
counts[idx] = min(count, 255)
|
|
87
|
+
|
|
88
|
+
# Return as comma-separated string
|
|
89
|
+
return ",".join(map(str, counts))
|
|
90
|
+
|
|
91
|
+
# Compute Morgan count fingerprints
|
|
92
|
+
fingerprints = largest_frags.apply(mol_to_count_string)
|
|
71
93
|
|
|
72
94
|
# Add the fingerprints to the DataFrame
|
|
73
95
|
df["fingerprint"] = fingerprints
|
|
@@ -75,59 +97,62 @@ def compute_morgan_fingerprints(df: pd.DataFrame, radius=2, n_bits=2048, counts=
|
|
|
75
97
|
# Drop the intermediate 'molecule' column if it was added
|
|
76
98
|
if delete_mol_column:
|
|
77
99
|
del df["molecule"]
|
|
100
|
+
|
|
78
101
|
return df
|
|
79
102
|
|
|
80
103
|
|
|
81
104
|
if __name__ == "__main__":
|
|
82
|
-
print("Running
|
|
83
|
-
print("Note: This requires molecular_screening module to be available")
|
|
105
|
+
print("Running Morgan count fingerprint tests...")
|
|
84
106
|
|
|
85
107
|
# Test molecules
|
|
86
108
|
test_molecules = {
|
|
87
109
|
"aspirin": "CC(=O)OC1=CC=CC=C1C(=O)O",
|
|
88
110
|
"caffeine": "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",
|
|
89
111
|
"glucose": "C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O", # With stereochemistry
|
|
90
|
-
"sodium_acetate": "CC(=O)[O-].[Na+]", # Salt
|
|
112
|
+
"sodium_acetate": "CC(=O)[O-].[Na+]", # Salt (largest fragment used)
|
|
91
113
|
"benzene": "c1ccccc1",
|
|
92
114
|
"butene_e": "C/C=C/C", # E-butene
|
|
93
115
|
"butene_z": "C/C=C\\C", # Z-butene
|
|
94
116
|
}
|
|
95
117
|
|
|
96
|
-
# Test 1: Morgan Fingerprints
|
|
97
|
-
print("\n1. Testing Morgan fingerprint generation...")
|
|
118
|
+
# Test 1: Morgan Count Fingerprints (default parameters)
|
|
119
|
+
print("\n1. Testing Morgan fingerprint generation (radius=2, n_bits=2048)...")
|
|
98
120
|
|
|
99
121
|
test_df = pd.DataFrame({"SMILES": list(test_molecules.values()), "name": list(test_molecules.keys())})
|
|
100
|
-
|
|
101
|
-
fp_df = compute_morgan_fingerprints(test_df.copy(), radius=2, n_bits=512, counts=False)
|
|
122
|
+
fp_df = compute_morgan_fingerprints(test_df.copy())
|
|
102
123
|
|
|
103
124
|
print(" Fingerprint generation results:")
|
|
104
125
|
for _, row in fp_df.iterrows():
|
|
105
126
|
fp = row.get("fingerprint", "N/A")
|
|
106
|
-
|
|
107
|
-
|
|
127
|
+
if pd.notna(fp):
|
|
128
|
+
counts = [int(x) for x in fp.split(",")]
|
|
129
|
+
non_zero = sum(1 for c in counts if c > 0)
|
|
130
|
+
max_count = max(counts)
|
|
131
|
+
print(f" {row['name']:15} → {len(counts)} features, {non_zero} non-zero, max={max_count}")
|
|
132
|
+
else:
|
|
133
|
+
print(f" {row['name']:15} → N/A")
|
|
108
134
|
|
|
109
|
-
# Test 2: Different
|
|
110
|
-
print("\n2. Testing different
|
|
135
|
+
# Test 2: Different parameters
|
|
136
|
+
print("\n2. Testing with different parameters (radius=3, n_bits=1024)...")
|
|
111
137
|
|
|
112
|
-
|
|
113
|
-
fp_counts_df = compute_morgan_fingerprints(test_df.copy(), radius=3, n_bits=256, counts=True)
|
|
138
|
+
fp_df_custom = compute_morgan_fingerprints(test_df.copy(), radius=3, n_bits=1024)
|
|
114
139
|
|
|
115
|
-
|
|
116
|
-
for _, row in fp_counts_df.iterrows():
|
|
140
|
+
for _, row in fp_df_custom.iterrows():
|
|
117
141
|
fp = row.get("fingerprint", "N/A")
|
|
118
|
-
|
|
119
|
-
|
|
142
|
+
if pd.notna(fp):
|
|
143
|
+
counts = [int(x) for x in fp.split(",")]
|
|
144
|
+
non_zero = sum(1 for c in counts if c > 0)
|
|
145
|
+
print(f" {row['name']:15} → {len(counts)} features, {non_zero} non-zero")
|
|
146
|
+
else:
|
|
147
|
+
print(f" {row['name']:15} → N/A")
|
|
120
148
|
|
|
121
149
|
# Test 3: Edge cases
|
|
122
150
|
print("\n3. Testing edge cases...")
|
|
123
151
|
|
|
124
152
|
# Invalid SMILES
|
|
125
153
|
invalid_df = pd.DataFrame({"SMILES": ["INVALID", ""]})
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
print(f" ✓ Invalid SMILES handled: {len(fp_invalid)} valid molecules")
|
|
129
|
-
except Exception as e:
|
|
130
|
-
print(f" ✓ Invalid SMILES properly raised error: {type(e).__name__}")
|
|
154
|
+
fp_invalid = compute_morgan_fingerprints(invalid_df.copy())
|
|
155
|
+
print(f" ✓ Invalid SMILES handled: {len(fp_invalid)} rows returned")
|
|
131
156
|
|
|
132
157
|
# Test with pre-existing molecule column
|
|
133
158
|
mol_df = test_df.copy()
|
|
@@ -135,4 +160,16 @@ if __name__ == "__main__":
|
|
|
135
160
|
fp_with_mol = compute_morgan_fingerprints(mol_df)
|
|
136
161
|
print(f" ✓ Pre-existing molecule column handled: {len(fp_with_mol)} fingerprints generated")
|
|
137
162
|
|
|
163
|
+
# Test 4: Verify count values are reasonable
|
|
164
|
+
print("\n4. Verifying count distribution...")
|
|
165
|
+
all_counts = []
|
|
166
|
+
for _, row in fp_df.iterrows():
|
|
167
|
+
fp = row.get("fingerprint", "N/A")
|
|
168
|
+
if pd.notna(fp):
|
|
169
|
+
counts = [int(x) for x in fp.split(",")]
|
|
170
|
+
all_counts.extend([c for c in counts if c > 0])
|
|
171
|
+
|
|
172
|
+
if all_counts:
|
|
173
|
+
print(f" Non-zero counts: min={min(all_counts)}, max={max(all_counts)}, mean={np.mean(all_counts):.2f}")
|
|
174
|
+
|
|
138
175
|
print("\n✅ All fingerprint tests completed!")
|
|
@@ -61,10 +61,10 @@ DEFAULT_HYPERPARAMETERS = {
|
|
|
61
61
|
TEMPLATE_PARAMS = {
|
|
62
62
|
"model_type": "uq_regressor",
|
|
63
63
|
"target": "udm_asy_res_efflux_ratio",
|
|
64
|
-
"features": ['
|
|
64
|
+
"features": ['fingerprint'],
|
|
65
65
|
"id_column": "udm_mol_bat_id",
|
|
66
|
-
"compressed_features": [],
|
|
67
|
-
"model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/caco2-er-reg-pytorch/training",
|
|
66
|
+
"compressed_features": ['fingerprint'],
|
|
67
|
+
"model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/caco2-er-reg-fp-pytorch/training",
|
|
68
68
|
"hyperparameters": {},
|
|
69
69
|
}
|
|
70
70
|
|
|
@@ -148,12 +148,16 @@ def convert_categorical_types(
|
|
|
148
148
|
def decompress_features(
|
|
149
149
|
df: pd.DataFrame, features: list[str], compressed_features: list[str]
|
|
150
150
|
) -> tuple[pd.DataFrame, list[str]]:
|
|
151
|
-
"""Decompress
|
|
151
|
+
"""Decompress compressed features (bitstrings or count vectors) into individual columns.
|
|
152
|
+
|
|
153
|
+
Supports two formats (auto-detected):
|
|
154
|
+
- Bitstrings: "10110010..." → individual uint8 columns (0 or 1)
|
|
155
|
+
- Count vectors: "0,3,0,1,5,..." → individual uint8 columns (0-255)
|
|
152
156
|
|
|
153
157
|
Args:
|
|
154
158
|
df: The features DataFrame
|
|
155
159
|
features: Full list of feature names
|
|
156
|
-
compressed_features: List of feature names to decompress
|
|
160
|
+
compressed_features: List of feature names to decompress
|
|
157
161
|
|
|
158
162
|
Returns:
|
|
159
163
|
Tuple of (DataFrame with decompressed features, updated feature list)
|
|
@@ -178,18 +182,18 @@ def decompress_features(
|
|
|
178
182
|
# Remove the feature from the list to avoid duplication
|
|
179
183
|
decompressed_features.remove(feature)
|
|
180
184
|
|
|
181
|
-
#
|
|
182
|
-
|
|
183
|
-
|
|
185
|
+
# Auto-detect format and parse: comma-separated counts or bitstring
|
|
186
|
+
sample = str(df[feature].dropna().iloc[0]) if not df[feature].dropna().empty else ""
|
|
187
|
+
parse_fn = (lambda s: list(map(int, s.split(",")))) if "," in sample else list
|
|
188
|
+
feature_matrix = np.array([parse_fn(s) for s in df[feature]], dtype=np.uint8)
|
|
184
189
|
|
|
185
|
-
# Create
|
|
186
|
-
|
|
187
|
-
|
|
190
|
+
# Create new columns with prefix from feature name
|
|
191
|
+
prefix = feature[:3]
|
|
192
|
+
new_col_names = [f"{prefix}_{i}" for i in range(feature_matrix.shape[1])]
|
|
193
|
+
new_df = pd.DataFrame(feature_matrix, columns=new_col_names, index=df.index)
|
|
188
194
|
|
|
189
|
-
#
|
|
195
|
+
# Update features list and dataframe
|
|
190
196
|
decompressed_features.extend(new_col_names)
|
|
191
|
-
|
|
192
|
-
# Drop original column and concatenate new ones
|
|
193
197
|
df = df.drop(columns=[feature])
|
|
194
198
|
df = pd.concat([df, new_df], axis=1)
|
|
195
199
|
|
|
@@ -63,12 +63,12 @@ REGRESSION_ONLY_PARAMS = {"objective"}
|
|
|
63
63
|
|
|
64
64
|
# Template parameters (filled in by Workbench)
|
|
65
65
|
TEMPLATE_PARAMS = {
|
|
66
|
-
"model_type": "
|
|
67
|
-
"target": "
|
|
68
|
-
"features": ['
|
|
69
|
-
"id_column": "
|
|
70
|
-
"compressed_features": [],
|
|
71
|
-
"model_metrics_s3_path": "s3://
|
|
66
|
+
"model_type": "uq_regressor",
|
|
67
|
+
"target": "udm_asy_res_efflux_ratio",
|
|
68
|
+
"features": ['fingerprint'],
|
|
69
|
+
"id_column": "udm_mol_bat_id",
|
|
70
|
+
"compressed_features": ['fingerprint'],
|
|
71
|
+
"model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/caco2-er-reg-fp/training",
|
|
72
72
|
"hyperparameters": {},
|
|
73
73
|
}
|
|
74
74
|
|
|
@@ -148,12 +148,16 @@ def convert_categorical_types(
|
|
|
148
148
|
def decompress_features(
|
|
149
149
|
df: pd.DataFrame, features: list[str], compressed_features: list[str]
|
|
150
150
|
) -> tuple[pd.DataFrame, list[str]]:
|
|
151
|
-
"""Decompress
|
|
151
|
+
"""Decompress compressed features (bitstrings or count vectors) into individual columns.
|
|
152
|
+
|
|
153
|
+
Supports two formats (auto-detected):
|
|
154
|
+
- Bitstrings: "10110010..." → individual uint8 columns (0 or 1)
|
|
155
|
+
- Count vectors: "0,3,0,1,5,..." → individual uint8 columns (0-255)
|
|
152
156
|
|
|
153
157
|
Args:
|
|
154
158
|
df: The features DataFrame
|
|
155
159
|
features: Full list of feature names
|
|
156
|
-
compressed_features: List of feature names to decompress
|
|
160
|
+
compressed_features: List of feature names to decompress
|
|
157
161
|
|
|
158
162
|
Returns:
|
|
159
163
|
Tuple of (DataFrame with decompressed features, updated feature list)
|
|
@@ -178,18 +182,18 @@ def decompress_features(
|
|
|
178
182
|
# Remove the feature from the list to avoid duplication
|
|
179
183
|
decompressed_features.remove(feature)
|
|
180
184
|
|
|
181
|
-
#
|
|
182
|
-
|
|
183
|
-
|
|
185
|
+
# Auto-detect format and parse: comma-separated counts or bitstring
|
|
186
|
+
sample = str(df[feature].dropna().iloc[0]) if not df[feature].dropna().empty else ""
|
|
187
|
+
parse_fn = (lambda s: list(map(int, s.split(",")))) if "," in sample else list
|
|
188
|
+
feature_matrix = np.array([parse_fn(s) for s in df[feature]], dtype=np.uint8)
|
|
184
189
|
|
|
185
|
-
# Create
|
|
186
|
-
|
|
187
|
-
|
|
190
|
+
# Create new columns with prefix from feature name
|
|
191
|
+
prefix = feature[:3]
|
|
192
|
+
new_col_names = [f"{prefix}_{i}" for i in range(feature_matrix.shape[1])]
|
|
193
|
+
new_df = pd.DataFrame(feature_matrix, columns=new_col_names, index=df.index)
|
|
188
194
|
|
|
189
|
-
#
|
|
195
|
+
# Update features list and dataframe
|
|
190
196
|
decompressed_features.extend(new_col_names)
|
|
191
|
-
|
|
192
|
-
# Drop original column and concatenate new ones
|
|
193
197
|
df = df.drop(columns=[feature])
|
|
194
198
|
df = pd.concat([df, new_df], axis=1)
|
|
195
199
|
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""MetaModelSimulator: Simulate and analyze ensemble model performance.
|
|
2
|
+
|
|
3
|
+
This class helps evaluate whether a meta model (ensemble) would outperform
|
|
4
|
+
individual child models by analyzing endpoint inference predictions.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import argparse
|
|
8
|
+
from workbench.utils.meta_model_simulator import MetaModelSimulator
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def main():
|
|
12
|
+
parser = argparse.ArgumentParser(
|
|
13
|
+
description="Simulate and analyze ensemble model performance using MetaModelSimulator."
|
|
14
|
+
)
|
|
15
|
+
parser.add_argument(
|
|
16
|
+
"models",
|
|
17
|
+
nargs="+",
|
|
18
|
+
help="List of model endpoint names to include in the ensemble simulation.",
|
|
19
|
+
)
|
|
20
|
+
parser.add_argument(
|
|
21
|
+
"--id-column",
|
|
22
|
+
default="molecule_name",
|
|
23
|
+
help="Name of the ID column (default: molecule_name)",
|
|
24
|
+
)
|
|
25
|
+
args = parser.parse_args()
|
|
26
|
+
models = args.models
|
|
27
|
+
id_column = args.id_column
|
|
28
|
+
|
|
29
|
+
# Create MetaModelSimulator instance and generate report
|
|
30
|
+
sim = MetaModelSimulator(models, id_column=id_column)
|
|
31
|
+
sim.report()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
if __name__ == "__main__":
|
|
35
|
+
main()
|