workbench 0.8.171__py3-none-any.whl → 0.8.173__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. workbench/algorithms/graph/light/proximity_graph.py +2 -1
  2. workbench/api/compound.py +1 -1
  3. workbench/api/feature_set.py +4 -4
  4. workbench/api/monitor.py +1 -16
  5. workbench/core/artifacts/artifact.py +11 -3
  6. workbench/core/artifacts/data_capture_core.py +315 -0
  7. workbench/core/artifacts/endpoint_core.py +9 -3
  8. workbench/core/artifacts/model_core.py +37 -14
  9. workbench/core/artifacts/monitor_core.py +33 -249
  10. workbench/core/cloud_platform/aws/aws_account_clamp.py +4 -1
  11. workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
  12. workbench/core/transforms/features_to_model/features_to_model.py +4 -4
  13. workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +471 -0
  14. workbench/model_scripts/custom_models/chem_info/mol_standardize.py +428 -0
  15. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
  16. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +19 -9
  17. workbench/model_scripts/custom_models/uq_models/mapie.template +502 -0
  18. workbench/model_scripts/custom_models/uq_models/meta_uq.template +8 -5
  19. workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
  20. workbench/model_scripts/script_generation.py +5 -0
  21. workbench/model_scripts/xgb_model/generated_model_script.py +5 -5
  22. workbench/repl/workbench_shell.py +3 -3
  23. workbench/utils/chem_utils/__init__.py +0 -0
  24. workbench/utils/chem_utils/fingerprints.py +134 -0
  25. workbench/utils/chem_utils/misc.py +194 -0
  26. workbench/utils/chem_utils/mol_descriptors.py +471 -0
  27. workbench/utils/chem_utils/mol_standardize.py +428 -0
  28. workbench/utils/chem_utils/mol_tagging.py +348 -0
  29. workbench/utils/chem_utils/projections.py +209 -0
  30. workbench/utils/chem_utils/salts.py +256 -0
  31. workbench/utils/chem_utils/sdf.py +292 -0
  32. workbench/utils/chem_utils/toxicity.py +250 -0
  33. workbench/utils/chem_utils/vis.py +253 -0
  34. workbench/utils/model_utils.py +1 -1
  35. workbench/utils/monitor_utils.py +49 -56
  36. workbench/utils/pandas_utils.py +3 -3
  37. workbench/utils/workbench_sqs.py +1 -1
  38. workbench/utils/xgboost_model_utils.py +1 -0
  39. workbench/web_interface/components/plugins/generated_compounds.py +1 -1
  40. {workbench-0.8.171.dist-info → workbench-0.8.173.dist-info}/METADATA +1 -1
  41. {workbench-0.8.171.dist-info → workbench-0.8.173.dist-info}/RECORD +45 -34
  42. workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
  43. workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
  44. workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
  45. workbench/utils/chem_utils.py +0 -1556
  46. {workbench-0.8.171.dist-info → workbench-0.8.173.dist-info}/WHEEL +0 -0
  47. {workbench-0.8.171.dist-info → workbench-0.8.173.dist-info}/entry_points.txt +0 -0
  48. {workbench-0.8.171.dist-info → workbench-0.8.173.dist-info}/licenses/LICENSE +0 -0
  49. {workbench-0.8.171.dist-info → workbench-0.8.173.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,502 @@
1
+ # Model: XGBoost for point predictions + LightGBM with MAPIE for conformalized intervals
2
+ from mapie.regression import ConformalizedQuantileRegressor
3
+ from lightgbm import LGBMRegressor
4
+ from xgboost import XGBRegressor
5
+ from sklearn.model_selection import train_test_split
6
+
7
+ # Model Performance Scores
8
+ from sklearn.metrics import (
9
+ mean_absolute_error,
10
+ r2_score,
11
+ root_mean_squared_error
12
+ )
13
+
14
+ from io import StringIO
15
+ import json
16
+ import argparse
17
+ import joblib
18
+ import os
19
+ import numpy as np
20
+ import pandas as pd
21
+ from typing import List, Tuple
22
+
23
+ # Template Placeholders
24
+ TEMPLATE_PARAMS = {
25
+ "target": "{{target_column}}",
26
+ "features": "{{feature_list}}",
27
+ "compressed_features": "{{compressed_features}}",
28
+ "train_all_data": "{{train_all_data}}"
29
+ }
30
+
31
+
32
+ # Function to check if dataframe is empty
33
+ def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
34
+ """
35
+ Check if the provided dataframe is empty and raise an exception if it is.
36
+
37
+ Args:
38
+ df (pd.DataFrame): DataFrame to check
39
+ df_name (str): Name of the DataFrame
40
+ """
41
+ if df.empty:
42
+ msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
43
+ print(msg)
44
+ raise ValueError(msg)
45
+
46
+
47
+ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
48
+ """
49
+ Matches and renames DataFrame columns to match model feature names (case-insensitive).
50
+ Prioritizes exact matches, then case-insensitive matches.
51
+
52
+ Raises ValueError if any model features cannot be matched.
53
+ """
54
+ df_columns_lower = {col.lower(): col for col in df.columns}
55
+ rename_dict = {}
56
+ missing = []
57
+ for feature in model_features:
58
+ if feature in df.columns:
59
+ continue # Exact match
60
+ elif feature.lower() in df_columns_lower:
61
+ rename_dict[df_columns_lower[feature.lower()]] = feature
62
+ else:
63
+ missing.append(feature)
64
+
65
+ if missing:
66
+ raise ValueError(f"Features not found: {missing}")
67
+
68
+ # Rename the DataFrame columns to match the model features
69
+ return df.rename(columns=rename_dict)
70
+
71
+
72
+ def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
73
+ """
74
+ Converts appropriate columns to categorical type with consistent mappings.
75
+
76
+ Args:
77
+ df (pd.DataFrame): The DataFrame to process.
78
+ features (list): List of feature names to consider for conversion.
79
+ category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
80
+ training mode. If populated, we're in inference mode.
81
+
82
+ Returns:
83
+ tuple: (processed DataFrame, category mappings dictionary)
84
+ """
85
+ # Training mode
86
+ if category_mappings == {}:
87
+ for col in df.select_dtypes(include=["object", "string"]):
88
+ if col in features and df[col].nunique() < 20:
89
+ print(f"Training mode: Converting {col} to category")
90
+ df[col] = df[col].astype("category")
91
+ category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
92
+
93
+ # Inference mode
94
+ else:
95
+ for col, categories in category_mappings.items():
96
+ if col in df.columns:
97
+ print(f"Inference mode: Applying categorical mapping for {col}")
98
+ df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
99
+
100
+ return df, category_mappings
101
+
102
+
103
+ def decompress_features(
104
+ df: pd.DataFrame, features: List[str], compressed_features: List[str]
105
+ ) -> Tuple[pd.DataFrame, List[str]]:
106
+ """Prepare features for the model by decompressing bitstring features
107
+
108
+ Args:
109
+ df (pd.DataFrame): The features DataFrame
110
+ features (List[str]): Full list of feature names
111
+ compressed_features (List[str]): List of feature names to decompress (bitstrings)
112
+
113
+ Returns:
114
+ pd.DataFrame: DataFrame with the decompressed features
115
+ List[str]: Updated list of feature names after decompression
116
+
117
+ Raises:
118
+ ValueError: If any missing values are found in the specified features
119
+ """
120
+
121
+ # Check for any missing values in the required features
122
+ missing_counts = df[features].isna().sum()
123
+ if missing_counts.any():
124
+ missing_features = missing_counts[missing_counts > 0]
125
+ print(
126
+ f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
127
+ "WARNING: You might want to remove/replace all NaN values before processing."
128
+ )
129
+
130
+ # Decompress the specified compressed features
131
+ decompressed_features = features.copy()
132
+ for feature in compressed_features:
133
+ if (feature not in df.columns) or (feature not in features):
134
+ print(f"Feature '{feature}' not in the features list, skipping decompression.")
135
+ continue
136
+
137
+ # Remove the feature from the list of features to avoid duplication
138
+ decompressed_features.remove(feature)
139
+
140
+ # Handle all compressed features as bitstrings
141
+ bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
142
+ prefix = feature[:3]
143
+
144
+ # Create all new columns at once - avoids fragmentation
145
+ new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
146
+ new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
147
+
148
+ # Add to features list
149
+ decompressed_features.extend(new_col_names)
150
+
151
+ # Drop original column and concatenate new ones
152
+ df = df.drop(columns=[feature])
153
+ df = pd.concat([df, new_df], axis=1)
154
+
155
+ return df, decompressed_features
156
+
157
+
158
+ if __name__ == "__main__":
159
+ # Template Parameters
160
+ target = TEMPLATE_PARAMS["target"]
161
+ features = TEMPLATE_PARAMS["features"]
162
+ orig_features = features.copy()
163
+ compressed_features = TEMPLATE_PARAMS["compressed_features"]
164
+ train_all_data = TEMPLATE_PARAMS["train_all_data"]
165
+ validation_split = 0.2
166
+
167
+ # Script arguments for input/output directories
168
+ parser = argparse.ArgumentParser()
169
+ parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
170
+ parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
171
+ parser.add_argument(
172
+ "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
173
+ )
174
+ args = parser.parse_args()
175
+
176
+ # Read the training data into DataFrames
177
+ training_files = [
178
+ os.path.join(args.train, file)
179
+ for file in os.listdir(args.train)
180
+ if file.endswith(".csv")
181
+ ]
182
+ print(f"Training Files: {training_files}")
183
+
184
+ # Combine files and read them all into a single pandas dataframe
185
+ all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
186
+
187
+ # Check if the dataframe is empty
188
+ check_dataframe(all_df, "training_df")
189
+
190
+ # Features/Target output
191
+ print(f"Target: {target}")
192
+ print(f"Features: {str(features)}")
193
+
194
+ # Convert any features that might be categorical to 'category' type
195
+ all_df, category_mappings = convert_categorical_types(all_df, features)
196
+
197
+ # If we have compressed features, decompress them
198
+ if compressed_features:
199
+ print(f"Decompressing features {compressed_features}...")
200
+ all_df, features = decompress_features(all_df, features, compressed_features)
201
+
202
+ # Do we want to train on all the data?
203
+ if train_all_data:
204
+ print("Training on ALL of the data")
205
+ df_train = all_df.copy()
206
+ df_val = all_df.copy()
207
+
208
+ # Does the dataframe have a training column?
209
+ elif "training" in all_df.columns:
210
+ print("Found training column, splitting data based on training column")
211
+ df_train = all_df[all_df["training"]]
212
+ df_val = all_df[~all_df["training"]]
213
+ else:
214
+ # Just do a random training Split
215
+ print("WARNING: No training column found, splitting data with random state=42")
216
+ df_train, df_val = train_test_split(
217
+ all_df, test_size=validation_split, random_state=42
218
+ )
219
+ print(f"FIT/TRAIN: {df_train.shape}")
220
+ print(f"VALIDATION: {df_val.shape}")
221
+
222
+ # Prepare features and targets for training
223
+ X_train = df_train[features]
224
+ X_validate = df_val[features]
225
+ y_train = df_train[target]
226
+ y_validate = df_val[target]
227
+
228
+ # Train XGBoost for point predictions
229
+ print("\nTraining XGBoost for point predictions...")
230
+ xgb_model = XGBRegressor(
231
+ n_estimators=1000,
232
+ max_depth=6,
233
+ learning_rate=0.01,
234
+ subsample=0.8,
235
+ colsample_bytree=0.8,
236
+ random_state=42,
237
+ verbosity=0
238
+ )
239
+ xgb_model.fit(X_train, y_train)
240
+
241
+ # Evaluate XGBoost performance
242
+ y_pred_xgb = xgb_model.predict(X_validate)
243
+ xgb_rmse = root_mean_squared_error(y_validate, y_pred_xgb)
244
+ xgb_mae = mean_absolute_error(y_validate, y_pred_xgb)
245
+ xgb_r2 = r2_score(y_validate, y_pred_xgb)
246
+
247
+ print(f"\nXGBoost Point Prediction Performance:")
248
+ print(f"RMSE: {xgb_rmse:.3f}")
249
+ print(f"MAE: {xgb_mae:.3f}")
250
+ print(f"R2: {xgb_r2:.3f}")
251
+
252
+ # Define confidence levels we want to model
253
+ confidence_levels = [0.50, 0.80, 0.90, 0.95] # 50%, 80%, 90%, 95% confidence intervals
254
+
255
+ # Store MAPIE models for each confidence level
256
+ mapie_models = {}
257
+
258
+ # Train models for each confidence level
259
+ for confidence_level in confidence_levels:
260
+ alpha = 1 - confidence_level
261
+ lower_q = alpha / 2
262
+ upper_q = 1 - alpha / 2
263
+
264
+ print(f"\nTraining quantile models for {confidence_level * 100:.0f}% confidence interval...")
265
+ print(f" Quantiles: {lower_q:.3f}, {upper_q:.3f}, 0.500")
266
+
267
+ # Train three models for this confidence level
268
+ quantile_estimators = []
269
+ for q in [lower_q, upper_q, 0.5]:
270
+ print(f" Training model for quantile {q:.3f}...")
271
+ est = LGBMRegressor(
272
+ objective="quantile",
273
+ alpha=q,
274
+ n_estimators=1000,
275
+ max_depth=6,
276
+ learning_rate=0.01,
277
+ num_leaves=31,
278
+ min_child_samples=20,
279
+ subsample=0.8,
280
+ colsample_bytree=0.8,
281
+ random_state=42,
282
+ verbose=-1,
283
+ force_col_wise=True
284
+ )
285
+ est.fit(X_train, y_train)
286
+ quantile_estimators.append(est)
287
+
288
+ # Create MAPIE CQR model for this confidence level
289
+ print(f" Setting up MAPIE CQR for {confidence_level * 100:.0f}% confidence...")
290
+ mapie_model = ConformalizedQuantileRegressor(
291
+ quantile_estimators,
292
+ confidence_level=confidence_level,
293
+ prefit=True
294
+ )
295
+
296
+ # Conformalize the model
297
+ print(f" Conformalizing with validation data...")
298
+ mapie_model.conformalize(X_validate, y_validate)
299
+
300
+ # Store the model
301
+ mapie_models[f"mapie_{confidence_level:.2f}"] = mapie_model
302
+
303
+ # Validate coverage for this confidence level
304
+ y_pred, y_pis = mapie_model.predict_interval(X_validate)
305
+ coverage = np.mean((y_validate >= y_pis[:, 0, 0]) & (y_validate <= y_pis[:, 1, 0]))
306
+ print(f" Coverage: Target={confidence_level * 100:.0f}%, Empirical={coverage * 100:.1f}%")
307
+
308
+ print(f"\nOverall Model Performance Summary:")
309
+ print(f"XGBoost RMSE: {xgb_rmse:.3f}")
310
+ print(f"XGBoost MAE: {xgb_mae:.3f}")
311
+ print(f"XGBoost R2: {xgb_r2:.3f}")
312
+ print(f"NumRows: {len(df_val)}")
313
+
314
+ # Analyze interval widths across confidence levels
315
+ print(f"\nInterval Width Analysis:")
316
+ for conf_level in confidence_levels:
317
+ model = mapie_models[f"mapie_{conf_level:.2f}"]
318
+ _, y_pis = model.predict_interval(X_validate)
319
+ widths = y_pis[:, 1, 0] - y_pis[:, 0, 0]
320
+ print(f" {conf_level * 100:.0f}% CI: Mean width={np.mean(widths):.3f}, Std={np.std(widths):.3f}")
321
+
322
+ # Save the trained XGBoost model
323
+ xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
324
+
325
+ # Save all MAPIE models
326
+ for model_name, model in mapie_models.items():
327
+ joblib.dump(model, os.path.join(args.model_dir, f"{model_name}.joblib"))
328
+
329
+ # Save the feature list
330
+ with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
331
+ json.dump(features, fp)
332
+
333
+ # Save category mappings if any
334
+ if category_mappings:
335
+ with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
336
+ json.dump(category_mappings, fp)
337
+
338
+ # Save model configuration
339
+ model_config = {
340
+ "model_type": "XGBoost_MAPIE_CQR_LightGBM",
341
+ "confidence_levels": confidence_levels,
342
+ "n_features": len(features),
343
+ "target": target,
344
+ "validation_metrics": {
345
+ "xgb_rmse": float(xgb_rmse),
346
+ "xgb_mae": float(xgb_mae),
347
+ "xgb_r2": float(xgb_r2),
348
+ "n_validation": len(df_val)
349
+ }
350
+ }
351
+ with open(os.path.join(args.model_dir, "model_config.json"), "w") as fp:
352
+ json.dump(model_config, fp, indent=2)
353
+
354
+ print(f"\nModel training complete!")
355
+ print(f"Saved 1 XGBoost model and {len(mapie_models)} MAPIE models to {args.model_dir}")
356
+
357
+
358
+ #
359
+ # Inference Section
360
+ #
361
+ def model_fn(model_dir) -> dict:
362
+ """Load XGBoost and all MAPIE models from the specified directory."""
363
+
364
+ # Load model configuration to know which models to load
365
+ with open(os.path.join(model_dir, "model_config.json")) as fp:
366
+ config = json.load(fp)
367
+
368
+ # Load XGBoost regressor
369
+ xgb_path = os.path.join(model_dir, "xgb_model.json")
370
+ xgb_model = XGBRegressor(enable_categorical=True)
371
+ xgb_model.load_model(xgb_path)
372
+
373
+ # Load all MAPIE models
374
+ mapie_models = {}
375
+ for conf_level in config["confidence_levels"]:
376
+ model_name = f"mapie_{conf_level:.2f}"
377
+ mapie_models[model_name] = joblib.load(os.path.join(model_dir, f"{model_name}.joblib"))
378
+
379
+ # Load category mappings if they exist
380
+ category_mappings = {}
381
+ category_path = os.path.join(model_dir, "category_mappings.json")
382
+ if os.path.exists(category_path):
383
+ with open(category_path) as fp:
384
+ category_mappings = json.load(fp)
385
+
386
+ return {
387
+ "xgb_model": xgb_model,
388
+ "mapie_models": mapie_models,
389
+ "confidence_levels": config["confidence_levels"],
390
+ "category_mappings": category_mappings
391
+ }
392
+
393
+
394
+ def input_fn(input_data, content_type):
395
+ """Parse input data and return a DataFrame."""
396
+ if not input_data:
397
+ raise ValueError("Empty input data is not supported!")
398
+
399
+ # Decode bytes to string if necessary
400
+ if isinstance(input_data, bytes):
401
+ input_data = input_data.decode("utf-8")
402
+
403
+ if "text/csv" in content_type:
404
+ return pd.read_csv(StringIO(input_data))
405
+ elif "application/json" in content_type:
406
+ return pd.DataFrame(json.loads(input_data))
407
+ else:
408
+ raise ValueError(f"{content_type} not supported!")
409
+
410
+
411
+ def output_fn(output_df, accept_type):
412
+ """Supports both CSV and JSON output formats."""
413
+ if "text/csv" in accept_type:
414
+ # Convert categorical columns to string to avoid fillna issues
415
+ for col in output_df.select_dtypes(include=['category']).columns:
416
+ output_df[col] = output_df[col].astype(str)
417
+ csv_output = output_df.fillna("N/A").to_csv(index=False)
418
+ return csv_output, "text/csv"
419
+ elif "application/json" in accept_type:
420
+ return output_df.to_json(orient="records"), "application/json"
421
+ else:
422
+ raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
423
+
424
+
425
+ def predict_fn(df, models) -> pd.DataFrame:
426
+ """Make predictions using XGBoost for point estimates and MAPIE for conformalized intervals
427
+
428
+ Args:
429
+ df (pd.DataFrame): The input DataFrame
430
+ models (dict): Dictionary containing XGBoost and MAPIE models
431
+
432
+ Returns:
433
+ pd.DataFrame: DataFrame with XGBoost predictions and conformalized intervals
434
+ """
435
+
436
+ # Grab our feature columns (from training)
437
+ model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
438
+ with open(os.path.join(model_dir, "feature_columns.json")) as fp:
439
+ model_features = json.load(fp)
440
+
441
+ # Match features in a case-insensitive manner
442
+ matched_df = match_features_case_insensitive(df, model_features)
443
+
444
+ # Apply categorical mappings if they exist
445
+ if models.get("category_mappings"):
446
+ matched_df, _ = convert_categorical_types(
447
+ matched_df,
448
+ model_features,
449
+ models["category_mappings"]
450
+ )
451
+
452
+ # Get features for prediction
453
+ X = matched_df[model_features]
454
+
455
+ # Get XGBoost point predictions
456
+ df["prediction"] = models["xgb_model"].predict(X)
457
+
458
+ # Get predictions from each MAPIE model for conformalized intervals
459
+ for conf_level in models["confidence_levels"]:
460
+ model_name = f"mapie_{conf_level:.2f}"
461
+ model = models["mapie_models"][model_name]
462
+
463
+ # Get conformalized predictions
464
+ y_pred, y_pis = model.predict_interval(X)
465
+
466
+ # Map confidence levels to quantile names
467
+ if conf_level == 0.50: # 50% CI
468
+ df["q_25"] = y_pis[:, 0, 0]
469
+ df["q_75"] = y_pis[:, 1, 0]
470
+ elif conf_level == 0.80: # 80% CI
471
+ df["q_10"] = y_pis[:, 0, 0]
472
+ df["q_90"] = y_pis[:, 1, 0]
473
+ elif conf_level == 0.90: # 90% CI
474
+ df["q_05"] = y_pis[:, 0, 0]
475
+ df["q_95"] = y_pis[:, 1, 0]
476
+ elif conf_level == 0.95: # 95% CI
477
+ df["q_025"] = y_pis[:, 0, 0]
478
+ df["q_975"] = y_pis[:, 1, 0]
479
+
480
+ # Add median (q_50) from XGBoost prediction
481
+ df["q_50"] = df["prediction"]
482
+
483
+ # Calculate uncertainty metrics based on 95% interval
484
+ interval_width = df["q_975"] - df["q_025"]
485
+ df["prediction_std"] = interval_width / 3.92
486
+
487
+ # Reorder the quantile columns for easier reading
488
+ quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
489
+ other_cols = [col for col in df.columns if col not in quantile_cols]
490
+ df = df[other_cols + quantile_cols]
491
+
492
+ # Uncertainty score
493
+ df["uncertainty_score"] = interval_width / (np.abs(df["prediction"]) + 1e-6)
494
+
495
+ # Confidence bands
496
+ df["confidence_band"] = pd.cut(
497
+ df["uncertainty_score"],
498
+ bins=[0, 0.5, 1.0, 2.0, np.inf],
499
+ labels=["high", "medium", "low", "very_low"]
500
+ )
501
+
502
+ return df
@@ -1,6 +1,7 @@
1
1
  # Model: NGBoost Regressor with Distribution output
2
2
  from ngboost import NGBRegressor
3
- from xgboost import XGBRegressor # Base Estimator
3
+ from ngboost.distns import Cauchy
4
+ from xgboost import XGBRegressor # Point Estimator
4
5
  from sklearn.model_selection import train_test_split
5
6
 
6
7
  # Model Performance Scores
@@ -106,8 +107,10 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
106
107
  return df, category_mappings
107
108
 
108
109
 
109
- def decompress_features(df: pd.DataFrame, features: List[str], compressed_features: List[str]) -> Tuple[pd.DataFrame, List[str]]:
110
- """Prepare features for the XGBoost model
110
+ def decompress_features(
111
+ df: pd.DataFrame, features: List[str], compressed_features: List[str]
112
+ ) -> Tuple[pd.DataFrame, List[str]]:
113
+ """Prepare features for the model by decompressing bitstring features
111
114
 
112
115
  Args:
113
116
  df (pd.DataFrame): The features DataFrame
@@ -132,7 +135,7 @@ def decompress_features(df: pd.DataFrame, features: List[str], compressed_featur
132
135
  )
133
136
 
134
137
  # Decompress the specified compressed features
135
- decompressed_features = features
138
+ decompressed_features = features.copy()
136
139
  for feature in compressed_features:
137
140
  if (feature not in df.columns) or (feature not in features):
138
141
  print(f"Feature '{feature}' not in the features list, skipping decompression.")
@@ -227,7 +230,7 @@ if __name__ == "__main__":
227
230
 
228
231
  # We're using XGBoost for point predictions and NGBoost for uncertainty quantification
229
232
  xgb_model = XGBRegressor()
230
- ngb_model = NGBRegressor()
233
+ ngb_model = NGBRegressor() # Dist=Cauchy) Seems to give HUGE prediction intervals
231
234
 
232
235
  # Prepare features and targets for training
233
236
  X_train = df_train[features]
@@ -1,3 +1 @@
1
- # Note: NGBoost is not included in the default inference image, so it must be specified here.
2
- ngboost
3
- mapie
1
+ # Note: Most libs are already in the training/inference images, ONLY specify additional libs here
@@ -70,6 +70,11 @@ def fill_template(template_path: str, params: dict, output_script: str) -> str:
70
70
  # Sanity check to ensure all placeholders were replaced
71
71
  if "{{" in template and "}}" in template:
72
72
  msg = "Not all template placeholders were replaced. Please check your params."
73
+
74
+ # Show which placeholders are still present
75
+ start = template.index("{{")
76
+ end = template.index("}}", start) + 2
77
+ msg += f" Unreplaced placeholder: {template[start:end]}"
73
78
  log.critical(msg)
74
79
  raise ValueError(msg)
75
80
 
@@ -28,12 +28,12 @@ from typing import List, Tuple
28
28
 
29
29
  # Template Parameters
30
30
  TEMPLATE_PARAMS = {
31
- "model_type": "regressor",
32
- "target": "udm_asy_res_intrinsic_clearance_ul_per_min_per_mg_protein",
33
- "features": ['bcut2d_logplow', 'numradicalelectrons', 'smr_vsa5', 'fr_lactam', 'fr_morpholine', 'fr_aldehyde', 'slogp_vsa1', 'fr_amidine', 'bpol', 'fr_ester', 'fr_azo', 'kappa3', 'peoe_vsa5', 'fr_ketone_topliss', 'vsa_estate9', 'estate_vsa9', 'bcut2d_mrhi', 'fr_ndealkylation1', 'numrotatablebonds', 'minestateindex', 'fr_quatn', 'peoe_vsa3', 'fr_epoxide', 'fr_aniline', 'minpartialcharge', 'fr_nitroso', 'fpdensitymorgan2', 'fr_oxime', 'fr_sulfone', 'smr_vsa1', 'kappa1', 'fr_pyridine', 'numaromaticrings', 'vsa_estate6', 'molmr', 'estate_vsa1', 'fr_dihydropyridine', 'vsa_estate10', 'fr_alkyl_halide', 'chi2n', 'fr_thiocyan', 'fpdensitymorgan1', 'fr_unbrch_alkane', 'slogp_vsa9', 'chi4n', 'fr_nitro_arom', 'fr_al_oh', 'fr_furan', 'fr_c_s', 'peoe_vsa8', 'peoe_vsa14', 'numheteroatoms', 'fr_ndealkylation2', 'maxabspartialcharge', 'vsa_estate2', 'peoe_vsa7', 'apol', 'numhacceptors', 'fr_tetrazole', 'vsa_estate1', 'peoe_vsa9', 'naromatom', 'bcut2d_chghi', 'fr_sh', 'fr_halogen', 'slogp_vsa4', 'fr_benzodiazepine', 'molwt', 'fr_isocyan', 'fr_prisulfonamd', 'maxabsestateindex', 'minabsestateindex', 'peoe_vsa11', 'slogp_vsa12', 'estate_vsa5', 'numaliphaticcarbocycles', 'bcut2d_mwlow', 'slogp_vsa7', 'fr_allylic_oxid', 'fr_methoxy', 'fr_nh0', 'fr_coo2', 'fr_phenol', 'nacid', 'nbase', 'chi3v', 'fr_ar_nh', 'fr_nitrile', 'fr_imidazole', 'fr_urea', 'bcut2d_mrlow', 'chi1', 'smr_vsa6', 'fr_aryl_methyl', 'narombond', 'fr_alkyl_carbamate', 'fr_piperzine', 'exactmolwt', 'qed', 'chi0n', 'fr_sulfonamd', 'fr_thiazole', 'numvalenceelectrons', 'fr_phos_acid', 'peoe_vsa12', 'fr_nh1', 'fr_hdrzine', 'fr_c_o_nocoo', 'fr_lactone', 'estate_vsa6', 'bcut2d_logphi', 'vsa_estate7', 'peoe_vsa13', 'numsaturatedcarbocycles', 'fr_nitro', 'fr_phenol_noorthohbond', 'rotratio', 'fr_barbitur', 'fr_isothiocyan', 'balabanj', 'fr_arn', 'fr_imine', 'maxpartialcharge', 'fr_sulfide', 'slogp_vsa11', 'fr_hoccn', 'fr_n_o', 'peoe_vsa1', 'slogp_vsa6', 'heavyatommolwt', 'fractioncsp3', 'estate_vsa8', 'peoe_vsa10', 'numaliphaticrings', 'fr_thiophene', 'maxestateindex', 'smr_vsa10', 'labuteasa', 'smr_vsa2', 'fpdensitymorgan3', 'smr_vsa9', 'slogp_vsa10', 'numaromaticheterocycles', 'fr_nh2', 'fr_diazo', 'chi3n', 'fr_ar_coo', 'slogp_vsa5', 'fr_bicyclic', 'fr_amide', 'estate_vsa10', 'fr_guanido', 'chi1n', 'numsaturatedrings', 'fr_piperdine', 'fr_term_acetylene', 'estate_vsa4', 'slogp_vsa3', 'fr_coo', 'fr_ether', 'estate_vsa7', 'bcut2d_chglo', 'fr_oxazole', 'peoe_vsa6', 'hallkieralpha', 'peoe_vsa2', 'chi2v', 'nocount', 'vsa_estate5', 'fr_nhpyrrole', 'fr_al_coo', 'bertzct', 'estate_vsa11', 'minabspartialcharge', 'slogp_vsa8', 'fr_imide', 'kappa2', 'numaliphaticheterocycles', 'numsaturatedheterocycles', 'fr_hdrzone', 'smr_vsa4', 'fr_ar_n', 'nrot', 'smr_vsa8', 'slogp_vsa2', 'chi4v', 'fr_phos_ester', 'fr_para_hydroxylation', 'smr_vsa3', 'nhohcount', 'estate_vsa2', 'mollogp', 'tpsa', 'fr_azide', 'peoe_vsa4', 'numhdonors', 'fr_al_oh_notert', 'fr_c_o', 'chi0', 'fr_nitro_arom_nonortho', 'vsa_estate3', 'fr_benzene', 'fr_ketone', 'vsa_estate8', 'smr_vsa7', 'fr_ar_oh', 'fr_priamide', 'ringcount', 'estate_vsa3', 'numaromaticcarbocycles', 'bcut2d_mwhi', 'chi1v', 'heavyatomcount', 'vsa_estate4', 'chi0v'],
31
+ "model_type": "classifier",
32
+ "target": "class",
33
+ "features": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo'],
34
34
  "compressed_features": [],
35
- "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/temp-hlm-phase1-reg-0-80/training",
36
- "train_all_data": False
35
+ "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/sol-class-f1-100/training",
36
+ "train_all_data": True
37
37
  }
38
38
 
39
39
  # Function to check if dataframe is empty
@@ -41,7 +41,7 @@ from workbench.cached.cached_meta import CachedMeta
41
41
  try:
42
42
  import rdkit # noqa
43
43
  import mordred # noqa
44
- from workbench.utils import chem_utils
44
+ from workbench.utils.chem_utils import vis
45
45
 
46
46
  HAVE_CHEM_UTILS = True
47
47
  except ImportError:
@@ -178,12 +178,12 @@ class WorkbenchShell:
178
178
 
179
179
  # Add cheminformatics utils if available
180
180
  if HAVE_CHEM_UTILS:
181
- self.commands["show"] = chem_utils.show
181
+ self.commands["show"] = vis.show
182
182
 
183
183
  def start(self):
184
184
  """Start the Workbench IPython shell"""
185
185
  cprint("magenta", "\nWelcome to Workbench!")
186
- if self.aws_status is False:
186
+ if not self.aws_status:
187
187
  cprint("red", "AWS Account Connection Failed...Review/Fix the Workbench Config:")
188
188
  cprint("red", f"Path: {self.cm.site_config_path}")
189
189
  self.show_config()
File without changes