workbench 0.8.177__py3-none-any.whl → 0.8.227__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

Files changed (140) hide show
  1. workbench/__init__.py +1 -0
  2. workbench/algorithms/dataframe/__init__.py +1 -2
  3. workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
  4. workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
  5. workbench/algorithms/dataframe/fingerprint_proximity.py +422 -86
  6. workbench/algorithms/dataframe/projection_2d.py +44 -21
  7. workbench/algorithms/dataframe/proximity.py +259 -305
  8. workbench/algorithms/graph/light/proximity_graph.py +12 -11
  9. workbench/algorithms/models/cleanlab_model.py +382 -0
  10. workbench/algorithms/models/noise_model.py +388 -0
  11. workbench/algorithms/sql/column_stats.py +0 -1
  12. workbench/algorithms/sql/correlations.py +0 -1
  13. workbench/algorithms/sql/descriptive_stats.py +0 -1
  14. workbench/algorithms/sql/outliers.py +3 -3
  15. workbench/api/__init__.py +5 -1
  16. workbench/api/df_store.py +17 -108
  17. workbench/api/endpoint.py +14 -12
  18. workbench/api/feature_set.py +117 -11
  19. workbench/api/meta.py +0 -1
  20. workbench/api/meta_model.py +289 -0
  21. workbench/api/model.py +52 -21
  22. workbench/api/parameter_store.py +3 -52
  23. workbench/cached/cached_meta.py +0 -1
  24. workbench/cached/cached_model.py +49 -11
  25. workbench/core/artifacts/__init__.py +11 -2
  26. workbench/core/artifacts/artifact.py +5 -5
  27. workbench/core/artifacts/df_store_core.py +114 -0
  28. workbench/core/artifacts/endpoint_core.py +319 -204
  29. workbench/core/artifacts/feature_set_core.py +249 -45
  30. workbench/core/artifacts/model_core.py +135 -82
  31. workbench/core/artifacts/parameter_store_core.py +98 -0
  32. workbench/core/cloud_platform/cloud_meta.py +0 -1
  33. workbench/core/pipelines/pipeline_executor.py +1 -1
  34. workbench/core/transforms/features_to_model/features_to_model.py +60 -44
  35. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +43 -10
  36. workbench/core/transforms/pandas_transforms/pandas_to_features.py +38 -2
  37. workbench/core/views/training_view.py +113 -42
  38. workbench/core/views/view.py +53 -3
  39. workbench/core/views/view_utils.py +4 -4
  40. workbench/model_script_utils/model_script_utils.py +339 -0
  41. workbench/model_script_utils/pytorch_utils.py +405 -0
  42. workbench/model_script_utils/uq_harness.py +277 -0
  43. workbench/model_scripts/chemprop/chemprop.template +774 -0
  44. workbench/model_scripts/chemprop/generated_model_script.py +774 -0
  45. workbench/model_scripts/chemprop/model_script_utils.py +339 -0
  46. workbench/model_scripts/chemprop/requirements.txt +3 -0
  47. workbench/model_scripts/custom_models/chem_info/fingerprints.py +175 -0
  48. workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +0 -1
  49. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +0 -1
  50. workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -2
  51. workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
  52. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +8 -10
  53. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  54. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
  55. workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
  56. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  57. workbench/model_scripts/custom_models/uq_models/ngboost.template +15 -16
  58. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
  59. workbench/model_scripts/meta_model/generated_model_script.py +209 -0
  60. workbench/model_scripts/meta_model/meta_model.template +209 -0
  61. workbench/model_scripts/pytorch_model/generated_model_script.py +443 -499
  62. workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
  63. workbench/model_scripts/pytorch_model/pytorch.template +440 -496
  64. workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
  65. workbench/model_scripts/pytorch_model/requirements.txt +1 -1
  66. workbench/model_scripts/pytorch_model/uq_harness.py +277 -0
  67. workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
  68. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  69. workbench/model_scripts/script_generation.py +15 -12
  70. workbench/model_scripts/uq_models/generated_model_script.py +248 -0
  71. workbench/model_scripts/xgb_model/generated_model_script.py +371 -403
  72. workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
  73. workbench/model_scripts/xgb_model/uq_harness.py +277 -0
  74. workbench/model_scripts/xgb_model/xgb_model.template +367 -399
  75. workbench/repl/workbench_shell.py +18 -14
  76. workbench/resources/open_source_api.key +1 -1
  77. workbench/scripts/endpoint_test.py +162 -0
  78. workbench/scripts/lambda_test.py +73 -0
  79. workbench/scripts/meta_model_sim.py +35 -0
  80. workbench/scripts/ml_pipeline_sqs.py +122 -6
  81. workbench/scripts/training_test.py +85 -0
  82. workbench/themes/dark/custom.css +59 -0
  83. workbench/themes/dark/plotly.json +5 -5
  84. workbench/themes/light/custom.css +153 -40
  85. workbench/themes/light/plotly.json +9 -9
  86. workbench/themes/midnight_blue/custom.css +59 -0
  87. workbench/utils/aws_utils.py +0 -1
  88. workbench/utils/chem_utils/fingerprints.py +87 -46
  89. workbench/utils/chem_utils/mol_descriptors.py +0 -1
  90. workbench/utils/chem_utils/projections.py +16 -6
  91. workbench/utils/chem_utils/vis.py +25 -27
  92. workbench/utils/chemprop_utils.py +141 -0
  93. workbench/utils/config_manager.py +2 -6
  94. workbench/utils/endpoint_utils.py +5 -7
  95. workbench/utils/license_manager.py +2 -6
  96. workbench/utils/markdown_utils.py +57 -0
  97. workbench/utils/meta_model_simulator.py +499 -0
  98. workbench/utils/metrics_utils.py +256 -0
  99. workbench/utils/model_utils.py +260 -76
  100. workbench/utils/pipeline_utils.py +0 -1
  101. workbench/utils/plot_utils.py +159 -34
  102. workbench/utils/pytorch_utils.py +87 -0
  103. workbench/utils/shap_utils.py +11 -57
  104. workbench/utils/theme_manager.py +95 -30
  105. workbench/utils/xgboost_local_crossfold.py +267 -0
  106. workbench/utils/xgboost_model_utils.py +127 -220
  107. workbench/web_interface/components/experiments/outlier_plot.py +0 -1
  108. workbench/web_interface/components/model_plot.py +16 -2
  109. workbench/web_interface/components/plugin_unit_test.py +5 -3
  110. workbench/web_interface/components/plugins/ag_table.py +2 -4
  111. workbench/web_interface/components/plugins/confusion_matrix.py +3 -6
  112. workbench/web_interface/components/plugins/model_details.py +48 -80
  113. workbench/web_interface/components/plugins/scatter_plot.py +192 -92
  114. workbench/web_interface/components/settings_menu.py +184 -0
  115. workbench/web_interface/page_views/main_page.py +0 -1
  116. {workbench-0.8.177.dist-info → workbench-0.8.227.dist-info}/METADATA +31 -17
  117. {workbench-0.8.177.dist-info → workbench-0.8.227.dist-info}/RECORD +121 -106
  118. {workbench-0.8.177.dist-info → workbench-0.8.227.dist-info}/entry_points.txt +4 -0
  119. {workbench-0.8.177.dist-info → workbench-0.8.227.dist-info}/licenses/LICENSE +1 -1
  120. workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
  121. workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
  122. workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
  123. workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
  124. workbench/model_scripts/custom_models/proximity/proximity.py +0 -384
  125. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -494
  126. workbench/model_scripts/custom_models/uq_models/mapie.template +0 -494
  127. workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -386
  128. workbench/model_scripts/custom_models/uq_models/proximity.py +0 -384
  129. workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
  130. workbench/model_scripts/quant_regression/quant_regression.template +0 -279
  131. workbench/model_scripts/quant_regression/requirements.txt +0 -1
  132. workbench/themes/quartz/base_css.url +0 -1
  133. workbench/themes/quartz/custom.css +0 -117
  134. workbench/themes/quartz/plotly.json +0 -642
  135. workbench/themes/quartz_dark/base_css.url +0 -1
  136. workbench/themes/quartz_dark/custom.css +0 -131
  137. workbench/themes/quartz_dark/plotly.json +0 -642
  138. workbench/utils/resource_utils.py +0 -39
  139. {workbench-0.8.177.dist-info → workbench-0.8.227.dist-info}/WHEEL +0 -0
  140. {workbench-0.8.177.dist-info → workbench-0.8.227.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,339 @@
1
+ """Shared utility functions for model training scripts (templates).
2
+
3
+ These functions are used across multiple model templates (XGBoost, PyTorch, ChemProp)
4
+ to reduce code duplication and ensure consistent behavior.
5
+ """
6
+
7
+ from io import StringIO
8
+ import json
9
+ import numpy as np
10
+ import pandas as pd
11
+ from sklearn.metrics import (
12
+ confusion_matrix,
13
+ mean_absolute_error,
14
+ median_absolute_error,
15
+ precision_recall_fscore_support,
16
+ r2_score,
17
+ root_mean_squared_error,
18
+ )
19
+ from scipy.stats import spearmanr
20
+
21
+
22
+ def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
23
+ """Check if the provided dataframe is empty and raise an exception if it is.
24
+
25
+ Args:
26
+ df: DataFrame to check
27
+ df_name: Name of the DataFrame (for error message)
28
+
29
+ Raises:
30
+ ValueError: If the DataFrame is empty
31
+ """
32
+ if df.empty:
33
+ msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
34
+ print(msg)
35
+ raise ValueError(msg)
36
+
37
+
38
+ def expand_proba_column(df: pd.DataFrame, class_labels: list[str]) -> pd.DataFrame:
39
+ """Expands a column containing a list of probabilities into separate columns.
40
+
41
+ Handles None values for rows where predictions couldn't be made.
42
+
43
+ Args:
44
+ df: DataFrame containing a "pred_proba" column
45
+ class_labels: List of class labels
46
+
47
+ Returns:
48
+ DataFrame with the "pred_proba" expanded into separate columns (e.g., "class1_proba")
49
+
50
+ Raises:
51
+ ValueError: If DataFrame does not contain a "pred_proba" column
52
+ """
53
+ proba_column = "pred_proba"
54
+ if proba_column not in df.columns:
55
+ raise ValueError('DataFrame does not contain a "pred_proba" column')
56
+
57
+ proba_splits = [f"{label}_proba" for label in class_labels]
58
+ n_classes = len(class_labels)
59
+
60
+ # Handle None values by replacing with list of NaNs
61
+ proba_values = []
62
+ for val in df[proba_column]:
63
+ if val is None:
64
+ proba_values.append([np.nan] * n_classes)
65
+ else:
66
+ proba_values.append(val)
67
+
68
+ proba_df = pd.DataFrame(proba_values, columns=proba_splits)
69
+
70
+ # Drop any existing proba columns and reset index for concat
71
+ df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
72
+ df = df.reset_index(drop=True)
73
+ df = pd.concat([df, proba_df], axis=1)
74
+ return df
75
+
76
+
77
+ def match_features_case_insensitive(df: pd.DataFrame, model_features: list[str]) -> pd.DataFrame:
78
+ """Matches and renames DataFrame columns to match model feature names (case-insensitive).
79
+
80
+ Prioritizes exact matches, then case-insensitive matches.
81
+
82
+ Args:
83
+ df: Input DataFrame
84
+ model_features: List of feature names expected by the model
85
+
86
+ Returns:
87
+ DataFrame with columns renamed to match model features
88
+
89
+ Raises:
90
+ ValueError: If any model features cannot be matched
91
+ """
92
+ df_columns_lower = {col.lower(): col for col in df.columns}
93
+ rename_dict = {}
94
+ missing = []
95
+ for feature in model_features:
96
+ if feature in df.columns:
97
+ continue # Exact match
98
+ elif feature.lower() in df_columns_lower:
99
+ rename_dict[df_columns_lower[feature.lower()]] = feature
100
+ else:
101
+ missing.append(feature)
102
+
103
+ if missing:
104
+ raise ValueError(f"Features not found: {missing}")
105
+
106
+ return df.rename(columns=rename_dict)
107
+
108
+
109
+ def convert_categorical_types(
110
+ df: pd.DataFrame, features: list[str], category_mappings: dict[str, list[str]] | None = None
111
+ ) -> tuple[pd.DataFrame, dict[str, list[str]]]:
112
+ """Converts appropriate columns to categorical type with consistent mappings.
113
+
114
+ In training mode (category_mappings is None or empty), detects object/string columns
115
+ with <20 unique values and converts them to categorical.
116
+ In inference mode (category_mappings provided), applies the stored mappings.
117
+
118
+ Args:
119
+ df: The DataFrame to process
120
+ features: List of feature names to consider for conversion
121
+ category_mappings: Existing category mappings. If None or empty, training mode.
122
+ If populated, inference mode.
123
+
124
+ Returns:
125
+ Tuple of (processed DataFrame, category mappings dictionary)
126
+ """
127
+ if category_mappings is None:
128
+ category_mappings = {}
129
+
130
+ # Training mode
131
+ if not category_mappings:
132
+ for col in df.select_dtypes(include=["object", "string"]):
133
+ if col in features and df[col].nunique() < 20:
134
+ print(f"Training mode: Converting {col} to category")
135
+ df[col] = df[col].astype("category")
136
+ category_mappings[col] = df[col].cat.categories.tolist()
137
+
138
+ # Inference mode
139
+ else:
140
+ for col, categories in category_mappings.items():
141
+ if col in df.columns:
142
+ print(f"Inference mode: Applying categorical mapping for {col}")
143
+ df[col] = pd.Categorical(df[col], categories=categories)
144
+
145
+ return df, category_mappings
146
+
147
+
148
+ def decompress_features(
149
+ df: pd.DataFrame, features: list[str], compressed_features: list[str]
150
+ ) -> tuple[pd.DataFrame, list[str]]:
151
+ """Decompress compressed features (bitstrings or count vectors) into individual columns.
152
+
153
+ Supports two formats (auto-detected):
154
+ - Bitstrings: "10110010..." → individual uint8 columns (0 or 1)
155
+ - Count vectors: "0,3,0,1,5,..." → individual uint8 columns (0-255)
156
+
157
+ Args:
158
+ df: The features DataFrame
159
+ features: Full list of feature names
160
+ compressed_features: List of feature names to decompress
161
+
162
+ Returns:
163
+ Tuple of (DataFrame with decompressed features, updated feature list)
164
+ """
165
+ # Check for any missing values in the required features
166
+ missing_counts = df[features].isna().sum()
167
+ if missing_counts.any():
168
+ missing_features = missing_counts[missing_counts > 0]
169
+ print(
170
+ f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
171
+ "WARNING: You might want to remove/replace all NaN values before processing."
172
+ )
173
+
174
+ # Make a copy to avoid mutating the original list
175
+ decompressed_features = features.copy()
176
+
177
+ for feature in compressed_features:
178
+ if (feature not in df.columns) or (feature not in decompressed_features):
179
+ print(f"Feature '{feature}' not in the features list, skipping decompression.")
180
+ continue
181
+
182
+ # Remove the feature from the list to avoid duplication
183
+ decompressed_features.remove(feature)
184
+
185
+ # Auto-detect format and parse: comma-separated counts or bitstring
186
+ sample = str(df[feature].dropna().iloc[0]) if not df[feature].dropna().empty else ""
187
+ parse_fn = (lambda s: list(map(int, s.split(",")))) if "," in sample else list
188
+ feature_matrix = np.array([parse_fn(s) for s in df[feature]], dtype=np.uint8)
189
+
190
+ # Create new columns with prefix from feature name
191
+ prefix = feature[:3]
192
+ new_col_names = [f"{prefix}_{i}" for i in range(feature_matrix.shape[1])]
193
+ new_df = pd.DataFrame(feature_matrix, columns=new_col_names, index=df.index)
194
+
195
+ # Update features list and dataframe
196
+ decompressed_features.extend(new_col_names)
197
+ df = df.drop(columns=[feature])
198
+ df = pd.concat([df, new_df], axis=1)
199
+
200
+ return df, decompressed_features
201
+
202
+
203
+ def input_fn(input_data, content_type: str) -> pd.DataFrame:
204
+ """Parse input data and return a DataFrame.
205
+
206
+ Args:
207
+ input_data: Raw input data (bytes or string)
208
+ content_type: MIME type of the input data
209
+
210
+ Returns:
211
+ Parsed DataFrame
212
+
213
+ Raises:
214
+ ValueError: If input is empty or content_type is not supported
215
+ """
216
+ if not input_data:
217
+ raise ValueError("Empty input data is not supported!")
218
+
219
+ if isinstance(input_data, bytes):
220
+ input_data = input_data.decode("utf-8")
221
+
222
+ if "text/csv" in content_type:
223
+ return pd.read_csv(StringIO(input_data))
224
+ elif "application/json" in content_type:
225
+ return pd.DataFrame(json.loads(input_data))
226
+ else:
227
+ raise ValueError(f"{content_type} not supported!")
228
+
229
+
230
+ def output_fn(output_df: pd.DataFrame, accept_type: str) -> tuple[str, str]:
231
+ """Convert output DataFrame to requested format.
232
+
233
+ Args:
234
+ output_df: DataFrame to convert
235
+ accept_type: Requested MIME type
236
+
237
+ Returns:
238
+ Tuple of (formatted output string, MIME type)
239
+
240
+ Raises:
241
+ RuntimeError: If accept_type is not supported
242
+ """
243
+ if "text/csv" in accept_type:
244
+ csv_output = output_df.fillna("N/A").to_csv(index=False)
245
+ return csv_output, "text/csv"
246
+ elif "application/json" in accept_type:
247
+ return output_df.to_json(orient="records"), "application/json"
248
+ else:
249
+ raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
250
+
251
+
252
+ def compute_regression_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> dict[str, float]:
253
+ """Compute standard regression metrics.
254
+
255
+ Args:
256
+ y_true: Ground truth target values
257
+ y_pred: Predicted values
258
+
259
+ Returns:
260
+ Dictionary with keys: rmse, mae, medae, r2, spearmanr, support
261
+ """
262
+ return {
263
+ "rmse": root_mean_squared_error(y_true, y_pred),
264
+ "mae": mean_absolute_error(y_true, y_pred),
265
+ "medae": median_absolute_error(y_true, y_pred),
266
+ "r2": r2_score(y_true, y_pred),
267
+ "spearmanr": spearmanr(y_true, y_pred).correlation,
268
+ "support": len(y_true),
269
+ }
270
+
271
+
272
+ def print_regression_metrics(metrics: dict[str, float]) -> None:
273
+ """Print regression metrics in the format expected by SageMaker metric definitions.
274
+
275
+ Args:
276
+ metrics: Dictionary of metric name -> value
277
+ """
278
+ print(f"rmse: {metrics['rmse']:.3f}")
279
+ print(f"mae: {metrics['mae']:.3f}")
280
+ print(f"medae: {metrics['medae']:.3f}")
281
+ print(f"r2: {metrics['r2']:.3f}")
282
+ print(f"spearmanr: {metrics['spearmanr']:.3f}")
283
+ print(f"support: {metrics['support']}")
284
+
285
+
286
+ def compute_classification_metrics(
287
+ y_true: np.ndarray, y_pred: np.ndarray, label_names: list[str], target_col: str
288
+ ) -> pd.DataFrame:
289
+ """Compute per-class classification metrics.
290
+
291
+ Args:
292
+ y_true: Ground truth labels
293
+ y_pred: Predicted labels
294
+ label_names: List of class label names
295
+ target_col: Name of the target column (for DataFrame output)
296
+
297
+ Returns:
298
+ DataFrame with columns: target_col, precision, recall, f1, support
299
+ """
300
+ scores = precision_recall_fscore_support(y_true, y_pred, average=None, labels=label_names)
301
+ return pd.DataFrame(
302
+ {
303
+ target_col: label_names,
304
+ "precision": scores[0],
305
+ "recall": scores[1],
306
+ "f1": scores[2],
307
+ "support": scores[3],
308
+ }
309
+ )
310
+
311
+
312
+ def print_classification_metrics(score_df: pd.DataFrame, target_col: str, label_names: list[str]) -> None:
313
+ """Print per-class classification metrics in the format expected by SageMaker.
314
+
315
+ Args:
316
+ score_df: DataFrame from compute_classification_metrics
317
+ target_col: Name of the target column
318
+ label_names: List of class label names
319
+ """
320
+ metrics = ["precision", "recall", "f1", "support"]
321
+ for t in label_names:
322
+ for m in metrics:
323
+ value = score_df.loc[score_df[target_col] == t, m].iloc[0]
324
+ print(f"Metrics:{t}:{m} {value}")
325
+
326
+
327
+ def print_confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray, label_names: list[str]) -> None:
328
+ """Print confusion matrix in the format expected by SageMaker.
329
+
330
+ Args:
331
+ y_true: Ground truth labels
332
+ y_pred: Predicted labels
333
+ label_names: List of class label names
334
+ """
335
+ conf_mtx = confusion_matrix(y_true, y_pred, labels=label_names)
336
+ for i, row_name in enumerate(label_names):
337
+ for j, col_name in enumerate(label_names):
338
+ value = conf_mtx[i, j]
339
+ print(f"ConfusionMatrix:{row_name}:{col_name} {value}")
@@ -0,0 +1,3 @@
1
+ # Requirements for ChemProp model scripts
2
+ # Note: The training and inference images already have torch and chemprop installed.
3
+ # So we only need to install packages that are not already included in the images.
@@ -0,0 +1,175 @@
1
+ """Molecular fingerprint computation utilities for ADMET modeling.
2
+
3
+ This module provides Morgan count fingerprints, the standard for ADMET prediction.
4
+ Count fingerprints outperform binary fingerprints for molecular property prediction.
5
+
6
+ References:
7
+ - Count vs Binary: https://pubs.acs.org/doi/10.1021/acs.est.3c02198
8
+ - ECFP/Morgan: https://pubs.acs.org/doi/10.1021/ci100050t
9
+ """
10
+
11
+ import logging
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+ from rdkit import Chem, RDLogger
16
+ from rdkit.Chem import AllChem
17
+ from rdkit.Chem.MolStandardize import rdMolStandardize
18
+
19
+ # Suppress RDKit warnings (e.g., "not removing hydrogen atom without neighbors")
20
+ # Keep errors enabled so we see actual problems
21
+ RDLogger.DisableLog("rdApp.warning")
22
+
23
+ # Set up the logger
24
+ log = logging.getLogger("workbench")
25
+
26
+
27
+ def compute_morgan_fingerprints(df: pd.DataFrame, radius: int = 2, n_bits: int = 2048) -> pd.DataFrame:
28
+ """Compute Morgan count fingerprints for ADMET modeling.
29
+
30
+ Generates true count fingerprints where each bit position contains the
31
+ number of times that substructure appears in the molecule (clamped to 0-255).
32
+ This is the recommended approach for ADMET prediction per 2025 research.
33
+
34
+ Args:
35
+ df: Input DataFrame containing SMILES strings.
36
+ radius: Radius for the Morgan fingerprint (default 2 = ECFP4 equivalent).
37
+ n_bits: Number of bits for the fingerprint (default 2048).
38
+
39
+ Returns:
40
+ pd.DataFrame: Input DataFrame with 'fingerprint' column added.
41
+ Values are comma-separated uint8 counts.
42
+
43
+ Note:
44
+ Count fingerprints outperform binary for ADMET prediction.
45
+ See: https://pubs.acs.org/doi/10.1021/acs.est.3c02198
46
+ """
47
+ delete_mol_column = False
48
+
49
+ # Check for the SMILES column (case-insensitive)
50
+ smiles_column = next((col for col in df.columns if col.lower() == "smiles"), None)
51
+ if smiles_column is None:
52
+ raise ValueError("Input DataFrame must have a 'smiles' column")
53
+
54
+ # Sanity check the molecule column (sometimes it gets serialized, which doesn't work)
55
+ if "molecule" in df.columns and df["molecule"].dtype == "string":
56
+ log.warning("Detected serialized molecules in 'molecule' column. Removing...")
57
+ del df["molecule"]
58
+
59
+ # Convert SMILES to RDKit molecule objects
60
+ if "molecule" not in df.columns:
61
+ log.info("Converting SMILES to RDKit Molecules...")
62
+ delete_mol_column = True
63
+ df["molecule"] = df[smiles_column].apply(Chem.MolFromSmiles)
64
+ # Make sure our molecules are not None
65
+ failed_smiles = df[df["molecule"].isnull()][smiles_column].tolist()
66
+ if failed_smiles:
67
+ log.warning(f"Failed to convert {len(failed_smiles)} SMILES to molecules ({failed_smiles})")
68
+ df = df.dropna(subset=["molecule"]).copy()
69
+
70
+ # If we have fragments in our compounds, get the largest fragment before computing fingerprints
71
+ largest_frags = df["molecule"].apply(
72
+ lambda mol: rdMolStandardize.LargestFragmentChooser().choose(mol) if mol else None
73
+ )
74
+
75
+ def mol_to_count_string(mol):
76
+ """Convert molecule to comma-separated count fingerprint string."""
77
+ if mol is None:
78
+ return pd.NA
79
+
80
+ # Get hashed Morgan fingerprint with counts
81
+ fp = AllChem.GetHashedMorganFingerprint(mol, radius, nBits=n_bits)
82
+
83
+ # Initialize array and populate with counts (clamped to uint8 range)
84
+ counts = np.zeros(n_bits, dtype=np.uint8)
85
+ for idx, count in fp.GetNonzeroElements().items():
86
+ counts[idx] = min(count, 255)
87
+
88
+ # Return as comma-separated string
89
+ return ",".join(map(str, counts))
90
+
91
+ # Compute Morgan count fingerprints
92
+ fingerprints = largest_frags.apply(mol_to_count_string)
93
+
94
+ # Add the fingerprints to the DataFrame
95
+ df["fingerprint"] = fingerprints
96
+
97
+ # Drop the intermediate 'molecule' column if it was added
98
+ if delete_mol_column:
99
+ del df["molecule"]
100
+
101
+ return df
102
+
103
+
104
+ if __name__ == "__main__":
105
+ print("Running Morgan count fingerprint tests...")
106
+
107
+ # Test molecules
108
+ test_molecules = {
109
+ "aspirin": "CC(=O)OC1=CC=CC=C1C(=O)O",
110
+ "caffeine": "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",
111
+ "glucose": "C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O", # With stereochemistry
112
+ "sodium_acetate": "CC(=O)[O-].[Na+]", # Salt (largest fragment used)
113
+ "benzene": "c1ccccc1",
114
+ "butene_e": "C/C=C/C", # E-butene
115
+ "butene_z": "C/C=C\\C", # Z-butene
116
+ }
117
+
118
+ # Test 1: Morgan Count Fingerprints (default parameters)
119
+ print("\n1. Testing Morgan fingerprint generation (radius=2, n_bits=2048)...")
120
+
121
+ test_df = pd.DataFrame({"SMILES": list(test_molecules.values()), "name": list(test_molecules.keys())})
122
+ fp_df = compute_morgan_fingerprints(test_df.copy())
123
+
124
+ print(" Fingerprint generation results:")
125
+ for _, row in fp_df.iterrows():
126
+ fp = row.get("fingerprint", "N/A")
127
+ if pd.notna(fp):
128
+ counts = [int(x) for x in fp.split(",")]
129
+ non_zero = sum(1 for c in counts if c > 0)
130
+ max_count = max(counts)
131
+ print(f" {row['name']:15} → {len(counts)} features, {non_zero} non-zero, max={max_count}")
132
+ else:
133
+ print(f" {row['name']:15} → N/A")
134
+
135
+ # Test 2: Different parameters
136
+ print("\n2. Testing with different parameters (radius=3, n_bits=1024)...")
137
+
138
+ fp_df_custom = compute_morgan_fingerprints(test_df.copy(), radius=3, n_bits=1024)
139
+
140
+ for _, row in fp_df_custom.iterrows():
141
+ fp = row.get("fingerprint", "N/A")
142
+ if pd.notna(fp):
143
+ counts = [int(x) for x in fp.split(",")]
144
+ non_zero = sum(1 for c in counts if c > 0)
145
+ print(f" {row['name']:15} → {len(counts)} features, {non_zero} non-zero")
146
+ else:
147
+ print(f" {row['name']:15} → N/A")
148
+
149
+ # Test 3: Edge cases
150
+ print("\n3. Testing edge cases...")
151
+
152
+ # Invalid SMILES
153
+ invalid_df = pd.DataFrame({"SMILES": ["INVALID", ""]})
154
+ fp_invalid = compute_morgan_fingerprints(invalid_df.copy())
155
+ print(f" ✓ Invalid SMILES handled: {len(fp_invalid)} rows returned")
156
+
157
+ # Test with pre-existing molecule column
158
+ mol_df = test_df.copy()
159
+ mol_df["molecule"] = mol_df["SMILES"].apply(Chem.MolFromSmiles)
160
+ fp_with_mol = compute_morgan_fingerprints(mol_df)
161
+ print(f" ✓ Pre-existing molecule column handled: {len(fp_with_mol)} fingerprints generated")
162
+
163
+ # Test 4: Verify count values are reasonable
164
+ print("\n4. Verifying count distribution...")
165
+ all_counts = []
166
+ for _, row in fp_df.iterrows():
167
+ fp = row.get("fingerprint", "N/A")
168
+ if pd.notna(fp):
169
+ counts = [int(x) for x in fp.split(",")]
170
+ all_counts.extend([c for c in counts if c > 0])
171
+
172
+ if all_counts:
173
+ print(f" Non-zero counts: min={min(all_counts)}, max={max(all_counts)}, mean={np.mean(all_counts):.2f}")
174
+
175
+ print("\n✅ All fingerprint tests completed!")
@@ -99,7 +99,6 @@ from rdkit.ML.Descriptors import MoleculeDescriptors
99
99
  from mordred import Calculator as MordredCalculator
100
100
  from mordred import AcidBase, Aromatic, Constitutional, Chi, CarbonTypes
101
101
 
102
-
103
102
  logger = logging.getLogger("workbench")
104
103
  logger.setLevel(logging.DEBUG)
105
104
 
@@ -15,7 +15,6 @@ import json
15
15
  from mol_standardize import standardize
16
16
  from mol_descriptors import compute_descriptors
17
17
 
18
-
19
18
  # TRAINING SECTION
20
19
  #
21
20
  # This section (__main__) is where SageMaker will execute the training job
@@ -15,8 +15,7 @@ import pandas as pd
15
15
  import json
16
16
 
17
17
  # Local imports
18
- from local_utils import compute_morgan_fingerprints
19
-
18
+ from fingerprints import compute_morgan_fingerprints
20
19
 
21
20
  # TRAINING SECTION
22
21
  #