workbench 0.8.205__py3-none-any.whl → 0.8.213__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. workbench/algorithms/models/noise_model.py +388 -0
  2. workbench/api/endpoint.py +3 -6
  3. workbench/api/feature_set.py +1 -1
  4. workbench/api/model.py +5 -11
  5. workbench/cached/cached_model.py +4 -4
  6. workbench/core/artifacts/endpoint_core.py +63 -153
  7. workbench/core/artifacts/model_core.py +21 -19
  8. workbench/core/transforms/features_to_model/features_to_model.py +2 -2
  9. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +1 -1
  10. workbench/model_script_utils/model_script_utils.py +335 -0
  11. workbench/model_script_utils/pytorch_utils.py +395 -0
  12. workbench/model_script_utils/uq_harness.py +278 -0
  13. workbench/model_scripts/chemprop/chemprop.template +289 -666
  14. workbench/model_scripts/chemprop/generated_model_script.py +292 -669
  15. workbench/model_scripts/chemprop/model_script_utils.py +335 -0
  16. workbench/model_scripts/chemprop/requirements.txt +2 -10
  17. workbench/model_scripts/pytorch_model/generated_model_script.py +355 -612
  18. workbench/model_scripts/pytorch_model/model_script_utils.py +335 -0
  19. workbench/model_scripts/pytorch_model/pytorch.template +350 -607
  20. workbench/model_scripts/pytorch_model/pytorch_utils.py +395 -0
  21. workbench/model_scripts/pytorch_model/requirements.txt +1 -1
  22. workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
  23. workbench/model_scripts/script_generation.py +2 -5
  24. workbench/model_scripts/uq_models/generated_model_script.py +65 -422
  25. workbench/model_scripts/xgb_model/generated_model_script.py +349 -412
  26. workbench/model_scripts/xgb_model/model_script_utils.py +335 -0
  27. workbench/model_scripts/xgb_model/uq_harness.py +278 -0
  28. workbench/model_scripts/xgb_model/xgb_model.template +344 -407
  29. workbench/scripts/training_test.py +85 -0
  30. workbench/utils/chemprop_utils.py +18 -656
  31. workbench/utils/metrics_utils.py +172 -0
  32. workbench/utils/model_utils.py +104 -47
  33. workbench/utils/pytorch_utils.py +32 -472
  34. workbench/utils/xgboost_local_crossfold.py +267 -0
  35. workbench/utils/xgboost_model_utils.py +49 -356
  36. workbench/web_interface/components/plugins/model_details.py +30 -68
  37. {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/METADATA +5 -5
  38. {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/RECORD +42 -31
  39. {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/entry_points.txt +1 -0
  40. workbench/model_scripts/uq_models/mapie.template +0 -605
  41. workbench/model_scripts/uq_models/requirements.txt +0 -1
  42. {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/WHEEL +0 -0
  43. {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/licenses/LICENSE +0 -0
  44. {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,335 @@
1
+ """Shared utility functions for model training scripts (templates).
2
+
3
+ These functions are used across multiple model templates (XGBoost, PyTorch, ChemProp)
4
+ to reduce code duplication and ensure consistent behavior.
5
+ """
6
+
7
+ from io import StringIO
8
+ import json
9
+ import numpy as np
10
+ import pandas as pd
11
+ from sklearn.metrics import (
12
+ confusion_matrix,
13
+ mean_absolute_error,
14
+ median_absolute_error,
15
+ precision_recall_fscore_support,
16
+ r2_score,
17
+ root_mean_squared_error,
18
+ )
19
+ from scipy.stats import spearmanr
20
+
21
+
22
+ def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
23
+ """Check if the provided dataframe is empty and raise an exception if it is.
24
+
25
+ Args:
26
+ df: DataFrame to check
27
+ df_name: Name of the DataFrame (for error message)
28
+
29
+ Raises:
30
+ ValueError: If the DataFrame is empty
31
+ """
32
+ if df.empty:
33
+ msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
34
+ print(msg)
35
+ raise ValueError(msg)
36
+
37
+
38
+ def expand_proba_column(df: pd.DataFrame, class_labels: list[str]) -> pd.DataFrame:
39
+ """Expands a column containing a list of probabilities into separate columns.
40
+
41
+ Handles None values for rows where predictions couldn't be made.
42
+
43
+ Args:
44
+ df: DataFrame containing a "pred_proba" column
45
+ class_labels: List of class labels
46
+
47
+ Returns:
48
+ DataFrame with the "pred_proba" expanded into separate columns (e.g., "class1_proba")
49
+
50
+ Raises:
51
+ ValueError: If DataFrame does not contain a "pred_proba" column
52
+ """
53
+ proba_column = "pred_proba"
54
+ if proba_column not in df.columns:
55
+ raise ValueError('DataFrame does not contain a "pred_proba" column')
56
+
57
+ proba_splits = [f"{label}_proba" for label in class_labels]
58
+ n_classes = len(class_labels)
59
+
60
+ # Handle None values by replacing with list of NaNs
61
+ proba_values = []
62
+ for val in df[proba_column]:
63
+ if val is None:
64
+ proba_values.append([np.nan] * n_classes)
65
+ else:
66
+ proba_values.append(val)
67
+
68
+ proba_df = pd.DataFrame(proba_values, columns=proba_splits)
69
+
70
+ # Drop any existing proba columns and reset index for concat
71
+ df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
72
+ df = df.reset_index(drop=True)
73
+ df = pd.concat([df, proba_df], axis=1)
74
+ return df
75
+
76
+
77
+ def match_features_case_insensitive(df: pd.DataFrame, model_features: list[str]) -> pd.DataFrame:
78
+ """Matches and renames DataFrame columns to match model feature names (case-insensitive).
79
+
80
+ Prioritizes exact matches, then case-insensitive matches.
81
+
82
+ Args:
83
+ df: Input DataFrame
84
+ model_features: List of feature names expected by the model
85
+
86
+ Returns:
87
+ DataFrame with columns renamed to match model features
88
+
89
+ Raises:
90
+ ValueError: If any model features cannot be matched
91
+ """
92
+ df_columns_lower = {col.lower(): col for col in df.columns}
93
+ rename_dict = {}
94
+ missing = []
95
+ for feature in model_features:
96
+ if feature in df.columns:
97
+ continue # Exact match
98
+ elif feature.lower() in df_columns_lower:
99
+ rename_dict[df_columns_lower[feature.lower()]] = feature
100
+ else:
101
+ missing.append(feature)
102
+
103
+ if missing:
104
+ raise ValueError(f"Features not found: {missing}")
105
+
106
+ return df.rename(columns=rename_dict)
107
+
108
+
109
+ def convert_categorical_types(
110
+ df: pd.DataFrame, features: list[str], category_mappings: dict[str, list[str]] | None = None
111
+ ) -> tuple[pd.DataFrame, dict[str, list[str]]]:
112
+ """Converts appropriate columns to categorical type with consistent mappings.
113
+
114
+ In training mode (category_mappings is None or empty), detects object/string columns
115
+ with <20 unique values and converts them to categorical.
116
+ In inference mode (category_mappings provided), applies the stored mappings.
117
+
118
+ Args:
119
+ df: The DataFrame to process
120
+ features: List of feature names to consider for conversion
121
+ category_mappings: Existing category mappings. If None or empty, training mode.
122
+ If populated, inference mode.
123
+
124
+ Returns:
125
+ Tuple of (processed DataFrame, category mappings dictionary)
126
+ """
127
+ if category_mappings is None:
128
+ category_mappings = {}
129
+
130
+ # Training mode
131
+ if not category_mappings:
132
+ for col in df.select_dtypes(include=["object", "string"]):
133
+ if col in features and df[col].nunique() < 20:
134
+ print(f"Training mode: Converting {col} to category")
135
+ df[col] = df[col].astype("category")
136
+ category_mappings[col] = df[col].cat.categories.tolist()
137
+
138
+ # Inference mode
139
+ else:
140
+ for col, categories in category_mappings.items():
141
+ if col in df.columns:
142
+ print(f"Inference mode: Applying categorical mapping for {col}")
143
+ df[col] = pd.Categorical(df[col], categories=categories)
144
+
145
+ return df, category_mappings
146
+
147
+
148
+ def decompress_features(
149
+ df: pd.DataFrame, features: list[str], compressed_features: list[str]
150
+ ) -> tuple[pd.DataFrame, list[str]]:
151
+ """Decompress bitstring features into individual bit columns.
152
+
153
+ Args:
154
+ df: The features DataFrame
155
+ features: Full list of feature names
156
+ compressed_features: List of feature names to decompress (bitstrings)
157
+
158
+ Returns:
159
+ Tuple of (DataFrame with decompressed features, updated feature list)
160
+ """
161
+ # Check for any missing values in the required features
162
+ missing_counts = df[features].isna().sum()
163
+ if missing_counts.any():
164
+ missing_features = missing_counts[missing_counts > 0]
165
+ print(
166
+ f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
167
+ "WARNING: You might want to remove/replace all NaN values before processing."
168
+ )
169
+
170
+ # Make a copy to avoid mutating the original list
171
+ decompressed_features = features.copy()
172
+
173
+ for feature in compressed_features:
174
+ if (feature not in df.columns) or (feature not in decompressed_features):
175
+ print(f"Feature '{feature}' not in the features list, skipping decompression.")
176
+ continue
177
+
178
+ # Remove the feature from the list to avoid duplication
179
+ decompressed_features.remove(feature)
180
+
181
+ # Handle all compressed features as bitstrings
182
+ bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
183
+ prefix = feature[:3]
184
+
185
+ # Create all new columns at once - avoids fragmentation
186
+ new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
187
+ new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
188
+
189
+ # Add to features list
190
+ decompressed_features.extend(new_col_names)
191
+
192
+ # Drop original column and concatenate new ones
193
+ df = df.drop(columns=[feature])
194
+ df = pd.concat([df, new_df], axis=1)
195
+
196
+ return df, decompressed_features
197
+
198
+
199
+ def input_fn(input_data, content_type: str) -> pd.DataFrame:
200
+ """Parse input data and return a DataFrame.
201
+
202
+ Args:
203
+ input_data: Raw input data (bytes or string)
204
+ content_type: MIME type of the input data
205
+
206
+ Returns:
207
+ Parsed DataFrame
208
+
209
+ Raises:
210
+ ValueError: If input is empty or content_type is not supported
211
+ """
212
+ if not input_data:
213
+ raise ValueError("Empty input data is not supported!")
214
+
215
+ if isinstance(input_data, bytes):
216
+ input_data = input_data.decode("utf-8")
217
+
218
+ if "text/csv" in content_type:
219
+ return pd.read_csv(StringIO(input_data))
220
+ elif "application/json" in content_type:
221
+ return pd.DataFrame(json.loads(input_data))
222
+ else:
223
+ raise ValueError(f"{content_type} not supported!")
224
+
225
+
226
+ def output_fn(output_df: pd.DataFrame, accept_type: str) -> tuple[str, str]:
227
+ """Convert output DataFrame to requested format.
228
+
229
+ Args:
230
+ output_df: DataFrame to convert
231
+ accept_type: Requested MIME type
232
+
233
+ Returns:
234
+ Tuple of (formatted output string, MIME type)
235
+
236
+ Raises:
237
+ RuntimeError: If accept_type is not supported
238
+ """
239
+ if "text/csv" in accept_type:
240
+ csv_output = output_df.fillna("N/A").to_csv(index=False)
241
+ return csv_output, "text/csv"
242
+ elif "application/json" in accept_type:
243
+ return output_df.to_json(orient="records"), "application/json"
244
+ else:
245
+ raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
246
+
247
+
248
+ def compute_regression_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> dict[str, float]:
249
+ """Compute standard regression metrics.
250
+
251
+ Args:
252
+ y_true: Ground truth target values
253
+ y_pred: Predicted values
254
+
255
+ Returns:
256
+ Dictionary with keys: rmse, mae, medae, r2, spearmanr, support
257
+ """
258
+ return {
259
+ "rmse": root_mean_squared_error(y_true, y_pred),
260
+ "mae": mean_absolute_error(y_true, y_pred),
261
+ "medae": median_absolute_error(y_true, y_pred),
262
+ "r2": r2_score(y_true, y_pred),
263
+ "spearmanr": spearmanr(y_true, y_pred).correlation,
264
+ "support": len(y_true),
265
+ }
266
+
267
+
268
+ def print_regression_metrics(metrics: dict[str, float]) -> None:
269
+ """Print regression metrics in the format expected by SageMaker metric definitions.
270
+
271
+ Args:
272
+ metrics: Dictionary of metric name -> value
273
+ """
274
+ print(f"rmse: {metrics['rmse']:.3f}")
275
+ print(f"mae: {metrics['mae']:.3f}")
276
+ print(f"medae: {metrics['medae']:.3f}")
277
+ print(f"r2: {metrics['r2']:.3f}")
278
+ print(f"spearmanr: {metrics['spearmanr']:.3f}")
279
+ print(f"support: {metrics['support']}")
280
+
281
+
282
+ def compute_classification_metrics(
283
+ y_true: np.ndarray, y_pred: np.ndarray, label_names: list[str], target_col: str
284
+ ) -> pd.DataFrame:
285
+ """Compute per-class classification metrics.
286
+
287
+ Args:
288
+ y_true: Ground truth labels
289
+ y_pred: Predicted labels
290
+ label_names: List of class label names
291
+ target_col: Name of the target column (for DataFrame output)
292
+
293
+ Returns:
294
+ DataFrame with columns: target_col, precision, recall, f1, support
295
+ """
296
+ scores = precision_recall_fscore_support(y_true, y_pred, average=None, labels=label_names)
297
+ return pd.DataFrame(
298
+ {
299
+ target_col: label_names,
300
+ "precision": scores[0],
301
+ "recall": scores[1],
302
+ "f1": scores[2],
303
+ "support": scores[3],
304
+ }
305
+ )
306
+
307
+
308
+ def print_classification_metrics(score_df: pd.DataFrame, target_col: str, label_names: list[str]) -> None:
309
+ """Print per-class classification metrics in the format expected by SageMaker.
310
+
311
+ Args:
312
+ score_df: DataFrame from compute_classification_metrics
313
+ target_col: Name of the target column
314
+ label_names: List of class label names
315
+ """
316
+ metrics = ["precision", "recall", "f1", "support"]
317
+ for t in label_names:
318
+ for m in metrics:
319
+ value = score_df.loc[score_df[target_col] == t, m].iloc[0]
320
+ print(f"Metrics:{t}:{m} {value}")
321
+
322
+
323
+ def print_confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray, label_names: list[str]) -> None:
324
+ """Print confusion matrix in the format expected by SageMaker.
325
+
326
+ Args:
327
+ y_true: Ground truth labels
328
+ y_pred: Predicted labels
329
+ label_names: List of class label names
330
+ """
331
+ conf_mtx = confusion_matrix(y_true, y_pred, labels=label_names)
332
+ for i, row_name in enumerate(label_names):
333
+ for j, col_name in enumerate(label_names):
334
+ value = conf_mtx[i, j]
335
+ print(f"ConfusionMatrix:{row_name}:{col_name} {value}")
@@ -1,11 +1,3 @@
1
1
  # Requirements for ChemProp model scripts
2
- # Note: These are the local dev requirements. The Docker images have their own requirements.txt
3
- chemprop==2.2.1
4
- rdkit==2025.9.1
5
- torch>=2.0.0
6
- lightning>=2.0.0
7
- pandas>=2.0.0
8
- numpy>=1.24.0
9
- scikit-learn>=1.3.0
10
- awswrangler>=3.0.0
11
- joblib>=1.3.0
2
+ # Note: The training and inference images already have torch and chemprop installed.
3
+ # So we only need to install packages that are not already included in the images.