workbench 0.8.161__py3-none-any.whl → 0.8.192__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. workbench/algorithms/dataframe/proximity.py +143 -102
  2. workbench/algorithms/graph/light/proximity_graph.py +2 -1
  3. workbench/api/compound.py +1 -1
  4. workbench/api/endpoint.py +12 -0
  5. workbench/api/feature_set.py +4 -4
  6. workbench/api/meta.py +5 -2
  7. workbench/api/model.py +16 -12
  8. workbench/api/monitor.py +1 -16
  9. workbench/core/artifacts/artifact.py +11 -3
  10. workbench/core/artifacts/data_capture_core.py +355 -0
  11. workbench/core/artifacts/endpoint_core.py +168 -78
  12. workbench/core/artifacts/feature_set_core.py +72 -13
  13. workbench/core/artifacts/model_core.py +50 -15
  14. workbench/core/artifacts/monitor_core.py +33 -248
  15. workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
  16. workbench/core/cloud_platform/aws/aws_meta.py +12 -5
  17. workbench/core/cloud_platform/aws/aws_session.py +4 -4
  18. workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
  19. workbench/core/transforms/features_to_model/features_to_model.py +9 -4
  20. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
  21. workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
  22. workbench/core/views/training_view.py +49 -53
  23. workbench/core/views/view.py +51 -1
  24. workbench/core/views/view_utils.py +4 -4
  25. workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
  26. workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
  27. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
  28. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
  29. workbench/model_scripts/custom_models/proximity/proximity.py +143 -102
  30. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  31. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
  32. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  33. workbench/model_scripts/custom_models/uq_models/meta_uq.template +156 -58
  34. workbench/model_scripts/custom_models/uq_models/ngboost.template +20 -14
  35. workbench/model_scripts/custom_models/uq_models/proximity.py +143 -102
  36. workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
  37. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
  38. workbench/model_scripts/pytorch_model/pytorch.template +19 -20
  39. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  40. workbench/model_scripts/script_generation.py +7 -2
  41. workbench/model_scripts/uq_models/mapie.template +492 -0
  42. workbench/model_scripts/uq_models/requirements.txt +1 -0
  43. workbench/model_scripts/xgb_model/xgb_model.template +31 -40
  44. workbench/repl/workbench_shell.py +11 -6
  45. workbench/scripts/lambda_launcher.py +63 -0
  46. workbench/scripts/ml_pipeline_batch.py +137 -0
  47. workbench/scripts/ml_pipeline_sqs.py +186 -0
  48. workbench/scripts/monitor_cloud_watch.py +20 -100
  49. workbench/utils/aws_utils.py +4 -3
  50. workbench/utils/chem_utils/__init__.py +0 -0
  51. workbench/utils/chem_utils/fingerprints.py +134 -0
  52. workbench/utils/chem_utils/misc.py +194 -0
  53. workbench/utils/chem_utils/mol_descriptors.py +483 -0
  54. workbench/utils/chem_utils/mol_standardize.py +450 -0
  55. workbench/utils/chem_utils/mol_tagging.py +348 -0
  56. workbench/utils/chem_utils/projections.py +209 -0
  57. workbench/utils/chem_utils/salts.py +256 -0
  58. workbench/utils/chem_utils/sdf.py +292 -0
  59. workbench/utils/chem_utils/toxicity.py +250 -0
  60. workbench/utils/chem_utils/vis.py +253 -0
  61. workbench/utils/cloudwatch_handler.py +1 -1
  62. workbench/utils/cloudwatch_utils.py +137 -0
  63. workbench/utils/config_manager.py +3 -7
  64. workbench/utils/endpoint_utils.py +5 -7
  65. workbench/utils/license_manager.py +2 -6
  66. workbench/utils/model_utils.py +76 -30
  67. workbench/utils/monitor_utils.py +44 -62
  68. workbench/utils/pandas_utils.py +3 -3
  69. workbench/utils/shap_utils.py +10 -2
  70. workbench/utils/workbench_logging.py +0 -3
  71. workbench/utils/workbench_sqs.py +1 -1
  72. workbench/utils/xgboost_model_utils.py +283 -145
  73. workbench/web_interface/components/plugins/dashboard_status.py +3 -1
  74. workbench/web_interface/components/plugins/generated_compounds.py +1 -1
  75. workbench/web_interface/components/plugins/scatter_plot.py +3 -3
  76. {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/METADATA +4 -4
  77. {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/RECORD +81 -76
  78. {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/entry_points.txt +3 -0
  79. workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
  80. workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
  81. workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
  82. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
  83. workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
  84. workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
  85. workbench/model_scripts/pytorch_model/generated_model_script.py +0 -565
  86. workbench/model_scripts/quant_regression/quant_regression.template +0 -279
  87. workbench/model_scripts/quant_regression/requirements.txt +0 -1
  88. workbench/model_scripts/scikit_learn/generated_model_script.py +0 -307
  89. workbench/model_scripts/xgb_model/generated_model_script.py +0 -477
  90. workbench/utils/chem_utils.py +0 -1556
  91. workbench/utils/execution_environment.py +0 -211
  92. workbench/utils/fast_inference.py +0 -167
  93. workbench/utils/resource_utils.py +0 -39
  94. {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/WHEEL +0 -0
  95. {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/licenses/LICENSE +0 -0
  96. {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/top_level.txt +0 -0
@@ -1,34 +1,33 @@
1
1
  # Model: NGBoost Regressor with Distribution output
2
2
  from ngboost import NGBRegressor
3
- from xgboost import XGBRegressor # Base Estimator
3
+ from ngboost.distns import Cauchy
4
+ from xgboost import XGBRegressor # Point Estimator
4
5
  from sklearn.model_selection import train_test_split
5
6
 
6
7
  # Model Performance Scores
7
- from sklearn.metrics import (
8
- mean_absolute_error,
9
- r2_score,
10
- root_mean_squared_error
11
- )
8
+ from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
12
9
 
13
10
  from io import StringIO
14
11
  import json
15
12
  import argparse
16
13
  import joblib
17
14
  import os
15
+ import numpy as np
18
16
  import pandas as pd
17
+ from typing import List, Tuple
19
18
 
20
19
  # Local Imports
21
20
  from proximity import Proximity
22
21
 
23
22
 
24
-
25
23
  # Template Placeholders
26
24
  TEMPLATE_PARAMS = {
27
25
  "id_column": "{{id_column}}",
28
- "features": "{{feature_list}}",
29
26
  "target": "{{target_column}}",
27
+ "features": "{{feature_list}}",
28
+ "compressed_features": "{{compressed_features}}",
30
29
  "train_all_data": "{{train_all_data}}",
31
- "track_columns": "{{track_columns}}"
30
+ "track_columns": "{{track_columns}}",
32
31
  }
33
32
 
34
33
 
@@ -72,16 +71,99 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
72
71
  return df.rename(columns=rename_dict)
73
72
 
74
73
 
75
- # TRAINING SECTION
76
- #
77
- # This section (__main__) is where SageMaker will execute the training job
78
- # and save the model artifacts to the model directory.
79
- #
74
+ def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
75
+ """
76
+ Converts appropriate columns to categorical type with consistent mappings.
77
+
78
+ Args:
79
+ df (pd.DataFrame): The DataFrame to process.
80
+ features (list): List of feature names to consider for conversion.
81
+ category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
82
+ training mode. If populated, we're in inference mode.
83
+
84
+ Returns:
85
+ tuple: (processed DataFrame, category mappings dictionary)
86
+ """
87
+ # Training mode
88
+ if category_mappings == {}:
89
+ for col in df.select_dtypes(include=["object", "string"]):
90
+ if col in features and df[col].nunique() < 20:
91
+ print(f"Training mode: Converting {col} to category")
92
+ df[col] = df[col].astype("category")
93
+ category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
94
+
95
+ # Inference mode
96
+ else:
97
+ for col, categories in category_mappings.items():
98
+ if col in df.columns:
99
+ print(f"Inference mode: Applying categorical mapping for {col}")
100
+ df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
101
+
102
+ return df, category_mappings
103
+
104
+
105
+ def decompress_features(
106
+ df: pd.DataFrame, features: List[str], compressed_features: List[str]
107
+ ) -> Tuple[pd.DataFrame, List[str]]:
108
+ """Prepare features for the model by decompressing bitstring features
109
+
110
+ Args:
111
+ df (pd.DataFrame): The features DataFrame
112
+ features (List[str]): Full list of feature names
113
+ compressed_features (List[str]): List of feature names to decompress (bitstrings)
114
+
115
+ Returns:
116
+ pd.DataFrame: DataFrame with the decompressed features
117
+ List[str]: Updated list of feature names after decompression
118
+
119
+ Raises:
120
+ ValueError: If any missing values are found in the specified features
121
+ """
122
+
123
+ # Check for any missing values in the required features
124
+ missing_counts = df[features].isna().sum()
125
+ if missing_counts.any():
126
+ missing_features = missing_counts[missing_counts > 0]
127
+ print(
128
+ f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
129
+ "WARNING: You might want to remove/replace all NaN values before processing."
130
+ )
131
+
132
+ # Decompress the specified compressed features
133
+ decompressed_features = features.copy()
134
+ for feature in compressed_features:
135
+ if (feature not in df.columns) or (feature not in features):
136
+ print(f"Feature '{feature}' not in the features list, skipping decompression.")
137
+ continue
138
+
139
+ # Remove the feature from the list of features to avoid duplication
140
+ decompressed_features.remove(feature)
141
+
142
+ # Handle all compressed features as bitstrings
143
+ bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
144
+ prefix = feature[:3]
145
+
146
+ # Create all new columns at once - avoids fragmentation
147
+ new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
148
+ new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
149
+
150
+ # Add to features list
151
+ decompressed_features.extend(new_col_names)
152
+
153
+ # Drop original column and concatenate new ones
154
+ df = df.drop(columns=[feature])
155
+ df = pd.concat([df, new_df], axis=1)
156
+
157
+ return df, decompressed_features
158
+
159
+
80
160
  if __name__ == "__main__":
81
161
  # Template Parameters
82
162
  id_column = TEMPLATE_PARAMS["id_column"]
83
- features = TEMPLATE_PARAMS["features"]
84
163
  target = TEMPLATE_PARAMS["target"]
164
+ features = TEMPLATE_PARAMS["features"]
165
+ orig_features = features.copy()
166
+ compressed_features = TEMPLATE_PARAMS["compressed_features"]
85
167
  train_all_data = TEMPLATE_PARAMS["train_all_data"]
86
168
  track_columns = TEMPLATE_PARAMS["track_columns"] # Can be None
87
169
  validation_split = 0.2
@@ -95,53 +177,62 @@ if __name__ == "__main__":
95
177
  )
96
178
  args = parser.parse_args()
97
179
 
98
- # Load training data from the specified directory
99
- training_files = [
100
- os.path.join(args.train, file)
101
- for file in os.listdir(args.train) if file.endswith(".csv")
102
- ]
180
+ # Read the training data into DataFrames
181
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
103
182
  print(f"Training Files: {training_files}")
104
183
 
105
184
  # Combine files and read them all into a single pandas dataframe
106
- df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
185
+ all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
186
+
187
+ # Check if the dataframe is empty
188
+ check_dataframe(all_df, "training_df")
189
+
190
+ # Features/Target output
191
+ print(f"Target: {target}")
192
+ print(f"Features: {str(features)}")
107
193
 
108
- # Check if the DataFrame is empty
109
- check_dataframe(df, "training_df")
194
+ # Convert any features that might be categorical to 'category' type
195
+ all_df, category_mappings = convert_categorical_types(all_df, features)
110
196
 
111
- # Training data split logic
197
+ # If we have compressed features, decompress them
198
+ if compressed_features:
199
+ print(f"Decompressing features {compressed_features}...")
200
+ all_df, features = decompress_features(all_df, features, compressed_features)
201
+
202
+ # Do we want to train on all the data?
112
203
  if train_all_data:
113
- # Use all data for both training and validation
114
- print("Training on all data...")
115
- df_train = df.copy()
116
- df_val = df.copy()
117
- elif "training" in df.columns:
118
- # Split data based on a 'training' column if it exists
119
- print("Splitting data based on 'training' column...")
120
- df_train = df[df["training"]].copy()
121
- df_val = df[~df["training"]].copy()
204
+ print("Training on ALL of the data")
205
+ df_train = all_df.copy()
206
+ df_val = all_df.copy()
207
+
208
+ # Does the dataframe have a training column?
209
+ elif "training" in all_df.columns:
210
+ print("Found training column, splitting data based on training column")
211
+ df_train = all_df[all_df["training"]]
212
+ df_val = all_df[~all_df["training"]]
122
213
  else:
123
- # Perform a random split if no 'training' column is found
124
- print("Splitting data randomly...")
125
- df_train, df_val = train_test_split(df, test_size=validation_split, random_state=42)
214
+ # Just do a random training Split
215
+ print("WARNING: No training column found, splitting data with random state=42")
216
+ df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
217
+ print(f"FIT/TRAIN: {df_train.shape}")
218
+ print(f"VALIDATION: {df_val.shape}")
126
219
 
127
220
  # We're using XGBoost for point predictions and NGBoost for uncertainty quantification
128
221
  xgb_model = XGBRegressor()
129
- ngb_model = NGBRegressor()
222
+ ngb_model = NGBRegressor() # Dist=Cauchy) Seems to give HUGE prediction intervals
130
223
 
131
224
  # Prepare features and targets for training
132
225
  X_train = df_train[features]
133
- X_val = df_val[features]
226
+ X_validate = df_val[features]
134
227
  y_train = df_train[target]
135
- y_val = df_val[target]
228
+ y_validate = df_val[target]
136
229
 
137
230
  # Train both models using the training data
138
231
  xgb_model.fit(X_train, y_train)
139
- ngb_model.fit(X_train, y_train, X_val=X_val, Y_val=y_val)
232
+ ngb_model.fit(X_train, y_train, X_val=X_validate, Y_val=y_validate)
140
233
 
141
234
  # Make Predictions on the Validation Set
142
235
  print(f"Making Predictions on Validation Set...")
143
- y_validate = df_val[target]
144
- X_validate = df_val[features]
145
236
  preds = xgb_model.predict(X_validate)
146
237
 
147
238
  # Calculate various model performance metrics (regression)
@@ -159,9 +250,9 @@ if __name__ == "__main__":
159
250
  # Save the trained NGBoost model
160
251
  joblib.dump(ngb_model, os.path.join(args.model_dir, "ngb_model.joblib"))
161
252
 
162
- # Save the feature list to validate input during predictions
253
+ # Save the features (this will validate input during predictions)
163
254
  with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
164
- json.dump(features, fp)
255
+ json.dump(orig_features, fp) # We save the original features, not the decompressed ones
165
256
 
166
257
  # Now the Proximity model
167
258
  model = Proximity(df_train, id_column, features, target, track_columns=track_columns)
@@ -187,11 +278,7 @@ def model_fn(model_dir) -> dict:
187
278
  # Deserialize the proximity model
188
279
  prox_model = Proximity.deserialize(model_dir)
189
280
 
190
- return {
191
- "xgboost": xgb_model,
192
- "ngboost": ngb_model,
193
- "proximity": prox_model
194
- }
281
+ return {"xgboost": xgb_model, "ngboost": ngb_model, "proximity": prox_model}
195
282
 
196
283
 
197
284
  def input_fn(input_data, content_type):
@@ -251,20 +338,31 @@ def predict_fn(df, models) -> pd.DataFrame:
251
338
  dist_params = y_dists.params
252
339
 
253
340
  # Extract mean and std from distribution parameters
254
- df["prediction_uq"] = dist_params['loc'] # mean
255
- df["prediction_std"] = dist_params['scale'] # standard deviation
341
+ df["prediction_uq"] = dist_params["loc"] # mean
342
+ df["prediction_std"] = dist_params["scale"] # standard deviation
256
343
 
257
344
  # Add 95% prediction intervals using ppf (percent point function)
258
- df["q_025"] = y_dists.ppf(0.025) # 2.5th percentile
259
- df["q_975"] = y_dists.ppf(0.975) # 97.5th percentile
345
+ # Note: Our hybrid model uses XGB point prediction and NGBoost UQ
346
+ # so we need to adjust the bounds to include the point prediction
347
+ df["q_025"] = np.minimum(y_dists.ppf(0.025), df["prediction"])
348
+ df["q_975"] = np.maximum(y_dists.ppf(0.975), df["prediction"])
349
+
350
+ # Add 90% prediction intervals
351
+ df["q_05"] = y_dists.ppf(0.05) # 5th percentile
352
+ df["q_95"] = y_dists.ppf(0.95) # 95th percentile
353
+
354
+ # Add 80% prediction intervals
355
+ df["q_10"] = y_dists.ppf(0.10) # 10th percentile
356
+ df["q_90"] = y_dists.ppf(0.90) # 90th percentile
260
357
 
261
358
  # Add 50% prediction intervals
262
- df["q_25"] = y_dists.ppf(0.25) # 25th percentile
263
- df["q_75"] = y_dists.ppf(0.75) # 75th percentile
359
+ df["q_25"] = y_dists.ppf(0.25) # 25th percentile
360
+ df["q_75"] = y_dists.ppf(0.75) # 75th percentile
264
361
 
265
- # Adjust prediction intervals to include point predictions
266
- df["q_025"] = df[["q_025", "prediction"]].min(axis=1)
267
- df["q_975"] = df[["q_975", "prediction"]].max(axis=1)
362
+ # Reorder the quantile columns for easier reading
363
+ quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
364
+ other_cols = [col for col in df.columns if col not in quantile_cols]
365
+ df = df[other_cols + quantile_cols]
268
366
 
269
367
  # Compute Nearest neighbors with Proximity model
270
368
  models["proximity"].neighbors(df)
@@ -3,11 +3,7 @@ from ngboost import NGBRegressor
3
3
  from sklearn.model_selection import train_test_split
4
4
 
5
5
  # Model Performance Scores
6
- from sklearn.metrics import (
7
- mean_absolute_error,
8
- r2_score,
9
- root_mean_squared_error
10
- )
6
+ from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
11
7
 
12
8
  from io import StringIO
13
9
  import json
@@ -21,7 +17,7 @@ import pandas as pd
21
17
  TEMPLATE_PARAMS = {
22
18
  "features": "{{feature_list}}",
23
19
  "target": "{{target_column}}",
24
- "train_all_data": "{{train_all_data}}"
20
+ "train_all_data": "{{train_all_data}}",
25
21
  }
26
22
 
27
23
 
@@ -87,10 +83,7 @@ if __name__ == "__main__":
87
83
  args = parser.parse_args()
88
84
 
89
85
  # Load training data from the specified directory
90
- training_files = [
91
- os.path.join(args.train, file)
92
- for file in os.listdir(args.train) if file.endswith(".csv")
93
- ]
86
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
94
87
  print(f"Training Files: {training_files}")
95
88
 
96
89
  # Combine files and read them all into a single pandas dataframe
@@ -212,16 +205,29 @@ def predict_fn(df, model) -> pd.DataFrame:
212
205
  dist_params = y_dists.params
213
206
 
214
207
  # Extract mean and std from distribution parameters
215
- df["prediction"] = dist_params['loc'] # mean
216
- df["prediction_std"] = dist_params['scale'] # standard deviation
208
+ df["prediction"] = dist_params["loc"] # mean
209
+ df["prediction_std"] = dist_params["scale"] # standard deviation
217
210
 
218
211
  # Add 95% prediction intervals using ppf (percent point function)
219
212
  df["q_025"] = y_dists.ppf(0.025) # 2.5th percentile
220
213
  df["q_975"] = y_dists.ppf(0.975) # 97.5th percentile
221
214
 
215
+ # Add 90% prediction intervals
216
+ df["q_05"] = y_dists.ppf(0.05) # 5th percentile
217
+ df["q_95"] = y_dists.ppf(0.95) # 95th percentile
218
+
219
+ # Add 80% prediction intervals
220
+ df["q_10"] = y_dists.ppf(0.10) # 10th percentile
221
+ df["q_90"] = y_dists.ppf(0.90) # 90th percentile
222
+
222
223
  # Add 50% prediction intervals
223
- df["q_25"] = y_dists.ppf(0.25) # 25th percentile
224
- df["q_75"] = y_dists.ppf(0.75) # 75th percentile
224
+ df["q_25"] = y_dists.ppf(0.25) # 25th percentile
225
+ df["q_75"] = y_dists.ppf(0.75) # 75th percentile
226
+
227
+ # Reorder the quantile columns for easier reading
228
+ quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
229
+ other_cols = [col for col in df.columns if col not in quantile_cols]
230
+ df = df[other_cols + quantile_cols]
225
231
 
226
232
  # Return the modified DataFrame
227
233
  return df