workbench 0.8.162__py3-none-any.whl → 0.8.202__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

Files changed (113) hide show
  1. workbench/algorithms/dataframe/__init__.py +1 -2
  2. workbench/algorithms/dataframe/fingerprint_proximity.py +2 -2
  3. workbench/algorithms/dataframe/proximity.py +261 -235
  4. workbench/algorithms/graph/light/proximity_graph.py +10 -8
  5. workbench/api/__init__.py +2 -1
  6. workbench/api/compound.py +1 -1
  7. workbench/api/endpoint.py +11 -0
  8. workbench/api/feature_set.py +11 -8
  9. workbench/api/meta.py +5 -2
  10. workbench/api/model.py +16 -15
  11. workbench/api/monitor.py +1 -16
  12. workbench/core/artifacts/__init__.py +11 -2
  13. workbench/core/artifacts/artifact.py +11 -3
  14. workbench/core/artifacts/data_capture_core.py +355 -0
  15. workbench/core/artifacts/endpoint_core.py +256 -118
  16. workbench/core/artifacts/feature_set_core.py +265 -16
  17. workbench/core/artifacts/model_core.py +107 -60
  18. workbench/core/artifacts/monitor_core.py +33 -248
  19. workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
  20. workbench/core/cloud_platform/aws/aws_meta.py +12 -5
  21. workbench/core/cloud_platform/aws/aws_parameter_store.py +18 -2
  22. workbench/core/cloud_platform/aws/aws_session.py +4 -4
  23. workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
  24. workbench/core/transforms/features_to_model/features_to_model.py +42 -32
  25. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
  26. workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
  27. workbench/core/views/training_view.py +113 -42
  28. workbench/core/views/view.py +53 -3
  29. workbench/core/views/view_utils.py +4 -4
  30. workbench/model_scripts/chemprop/chemprop.template +852 -0
  31. workbench/model_scripts/chemprop/generated_model_script.py +852 -0
  32. workbench/model_scripts/chemprop/requirements.txt +11 -0
  33. workbench/model_scripts/custom_models/chem_info/fingerprints.py +134 -0
  34. workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
  35. workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
  36. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
  37. workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
  38. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
  39. workbench/model_scripts/custom_models/proximity/proximity.py +261 -235
  40. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  41. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
  42. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  43. workbench/model_scripts/custom_models/uq_models/meta_uq.template +166 -62
  44. workbench/model_scripts/custom_models/uq_models/ngboost.template +30 -18
  45. workbench/model_scripts/custom_models/uq_models/proximity.py +261 -235
  46. workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
  47. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
  48. workbench/model_scripts/pytorch_model/generated_model_script.py +373 -190
  49. workbench/model_scripts/pytorch_model/pytorch.template +370 -187
  50. workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
  51. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  52. workbench/model_scripts/script_generation.py +17 -9
  53. workbench/model_scripts/uq_models/generated_model_script.py +605 -0
  54. workbench/model_scripts/uq_models/mapie.template +605 -0
  55. workbench/model_scripts/uq_models/requirements.txt +1 -0
  56. workbench/model_scripts/xgb_model/generated_model_script.py +37 -46
  57. workbench/model_scripts/xgb_model/xgb_model.template +44 -46
  58. workbench/repl/workbench_shell.py +28 -14
  59. workbench/scripts/endpoint_test.py +162 -0
  60. workbench/scripts/lambda_test.py +73 -0
  61. workbench/scripts/ml_pipeline_batch.py +137 -0
  62. workbench/scripts/ml_pipeline_sqs.py +186 -0
  63. workbench/scripts/monitor_cloud_watch.py +20 -100
  64. workbench/utils/aws_utils.py +4 -3
  65. workbench/utils/chem_utils/__init__.py +0 -0
  66. workbench/utils/chem_utils/fingerprints.py +134 -0
  67. workbench/utils/chem_utils/misc.py +194 -0
  68. workbench/utils/chem_utils/mol_descriptors.py +483 -0
  69. workbench/utils/chem_utils/mol_standardize.py +450 -0
  70. workbench/utils/chem_utils/mol_tagging.py +348 -0
  71. workbench/utils/chem_utils/projections.py +209 -0
  72. workbench/utils/chem_utils/salts.py +256 -0
  73. workbench/utils/chem_utils/sdf.py +292 -0
  74. workbench/utils/chem_utils/toxicity.py +250 -0
  75. workbench/utils/chem_utils/vis.py +253 -0
  76. workbench/utils/chemprop_utils.py +760 -0
  77. workbench/utils/cloudwatch_handler.py +1 -1
  78. workbench/utils/cloudwatch_utils.py +137 -0
  79. workbench/utils/config_manager.py +3 -7
  80. workbench/utils/endpoint_utils.py +5 -7
  81. workbench/utils/license_manager.py +2 -6
  82. workbench/utils/model_utils.py +95 -34
  83. workbench/utils/monitor_utils.py +44 -62
  84. workbench/utils/pandas_utils.py +3 -3
  85. workbench/utils/pytorch_utils.py +526 -0
  86. workbench/utils/shap_utils.py +10 -2
  87. workbench/utils/workbench_logging.py +0 -3
  88. workbench/utils/workbench_sqs.py +1 -1
  89. workbench/utils/xgboost_model_utils.py +371 -156
  90. workbench/web_interface/components/model_plot.py +7 -1
  91. workbench/web_interface/components/plugin_unit_test.py +5 -2
  92. workbench/web_interface/components/plugins/dashboard_status.py +3 -1
  93. workbench/web_interface/components/plugins/generated_compounds.py +1 -1
  94. workbench/web_interface/components/plugins/model_details.py +9 -7
  95. workbench/web_interface/components/plugins/scatter_plot.py +3 -3
  96. {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/METADATA +27 -6
  97. {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/RECORD +101 -85
  98. {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/entry_points.txt +4 -0
  99. {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/licenses/LICENSE +1 -1
  100. workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
  101. workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
  102. workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
  103. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
  104. workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
  105. workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
  106. workbench/model_scripts/quant_regression/quant_regression.template +0 -279
  107. workbench/model_scripts/quant_regression/requirements.txt +0 -1
  108. workbench/utils/chem_utils.py +0 -1556
  109. workbench/utils/execution_environment.py +0 -211
  110. workbench/utils/fast_inference.py +0 -167
  111. workbench/utils/resource_utils.py +0 -39
  112. {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/WHEEL +0 -0
  113. {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/top_level.txt +0 -0
@@ -1,279 +0,0 @@
1
- # Imports for XGB Model
2
- import xgboost as xgb
3
- import awswrangler as wr
4
- from sklearn.model_selection import train_test_split
5
-
6
- # Model Performance Scores
7
- from sklearn.metrics import (
8
- mean_absolute_error,
9
- r2_score,
10
- root_mean_squared_error
11
- )
12
-
13
- from io import StringIO
14
- import json
15
- import argparse
16
- import os
17
- import pandas as pd
18
-
19
- # Template Placeholders
20
- TEMPLATE_PARAMS = {
21
- "model_type": "{{model_type}}",
22
- "target_column": "{{target_column}}",
23
- "features": "{{feature_list}}",
24
- "model_metrics_s3_path": "{{model_metrics_s3_path}}",
25
- "train_all_data": "{{train_all_data}}"
26
- }
27
-
28
- # Function to check if dataframe is empty
29
- def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
30
- """
31
- Check if the provided dataframe is empty and raise an exception if it is.
32
-
33
- Args:
34
- df (pd.DataFrame): DataFrame to check
35
- df_name (str): Name of the DataFrame
36
- """
37
- if df.empty:
38
- msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
39
- print(msg)
40
- raise ValueError(msg)
41
-
42
-
43
- def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
44
- """
45
- Matches and renames DataFrame columns to match model feature names (case-insensitive).
46
- Prioritizes exact matches, then case-insensitive matches.
47
-
48
- Raises ValueError if any model features cannot be matched.
49
- """
50
- df_columns_lower = {col.lower(): col for col in df.columns}
51
- rename_dict = {}
52
- missing = []
53
- for feature in model_features:
54
- if feature in df.columns:
55
- continue # Exact match
56
- elif feature.lower() in df_columns_lower:
57
- rename_dict[df_columns_lower[feature.lower()]] = feature
58
- else:
59
- missing.append(feature)
60
-
61
- if missing:
62
- raise ValueError(f"Features not found: {missing}")
63
-
64
- # Rename the DataFrame columns to match the model features
65
- return df.rename(columns=rename_dict)
66
-
67
- if __name__ == "__main__":
68
- """The main function is for training the XGBoost Quantile Regression models"""
69
-
70
- # Harness Template Parameters
71
- target = TEMPLATE_PARAMS["target_column"]
72
- features = TEMPLATE_PARAMS["features"]
73
- model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
74
- train_all_data = TEMPLATE_PARAMS["train_all_data"]
75
- validation_split = 0.2
76
- quantiles = [0.025, 0.25, 0.50, 0.75, 0.975]
77
- q_models = {}
78
-
79
- # Script arguments for input/output directories
80
- parser = argparse.ArgumentParser()
81
- parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
82
- parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
83
- parser.add_argument(
84
- "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
85
- )
86
- args = parser.parse_args()
87
-
88
- # Load training data from the specified directory
89
- training_files = [
90
- os.path.join(args.train, file)
91
- for file in os.listdir(args.train) if file.endswith(".csv")
92
- ]
93
- print(f"Training Files: {training_files}")
94
-
95
- # Combine files and read them all into a single pandas dataframe
96
- df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
97
-
98
- # Check if the DataFrame is empty
99
- check_dataframe(df, "training_df")
100
-
101
- # Training data split logic
102
- if train_all_data:
103
- # Use all data for both training and validation
104
- print("Training on all data...")
105
- df_train = df.copy()
106
- df_val = df.copy()
107
- elif "training" in df.columns:
108
- # Split data based on a 'training' column if it exists
109
- print("Splitting data based on 'training' column...")
110
- df_train = df[df["training"]].copy()
111
- df_val = df[~df["training"]].copy()
112
- else:
113
- # Perform a random split if no 'training' column is found
114
- print("Splitting data randomly...")
115
- df_train, df_val = train_test_split(df, test_size=validation_split, random_state=42)
116
-
117
- # Features/Target output
118
- print(f"Target: {target}")
119
- print(f"Features: {str(features)}")
120
- print(f"Data Shape: {df.shape}")
121
-
122
- # Prepare features and targets for training
123
- X_train = df_train[features]
124
- X_val = df_val[features]
125
- y_train = df_train[target]
126
- y_val = df_val[target]
127
-
128
- # Train models for each of the quantiles
129
- for q in quantiles:
130
- params = {
131
- "objective": "reg:quantileerror",
132
- "quantile_alpha": q,
133
- }
134
- model = xgb.XGBRegressor(**params)
135
- model.fit(X_train, y_train)
136
-
137
- # Convert quantile to string
138
- q_str = f"q_{int(q * 100)}" if (q * 100) == int(q * 100) else f"q_{int(q * 1000):03d}"
139
-
140
- # Store the model
141
- q_models[q_str] = model
142
-
143
- # Run predictions for each quantile
144
- quantile_predictions = {q: model.predict(X_val) for q, model in q_models.items()}
145
-
146
- # Create a copy of the validation DataFrame and add the new columns
147
- result_df = df_val[[target]].copy()
148
-
149
- # Add the quantile predictions to the DataFrame
150
- for name, preds in quantile_predictions.items():
151
- result_df[name] = preds
152
-
153
- # Add the median as the main prediction
154
- result_df["prediction"] = result_df["q_50"]
155
-
156
- # Now compute residuals on the prediction
157
- result_df["residual"] = result_df[target] - result_df["prediction"]
158
- result_df["residual_abs"] = result_df["residual"].abs()
159
-
160
- # Save the results dataframe to S3
161
- wr.s3.to_csv(
162
- result_df,
163
- path=f"{model_metrics_s3_path}/validation_predictions.csv",
164
- index=False,
165
- )
166
-
167
- # Report Performance Metrics
168
- rmse = root_mean_squared_error(result_df[target], result_df["prediction"])
169
- mae = mean_absolute_error(result_df[target], result_df["prediction"])
170
- r2 = r2_score(result_df[target], result_df["prediction"])
171
- print(f"RMSE: {rmse:.3f}")
172
- print(f"MAE: {mae:.3f}")
173
- print(f"R2: {r2:.3f}")
174
- print(f"NumRows: {len(result_df)}")
175
-
176
- # Now save the quantile models
177
- for name, model in q_models.items():
178
- model_path = os.path.join(args.model_dir, f"{name}.json")
179
- print(f"Saving model: {model_path}")
180
- model.save_model(model_path)
181
-
182
- # Also save the features (this will validate input during predictions)
183
- with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
184
- json.dump(features, fp)
185
-
186
-
187
- def model_fn(model_dir) -> dict:
188
- """Deserialized and return all the fitted models from the model directory.
189
-
190
- Args:
191
- model_dir (str): The directory where the models are stored.
192
-
193
- Returns:
194
- dict: A dictionary of the models.
195
- """
196
-
197
- # Load ALL the Quantile models from the model directory
198
- models = {}
199
- for file in os.listdir(model_dir):
200
- if file.startswith("q") and file.endswith(".json"): # The Quantile models
201
- # Load the model
202
- model_path = os.path.join(model_dir, file)
203
- print(f"Loading model: {model_path}")
204
- model = xgb.XGBRegressor()
205
- model.load_model(model_path)
206
-
207
- # Store the quantile model
208
- q_name = os.path.splitext(file)[0]
209
- models[q_name] = model
210
-
211
- # Return all the models
212
- return models
213
-
214
-
215
- def input_fn(input_data, content_type):
216
- """Parse input data and return a DataFrame."""
217
- if not input_data:
218
- raise ValueError("Empty input data is not supported!")
219
-
220
- # Decode bytes to string if necessary
221
- if isinstance(input_data, bytes):
222
- input_data = input_data.decode("utf-8")
223
-
224
- if "text/csv" in content_type:
225
- return pd.read_csv(StringIO(input_data))
226
- elif "application/json" in content_type:
227
- return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
228
- else:
229
- raise ValueError(f"{content_type} not supported!")
230
-
231
-
232
- def output_fn(output_df, accept_type):
233
- """Supports both CSV and JSON output formats."""
234
- if "text/csv" in accept_type:
235
- csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
236
- return csv_output, "text/csv"
237
- elif "application/json" in accept_type:
238
- return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
239
- else:
240
- raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
241
-
242
-
243
- def predict_fn(df, models) -> pd.DataFrame:
244
- """Make Predictions with our XGB Quantile Regression Model
245
-
246
- Args:
247
- df (pd.DataFrame): The input DataFrame
248
- models (dict): The dictionary of models to use for predictions
249
-
250
- Returns:
251
- pd.DataFrame: The DataFrame with the predictions added
252
- """
253
-
254
- # Grab our feature columns (from training)
255
- model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
256
- with open(os.path.join(model_dir, "feature_columns.json")) as fp:
257
- model_features = json.load(fp)
258
- print(f"Model Features: {model_features}")
259
-
260
- # We're going match features in a case-insensitive manner, accounting for all the permutations
261
- # - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
262
- # - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
263
- matched_df = match_features_case_insensitive(df, model_features)
264
-
265
- # Predict the features against all the models
266
- for name, model in models.items():
267
- df[name] = model.predict(matched_df[model_features])
268
-
269
- # Use the median prediction as the main prediction
270
- df["prediction"] = df["q_50"]
271
-
272
- # Estimate the standard deviation of the predictions using the interquartile range
273
- df["prediction_std"] = (df["q_75"] - df["q_25"]) / 1.35
274
-
275
- # Reorganize the columns so they are in alphabetical order
276
- df = df.reindex(sorted(df.columns), axis=1)
277
-
278
- # All done, return the DataFrame
279
- return df
@@ -1 +0,0 @@
1
- # Note: In general this file should be empty (as the default inference image has all required libraries)