workbench 0.8.213__py3-none-any.whl → 0.8.219__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
  2. workbench/algorithms/dataframe/fingerprint_proximity.py +257 -80
  3. workbench/algorithms/dataframe/projection_2d.py +38 -21
  4. workbench/algorithms/dataframe/proximity.py +75 -150
  5. workbench/algorithms/graph/light/proximity_graph.py +5 -5
  6. workbench/algorithms/models/cleanlab_model.py +382 -0
  7. workbench/algorithms/models/noise_model.py +2 -2
  8. workbench/algorithms/sql/outliers.py +3 -3
  9. workbench/api/__init__.py +3 -0
  10. workbench/api/endpoint.py +10 -5
  11. workbench/api/feature_set.py +76 -6
  12. workbench/api/meta_model.py +289 -0
  13. workbench/api/model.py +43 -4
  14. workbench/core/artifacts/endpoint_core.py +65 -117
  15. workbench/core/artifacts/feature_set_core.py +3 -3
  16. workbench/core/artifacts/model_core.py +6 -4
  17. workbench/core/pipelines/pipeline_executor.py +1 -1
  18. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +30 -10
  19. workbench/model_script_utils/model_script_utils.py +15 -11
  20. workbench/model_script_utils/pytorch_utils.py +11 -1
  21. workbench/model_scripts/chemprop/chemprop.template +147 -71
  22. workbench/model_scripts/chemprop/generated_model_script.py +151 -75
  23. workbench/model_scripts/chemprop/model_script_utils.py +15 -11
  24. workbench/model_scripts/custom_models/chem_info/fingerprints.py +87 -46
  25. workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
  26. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +6 -6
  27. workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
  28. workbench/model_scripts/meta_model/generated_model_script.py +209 -0
  29. workbench/model_scripts/meta_model/meta_model.template +209 -0
  30. workbench/model_scripts/pytorch_model/generated_model_script.py +45 -27
  31. workbench/model_scripts/pytorch_model/model_script_utils.py +15 -11
  32. workbench/model_scripts/pytorch_model/pytorch.template +42 -24
  33. workbench/model_scripts/pytorch_model/pytorch_utils.py +11 -1
  34. workbench/model_scripts/script_generation.py +4 -0
  35. workbench/model_scripts/xgb_model/generated_model_script.py +167 -156
  36. workbench/model_scripts/xgb_model/model_script_utils.py +15 -11
  37. workbench/model_scripts/xgb_model/xgb_model.template +163 -152
  38. workbench/repl/workbench_shell.py +0 -5
  39. workbench/scripts/endpoint_test.py +2 -2
  40. workbench/scripts/meta_model_sim.py +35 -0
  41. workbench/utils/chem_utils/fingerprints.py +87 -46
  42. workbench/utils/chemprop_utils.py +23 -5
  43. workbench/utils/meta_model_simulator.py +499 -0
  44. workbench/utils/metrics_utils.py +94 -10
  45. workbench/utils/model_utils.py +91 -9
  46. workbench/utils/pytorch_utils.py +1 -1
  47. workbench/utils/shap_utils.py +1 -55
  48. workbench/web_interface/components/plugins/scatter_plot.py +4 -8
  49. {workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/METADATA +2 -1
  50. {workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/RECORD +54 -50
  51. {workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/entry_points.txt +1 -0
  52. workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
  53. workbench/model_scripts/custom_models/proximity/proximity.py +0 -410
  54. workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -377
  55. workbench/model_scripts/custom_models/uq_models/proximity.py +0 -410
  56. {workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/WHEEL +0 -0
  57. {workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/licenses/LICENSE +0 -0
  58. {workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/top_level.txt +0 -0
@@ -1,377 +0,0 @@
1
- # Model: NGBoost Regressor with Distribution output
2
- from ngboost import NGBRegressor
3
- from ngboost.distns import Cauchy
4
- from xgboost import XGBRegressor # Point Estimator
5
- from sklearn.model_selection import train_test_split
6
-
7
- # Model Performance Scores
8
- from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, root_mean_squared_error
9
- from scipy.stats import spearmanr
10
-
11
- from io import StringIO
12
- import json
13
- import argparse
14
- import joblib
15
- import os
16
- import numpy as np
17
- import pandas as pd
18
- from typing import List, Tuple
19
-
20
- # Local Imports
21
- from proximity import Proximity
22
-
23
-
24
- # Template Placeholders
25
- TEMPLATE_PARAMS = {
26
- "id_column": "{{id_column}}",
27
- "target": "{{target_column}}",
28
- "features": "{{feature_list}}",
29
- "compressed_features": "{{compressed_features}}",
30
- "train_all_data": "{{train_all_data}}",
31
- "track_columns": "{{track_columns}}",
32
- }
33
-
34
-
35
- # Function to check if dataframe is empty
36
- def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
37
- """
38
- Check if the provided dataframe is empty and raise an exception if it is.
39
-
40
- Args:
41
- df (pd.DataFrame): DataFrame to check
42
- df_name (str): Name of the DataFrame
43
- """
44
- if df.empty:
45
- msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
46
- print(msg)
47
- raise ValueError(msg)
48
-
49
-
50
- def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
51
- """
52
- Matches and renames DataFrame columns to match model feature names (case-insensitive).
53
- Prioritizes exact matches, then case-insensitive matches.
54
-
55
- Raises ValueError if any model features cannot be matched.
56
- """
57
- df_columns_lower = {col.lower(): col for col in df.columns}
58
- rename_dict = {}
59
- missing = []
60
- for feature in model_features:
61
- if feature in df.columns:
62
- continue # Exact match
63
- elif feature.lower() in df_columns_lower:
64
- rename_dict[df_columns_lower[feature.lower()]] = feature
65
- else:
66
- missing.append(feature)
67
-
68
- if missing:
69
- raise ValueError(f"Features not found: {missing}")
70
-
71
- # Rename the DataFrame columns to match the model features
72
- return df.rename(columns=rename_dict)
73
-
74
-
75
- def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
76
- """
77
- Converts appropriate columns to categorical type with consistent mappings.
78
-
79
- Args:
80
- df (pd.DataFrame): The DataFrame to process.
81
- features (list): List of feature names to consider for conversion.
82
- category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
83
- training mode. If populated, we're in inference mode.
84
-
85
- Returns:
86
- tuple: (processed DataFrame, category mappings dictionary)
87
- """
88
- # Training mode
89
- if category_mappings == {}:
90
- for col in df.select_dtypes(include=["object", "string"]):
91
- if col in features and df[col].nunique() < 20:
92
- print(f"Training mode: Converting {col} to category")
93
- df[col] = df[col].astype("category")
94
- category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
95
-
96
- # Inference mode
97
- else:
98
- for col, categories in category_mappings.items():
99
- if col in df.columns:
100
- print(f"Inference mode: Applying categorical mapping for {col}")
101
- df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
102
-
103
- return df, category_mappings
104
-
105
-
106
- def decompress_features(
107
- df: pd.DataFrame, features: List[str], compressed_features: List[str]
108
- ) -> Tuple[pd.DataFrame, List[str]]:
109
- """Prepare features for the model by decompressing bitstring features
110
-
111
- Args:
112
- df (pd.DataFrame): The features DataFrame
113
- features (List[str]): Full list of feature names
114
- compressed_features (List[str]): List of feature names to decompress (bitstrings)
115
-
116
- Returns:
117
- pd.DataFrame: DataFrame with the decompressed features
118
- List[str]: Updated list of feature names after decompression
119
-
120
- Raises:
121
- ValueError: If any missing values are found in the specified features
122
- """
123
-
124
- # Check for any missing values in the required features
125
- missing_counts = df[features].isna().sum()
126
- if missing_counts.any():
127
- missing_features = missing_counts[missing_counts > 0]
128
- print(
129
- f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
130
- "WARNING: You might want to remove/replace all NaN values before processing."
131
- )
132
-
133
- # Decompress the specified compressed features
134
- decompressed_features = features.copy()
135
- for feature in compressed_features:
136
- if (feature not in df.columns) or (feature not in features):
137
- print(f"Feature '{feature}' not in the features list, skipping decompression.")
138
- continue
139
-
140
- # Remove the feature from the list of features to avoid duplication
141
- decompressed_features.remove(feature)
142
-
143
- # Handle all compressed features as bitstrings
144
- bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
145
- prefix = feature[:3]
146
-
147
- # Create all new columns at once - avoids fragmentation
148
- new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
149
- new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
150
-
151
- # Add to features list
152
- decompressed_features.extend(new_col_names)
153
-
154
- # Drop original column and concatenate new ones
155
- df = df.drop(columns=[feature])
156
- df = pd.concat([df, new_df], axis=1)
157
-
158
- return df, decompressed_features
159
-
160
-
161
- if __name__ == "__main__":
162
- # Template Parameters
163
- id_column = TEMPLATE_PARAMS["id_column"]
164
- target = TEMPLATE_PARAMS["target"]
165
- features = TEMPLATE_PARAMS["features"]
166
- orig_features = features.copy()
167
- compressed_features = TEMPLATE_PARAMS["compressed_features"]
168
- train_all_data = TEMPLATE_PARAMS["train_all_data"]
169
- track_columns = TEMPLATE_PARAMS["track_columns"] # Can be None
170
- validation_split = 0.2
171
-
172
- # Script arguments for input/output directories
173
- parser = argparse.ArgumentParser()
174
- parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
175
- parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
176
- parser.add_argument(
177
- "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
178
- )
179
- args = parser.parse_args()
180
-
181
- # Read the training data into DataFrames
182
- training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
183
- print(f"Training Files: {training_files}")
184
-
185
- # Combine files and read them all into a single pandas dataframe
186
- all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
187
-
188
- # Check if the dataframe is empty
189
- check_dataframe(all_df, "training_df")
190
-
191
- # Features/Target output
192
- print(f"Target: {target}")
193
- print(f"Features: {str(features)}")
194
-
195
- # Convert any features that might be categorical to 'category' type
196
- all_df, category_mappings = convert_categorical_types(all_df, features)
197
-
198
- # If we have compressed features, decompress them
199
- if compressed_features:
200
- print(f"Decompressing features {compressed_features}...")
201
- all_df, features = decompress_features(all_df, features, compressed_features)
202
-
203
- # Do we want to train on all the data?
204
- if train_all_data:
205
- print("Training on ALL of the data")
206
- df_train = all_df.copy()
207
- df_val = all_df.copy()
208
-
209
- # Does the dataframe have a training column?
210
- elif "training" in all_df.columns:
211
- print("Found training column, splitting data based on training column")
212
- df_train = all_df[all_df["training"]]
213
- df_val = all_df[~all_df["training"]]
214
- else:
215
- # Just do a random training Split
216
- print("WARNING: No training column found, splitting data with random state=42")
217
- df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
218
- print(f"FIT/TRAIN: {df_train.shape}")
219
- print(f"VALIDATION: {df_val.shape}")
220
-
221
- # We're using XGBoost for point predictions and NGBoost for uncertainty quantification
222
- xgb_model = XGBRegressor()
223
- ngb_model = NGBRegressor() # Dist=Cauchy) Seems to give HUGE prediction intervals
224
-
225
- # Prepare features and targets for training
226
- X_train = df_train[features]
227
- X_validate = df_val[features]
228
- y_train = df_train[target]
229
- y_validate = df_val[target]
230
-
231
- # Train both models using the training data
232
- xgb_model.fit(X_train, y_train)
233
- ngb_model.fit(X_train, y_train, X_val=X_validate, Y_val=y_validate)
234
-
235
- # Make Predictions on the Validation Set
236
- print(f"Making Predictions on Validation Set...")
237
- preds = xgb_model.predict(X_validate)
238
-
239
- # Calculate various model performance metrics (regression)
240
- rmse = root_mean_squared_error(y_validate, preds)
241
- mae = mean_absolute_error(y_validate, preds)
242
- medae = median_absolute_error(y_validate, preds)
243
- r2 = r2_score(y_validate, preds)
244
- spearman_corr = spearmanr(y_validate, preds).correlation
245
- support = len(df_val)
246
- print(f"rmse: {rmse:.3f}")
247
- print(f"mae: {mae:.3f}")
248
- print(f"medae: {medae:.3f}")
249
- print(f"r2: {r2:.3f}")
250
- print(f"spearmanr: {spearman_corr:.3f}")
251
- print(f"support: {support}")
252
-
253
- # Save the trained XGBoost model
254
- xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
255
-
256
- # Save the trained NGBoost model
257
- joblib.dump(ngb_model, os.path.join(args.model_dir, "ngb_model.joblib"))
258
-
259
- # Save the features (this will validate input during predictions)
260
- with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
261
- json.dump(orig_features, fp) # We save the original features, not the decompressed ones
262
-
263
- # Now the Proximity model
264
- model = Proximity(df_train, id_column, features, target, track_columns=track_columns)
265
-
266
- # Now serialize the model
267
- model.serialize(args.model_dir)
268
-
269
-
270
- #
271
- # Inference Section
272
- #
273
- def model_fn(model_dir) -> dict:
274
- """Load and return XGBoost, NGBoost, and Prox Model from model directory."""
275
-
276
- # Load XGBoost regressor
277
- xgb_path = os.path.join(model_dir, "xgb_model.json")
278
- xgb_model = XGBRegressor(enable_categorical=True)
279
- xgb_model.load_model(xgb_path)
280
-
281
- # Load NGBoost regressor
282
- ngb_model = joblib.load(os.path.join(model_dir, "ngb_model.joblib"))
283
-
284
- # Deserialize the proximity model
285
- prox_model = Proximity.deserialize(model_dir)
286
-
287
- return {"xgboost": xgb_model, "ngboost": ngb_model, "proximity": prox_model}
288
-
289
-
290
- def input_fn(input_data, content_type):
291
- """Parse input data and return a DataFrame."""
292
- if not input_data:
293
- raise ValueError("Empty input data is not supported!")
294
-
295
- # Decode bytes to string if necessary
296
- if isinstance(input_data, bytes):
297
- input_data = input_data.decode("utf-8")
298
-
299
- if "text/csv" in content_type:
300
- return pd.read_csv(StringIO(input_data))
301
- elif "application/json" in content_type:
302
- return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
303
- else:
304
- raise ValueError(f"{content_type} not supported!")
305
-
306
-
307
- def output_fn(output_df, accept_type):
308
- """Supports both CSV and JSON output formats."""
309
- if "text/csv" in accept_type:
310
- csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
311
- return csv_output, "text/csv"
312
- elif "application/json" in accept_type:
313
- return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
314
- else:
315
- raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
316
-
317
-
318
- def predict_fn(df, models) -> pd.DataFrame:
319
- """Make Predictions with our XGB Quantile Regression Model
320
-
321
- Args:
322
- df (pd.DataFrame): The input DataFrame
323
- models (dict): The dictionary of models to use for predictions
324
-
325
- Returns:
326
- pd.DataFrame: The DataFrame with the predictions added
327
- """
328
-
329
- # Grab our feature columns (from training)
330
- model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
331
- with open(os.path.join(model_dir, "feature_columns.json")) as fp:
332
- model_features = json.load(fp)
333
-
334
- # Match features in a case-insensitive manner
335
- matched_df = match_features_case_insensitive(df, model_features)
336
-
337
- # Use XGBoost for point predictions
338
- df["prediction"] = models["xgboost"].predict(matched_df[model_features])
339
-
340
- # NGBoost predict returns distribution objects
341
- y_dists = models["ngboost"].pred_dist(matched_df[model_features])
342
-
343
- # Extract parameters from distribution
344
- dist_params = y_dists.params
345
-
346
- # Extract mean and std from distribution parameters
347
- df["prediction_uq"] = dist_params["loc"] # mean
348
- df["prediction_std"] = dist_params["scale"] # standard deviation
349
-
350
- # Add 95% prediction intervals using ppf (percent point function)
351
- # Note: Our hybrid model uses XGB point prediction and NGBoost UQ
352
- # so we need to adjust the bounds to include the point prediction
353
- df["q_025"] = np.minimum(y_dists.ppf(0.025), df["prediction"])
354
- df["q_975"] = np.maximum(y_dists.ppf(0.975), df["prediction"])
355
-
356
- # Add 90% prediction intervals
357
- df["q_05"] = y_dists.ppf(0.05) # 5th percentile
358
- df["q_95"] = y_dists.ppf(0.95) # 95th percentile
359
-
360
- # Add 80% prediction intervals
361
- df["q_10"] = y_dists.ppf(0.10) # 10th percentile
362
- df["q_90"] = y_dists.ppf(0.90) # 90th percentile
363
-
364
- # Add 50% prediction intervals
365
- df["q_25"] = y_dists.ppf(0.25) # 25th percentile
366
- df["q_75"] = y_dists.ppf(0.75) # 75th percentile
367
-
368
- # Reorder the quantile columns for easier reading
369
- quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
370
- other_cols = [col for col in df.columns if col not in quantile_cols]
371
- df = df[other_cols + quantile_cols]
372
-
373
- # Compute Nearest neighbors with Proximity model
374
- models["proximity"].neighbors(df)
375
-
376
- # Return the modified DataFrame
377
- return df