workbench 0.8.168__py3-none-any.whl → 0.8.192__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. workbench/algorithms/dataframe/proximity.py +143 -102
  2. workbench/algorithms/graph/light/proximity_graph.py +2 -1
  3. workbench/api/compound.py +1 -1
  4. workbench/api/endpoint.py +3 -2
  5. workbench/api/feature_set.py +4 -4
  6. workbench/api/model.py +16 -12
  7. workbench/api/monitor.py +1 -16
  8. workbench/core/artifacts/artifact.py +11 -3
  9. workbench/core/artifacts/data_capture_core.py +355 -0
  10. workbench/core/artifacts/endpoint_core.py +113 -27
  11. workbench/core/artifacts/feature_set_core.py +72 -13
  12. workbench/core/artifacts/model_core.py +50 -15
  13. workbench/core/artifacts/monitor_core.py +33 -249
  14. workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
  15. workbench/core/cloud_platform/aws/aws_meta.py +11 -4
  16. workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
  17. workbench/core/transforms/features_to_model/features_to_model.py +9 -4
  18. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
  19. workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
  20. workbench/core/views/training_view.py +49 -53
  21. workbench/core/views/view.py +51 -1
  22. workbench/core/views/view_utils.py +4 -4
  23. workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
  24. workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
  25. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
  26. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
  27. workbench/model_scripts/custom_models/proximity/proximity.py +143 -102
  28. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  29. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
  30. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  31. workbench/model_scripts/custom_models/uq_models/meta_uq.template +156 -58
  32. workbench/model_scripts/custom_models/uq_models/ngboost.template +20 -14
  33. workbench/model_scripts/custom_models/uq_models/proximity.py +143 -102
  34. workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
  35. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
  36. workbench/model_scripts/pytorch_model/pytorch.template +9 -18
  37. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  38. workbench/model_scripts/script_generation.py +7 -2
  39. workbench/model_scripts/uq_models/mapie.template +492 -0
  40. workbench/model_scripts/uq_models/requirements.txt +1 -0
  41. workbench/model_scripts/xgb_model/xgb_model.template +31 -40
  42. workbench/repl/workbench_shell.py +4 -4
  43. workbench/scripts/lambda_launcher.py +63 -0
  44. workbench/scripts/{ml_pipeline_launcher.py → ml_pipeline_batch.py} +49 -51
  45. workbench/scripts/ml_pipeline_sqs.py +186 -0
  46. workbench/utils/chem_utils/__init__.py +0 -0
  47. workbench/utils/chem_utils/fingerprints.py +134 -0
  48. workbench/utils/chem_utils/misc.py +194 -0
  49. workbench/utils/chem_utils/mol_descriptors.py +483 -0
  50. workbench/utils/chem_utils/mol_standardize.py +450 -0
  51. workbench/utils/chem_utils/mol_tagging.py +348 -0
  52. workbench/utils/chem_utils/projections.py +209 -0
  53. workbench/utils/chem_utils/salts.py +256 -0
  54. workbench/utils/chem_utils/sdf.py +292 -0
  55. workbench/utils/chem_utils/toxicity.py +250 -0
  56. workbench/utils/chem_utils/vis.py +253 -0
  57. workbench/utils/config_manager.py +2 -6
  58. workbench/utils/endpoint_utils.py +5 -7
  59. workbench/utils/license_manager.py +2 -6
  60. workbench/utils/model_utils.py +76 -30
  61. workbench/utils/monitor_utils.py +44 -62
  62. workbench/utils/pandas_utils.py +3 -3
  63. workbench/utils/shap_utils.py +10 -2
  64. workbench/utils/workbench_sqs.py +1 -1
  65. workbench/utils/xgboost_model_utils.py +283 -145
  66. workbench/web_interface/components/plugins/dashboard_status.py +3 -1
  67. workbench/web_interface/components/plugins/generated_compounds.py +1 -1
  68. workbench/web_interface/components/plugins/scatter_plot.py +3 -3
  69. {workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/METADATA +2 -1
  70. {workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/RECORD +74 -70
  71. {workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/entry_points.txt +3 -1
  72. workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
  73. workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
  74. workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
  75. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
  76. workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
  77. workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
  78. workbench/model_scripts/pytorch_model/generated_model_script.py +0 -576
  79. workbench/model_scripts/quant_regression/quant_regression.template +0 -279
  80. workbench/model_scripts/quant_regression/requirements.txt +0 -1
  81. workbench/model_scripts/scikit_learn/generated_model_script.py +0 -307
  82. workbench/model_scripts/xgb_model/generated_model_script.py +0 -477
  83. workbench/utils/chem_utils.py +0 -1556
  84. workbench/utils/fast_inference.py +0 -167
  85. workbench/utils/resource_utils.py +0 -39
  86. {workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/WHEEL +0 -0
  87. {workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/licenses/LICENSE +0 -0
  88. {workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/top_level.txt +0 -0
@@ -1,477 +0,0 @@
1
- # Imports for XGB Model
2
- import xgboost as xgb
3
- import awswrangler as wr
4
- import numpy as np
5
-
6
- # Model Performance Scores
7
- from sklearn.metrics import (
8
- mean_absolute_error,
9
- r2_score,
10
- root_mean_squared_error,
11
- precision_recall_fscore_support,
12
- confusion_matrix,
13
- )
14
-
15
- # Classification Encoder
16
- from sklearn.preprocessing import LabelEncoder
17
-
18
- # Scikit Learn Imports
19
- from sklearn.model_selection import train_test_split
20
-
21
- from io import StringIO
22
- import json
23
- import argparse
24
- import joblib
25
- import os
26
- import pandas as pd
27
- from typing import List, Tuple
28
-
29
- # Template Parameters
30
- TEMPLATE_PARAMS = {
31
- "model_type": "classifier",
32
- "target_column": "class",
33
- "features": ['bcut2d_logplow', 'numradicalelectrons', 'smr_vsa5', 'fr_lactam', 'fr_morpholine', 'fr_aldehyde', 'slogp_vsa1', 'fr_amidine', 'bpol', 'fr_ester', 'fr_azo', 'kappa3', 'peoe_vsa5', 'fr_ketone_topliss', 'vsa_estate9', 'estate_vsa9', 'bcut2d_mrhi', 'fr_ndealkylation1', 'numrotatablebonds', 'minestateindex', 'fr_quatn', 'peoe_vsa3', 'fr_epoxide', 'fr_aniline', 'minpartialcharge', 'fr_nitroso', 'fpdensitymorgan2', 'fr_oxime', 'fr_sulfone', 'smr_vsa1', 'kappa1', 'fr_pyridine', 'numaromaticrings', 'vsa_estate6', 'molmr', 'estate_vsa1', 'fr_dihydropyridine', 'vsa_estate10', 'fr_alkyl_halide', 'chi2n', 'fr_thiocyan', 'fpdensitymorgan1', 'fr_unbrch_alkane', 'slogp_vsa9', 'chi4n', 'fr_nitro_arom', 'fr_al_oh', 'fr_furan', 'fr_c_s', 'peoe_vsa8', 'peoe_vsa14', 'numheteroatoms', 'fr_ndealkylation2', 'maxabspartialcharge', 'vsa_estate2', 'peoe_vsa7', 'apol', 'numhacceptors', 'fr_tetrazole', 'vsa_estate1', 'peoe_vsa9', 'naromatom', 'bcut2d_chghi', 'fr_sh', 'fr_halogen', 'slogp_vsa4', 'fr_benzodiazepine', 'molwt', 'fr_isocyan', 'fr_prisulfonamd', 'maxabsestateindex', 'minabsestateindex', 'peoe_vsa11', 'slogp_vsa12', 'estate_vsa5', 'numaliphaticcarbocycles', 'bcut2d_mwlow', 'slogp_vsa7', 'fr_allylic_oxid', 'fr_methoxy', 'fr_nh0', 'fr_coo2', 'fr_phenol', 'nacid', 'nbase', 'chi3v', 'fr_ar_nh', 'fr_nitrile', 'fr_imidazole', 'fr_urea', 'bcut2d_mrlow', 'chi1', 'smr_vsa6', 'fr_aryl_methyl', 'narombond', 'fr_alkyl_carbamate', 'fr_piperzine', 'exactmolwt', 'qed', 'chi0n', 'fr_sulfonamd', 'fr_thiazole', 'numvalenceelectrons', 'fr_phos_acid', 'peoe_vsa12', 'fr_nh1', 'fr_hdrzine', 'fr_c_o_nocoo', 'fr_lactone', 'estate_vsa6', 'bcut2d_logphi', 'vsa_estate7', 'peoe_vsa13', 'numsaturatedcarbocycles', 'fr_nitro', 'fr_phenol_noorthohbond', 'rotratio', 'fr_barbitur', 'fr_isothiocyan', 'balabanj', 'fr_arn', 'fr_imine', 'maxpartialcharge', 'fr_sulfide', 'slogp_vsa11', 'fr_hoccn', 'fr_n_o', 'peoe_vsa1', 'slogp_vsa6', 'heavyatommolwt', 'fractioncsp3', 'estate_vsa8', 'peoe_vsa10', 'numaliphaticrings', 'fr_thiophene', 'maxestateindex', 'smr_vsa10', 'labuteasa', 'smr_vsa2', 'fpdensitymorgan3', 'smr_vsa9', 'slogp_vsa10', 'numaromaticheterocycles', 'fr_nh2', 'fr_diazo', 'chi3n', 'fr_ar_coo', 'slogp_vsa5', 'fr_bicyclic', 'fr_amide', 'estate_vsa10', 'fr_guanido', 'chi1n', 'numsaturatedrings', 'fr_piperdine', 'fr_term_acetylene', 'estate_vsa4', 'slogp_vsa3', 'fr_coo', 'fr_ether', 'estate_vsa7', 'bcut2d_chglo', 'fr_oxazole', 'peoe_vsa6', 'hallkieralpha', 'peoe_vsa2', 'chi2v', 'nocount', 'vsa_estate5', 'fr_nhpyrrole', 'fr_al_coo', 'bertzct', 'estate_vsa11', 'minabspartialcharge', 'slogp_vsa8', 'fr_imide', 'kappa2', 'numaliphaticheterocycles', 'numsaturatedheterocycles', 'fr_hdrzone', 'smr_vsa4', 'fr_ar_n', 'nrot', 'smr_vsa8', 'slogp_vsa2', 'chi4v', 'fr_phos_ester', 'fr_para_hydroxylation', 'smr_vsa3', 'nhohcount', 'estate_vsa2', 'mollogp', 'tpsa', 'fr_azide', 'peoe_vsa4', 'numhdonors', 'fr_al_oh_notert', 'fr_c_o', 'chi0', 'fr_nitro_arom_nonortho', 'vsa_estate3', 'fr_benzene', 'fr_ketone', 'vsa_estate8', 'smr_vsa7', 'fr_ar_oh', 'fr_priamide', 'ringcount', 'estate_vsa3', 'numaromaticcarbocycles', 'bcut2d_mwhi', 'chi1v', 'heavyatomcount', 'vsa_estate4', 'chi0v', 'pred_pka_reg'],
34
- "compressed_features": [],
35
- "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/sol-with-pka-class-100-test/training",
36
- "train_all_data": True
37
- }
38
-
39
- # Function to check if dataframe is empty
40
- def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
41
- """
42
- Check if the provided dataframe is empty and raise an exception if it is.
43
-
44
- Args:
45
- df (pd.DataFrame): DataFrame to check
46
- df_name (str): Name of the DataFrame
47
- """
48
- if df.empty:
49
- msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
50
- print(msg)
51
- raise ValueError(msg)
52
-
53
-
54
- def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFrame:
55
- """
56
- Expands a column in a DataFrame containing a list of probabilities into separate columns.
57
-
58
- Args:
59
- df (pd.DataFrame): DataFrame containing a "pred_proba" column
60
- class_labels (List[str]): List of class labels
61
-
62
- Returns:
63
- pd.DataFrame: DataFrame with the "pred_proba" expanded into separate columns
64
- """
65
-
66
- # Sanity check
67
- proba_column = "pred_proba"
68
- if proba_column not in df.columns:
69
- raise ValueError('DataFrame does not contain a "pred_proba" column')
70
-
71
- # Construct new column names with '_proba' suffix
72
- proba_splits = [f"{label}_proba" for label in class_labels]
73
-
74
- # Expand the proba_column into separate columns for each probability
75
- proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
76
-
77
- # Drop any proba columns and reset the index in prep for the concat
78
- df = df.drop(columns=[proba_column]+proba_splits, errors="ignore")
79
- df = df.reset_index(drop=True)
80
-
81
- # Concatenate the new columns with the original DataFrame
82
- df = pd.concat([df, proba_df], axis=1)
83
- print(df)
84
- return df
85
-
86
-
87
- def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
88
- """
89
- Matches and renames DataFrame columns to match model feature names (case-insensitive).
90
- Prioritizes exact matches, then case-insensitive matches.
91
-
92
- Raises ValueError if any model features cannot be matched.
93
- """
94
- df_columns_lower = {col.lower(): col for col in df.columns}
95
- rename_dict = {}
96
- missing = []
97
-
98
- for feature in model_features:
99
- if feature in df.columns:
100
- continue # Exact match
101
- elif feature.lower() in df_columns_lower:
102
- rename_dict[df_columns_lower[feature.lower()]] = feature
103
- else:
104
- missing.append(feature)
105
-
106
- if missing:
107
- raise ValueError(f"Features not found: {missing}")
108
-
109
- return df.rename(columns=rename_dict)
110
-
111
-
112
- def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
113
- """
114
- Converts appropriate columns to categorical type with consistent mappings.
115
-
116
- Args:
117
- df (pd.DataFrame): The DataFrame to process.
118
- features (list): List of feature names to consider for conversion.
119
- category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
120
- training mode. If populated, we're in inference mode.
121
-
122
- Returns:
123
- tuple: (processed DataFrame, category mappings dictionary)
124
- """
125
- # Training mode
126
- if category_mappings == {}:
127
- for col in df.select_dtypes(include=["object", "string"]):
128
- if col in features and df[col].nunique() < 20:
129
- print(f"Training mode: Converting {col} to category")
130
- df[col] = df[col].astype("category")
131
- category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
132
-
133
- # Inference mode
134
- else:
135
- for col, categories in category_mappings.items():
136
- if col in df.columns:
137
- print(f"Inference mode: Applying categorical mapping for {col}")
138
- df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
139
-
140
- return df, category_mappings
141
-
142
-
143
- def decompress_features(df: pd.DataFrame, features: List[str], compressed_features: List[str]) -> Tuple[pd.DataFrame, List[str]]:
144
- """Prepare features for the XGBoost model
145
-
146
- Args:
147
- df (pd.DataFrame): The features DataFrame
148
- features (List[str]): Full list of feature names
149
- compressed_features (List[str]): List of feature names to decompress (bitstrings)
150
-
151
- Returns:
152
- pd.DataFrame: DataFrame with the decompressed features
153
- List[str]: Updated list of feature names after decompression
154
-
155
- Raises:
156
- ValueError: If any missing values are found in the specified features
157
- """
158
-
159
- # Check for any missing values in the required features
160
- missing_counts = df[features].isna().sum()
161
- if missing_counts.any():
162
- missing_features = missing_counts[missing_counts > 0]
163
- print(
164
- f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
165
- "WARNING: You might want to remove/replace all NaN values before processing."
166
- )
167
-
168
- # Decompress the specified compressed features
169
- decompressed_features = features
170
- for feature in compressed_features:
171
- if (feature not in df.columns) or (feature not in features):
172
- print(f"Feature '{feature}' not in the features list, skipping decompression.")
173
- continue
174
-
175
- # Remove the feature from the list of features to avoid duplication
176
- decompressed_features.remove(feature)
177
-
178
- # Handle all compressed features as bitstrings
179
- bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
180
- prefix = feature[:3]
181
-
182
- # Create all new columns at once - avoids fragmentation
183
- new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
184
- new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
185
-
186
- # Add to features list
187
- decompressed_features.extend(new_col_names)
188
-
189
- # Drop original column and concatenate new ones
190
- df = df.drop(columns=[feature])
191
- df = pd.concat([df, new_df], axis=1)
192
-
193
- return df, decompressed_features
194
-
195
-
196
- if __name__ == "__main__":
197
- """The main function is for training the XGBoost model"""
198
-
199
- # Harness Template Parameters
200
- target = TEMPLATE_PARAMS["target_column"]
201
- features = TEMPLATE_PARAMS["features"]
202
- orig_features = features.copy()
203
- compressed_features = TEMPLATE_PARAMS["compressed_features"]
204
- model_type = TEMPLATE_PARAMS["model_type"]
205
- model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
206
- train_all_data = TEMPLATE_PARAMS["train_all_data"]
207
- validation_split = 0.2
208
-
209
- # Script arguments for input/output directories
210
- parser = argparse.ArgumentParser()
211
- parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
212
- parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
213
- parser.add_argument(
214
- "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
215
- )
216
- args = parser.parse_args()
217
-
218
- # Read the training data into DataFrames
219
- training_files = [
220
- os.path.join(args.train, file)
221
- for file in os.listdir(args.train)
222
- if file.endswith(".csv")
223
- ]
224
- print(f"Training Files: {training_files}")
225
-
226
- # Combine files and read them all into a single pandas dataframe
227
- all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
228
-
229
- # Check if the dataframe is empty
230
- check_dataframe(all_df, "training_df")
231
-
232
- # Features/Target output
233
- print(f"Target: {target}")
234
- print(f"Features: {str(features)}")
235
-
236
- # Convert any features that might be categorical to 'category' type
237
- all_df, category_mappings = convert_categorical_types(all_df, features)
238
-
239
- # If we have compressed features, decompress them
240
- if compressed_features:
241
- print(f"Decompressing features {compressed_features}...")
242
- all_df, features = decompress_features(all_df, features, compressed_features)
243
-
244
- # Do we want to train on all the data?
245
- if train_all_data:
246
- print("Training on ALL of the data")
247
- df_train = all_df.copy()
248
- df_val = all_df.copy()
249
-
250
- # Does the dataframe have a training column?
251
- elif "training" in all_df.columns:
252
- print("Found training column, splitting data based on training column")
253
- df_train = all_df[all_df["training"]]
254
- df_val = all_df[~all_df["training"]]
255
- else:
256
- # Just do a random training Split
257
- print("WARNING: No training column found, splitting data with random state=42")
258
- df_train, df_val = train_test_split(
259
- all_df, test_size=validation_split, random_state=42
260
- )
261
- print(f"FIT/TRAIN: {df_train.shape}")
262
- print(f"VALIDATION: {df_val.shape}")
263
-
264
- # Now spin up our XGB Model
265
- if model_type == "classifier":
266
- xgb_model = xgb.XGBClassifier(enable_categorical=True)
267
-
268
- # Encode the target column
269
- label_encoder = LabelEncoder()
270
- df_train[target] = label_encoder.fit_transform(df_train[target])
271
- df_val[target] = label_encoder.transform(df_val[target])
272
-
273
- else:
274
- xgb_model = xgb.XGBRegressor(enable_categorical=True)
275
- label_encoder = None # We don't need this for regression
276
-
277
- # Grab our Features, Target and Train the Model
278
- y_train = df_train[target]
279
- X_train= df_train[features]
280
- xgb_model.fit(X_train, y_train)
281
-
282
- # Make Predictions on the Validation Set
283
- print(f"Making Predictions on Validation Set...")
284
- y_validate = df_val[target]
285
- X_validate = df_val[features]
286
- preds = xgb_model.predict(X_validate)
287
- if model_type == "classifier":
288
- # Also get the probabilities for each class
289
- print("Processing Probabilities...")
290
- probs = xgb_model.predict_proba(X_validate)
291
- df_val["pred_proba"] = [p.tolist() for p in probs]
292
-
293
- # Expand the pred_proba column into separate columns for each class
294
- print(df_val.columns)
295
- df_val = expand_proba_column(df_val, label_encoder.classes_)
296
- print(df_val.columns)
297
-
298
- # Decode the target and prediction labels
299
- y_validate = label_encoder.inverse_transform(y_validate)
300
- preds = label_encoder.inverse_transform(preds)
301
-
302
- # Save predictions to S3 (just the target, prediction, and '_proba' columns)
303
- df_val["prediction"] = preds
304
- output_columns = [target, "prediction"]
305
- output_columns += [col for col in df_val.columns if col.endswith("_proba")]
306
- wr.s3.to_csv(
307
- df_val[output_columns],
308
- path=f"{model_metrics_s3_path}/validation_predictions.csv",
309
- index=False,
310
- )
311
-
312
- # Report Performance Metrics
313
- if model_type == "classifier":
314
- # Get the label names and their integer mapping
315
- label_names = label_encoder.classes_
316
-
317
- # Calculate various model performance metrics
318
- scores = precision_recall_fscore_support(
319
- y_validate, preds, average=None, labels=label_names
320
- )
321
-
322
- # Put the scores into a dataframe
323
- score_df = pd.DataFrame(
324
- {
325
- target: label_names,
326
- "precision": scores[0],
327
- "recall": scores[1],
328
- "fscore": scores[2],
329
- "support": scores[3],
330
- }
331
- )
332
-
333
- # We need to get creative with the Classification Metrics
334
- metrics = ["precision", "recall", "fscore", "support"]
335
- for t in label_names:
336
- for m in metrics:
337
- value = score_df.loc[score_df[target] == t, m].iloc[0]
338
- print(f"Metrics:{t}:{m} {value}")
339
-
340
- # Compute and output the confusion matrix
341
- conf_mtx = confusion_matrix(y_validate, preds, labels=label_names)
342
- for i, row_name in enumerate(label_names):
343
- for j, col_name in enumerate(label_names):
344
- value = conf_mtx[i, j]
345
- print(f"ConfusionMatrix:{row_name}:{col_name} {value}")
346
-
347
- else:
348
- # Calculate various model performance metrics (regression)
349
- rmse = root_mean_squared_error(y_validate, preds)
350
- mae = mean_absolute_error(y_validate, preds)
351
- r2 = r2_score(y_validate, preds)
352
- print(f"RMSE: {rmse:.3f}")
353
- print(f"MAE: {mae:.3f}")
354
- print(f"R2: {r2:.3f}")
355
- print(f"NumRows: {len(df_val)}")
356
-
357
- # Now save the model to the standard place/name
358
- xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
359
- if label_encoder:
360
- joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
361
-
362
- # Save the features (this will validate input during predictions)
363
- with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
364
- json.dump(orig_features, fp) # We save the original features, not the decompressed ones
365
-
366
- # Save the category mappings
367
- with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
368
- json.dump(category_mappings, fp)
369
-
370
-
371
- def model_fn(model_dir):
372
- """Deserialize and return fitted XGBoost model"""
373
-
374
- model_path = os.path.join(model_dir, "xgb_model.json")
375
-
376
- with open(model_path, "r") as f:
377
- model_json = json.load(f)
378
-
379
- sklearn_data = model_json['learner']['attributes']['scikit_learn']
380
- model_type = json.loads(sklearn_data)['_estimator_type']
381
-
382
- model_class = xgb.XGBClassifier if model_type == "classifier" else xgb.XGBRegressor
383
- model = model_class(enable_categorical=True)
384
- model.load_model(model_path)
385
-
386
- return model
387
-
388
-
389
- def input_fn(input_data, content_type):
390
- """Parse input data and return a DataFrame."""
391
- if not input_data:
392
- raise ValueError("Empty input data is not supported!")
393
-
394
- # Decode bytes to string if necessary
395
- if isinstance(input_data, bytes):
396
- input_data = input_data.decode("utf-8")
397
-
398
- if "text/csv" in content_type:
399
- return pd.read_csv(StringIO(input_data))
400
- elif "application/json" in content_type:
401
- return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
402
- else:
403
- raise ValueError(f"{content_type} not supported!")
404
-
405
-
406
- def output_fn(output_df, accept_type):
407
- """Supports both CSV and JSON output formats."""
408
- if "text/csv" in accept_type:
409
- csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
410
- return csv_output, "text/csv"
411
- elif "application/json" in accept_type:
412
- return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
413
- else:
414
- raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
415
-
416
-
417
- def predict_fn(df, model) -> pd.DataFrame:
418
- """Make Predictions with our XGB Model
419
-
420
- Args:
421
- df (pd.DataFrame): The input DataFrame
422
- model: The model use for predictions
423
-
424
- Returns:
425
- pd.DataFrame: The DataFrame with the predictions added
426
- """
427
- compressed_features = TEMPLATE_PARAMS["compressed_features"]
428
-
429
- # Grab our feature columns (from training)
430
- model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
431
- with open(os.path.join(model_dir, "feature_columns.json")) as fp:
432
- features = json.load(fp)
433
- print(f"Model Features: {features}")
434
-
435
- # Load the category mappings (from training)
436
- with open(os.path.join(model_dir, "category_mappings.json")) as fp:
437
- category_mappings = json.load(fp)
438
-
439
- # Load our Label Encoder if we have one
440
- label_encoder = None
441
- if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")):
442
- label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
443
-
444
- # We're going match features in a case-insensitive manner, accounting for all the permutations
445
- # - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
446
- # - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
447
- matched_df = match_features_case_insensitive(df, features)
448
-
449
- # Detect categorical types in the incoming DataFrame
450
- matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
451
-
452
- # If we have compressed features, decompress them
453
- if compressed_features:
454
- print("Decompressing features for prediction...")
455
- matched_df, features = decompress_features(matched_df, features, compressed_features)
456
-
457
- # Predict the features against our XGB Model
458
- X = matched_df[features]
459
- predictions = model.predict(X)
460
-
461
- # If we have a label encoder, decode the predictions
462
- if label_encoder:
463
- predictions = label_encoder.inverse_transform(predictions)
464
-
465
- # Set the predictions on the DataFrame
466
- df["prediction"] = predictions
467
-
468
- # Does our model have a 'predict_proba' method? If so we will call it and add the results to the DataFrame
469
- if getattr(model, "predict_proba", None):
470
- probs = model.predict_proba(matched_df[features])
471
- df["pred_proba"] = [p.tolist() for p in probs]
472
-
473
- # Expand the pred_proba column into separate columns for each class
474
- df = expand_proba_column(df, label_encoder.classes_)
475
-
476
- # All done, return the DataFrame with new columns for the predictions
477
- return df