workbench 0.8.161__py3-none-any.whl → 0.8.192__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. workbench/algorithms/dataframe/proximity.py +143 -102
  2. workbench/algorithms/graph/light/proximity_graph.py +2 -1
  3. workbench/api/compound.py +1 -1
  4. workbench/api/endpoint.py +12 -0
  5. workbench/api/feature_set.py +4 -4
  6. workbench/api/meta.py +5 -2
  7. workbench/api/model.py +16 -12
  8. workbench/api/monitor.py +1 -16
  9. workbench/core/artifacts/artifact.py +11 -3
  10. workbench/core/artifacts/data_capture_core.py +355 -0
  11. workbench/core/artifacts/endpoint_core.py +168 -78
  12. workbench/core/artifacts/feature_set_core.py +72 -13
  13. workbench/core/artifacts/model_core.py +50 -15
  14. workbench/core/artifacts/monitor_core.py +33 -248
  15. workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
  16. workbench/core/cloud_platform/aws/aws_meta.py +12 -5
  17. workbench/core/cloud_platform/aws/aws_session.py +4 -4
  18. workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
  19. workbench/core/transforms/features_to_model/features_to_model.py +9 -4
  20. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
  21. workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
  22. workbench/core/views/training_view.py +49 -53
  23. workbench/core/views/view.py +51 -1
  24. workbench/core/views/view_utils.py +4 -4
  25. workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
  26. workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
  27. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
  28. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
  29. workbench/model_scripts/custom_models/proximity/proximity.py +143 -102
  30. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  31. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
  32. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  33. workbench/model_scripts/custom_models/uq_models/meta_uq.template +156 -58
  34. workbench/model_scripts/custom_models/uq_models/ngboost.template +20 -14
  35. workbench/model_scripts/custom_models/uq_models/proximity.py +143 -102
  36. workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
  37. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
  38. workbench/model_scripts/pytorch_model/pytorch.template +19 -20
  39. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  40. workbench/model_scripts/script_generation.py +7 -2
  41. workbench/model_scripts/uq_models/mapie.template +492 -0
  42. workbench/model_scripts/uq_models/requirements.txt +1 -0
  43. workbench/model_scripts/xgb_model/xgb_model.template +31 -40
  44. workbench/repl/workbench_shell.py +11 -6
  45. workbench/scripts/lambda_launcher.py +63 -0
  46. workbench/scripts/ml_pipeline_batch.py +137 -0
  47. workbench/scripts/ml_pipeline_sqs.py +186 -0
  48. workbench/scripts/monitor_cloud_watch.py +20 -100
  49. workbench/utils/aws_utils.py +4 -3
  50. workbench/utils/chem_utils/__init__.py +0 -0
  51. workbench/utils/chem_utils/fingerprints.py +134 -0
  52. workbench/utils/chem_utils/misc.py +194 -0
  53. workbench/utils/chem_utils/mol_descriptors.py +483 -0
  54. workbench/utils/chem_utils/mol_standardize.py +450 -0
  55. workbench/utils/chem_utils/mol_tagging.py +348 -0
  56. workbench/utils/chem_utils/projections.py +209 -0
  57. workbench/utils/chem_utils/salts.py +256 -0
  58. workbench/utils/chem_utils/sdf.py +292 -0
  59. workbench/utils/chem_utils/toxicity.py +250 -0
  60. workbench/utils/chem_utils/vis.py +253 -0
  61. workbench/utils/cloudwatch_handler.py +1 -1
  62. workbench/utils/cloudwatch_utils.py +137 -0
  63. workbench/utils/config_manager.py +3 -7
  64. workbench/utils/endpoint_utils.py +5 -7
  65. workbench/utils/license_manager.py +2 -6
  66. workbench/utils/model_utils.py +76 -30
  67. workbench/utils/monitor_utils.py +44 -62
  68. workbench/utils/pandas_utils.py +3 -3
  69. workbench/utils/shap_utils.py +10 -2
  70. workbench/utils/workbench_logging.py +0 -3
  71. workbench/utils/workbench_sqs.py +1 -1
  72. workbench/utils/xgboost_model_utils.py +283 -145
  73. workbench/web_interface/components/plugins/dashboard_status.py +3 -1
  74. workbench/web_interface/components/plugins/generated_compounds.py +1 -1
  75. workbench/web_interface/components/plugins/scatter_plot.py +3 -3
  76. {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/METADATA +4 -4
  77. {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/RECORD +81 -76
  78. {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/entry_points.txt +3 -0
  79. workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
  80. workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
  81. workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
  82. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
  83. workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
  84. workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
  85. workbench/model_scripts/pytorch_model/generated_model_script.py +0 -565
  86. workbench/model_scripts/quant_regression/quant_regression.template +0 -279
  87. workbench/model_scripts/quant_regression/requirements.txt +0 -1
  88. workbench/model_scripts/scikit_learn/generated_model_script.py +0 -307
  89. workbench/model_scripts/xgb_model/generated_model_script.py +0 -477
  90. workbench/utils/chem_utils.py +0 -1556
  91. workbench/utils/execution_environment.py +0 -211
  92. workbench/utils/fast_inference.py +0 -167
  93. workbench/utils/resource_utils.py +0 -39
  94. {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/WHEEL +0 -0
  95. {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/licenses/LICENSE +0 -0
  96. {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/top_level.txt +0 -0
@@ -1,279 +0,0 @@
1
- # Imports for XGB Model
2
- import xgboost as xgb
3
- import awswrangler as wr
4
- from sklearn.model_selection import train_test_split
5
-
6
- # Model Performance Scores
7
- from sklearn.metrics import (
8
- mean_absolute_error,
9
- r2_score,
10
- root_mean_squared_error
11
- )
12
-
13
- from io import StringIO
14
- import json
15
- import argparse
16
- import os
17
- import pandas as pd
18
-
19
- # Template Placeholders
20
- TEMPLATE_PARAMS = {
21
- "model_type": "{{model_type}}",
22
- "target_column": "{{target_column}}",
23
- "features": "{{feature_list}}",
24
- "model_metrics_s3_path": "{{model_metrics_s3_path}}",
25
- "train_all_data": "{{train_all_data}}"
26
- }
27
-
28
- # Function to check if dataframe is empty
29
- def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
30
- """
31
- Check if the provided dataframe is empty and raise an exception if it is.
32
-
33
- Args:
34
- df (pd.DataFrame): DataFrame to check
35
- df_name (str): Name of the DataFrame
36
- """
37
- if df.empty:
38
- msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
39
- print(msg)
40
- raise ValueError(msg)
41
-
42
-
43
- def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
44
- """
45
- Matches and renames DataFrame columns to match model feature names (case-insensitive).
46
- Prioritizes exact matches, then case-insensitive matches.
47
-
48
- Raises ValueError if any model features cannot be matched.
49
- """
50
- df_columns_lower = {col.lower(): col for col in df.columns}
51
- rename_dict = {}
52
- missing = []
53
- for feature in model_features:
54
- if feature in df.columns:
55
- continue # Exact match
56
- elif feature.lower() in df_columns_lower:
57
- rename_dict[df_columns_lower[feature.lower()]] = feature
58
- else:
59
- missing.append(feature)
60
-
61
- if missing:
62
- raise ValueError(f"Features not found: {missing}")
63
-
64
- # Rename the DataFrame columns to match the model features
65
- return df.rename(columns=rename_dict)
66
-
67
- if __name__ == "__main__":
68
- """The main function is for training the XGBoost Quantile Regression models"""
69
-
70
- # Harness Template Parameters
71
- target = TEMPLATE_PARAMS["target_column"]
72
- features = TEMPLATE_PARAMS["features"]
73
- model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
74
- train_all_data = TEMPLATE_PARAMS["train_all_data"]
75
- validation_split = 0.2
76
- quantiles = [0.025, 0.25, 0.50, 0.75, 0.975]
77
- q_models = {}
78
-
79
- # Script arguments for input/output directories
80
- parser = argparse.ArgumentParser()
81
- parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
82
- parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
83
- parser.add_argument(
84
- "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
85
- )
86
- args = parser.parse_args()
87
-
88
- # Load training data from the specified directory
89
- training_files = [
90
- os.path.join(args.train, file)
91
- for file in os.listdir(args.train) if file.endswith(".csv")
92
- ]
93
- print(f"Training Files: {training_files}")
94
-
95
- # Combine files and read them all into a single pandas dataframe
96
- df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
97
-
98
- # Check if the DataFrame is empty
99
- check_dataframe(df, "training_df")
100
-
101
- # Training data split logic
102
- if train_all_data:
103
- # Use all data for both training and validation
104
- print("Training on all data...")
105
- df_train = df.copy()
106
- df_val = df.copy()
107
- elif "training" in df.columns:
108
- # Split data based on a 'training' column if it exists
109
- print("Splitting data based on 'training' column...")
110
- df_train = df[df["training"]].copy()
111
- df_val = df[~df["training"]].copy()
112
- else:
113
- # Perform a random split if no 'training' column is found
114
- print("Splitting data randomly...")
115
- df_train, df_val = train_test_split(df, test_size=validation_split, random_state=42)
116
-
117
- # Features/Target output
118
- print(f"Target: {target}")
119
- print(f"Features: {str(features)}")
120
- print(f"Data Shape: {df.shape}")
121
-
122
- # Prepare features and targets for training
123
- X_train = df_train[features]
124
- X_val = df_val[features]
125
- y_train = df_train[target]
126
- y_val = df_val[target]
127
-
128
- # Train models for each of the quantiles
129
- for q in quantiles:
130
- params = {
131
- "objective": "reg:quantileerror",
132
- "quantile_alpha": q,
133
- }
134
- model = xgb.XGBRegressor(**params)
135
- model.fit(X_train, y_train)
136
-
137
- # Convert quantile to string
138
- q_str = f"q_{int(q * 100)}" if (q * 100) == int(q * 100) else f"q_{int(q * 1000):03d}"
139
-
140
- # Store the model
141
- q_models[q_str] = model
142
-
143
- # Run predictions for each quantile
144
- quantile_predictions = {q: model.predict(X_val) for q, model in q_models.items()}
145
-
146
- # Create a copy of the validation DataFrame and add the new columns
147
- result_df = df_val[[target]].copy()
148
-
149
- # Add the quantile predictions to the DataFrame
150
- for name, preds in quantile_predictions.items():
151
- result_df[name] = preds
152
-
153
- # Add the median as the main prediction
154
- result_df["prediction"] = result_df["q_50"]
155
-
156
- # Now compute residuals on the prediction
157
- result_df["residual"] = result_df[target] - result_df["prediction"]
158
- result_df["residual_abs"] = result_df["residual"].abs()
159
-
160
- # Save the results dataframe to S3
161
- wr.s3.to_csv(
162
- result_df,
163
- path=f"{model_metrics_s3_path}/validation_predictions.csv",
164
- index=False,
165
- )
166
-
167
- # Report Performance Metrics
168
- rmse = root_mean_squared_error(result_df[target], result_df["prediction"])
169
- mae = mean_absolute_error(result_df[target], result_df["prediction"])
170
- r2 = r2_score(result_df[target], result_df["prediction"])
171
- print(f"RMSE: {rmse:.3f}")
172
- print(f"MAE: {mae:.3f}")
173
- print(f"R2: {r2:.3f}")
174
- print(f"NumRows: {len(result_df)}")
175
-
176
- # Now save the quantile models
177
- for name, model in q_models.items():
178
- model_path = os.path.join(args.model_dir, f"{name}.json")
179
- print(f"Saving model: {model_path}")
180
- model.save_model(model_path)
181
-
182
- # Also save the features (this will validate input during predictions)
183
- with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
184
- json.dump(features, fp)
185
-
186
-
187
- def model_fn(model_dir) -> dict:
188
- """Deserialized and return all the fitted models from the model directory.
189
-
190
- Args:
191
- model_dir (str): The directory where the models are stored.
192
-
193
- Returns:
194
- dict: A dictionary of the models.
195
- """
196
-
197
- # Load ALL the Quantile models from the model directory
198
- models = {}
199
- for file in os.listdir(model_dir):
200
- if file.startswith("q") and file.endswith(".json"): # The Quantile models
201
- # Load the model
202
- model_path = os.path.join(model_dir, file)
203
- print(f"Loading model: {model_path}")
204
- model = xgb.XGBRegressor()
205
- model.load_model(model_path)
206
-
207
- # Store the quantile model
208
- q_name = os.path.splitext(file)[0]
209
- models[q_name] = model
210
-
211
- # Return all the models
212
- return models
213
-
214
-
215
- def input_fn(input_data, content_type):
216
- """Parse input data and return a DataFrame."""
217
- if not input_data:
218
- raise ValueError("Empty input data is not supported!")
219
-
220
- # Decode bytes to string if necessary
221
- if isinstance(input_data, bytes):
222
- input_data = input_data.decode("utf-8")
223
-
224
- if "text/csv" in content_type:
225
- return pd.read_csv(StringIO(input_data))
226
- elif "application/json" in content_type:
227
- return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
228
- else:
229
- raise ValueError(f"{content_type} not supported!")
230
-
231
-
232
- def output_fn(output_df, accept_type):
233
- """Supports both CSV and JSON output formats."""
234
- if "text/csv" in accept_type:
235
- csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
236
- return csv_output, "text/csv"
237
- elif "application/json" in accept_type:
238
- return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
239
- else:
240
- raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
241
-
242
-
243
- def predict_fn(df, models) -> pd.DataFrame:
244
- """Make Predictions with our XGB Quantile Regression Model
245
-
246
- Args:
247
- df (pd.DataFrame): The input DataFrame
248
- models (dict): The dictionary of models to use for predictions
249
-
250
- Returns:
251
- pd.DataFrame: The DataFrame with the predictions added
252
- """
253
-
254
- # Grab our feature columns (from training)
255
- model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
256
- with open(os.path.join(model_dir, "feature_columns.json")) as fp:
257
- model_features = json.load(fp)
258
- print(f"Model Features: {model_features}")
259
-
260
- # We're going match features in a case-insensitive manner, accounting for all the permutations
261
- # - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
262
- # - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
263
- matched_df = match_features_case_insensitive(df, model_features)
264
-
265
- # Predict the features against all the models
266
- for name, model in models.items():
267
- df[name] = model.predict(matched_df[model_features])
268
-
269
- # Use the median prediction as the main prediction
270
- df["prediction"] = df["q_50"]
271
-
272
- # Estimate the standard deviation of the predictions using the interquartile range
273
- df["prediction_std"] = (df["q_75"] - df["q_25"]) / 1.35
274
-
275
- # Reorganize the columns so they are in alphabetical order
276
- df = df.reindex(sorted(df.columns), axis=1)
277
-
278
- # All done, return the DataFrame
279
- return df
@@ -1 +0,0 @@
1
- # Note: In general this file should be empty (as the default inference image has all required libraries)
@@ -1,307 +0,0 @@
1
- # Model Imports (this will be replaced with the imports for the template)
2
- None
3
-
4
- # Template Placeholders
5
- TEMPLATE_PARAMS = {
6
- "model_type": "regressor",
7
- "target_column": "solubility",
8
- "feature_list": ['molwt', 'mollogp', 'molmr', 'heavyatomcount', 'numhacceptors', 'numhdonors', 'numheteroatoms', 'numrotatablebonds', 'numvalenceelectrons', 'numaromaticrings', 'numsaturatedrings', 'numaliphaticrings', 'ringcount', 'tpsa', 'labuteasa', 'balabanj', 'bertzct'],
9
- "model_class": PyTorch,
10
- "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/aqsol-pytorch-reg/training",
11
- "train_all_data": False
12
- }
13
-
14
- import awswrangler as wr
15
- from sklearn.preprocessing import LabelEncoder, StandardScaler
16
- from sklearn.model_selection import train_test_split
17
- from sklearn.pipeline import Pipeline
18
-
19
- from io import StringIO
20
- import json
21
- import argparse
22
- import joblib
23
- import os
24
- import pandas as pd
25
- from typing import List
26
-
27
- # Global model_type for both training and inference
28
- model_type = TEMPLATE_PARAMS["model_type"]
29
-
30
-
31
- # Function to check if dataframe is empty
32
- def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
33
- """Check if the DataFrame is empty and raise an error if so."""
34
- if df.empty:
35
- msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
36
- print(msg)
37
- raise ValueError(msg)
38
-
39
-
40
- # Function to expand probability column into individual class probability columns
41
- def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFrame:
42
- """Expand 'pred_proba' column into separate columns for each class label."""
43
- proba_column = "pred_proba"
44
- if proba_column not in df.columns:
45
- raise ValueError('DataFrame does not contain a "pred_proba" column')
46
-
47
- # Create new columns for each class label's probability
48
- new_col_names = [f"{label}_proba" for label in class_labels]
49
- proba_df = pd.DataFrame(df[proba_column].tolist(), columns=new_col_names)
50
-
51
- # Drop the original 'pred_proba' column and reset the index
52
- df = df.drop(columns=[proba_column]).reset_index(drop=True)
53
-
54
- # Concatenate the new probability columns with the original DataFrame
55
- df = pd.concat([df, proba_df], axis=1)
56
- return df
57
-
58
-
59
- # Function to match DataFrame columns to model features (case-insensitive)
60
- def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
61
- """Match and rename DataFrame columns to match the model's features, case-insensitively."""
62
- # Create a set of exact matches from the DataFrame columns
63
- exact_match_set = set(df.columns)
64
-
65
- # Create a case-insensitive map of DataFrame columns
66
- column_map = {col.lower(): col for col in df.columns}
67
- rename_dict = {}
68
-
69
- # Build a dictionary for renaming columns based on case-insensitive matching
70
- for feature in model_features:
71
- if feature in exact_match_set:
72
- rename_dict[feature] = feature
73
- elif feature.lower() in column_map:
74
- rename_dict[column_map[feature.lower()]] = feature
75
-
76
- # Rename columns in the DataFrame to match model features
77
- return df.rename(columns=rename_dict)
78
-
79
-
80
- #
81
- # Training Section
82
- #
83
- if __name__ == "__main__":
84
- # Template Parameters
85
- target = TEMPLATE_PARAMS["target_column"] # Can be None for unsupervised models
86
- feature_list = TEMPLATE_PARAMS["feature_list"]
87
- model_class = TEMPLATE_PARAMS["model_class"]
88
- model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
89
- train_all_data = TEMPLATE_PARAMS["train_all_data"]
90
- validation_split = 0.2
91
-
92
- # Script arguments for input/output directories
93
- parser = argparse.ArgumentParser()
94
- parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
95
- parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
96
- parser.add_argument(
97
- "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
98
- )
99
- args = parser.parse_args()
100
-
101
- # Load training data from the specified directory
102
- training_files = [
103
- os.path.join(args.train, file)
104
- for file in os.listdir(args.train) if file.endswith(".csv")
105
- ]
106
- all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
107
-
108
- # Check if the DataFrame is empty
109
- check_dataframe(all_df, "training_df")
110
-
111
- # Initialize the model using the specified model class
112
- model = model_class()
113
-
114
- # Determine if standardization is needed based on the model type
115
- needs_standardization = model_type in ["clusterer", "projection"]
116
-
117
- if needs_standardization:
118
- # Create a pipeline with standardization and the model
119
- model = Pipeline([
120
- ("scaler", StandardScaler()),
121
- ("model", model)
122
- ])
123
-
124
- # Handle logic based on the model_type
125
- if model_type in ["classifier", "regressor"]:
126
- # Supervised Models: Prepare for training
127
- if train_all_data:
128
- # Use all data for both training and validation
129
- print("Training on all data...")
130
- df_train = all_df.copy()
131
- df_val = all_df.copy()
132
- elif "training" in all_df.columns:
133
- # Split data based on a 'training' column if it exists
134
- print("Splitting data based on 'training' column...")
135
- df_train = all_df[all_df["training"]].copy()
136
- df_val = all_df[~all_df["training"]].copy()
137
- else:
138
- # Perform a random split if no 'training' column is found
139
- print("Splitting data randomly...")
140
- df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
141
-
142
- # Encode the target variable if the model is a classifier
143
- label_encoder = None
144
- if model_type == "classifier" and target:
145
- label_encoder = LabelEncoder()
146
- df_train[target] = label_encoder.fit_transform(df_train[target])
147
- df_val[target] = label_encoder.transform(df_val[target])
148
-
149
- # Prepare features and targets for training
150
- X_train = df_train[feature_list]
151
- X_val = df_val[feature_list]
152
- y_train = df_train[target] if target else None
153
- y_val = df_val[target] if target else None
154
-
155
- # Train the model using the training data
156
- model.fit(X_train, y_train)
157
-
158
- # Make predictions and handle classification-specific logic
159
- preds = model.predict(X_val)
160
- if model_type == "classifier" and target:
161
- # Get class probabilities and expand them into separate columns
162
- probs = model.predict_proba(X_val)
163
- df_val["pred_proba"] = [p.tolist() for p in probs]
164
- df_val = expand_proba_column(df_val, label_encoder.classes_)
165
-
166
- # Decode the target and prediction labels
167
- df_val[target] = label_encoder.inverse_transform(df_val[target])
168
- preds = label_encoder.inverse_transform(preds)
169
-
170
- # Add predictions to the validation DataFrame
171
- df_val["prediction"] = preds
172
-
173
- # Save the validation predictions to S3
174
- output_columns = [target, "prediction"] + [col for col in df_val.columns if col.endswith("_proba")]
175
- wr.s3.to_csv(df_val[output_columns], path=f"{model_metrics_s3_path}/validation_predictions.csv", index=False)
176
-
177
- elif model_type == "clusterer":
178
- # Unsupervised Clustering Models: Assign cluster labels
179
- all_df["cluster"] = model.fit_predict(all_df[feature_list])
180
-
181
- elif model_type == "projection":
182
- # Projection Models: Apply transformation and label first three components as x, y, z
183
- transformed_data = model.fit_transform(all_df[feature_list])
184
- num_components = transformed_data.shape[1]
185
-
186
- # Special labels for the first three components, if they exist
187
- special_labels = ["x", "y", "z"]
188
- for i in range(num_components):
189
- if i < len(special_labels):
190
- all_df[special_labels[i]] = transformed_data[:, i]
191
- else:
192
- all_df[f"component_{i + 1}"] = transformed_data[:, i]
193
-
194
- elif model_type == "transformer":
195
- # Transformer Models: Apply transformation and use generic component labels
196
- transformed_data = model.fit_transform(all_df[feature_list])
197
- for i in range(transformed_data.shape[1]):
198
- all_df[f"component_{i + 1}"] = transformed_data[:, i]
199
-
200
- # Save the trained model and any necessary assets
201
- joblib.dump(model, os.path.join(args.model_dir, "model.joblib"))
202
- if model_type == "classifier" and label_encoder:
203
- joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
204
-
205
- # Save the feature list to validate input during predictions
206
- with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
207
- json.dump(feature_list, fp)
208
-
209
- #
210
- # Inference Section
211
- #
212
- def model_fn(model_dir):
213
- """Load and return the model from the specified directory."""
214
- return joblib.load(os.path.join(model_dir, "model.joblib"))
215
-
216
-
217
- def input_fn(input_data, content_type):
218
- """Parse input data and return a DataFrame."""
219
- if not input_data:
220
- raise ValueError("Empty input data is not supported!")
221
-
222
- # Decode bytes to string if necessary
223
- if isinstance(input_data, bytes):
224
- input_data = input_data.decode("utf-8")
225
-
226
- if "text/csv" in content_type:
227
- return pd.read_csv(StringIO(input_data))
228
- elif "application/json" in content_type:
229
- return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
230
- else:
231
- raise ValueError(f"{content_type} not supported!")
232
-
233
-
234
- def output_fn(output_df, accept_type):
235
- """Supports both CSV and JSON output formats."""
236
- if "text/csv" in accept_type:
237
- csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
238
- return csv_output, "text/csv"
239
- elif "application/json" in accept_type:
240
- return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
241
- else:
242
- raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
243
-
244
-
245
- def predict_fn(df, model):
246
- """Make predictions or apply transformations using the model and return the DataFrame with results."""
247
- model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
248
-
249
- # Load feature columns from the saved file
250
- with open(os.path.join(model_dir, "feature_columns.json")) as fp:
251
- model_features = json.load(fp)
252
-
253
- # Load label encoder if available (for classification models)
254
- label_encoder = None
255
- if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")):
256
- label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
257
-
258
- # Match features in a case-insensitive manner
259
- matched_df = match_features_case_insensitive(df, model_features)
260
-
261
- # Initialize a dictionary to store the results
262
- results = {}
263
-
264
- # Determine how to handle the model based on its available methods
265
- if hasattr(model, "predict"):
266
- # For supervised models (classifier or regressor)
267
- predictions = model.predict(matched_df[model_features])
268
- results["prediction"] = predictions
269
-
270
- elif hasattr(model, "fit_predict"):
271
- # For clustering models (e.g., DBSCAN)
272
- clusters = model.fit_predict(matched_df[model_features])
273
- results["cluster"] = clusters
274
-
275
- elif hasattr(model, "fit_transform") and not hasattr(model, "predict"):
276
- # For transformation/projection models (e.g., t-SNE, PCA)
277
- transformed_data = model.fit_transform(matched_df[model_features])
278
-
279
- # Handle 2D projection models specifically
280
- if model_type == "projection" and transformed_data.shape[1] == 2:
281
- results["x"] = transformed_data[:, 0]
282
- results["y"] = transformed_data[:, 1]
283
- else:
284
- # General case for any number of components
285
- for i in range(transformed_data.shape[1]):
286
- results[f"component_{i + 1}"] = transformed_data[:, i]
287
-
288
- else:
289
- # Raise an error if the model does not support the expected methods
290
- raise ValueError("Model does not support predict, fit_predict, or fit_transform methods.")
291
-
292
- # Decode predictions if using a label encoder (for classification)
293
- if label_encoder and "prediction" in results:
294
- results["prediction"] = label_encoder.inverse_transform(results["prediction"])
295
-
296
- # Add the results to the DataFrame
297
- for key, value in results.items():
298
- df[key] = value
299
-
300
- # Add probability columns if the model supports it (for classification)
301
- if hasattr(model, "predict_proba"):
302
- probs = model.predict_proba(matched_df[model_features])
303
- df["pred_proba"] = [p.tolist() for p in probs]
304
- df = expand_proba_column(df, label_encoder.classes_)
305
-
306
- # Return the modified DataFrame
307
- return df