workbench 0.8.189__py3-none-any.whl → 0.8.190__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

@@ -1,307 +0,0 @@
1
- # Model Imports (this will be replaced with the imports for the template)
2
- None
3
-
4
- # Template Placeholders
5
- TEMPLATE_PARAMS = {
6
- "model_type": "regressor",
7
- "target_column": "solubility",
8
- "feature_list": ['molwt', 'mollogp', 'molmr', 'heavyatomcount', 'numhacceptors', 'numhdonors', 'numheteroatoms', 'numrotatablebonds', 'numvalenceelectrons', 'numaromaticrings', 'numsaturatedrings', 'numaliphaticrings', 'ringcount', 'tpsa', 'labuteasa', 'balabanj', 'bertzct'],
9
- "model_class": PyTorch,
10
- "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/aqsol-pytorch-reg/training",
11
- "train_all_data": False
12
- }
13
-
14
- import awswrangler as wr
15
- from sklearn.preprocessing import LabelEncoder, StandardScaler
16
- from sklearn.model_selection import train_test_split
17
- from sklearn.pipeline import Pipeline
18
-
19
- from io import StringIO
20
- import json
21
- import argparse
22
- import joblib
23
- import os
24
- import pandas as pd
25
- from typing import List
26
-
27
- # Global model_type for both training and inference
28
- model_type = TEMPLATE_PARAMS["model_type"]
29
-
30
-
31
- # Function to check if dataframe is empty
32
- def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
33
- """Check if the DataFrame is empty and raise an error if so."""
34
- if df.empty:
35
- msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
36
- print(msg)
37
- raise ValueError(msg)
38
-
39
-
40
- # Function to expand probability column into individual class probability columns
41
- def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFrame:
42
- """Expand 'pred_proba' column into separate columns for each class label."""
43
- proba_column = "pred_proba"
44
- if proba_column not in df.columns:
45
- raise ValueError('DataFrame does not contain a "pred_proba" column')
46
-
47
- # Create new columns for each class label's probability
48
- new_col_names = [f"{label}_proba" for label in class_labels]
49
- proba_df = pd.DataFrame(df[proba_column].tolist(), columns=new_col_names)
50
-
51
- # Drop the original 'pred_proba' column and reset the index
52
- df = df.drop(columns=[proba_column]).reset_index(drop=True)
53
-
54
- # Concatenate the new probability columns with the original DataFrame
55
- df = pd.concat([df, proba_df], axis=1)
56
- return df
57
-
58
-
59
- # Function to match DataFrame columns to model features (case-insensitive)
60
- def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
61
- """Match and rename DataFrame columns to match the model's features, case-insensitively."""
62
- # Create a set of exact matches from the DataFrame columns
63
- exact_match_set = set(df.columns)
64
-
65
- # Create a case-insensitive map of DataFrame columns
66
- column_map = {col.lower(): col for col in df.columns}
67
- rename_dict = {}
68
-
69
- # Build a dictionary for renaming columns based on case-insensitive matching
70
- for feature in model_features:
71
- if feature in exact_match_set:
72
- rename_dict[feature] = feature
73
- elif feature.lower() in column_map:
74
- rename_dict[column_map[feature.lower()]] = feature
75
-
76
- # Rename columns in the DataFrame to match model features
77
- return df.rename(columns=rename_dict)
78
-
79
-
80
- #
81
- # Training Section
82
- #
83
- if __name__ == "__main__":
84
- # Template Parameters
85
- target = TEMPLATE_PARAMS["target_column"] # Can be None for unsupervised models
86
- feature_list = TEMPLATE_PARAMS["feature_list"]
87
- model_class = TEMPLATE_PARAMS["model_class"]
88
- model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
89
- train_all_data = TEMPLATE_PARAMS["train_all_data"]
90
- validation_split = 0.2
91
-
92
- # Script arguments for input/output directories
93
- parser = argparse.ArgumentParser()
94
- parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
95
- parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
96
- parser.add_argument(
97
- "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
98
- )
99
- args = parser.parse_args()
100
-
101
- # Load training data from the specified directory
102
- training_files = [
103
- os.path.join(args.train, file)
104
- for file in os.listdir(args.train) if file.endswith(".csv")
105
- ]
106
- all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
107
-
108
- # Check if the DataFrame is empty
109
- check_dataframe(all_df, "training_df")
110
-
111
- # Initialize the model using the specified model class
112
- model = model_class()
113
-
114
- # Determine if standardization is needed based on the model type
115
- needs_standardization = model_type in ["clusterer", "projection"]
116
-
117
- if needs_standardization:
118
- # Create a pipeline with standardization and the model
119
- model = Pipeline([
120
- ("scaler", StandardScaler()),
121
- ("model", model)
122
- ])
123
-
124
- # Handle logic based on the model_type
125
- if model_type in ["classifier", "regressor"]:
126
- # Supervised Models: Prepare for training
127
- if train_all_data:
128
- # Use all data for both training and validation
129
- print("Training on all data...")
130
- df_train = all_df.copy()
131
- df_val = all_df.copy()
132
- elif "training" in all_df.columns:
133
- # Split data based on a 'training' column if it exists
134
- print("Splitting data based on 'training' column...")
135
- df_train = all_df[all_df["training"]].copy()
136
- df_val = all_df[~all_df["training"]].copy()
137
- else:
138
- # Perform a random split if no 'training' column is found
139
- print("Splitting data randomly...")
140
- df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
141
-
142
- # Encode the target variable if the model is a classifier
143
- label_encoder = None
144
- if model_type == "classifier" and target:
145
- label_encoder = LabelEncoder()
146
- df_train[target] = label_encoder.fit_transform(df_train[target])
147
- df_val[target] = label_encoder.transform(df_val[target])
148
-
149
- # Prepare features and targets for training
150
- X_train = df_train[feature_list]
151
- X_val = df_val[feature_list]
152
- y_train = df_train[target] if target else None
153
- y_val = df_val[target] if target else None
154
-
155
- # Train the model using the training data
156
- model.fit(X_train, y_train)
157
-
158
- # Make predictions and handle classification-specific logic
159
- preds = model.predict(X_val)
160
- if model_type == "classifier" and target:
161
- # Get class probabilities and expand them into separate columns
162
- probs = model.predict_proba(X_val)
163
- df_val["pred_proba"] = [p.tolist() for p in probs]
164
- df_val = expand_proba_column(df_val, label_encoder.classes_)
165
-
166
- # Decode the target and prediction labels
167
- df_val[target] = label_encoder.inverse_transform(df_val[target])
168
- preds = label_encoder.inverse_transform(preds)
169
-
170
- # Add predictions to the validation DataFrame
171
- df_val["prediction"] = preds
172
-
173
- # Save the validation predictions to S3
174
- output_columns = [target, "prediction"] + [col for col in df_val.columns if col.endswith("_proba")]
175
- wr.s3.to_csv(df_val[output_columns], path=f"{model_metrics_s3_path}/validation_predictions.csv", index=False)
176
-
177
- elif model_type == "clusterer":
178
- # Unsupervised Clustering Models: Assign cluster labels
179
- all_df["cluster"] = model.fit_predict(all_df[feature_list])
180
-
181
- elif model_type == "projection":
182
- # Projection Models: Apply transformation and label first three components as x, y, z
183
- transformed_data = model.fit_transform(all_df[feature_list])
184
- num_components = transformed_data.shape[1]
185
-
186
- # Special labels for the first three components, if they exist
187
- special_labels = ["x", "y", "z"]
188
- for i in range(num_components):
189
- if i < len(special_labels):
190
- all_df[special_labels[i]] = transformed_data[:, i]
191
- else:
192
- all_df[f"component_{i + 1}"] = transformed_data[:, i]
193
-
194
- elif model_type == "transformer":
195
- # Transformer Models: Apply transformation and use generic component labels
196
- transformed_data = model.fit_transform(all_df[feature_list])
197
- for i in range(transformed_data.shape[1]):
198
- all_df[f"component_{i + 1}"] = transformed_data[:, i]
199
-
200
- # Save the trained model and any necessary assets
201
- joblib.dump(model, os.path.join(args.model_dir, "model.joblib"))
202
- if model_type == "classifier" and label_encoder:
203
- joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
204
-
205
- # Save the feature list to validate input during predictions
206
- with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
207
- json.dump(feature_list, fp)
208
-
209
- #
210
- # Inference Section
211
- #
212
- def model_fn(model_dir):
213
- """Load and return the model from the specified directory."""
214
- return joblib.load(os.path.join(model_dir, "model.joblib"))
215
-
216
-
217
- def input_fn(input_data, content_type):
218
- """Parse input data and return a DataFrame."""
219
- if not input_data:
220
- raise ValueError("Empty input data is not supported!")
221
-
222
- # Decode bytes to string if necessary
223
- if isinstance(input_data, bytes):
224
- input_data = input_data.decode("utf-8")
225
-
226
- if "text/csv" in content_type:
227
- return pd.read_csv(StringIO(input_data))
228
- elif "application/json" in content_type:
229
- return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
230
- else:
231
- raise ValueError(f"{content_type} not supported!")
232
-
233
-
234
- def output_fn(output_df, accept_type):
235
- """Supports both CSV and JSON output formats."""
236
- if "text/csv" in accept_type:
237
- csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
238
- return csv_output, "text/csv"
239
- elif "application/json" in accept_type:
240
- return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
241
- else:
242
- raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
243
-
244
-
245
- def predict_fn(df, model):
246
- """Make predictions or apply transformations using the model and return the DataFrame with results."""
247
- model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
248
-
249
- # Load feature columns from the saved file
250
- with open(os.path.join(model_dir, "feature_columns.json")) as fp:
251
- model_features = json.load(fp)
252
-
253
- # Load label encoder if available (for classification models)
254
- label_encoder = None
255
- if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")):
256
- label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
257
-
258
- # Match features in a case-insensitive manner
259
- matched_df = match_features_case_insensitive(df, model_features)
260
-
261
- # Initialize a dictionary to store the results
262
- results = {}
263
-
264
- # Determine how to handle the model based on its available methods
265
- if hasattr(model, "predict"):
266
- # For supervised models (classifier or regressor)
267
- predictions = model.predict(matched_df[model_features])
268
- results["prediction"] = predictions
269
-
270
- elif hasattr(model, "fit_predict"):
271
- # For clustering models (e.g., DBSCAN)
272
- clusters = model.fit_predict(matched_df[model_features])
273
- results["cluster"] = clusters
274
-
275
- elif hasattr(model, "fit_transform") and not hasattr(model, "predict"):
276
- # For transformation/projection models (e.g., t-SNE, PCA)
277
- transformed_data = model.fit_transform(matched_df[model_features])
278
-
279
- # Handle 2D projection models specifically
280
- if model_type == "projection" and transformed_data.shape[1] == 2:
281
- results["x"] = transformed_data[:, 0]
282
- results["y"] = transformed_data[:, 1]
283
- else:
284
- # General case for any number of components
285
- for i in range(transformed_data.shape[1]):
286
- results[f"component_{i + 1}"] = transformed_data[:, i]
287
-
288
- else:
289
- # Raise an error if the model does not support the expected methods
290
- raise ValueError("Model does not support predict, fit_predict, or fit_transform methods.")
291
-
292
- # Decode predictions if using a label encoder (for classification)
293
- if label_encoder and "prediction" in results:
294
- results["prediction"] = label_encoder.inverse_transform(results["prediction"])
295
-
296
- # Add the results to the DataFrame
297
- for key, value in results.items():
298
- df[key] = value
299
-
300
- # Add probability columns if the model supports it (for classification)
301
- if hasattr(model, "predict_proba"):
302
- probs = model.predict_proba(matched_df[model_features])
303
- df["pred_proba"] = [p.tolist() for p in probs]
304
- df = expand_proba_column(df, label_encoder.classes_)
305
-
306
- # Return the modified DataFrame
307
- return df