workbench 0.8.201__py3-none-any.whl → 0.8.203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. workbench/api/df_store.py +17 -108
  2. workbench/api/feature_set.py +41 -7
  3. workbench/api/parameter_store.py +3 -52
  4. workbench/core/artifacts/artifact.py +5 -5
  5. workbench/core/artifacts/df_store_core.py +114 -0
  6. workbench/core/artifacts/endpoint_core.py +203 -58
  7. workbench/core/artifacts/model_core.py +11 -7
  8. workbench/core/artifacts/parameter_store_core.py +98 -0
  9. workbench/core/transforms/features_to_model/features_to_model.py +27 -13
  10. workbench/core/transforms/pandas_transforms/pandas_to_features.py +11 -2
  11. workbench/model_scripts/chemprop/chemprop.template +297 -295
  12. workbench/model_scripts/chemprop/generated_model_script.py +300 -298
  13. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +11 -5
  14. workbench/model_scripts/custom_models/uq_models/meta_uq.template +11 -5
  15. workbench/model_scripts/custom_models/uq_models/ngboost.template +11 -5
  16. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +11 -5
  17. workbench/model_scripts/pytorch_model/generated_model_script.py +278 -128
  18. workbench/model_scripts/pytorch_model/pytorch.template +273 -123
  19. workbench/model_scripts/uq_models/generated_model_script.py +19 -10
  20. workbench/model_scripts/uq_models/mapie.template +17 -8
  21. workbench/model_scripts/xgb_model/generated_model_script.py +38 -9
  22. workbench/model_scripts/xgb_model/xgb_model.template +34 -5
  23. workbench/resources/open_source_api.key +1 -1
  24. workbench/utils/chemprop_utils.py +38 -1
  25. workbench/utils/pytorch_utils.py +38 -8
  26. workbench/web_interface/components/model_plot.py +7 -1
  27. {workbench-0.8.201.dist-info → workbench-0.8.203.dist-info}/METADATA +2 -2
  28. {workbench-0.8.201.dist-info → workbench-0.8.203.dist-info}/RECORD +32 -32
  29. workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
  30. workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -296
  31. {workbench-0.8.201.dist-info → workbench-0.8.203.dist-info}/WHEEL +0 -0
  32. {workbench-0.8.201.dist-info → workbench-0.8.203.dist-info}/entry_points.txt +0 -0
  33. {workbench-0.8.201.dist-info → workbench-0.8.203.dist-info}/licenses/LICENSE +0 -0
  34. {workbench-0.8.201.dist-info → workbench-0.8.203.dist-info}/top_level.txt +0 -0
@@ -36,8 +36,8 @@ from workbench.utils.cache import Cache
36
36
  from workbench.utils.s3_utils import compute_s3_object_hash
37
37
  from workbench.utils.model_utils import uq_metrics
38
38
  from workbench.utils.xgboost_model_utils import cross_fold_inference as xgboost_cross_fold
39
- from workbench.utils.pytorch_utils import cross_fold_inference as pytorch_cross_fold
40
- from workbench.utils.chemprop_utils import cross_fold_inference as chemprop_cross_fold
39
+ from workbench.utils.pytorch_utils import pull_cv_results as pytorch_pull_cv
40
+ from workbench.utils.chemprop_utils import pull_cv_results as chemprop_pull_cv
41
41
  from workbench_bridges.endpoints.fast_inference import fast_inference
42
42
 
43
43
 
@@ -389,7 +389,7 @@ class EndpointCore(Artifact):
389
389
  # Grab the model features and target column
390
390
  model = ModelCore(self.model_name)
391
391
  features = model.features()
392
- target_column = model.target()
392
+ targets = model.target() # Note: We have multi-target models (so this could be a list)
393
393
 
394
394
  # Run predictions on the evaluation data
395
395
  prediction_df = self._predict(eval_df, features, drop_error_rows)
@@ -397,19 +397,26 @@ class EndpointCore(Artifact):
397
397
  self.log.warning("No predictions were made. Returning empty DataFrame.")
398
398
  return prediction_df
399
399
 
400
+ # FIXME: Multi-target support - currently uses first target for metrics
401
+ # Normalize targets to handle both string and list formats
402
+ if isinstance(targets, list):
403
+ primary_target = targets[0] if targets else None
404
+ else:
405
+ primary_target = targets
406
+
400
407
  # Sanity Check that the target column is present
401
- if target_column and (target_column not in prediction_df.columns):
402
- self.log.important(f"Target Column {target_column} not found in prediction_df!")
408
+ if primary_target and (primary_target not in prediction_df.columns):
409
+ self.log.important(f"Target Column {primary_target} not found in prediction_df!")
403
410
  self.log.important("In order to compute metrics, the target column must be present!")
404
411
  metrics = pd.DataFrame()
405
412
 
406
413
  # Compute the standard performance metrics for this model
407
414
  else:
408
415
  if model.model_type in [ModelType.REGRESSOR, ModelType.UQ_REGRESSOR, ModelType.ENSEMBLE_REGRESSOR]:
409
- prediction_df = self.residuals(target_column, prediction_df)
410
- metrics = self.regression_metrics(target_column, prediction_df)
416
+ prediction_df = self.residuals(primary_target, prediction_df)
417
+ metrics = self.regression_metrics(primary_target, prediction_df)
411
418
  elif model.model_type == ModelType.CLASSIFIER:
412
- metrics = self.classification_metrics(target_column, prediction_df)
419
+ metrics = self.classification_metrics(primary_target, prediction_df)
413
420
  else:
414
421
  # For other model types, we don't compute metrics
415
422
  self.log.info(f"Model Type: {model.model_type} doesn't have metrics...")
@@ -426,14 +433,47 @@ class EndpointCore(Artifact):
426
433
  if id_column is None:
427
434
  fs = FeatureSetCore(model.get_input())
428
435
  id_column = fs.id_column
429
- description = capture_name.replace("_", " ").title()
430
- self._capture_inference_results(
431
- capture_name, prediction_df, target_column, model.model_type, metrics, description, features, id_column
432
- )
436
+
437
+ # Normalize targets to a list for iteration
438
+ target_list = targets if isinstance(targets, list) else [targets]
439
+
440
+ # For multi-target models, use target-specific capture names (e.g., auto_target1, auto_target2)
441
+ # For single-target models, use the original capture name for backward compatibility
442
+ for target in target_list:
443
+ # Determine capture name: use prefix for multi-target, original name for single-target
444
+ if len(target_list) > 1:
445
+ prefix = "auto" if "auto" in capture_name else capture_name
446
+ target_capture_name = f"{prefix}_{target}"
447
+ else:
448
+ target_capture_name = capture_name
449
+
450
+ description = target_capture_name.replace("_", " ").title()
451
+
452
+ # Drop rows with NaN target values for metrics/plots
453
+ target_df = prediction_df.dropna(subset=[target])
454
+
455
+ # Compute per-target metrics
456
+ if model.model_type in [ModelType.REGRESSOR, ModelType.UQ_REGRESSOR, ModelType.ENSEMBLE_REGRESSOR]:
457
+ target_metrics = self.regression_metrics(target, target_df)
458
+ elif model.model_type == ModelType.CLASSIFIER:
459
+ target_metrics = self.classification_metrics(target, target_df)
460
+ else:
461
+ target_metrics = pd.DataFrame()
462
+
463
+ self._capture_inference_results(
464
+ target_capture_name,
465
+ target_df,
466
+ target,
467
+ model.model_type,
468
+ target_metrics,
469
+ description,
470
+ features,
471
+ id_column,
472
+ )
433
473
 
434
474
  # For UQ Models we also capture the uncertainty metrics
435
475
  if model.model_type in [ModelType.UQ_REGRESSOR]:
436
- metrics = uq_metrics(prediction_df, target_column)
476
+ metrics = uq_metrics(prediction_df, primary_target)
437
477
  self.param_store.upsert(f"/workbench/models/{model.name}/inference/{capture_name}", metrics)
438
478
 
439
479
  # Return the prediction DataFrame
@@ -453,12 +493,13 @@ class EndpointCore(Artifact):
453
493
  model = ModelCore(self.model_name)
454
494
 
455
495
  # Compute CrossFold (Metrics and Prediction Dataframe)
496
+ # For PyTorch and ChemProp, pull pre-computed CV results from training
456
497
  if model.model_framework in [ModelFramework.UNKNOWN, ModelFramework.XGBOOST]:
457
498
  cross_fold_metrics, out_of_fold_df = xgboost_cross_fold(model, nfolds=nfolds)
458
499
  elif model.model_framework == ModelFramework.PYTORCH_TABULAR:
459
- cross_fold_metrics, out_of_fold_df = pytorch_cross_fold(model, nfolds=nfolds)
500
+ cross_fold_metrics, out_of_fold_df = pytorch_pull_cv(model)
460
501
  elif model.model_framework == ModelFramework.CHEMPROP:
461
- cross_fold_metrics, out_of_fold_df = chemprop_cross_fold(model, nfolds=nfolds)
502
+ cross_fold_metrics, out_of_fold_df = chemprop_pull_cv(model)
462
503
  else:
463
504
  self.log.error(f"Cross-Fold Inference not supported for Model Framework: {model.model_framework}.")
464
505
  return pd.DataFrame()
@@ -475,9 +516,7 @@ class EndpointCore(Artifact):
475
516
  return out_of_fold_df
476
517
 
477
518
  # Capture the results
478
- capture_name = "full_cross_fold"
479
- description = capture_name.replace("_", " ").title()
480
- target_column = model.target()
519
+ targets = model.target() # Note: We have multi-target models (so this could be a list)
481
520
  model_type = model.model_type
482
521
 
483
522
  # Get the id_column from the model's FeatureSet
@@ -486,7 +525,7 @@ class EndpointCore(Artifact):
486
525
 
487
526
  # Is this a UQ Model? If so, run full inference and merge the results
488
527
  additional_columns = []
489
- if model_type == ModelType.UQ_REGRESSOR:
528
+ if model.model_framework == ModelFramework.XGBOOST and model_type == ModelType.UQ_REGRESSOR:
490
529
  self.log.important("UQ Regressor detected, running full inference to get uncertainty estimates...")
491
530
 
492
531
  # Get the training view dataframe for inference
@@ -495,9 +534,11 @@ class EndpointCore(Artifact):
495
534
  # Run inference on the endpoint to get UQ outputs
496
535
  uq_df = self.inference(training_df)
497
536
 
498
- # Identify UQ-specific columns (quantiles and prediction_std)
537
+ # Identify UQ-specific columns (quantiles, prediction_std, *_pred_std)
499
538
  uq_columns = [
500
- col for col in uq_df.columns if col.startswith("q_") or col == "prediction_std" or col == "confidence"
539
+ col
540
+ for col in uq_df.columns
541
+ if col.startswith("q_") or col == "prediction_std" or col.endswith("_pred_std") or col == "confidence"
501
542
  ]
502
543
 
503
544
  # Merge UQ columns with out-of-fold predictions
@@ -513,20 +554,42 @@ class EndpointCore(Artifact):
513
554
  additional_columns = uq_columns
514
555
  self.log.info(f"Added UQ columns: {', '.join(additional_columns)}")
515
556
 
516
- # Also compute UQ metrics
517
- metrics = uq_metrics(out_of_fold_df, target_column)
518
- self.param_store.upsert(f"/workbench/models/{model.name}/inference/{capture_name}", metrics)
557
+ # Also compute UQ metrics (use first target for multi-target models)
558
+ primary_target = targets[0] if isinstance(targets, list) else targets
559
+ metrics = uq_metrics(out_of_fold_df, primary_target)
560
+ self.param_store.upsert(f"/workbench/models/{model.name}/inference/full_cross_fold", metrics)
561
+
562
+ # Normalize targets to a list for iteration
563
+ target_list = targets if isinstance(targets, list) else [targets]
564
+
565
+ # For multi-target models, use target-specific capture names (e.g., cv_target1, cv_target2)
566
+ # For single-target models, use "full_cross_fold" for backward compatibility
567
+ for target in target_list:
568
+ capture_name = f"cv_{target}"
569
+ description = capture_name.replace("_", " ").title()
570
+
571
+ # Drop rows with NaN target values for metrics/plots
572
+ target_df = out_of_fold_df.dropna(subset=[target])
573
+
574
+ # Compute per-target metrics
575
+ if model_type in [ModelType.REGRESSOR, ModelType.UQ_REGRESSOR, ModelType.ENSEMBLE_REGRESSOR]:
576
+ target_metrics = self.regression_metrics(target, target_df)
577
+ elif model_type == ModelType.CLASSIFIER:
578
+ target_metrics = self.classification_metrics(target, target_df)
579
+ else:
580
+ target_metrics = pd.DataFrame()
581
+
582
+ self._capture_inference_results(
583
+ capture_name,
584
+ target_df,
585
+ target,
586
+ model_type,
587
+ target_metrics,
588
+ description,
589
+ features=additional_columns,
590
+ id_column=id_column,
591
+ )
519
592
 
520
- self._capture_inference_results(
521
- capture_name,
522
- out_of_fold_df,
523
- target_column,
524
- model_type,
525
- cross_fold_metrics,
526
- description,
527
- features=additional_columns,
528
- id_column=id_column,
529
- )
530
593
  return out_of_fold_df
531
594
 
532
595
  def fast_inference(self, eval_df: pd.DataFrame, threads: int = 4) -> pd.DataFrame:
@@ -732,23 +795,47 @@ class EndpointCore(Artifact):
732
795
  combined = row_hashes.values.tobytes()
733
796
  return hashlib.md5(combined).hexdigest()[:hash_length]
734
797
 
798
+ @staticmethod
799
+ def _find_prediction_column(df: pd.DataFrame, target_column: str) -> Optional[str]:
800
+ """Find the prediction column in a DataFrame.
801
+
802
+ Looks for 'prediction' column first, then '{target}_pred' pattern.
803
+
804
+ Args:
805
+ df: DataFrame to search
806
+ target_column: Name of the target column (used for {target}_pred pattern)
807
+
808
+ Returns:
809
+ Name of the prediction column, or None if not found
810
+ """
811
+ # Check for 'prediction' column first (legacy/standard format)
812
+ if "prediction" in df.columns:
813
+ return "prediction"
814
+
815
+ # Check for '{target}_pred' format (multi-target format)
816
+ target_pred_col = f"{target_column}_pred"
817
+ if target_pred_col in df.columns:
818
+ return target_pred_col
819
+
820
+ return None
821
+
735
822
  def _capture_inference_results(
736
823
  self,
737
824
  capture_name: str,
738
825
  pred_results_df: pd.DataFrame,
739
- target_column: str,
826
+ target: str,
740
827
  model_type: ModelType,
741
828
  metrics: pd.DataFrame,
742
829
  description: str,
743
830
  features: list,
744
831
  id_column: str = None,
745
832
  ):
746
- """Internal: Capture the inference results and metrics to S3
833
+ """Internal: Capture the inference results and metrics to S3 for a single target
747
834
 
748
835
  Args:
749
836
  capture_name (str): Name of the inference capture
750
837
  pred_results_df (pd.DataFrame): DataFrame with the prediction results
751
- target_column (str): Name of the target column
838
+ target (str): Target column name
752
839
  model_type (ModelType): Type of the model (e.g. REGRESSOR, CLASSIFIER)
753
840
  metrics (pd.DataFrame): DataFrame with the performance metrics
754
841
  description (str): Description of the inference results
@@ -779,26 +866,12 @@ class EndpointCore(Artifact):
779
866
  self.log.info(f"Writing metrics to {inference_capture_path}/inference_metrics.csv")
780
867
  wr.s3.to_csv(metrics, f"{inference_capture_path}/inference_metrics.csv", index=False)
781
868
 
782
- # Grab the ID column and target column if they are present
783
- output_columns = []
784
- if id_column and id_column in pred_results_df.columns:
785
- output_columns.append(id_column)
786
- if target_column in pred_results_df.columns:
787
- output_columns.append(target_column)
788
-
789
- # Grab the prediction column, any _proba columns, and UQ columns
790
- output_columns += [col for col in pred_results_df.columns if "prediction" in col]
791
- output_columns += [col for col in pred_results_df.columns if col.endswith("_proba")]
792
- output_columns += [col for col in pred_results_df.columns if col.startswith("q_") or col == "confidence"]
793
-
794
- # Write the predictions to our S3 Model Inference Folder
795
- self.log.info(f"Writing predictions to {inference_capture_path}/inference_predictions.csv")
796
- subset_df = pred_results_df[output_columns]
797
- wr.s3.to_csv(subset_df, f"{inference_capture_path}/inference_predictions.csv", index=False)
869
+ # Save the inference predictions for this target
870
+ self._save_target_inference(inference_capture_path, pred_results_df, target, id_column)
798
871
 
799
872
  # CLASSIFIER: Write the confusion matrix to our S3 Model Inference Folder
800
873
  if model_type == ModelType.CLASSIFIER:
801
- conf_mtx = self.generate_confusion_matrix(target_column, pred_results_df)
874
+ conf_mtx = self.generate_confusion_matrix(target, pred_results_df)
802
875
  self.log.info(f"Writing confusion matrix to {inference_capture_path}/inference_cm.csv")
803
876
  # Note: Unlike other dataframes here, we want to write the index (labels) to the CSV
804
877
  wr.s3.to_csv(conf_mtx, f"{inference_capture_path}/inference_cm.csv", index=True)
@@ -808,6 +881,57 @@ class EndpointCore(Artifact):
808
881
  model = ModelCore(self.model_name)
809
882
  model._load_inference_metrics(capture_name)
810
883
 
884
+ def _save_target_inference(
885
+ self,
886
+ inference_capture_path: str,
887
+ pred_results_df: pd.DataFrame,
888
+ target: str,
889
+ id_column: str = None,
890
+ ):
891
+ """Save inference results for a single target.
892
+
893
+ Args:
894
+ inference_capture_path (str): S3 path for inference capture
895
+ pred_results_df (pd.DataFrame): DataFrame with prediction results
896
+ target (str): Target column name
897
+ id_column (str, optional): Name of the ID column
898
+ """
899
+ # Start with ID column if present
900
+ output_columns = []
901
+ if id_column and id_column in pred_results_df.columns:
902
+ output_columns.append(id_column)
903
+
904
+ # Add target column if present
905
+ if target and target in pred_results_df.columns:
906
+ output_columns.append(target)
907
+
908
+ # Build the output DataFrame
909
+ output_df = pred_results_df[output_columns].copy() if output_columns else pd.DataFrame()
910
+
911
+ # For multi-task: map {target}_pred -> prediction, {target}_pred_std -> prediction_std
912
+ # For single-task: just grab prediction and prediction_std columns directly
913
+ pred_col = f"{target}_pred"
914
+ std_col = f"{target}_pred_std"
915
+ if pred_col in pred_results_df.columns:
916
+ # Multi-task columns exist
917
+ output_df["prediction"] = pred_results_df[pred_col]
918
+ if std_col in pred_results_df.columns:
919
+ output_df["prediction_std"] = pred_results_df[std_col]
920
+ else:
921
+ # Single-task: grab standard prediction columns
922
+ for col in ["prediction", "prediction_std"]:
923
+ if col in pred_results_df.columns:
924
+ output_df[col] = pred_results_df[col]
925
+ # Also grab any _proba columns and UQ columns
926
+ for col in pred_results_df.columns:
927
+ if col.endswith("_proba") or col.startswith("q_") or col == "confidence":
928
+ output_df[col] = pred_results_df[col]
929
+
930
+ # Write the predictions to S3
931
+ output_file = f"{inference_capture_path}/inference_predictions.csv"
932
+ self.log.info(f"Writing predictions to {output_file}")
933
+ wr.s3.to_csv(output_df, output_file, index=False)
934
+
811
935
  def regression_metrics(self, target_column: str, prediction_df: pd.DataFrame) -> pd.DataFrame:
812
936
  """Compute the performance metrics for this Endpoint
813
937
  Args:
@@ -822,8 +946,13 @@ class EndpointCore(Artifact):
822
946
  self.log.warning("No predictions were made. Returning empty DataFrame.")
823
947
  return pd.DataFrame()
824
948
 
949
+ # Find the prediction column: "prediction" or "{target}_pred"
950
+ prediction_col = self._find_prediction_column(prediction_df, target_column)
951
+ if prediction_col is None:
952
+ self.log.warning(f"No prediction column found for target '{target_column}'")
953
+ return pd.DataFrame()
954
+
825
955
  # Check for NaN values in target or prediction columns
826
- prediction_col = "prediction" if "prediction" in prediction_df.columns else "predictions"
827
956
  if prediction_df[target_column].isnull().any() or prediction_df[prediction_col].isnull().any():
828
957
  # Compute the number of NaN values in each column
829
958
  num_nan_target = prediction_df[target_column].isnull().sum()
@@ -874,7 +1003,13 @@ class EndpointCore(Artifact):
874
1003
 
875
1004
  # Compute the residuals
876
1005
  y_true = prediction_df[target_column]
877
- prediction_col = "prediction" if "prediction" in prediction_df.columns else "predictions"
1006
+
1007
+ # Find the prediction column: "prediction" or "{target}_pred"
1008
+ prediction_col = self._find_prediction_column(prediction_df, target_column)
1009
+ if prediction_col is None:
1010
+ self.log.warning(f"No prediction column found for target '{target_column}'. Cannot compute residuals.")
1011
+ return prediction_df
1012
+
878
1013
  y_pred = prediction_df[prediction_col]
879
1014
 
880
1015
  # Check for classification scenario
@@ -916,8 +1051,13 @@ class EndpointCore(Artifact):
916
1051
  Returns:
917
1052
  pd.DataFrame: DataFrame with the performance metrics
918
1053
  """
1054
+ # Find the prediction column: "prediction" or "{target}_pred"
1055
+ prediction_col = self._find_prediction_column(prediction_df, target_column)
1056
+ if prediction_col is None:
1057
+ self.log.warning(f"No prediction column found for target '{target_column}'")
1058
+ return pd.DataFrame()
1059
+
919
1060
  # Drop rows with NaN predictions (can't compute metrics on missing predictions)
920
- prediction_col = "prediction" if "prediction" in prediction_df.columns else "predictions"
921
1061
  nan_mask = prediction_df[prediction_col].isna()
922
1062
  if nan_mask.any():
923
1063
  n_nan = nan_mask.sum()
@@ -986,8 +1126,13 @@ class EndpointCore(Artifact):
986
1126
  Returns:
987
1127
  pd.DataFrame: DataFrame with the confusion matrix
988
1128
  """
1129
+ # Find the prediction column: "prediction" or "{target}_pred"
1130
+ prediction_col = self._find_prediction_column(prediction_df, target_column)
1131
+ if prediction_col is None:
1132
+ self.log.warning(f"No prediction column found for target '{target_column}'")
1133
+ return pd.DataFrame()
1134
+
989
1135
  # Drop rows with NaN predictions (can't include in confusion matrix)
990
- prediction_col = "prediction" if "prediction" in prediction_df.columns else "predictions"
991
1136
  nan_mask = prediction_df[prediction_col].isna()
992
1137
  if nan_mask.any():
993
1138
  n_nan = nan_mask.sum()
@@ -263,21 +263,25 @@ class ModelCore(Artifact):
263
263
  else:
264
264
  self.log.important(f"No inference data found for {self.model_name}!")
265
265
 
266
- def get_inference_metrics(self, capture_name: str = "latest") -> Union[pd.DataFrame, None]:
266
+ def get_inference_metrics(self, capture_name: str = "any") -> Union[pd.DataFrame, None]:
267
267
  """Retrieve the inference performance metrics for this model
268
268
 
269
269
  Args:
270
- capture_name (str, optional): Specific capture_name or "training" (default: "latest")
270
+ capture_name (str, optional): Specific capture_name (default: "any")
271
271
  Returns:
272
272
  pd.DataFrame: DataFrame of the Model Metrics
273
273
 
274
274
  Note:
275
- If a capture_name isn't specified this will try to return something reasonable
275
+ If a capture_name isn't specified this will try to the 'first' available metrics
276
276
  """
277
277
  # Try to get the auto_capture 'training_holdout' or the training
278
- if capture_name == "latest":
279
- metrics_df = self.get_inference_metrics("auto_inference")
280
- return metrics_df if metrics_df is not None else self.get_inference_metrics("model_training")
278
+ if capture_name == "any":
279
+ metric_list = self.list_inference_runs()
280
+ if metric_list:
281
+ return self.get_inference_metrics(metric_list[0])
282
+ else:
283
+ self.log.warning(f"No performance metrics found for {self.model_name}!")
284
+ return None
281
285
 
282
286
  # Grab the metrics captured during model training (could return None)
283
287
  if capture_name == "model_training":
@@ -869,7 +873,7 @@ class ModelCore(Artifact):
869
873
  return self.df_store.get(f"/workbench/models/{self.name}/shap_data")
870
874
  else:
871
875
  # Loop over the SHAP data and return a dict of DataFrames
872
- shap_dfs = self.df_store.list_subfiles(f"/workbench/models/{self.name}/shap_data")
876
+ shap_dfs = self.df_store.list(f"/workbench/models/{self.name}/shap_data")
873
877
  shap_data = {}
874
878
  for df_location in shap_dfs:
875
879
  key = df_location.split("/")[-1]
@@ -0,0 +1,98 @@
1
+ """ParameterStoreCore: Manages Workbench parameters in a Cloud Based Parameter Store."""
2
+
3
+ import logging
4
+
5
+ # Workbench Imports
6
+ from workbench.core.cloud_platform.aws.aws_account_clamp import AWSAccountClamp
7
+
8
+ # Workbench Bridges Import
9
+ from workbench_bridges.api import ParameterStore as BridgesParameterStore
10
+
11
+
12
+ class ParameterStoreCore(BridgesParameterStore):
13
+ """ParameterStoreCore: Manages Workbench parameters in a Cloud Based Parameter Store.
14
+
15
+ Common Usage:
16
+ ```python
17
+ params = ParameterStoreCore()
18
+
19
+ # List Parameters
20
+ params.list()
21
+
22
+ ['/workbench/abalone_info',
23
+ '/workbench/my_data',
24
+ '/workbench/test',
25
+ '/workbench/pipelines/my_pipeline']
26
+
27
+ # Add Key
28
+ params.upsert("key", "value")
29
+ value = params.get("key")
30
+
31
+ # Add any data (lists, dictionaries, etc..)
32
+ my_data = {"key": "value", "number": 4.2, "list": [1,2,3]}
33
+ params.upsert("my_data", my_data)
34
+
35
+ # Retrieve data
36
+ return_value = params.get("my_data")
37
+ pprint(return_value)
38
+
39
+ {'key': 'value', 'list': [1, 2, 3], 'number': 4.2}
40
+
41
+ # Delete parameters
42
+ param_store.delete("my_data")
43
+ ```
44
+ """
45
+
46
+ def __init__(self):
47
+ """ParameterStoreCore Init Method"""
48
+ session = AWSAccountClamp().boto3_session
49
+
50
+ # Initialize parent with workbench config
51
+ super().__init__(boto3_session=session)
52
+ self.log = logging.getLogger("workbench")
53
+
54
+
55
+ if __name__ == "__main__":
56
+ """Exercise the ParameterStoreCore Class"""
57
+
58
+ # Create a ParameterStoreCore manager
59
+ param_store = ParameterStoreCore()
60
+
61
+ # List the parameters
62
+ print("Listing Parameters...")
63
+ print(param_store.list())
64
+
65
+ # Add a new parameter
66
+ param_store.upsert("/workbench/test", "value")
67
+
68
+ # Get the parameter
69
+ print(f"Getting parameter 'test': {param_store.get('/workbench/test')}")
70
+
71
+ # Add a dictionary as a parameter
72
+ sample_dict = {"key": "str_value", "awesome_value": 4.2}
73
+ param_store.upsert("/workbench/my_data", sample_dict)
74
+
75
+ # Retrieve the parameter as a dictionary
76
+ retrieved_value = param_store.get("/workbench/my_data")
77
+ print("Retrieved value:", retrieved_value)
78
+
79
+ # List the parameters
80
+ print("Listing Parameters...")
81
+ print(param_store.list())
82
+
83
+ # List the parameters with a prefix
84
+ print("Listing Parameters with prefix '/workbench':")
85
+ print(param_store.list("/workbench"))
86
+
87
+ # Delete the parameters
88
+ param_store.delete("/workbench/test")
89
+ param_store.delete("/workbench/my_data")
90
+
91
+ # Out of scope tests
92
+ param_store.upsert("test", "value")
93
+ param_store.delete("test")
94
+
95
+ # Recursive delete test
96
+ param_store.upsert("/workbench/test/test1", "value1")
97
+ param_store.upsert("/workbench/test/test2", "value2")
98
+ param_store.delete_recursive("workbench/test/")
@@ -1,6 +1,7 @@
1
1
  """FeaturesToModel: Train/Create a Model from a Feature Set"""
2
2
 
3
3
  from pathlib import Path
4
+ from typing import Union
4
5
  from sagemaker.estimator import Estimator
5
6
  import awswrangler as wr
6
7
  from datetime import datetime, timezone
@@ -83,12 +84,17 @@ class FeaturesToModel(Transform):
83
84
  self.inference_arch = inference_arch
84
85
 
85
86
  def transform_impl(
86
- self, target_column: str, description: str = None, feature_list: list = None, train_all_data=False, **kwargs
87
+ self,
88
+ target_column: Union[str, list[str]],
89
+ description: str = None,
90
+ feature_list: list = None,
91
+ train_all_data=False,
92
+ **kwargs,
87
93
  ):
88
94
  """Generic Features to Model: Note you should create a new class and inherit from
89
95
  this one to include specific logic for your Feature Set/Model
90
96
  Args:
91
- target_column (str): Column name of the target variable
97
+ target_column (str or list[str]): Column name(s) of the target variable(s)
92
98
  description (str): Description of the model (optional)
93
99
  feature_list (list[str]): A list of columns for the features (default None, will try to guess)
94
100
  train_all_data (bool): Train on ALL (100%) of the data (default False)
@@ -105,9 +111,11 @@ class FeaturesToModel(Transform):
105
111
  s3_training_path = feature_set.create_s3_training_data()
106
112
  self.log.info(f"Created new training data {s3_training_path}...")
107
113
 
108
- # Report the target column
114
+ # Report the target column(s)
109
115
  self.target_column = target_column
110
- self.log.info(f"Target column: {self.target_column}")
116
+ # Normalize target_column to a list for internal use
117
+ target_list = [target_column] if isinstance(target_column, str) else (target_column or [])
118
+ self.log.info(f"Target column(s): {self.target_column}")
111
119
 
112
120
  # Did they specify a feature list?
113
121
  if feature_list:
@@ -134,7 +142,7 @@ class FeaturesToModel(Transform):
134
142
  "is_deleted",
135
143
  "event_time",
136
144
  "training",
137
- ] + [self.target_column]
145
+ ] + target_list
138
146
  feature_list = [c for c in all_columns if c not in filter_list]
139
147
 
140
148
  # AWS Feature Store has 3 user column types (String, Integral, Fractional)
@@ -157,12 +165,14 @@ class FeaturesToModel(Transform):
157
165
  self.log.important(f"Feature List for Modeling: {self.model_feature_list}")
158
166
 
159
167
  # Set up our parameters for the model script
168
+ # ChemProp expects target_column as a list; other templates expect a string
169
+ target_for_template = target_list if self.model_framework == ModelFramework.CHEMPROP else self.target_column
160
170
  template_params = {
161
171
  "model_imports": self.model_import_str,
162
172
  "model_type": self.model_type,
163
173
  "model_framework": self.model_framework,
164
174
  "model_class": self.model_class,
165
- "target_column": self.target_column,
175
+ "target_column": target_for_template,
166
176
  "feature_list": self.model_feature_list,
167
177
  "compressed_features": feature_set.get_compressed_features(),
168
178
  "model_metrics_s3_path": self.model_training_root,
@@ -188,23 +198,27 @@ class FeaturesToModel(Transform):
188
198
  # Generate our model script
189
199
  script_path = generate_model_script(template_params)
190
200
 
191
- # Metric Definitions for Regression
201
+ # Metric Definitions for Regression (matches model script output format)
192
202
  if self.model_type in [ModelType.REGRESSOR, ModelType.UQ_REGRESSOR, ModelType.ENSEMBLE_REGRESSOR]:
193
203
  metric_definitions = [
194
- {"Name": "RMSE", "Regex": "RMSE: ([0-9.]+)"},
195
- {"Name": "MAE", "Regex": "MAE: ([0-9.]+)"},
196
- {"Name": "R2", "Regex": "R2: ([0-9.]+)"},
197
- {"Name": "NumRows", "Regex": "NumRows: ([0-9]+)"},
204
+ {"Name": "rmse", "Regex": r"rmse: ([0-9.]+)"},
205
+ {"Name": "mae", "Regex": r"mae: ([0-9.]+)"},
206
+ {"Name": "medae", "Regex": r"medae: ([0-9.]+)"},
207
+ {"Name": "r2", "Regex": r"r2: ([0-9.-]+)"},
208
+ {"Name": "spearmanr", "Regex": r"spearmanr: ([0-9.-]+)"},
209
+ {"Name": "support", "Regex": r"support: ([0-9]+)"},
198
210
  ]
199
211
 
200
212
  # Metric Definitions for Classification
201
213
  elif self.model_type == ModelType.CLASSIFIER:
202
214
  # We need to get creative with the Classification Metrics
215
+ # Note: Classification only supports single target
216
+ class_target = target_list[0] if target_list else self.target_column
203
217
 
204
218
  # Grab all the target column class values (class labels)
205
219
  table = feature_set.data_source.table
206
- self.class_labels = feature_set.query(f'select DISTINCT {self.target_column} FROM "{table}"')[
207
- self.target_column
220
+ self.class_labels = feature_set.query(f'select DISTINCT {class_target} FROM "{table}"')[
221
+ class_target
208
222
  ].to_list()
209
223
 
210
224
  # Sanity check on the targets