workbench 0.8.178__py3-none-any.whl → 0.8.180__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

Files changed (26) hide show
  1. workbench/api/endpoint.py +3 -2
  2. workbench/core/artifacts/endpoint_core.py +5 -5
  3. workbench/core/artifacts/feature_set_core.py +32 -2
  4. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
  5. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  6. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
  7. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  8. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +37 -34
  9. workbench/model_scripts/custom_models/uq_models/mapie.template +35 -32
  10. workbench/model_scripts/custom_models/uq_models/meta_uq.template +7 -22
  11. workbench/model_scripts/custom_models/uq_models/ngboost.template +5 -12
  12. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
  13. workbench/model_scripts/pytorch_model/pytorch.template +9 -18
  14. workbench/model_scripts/quant_regression/quant_regression.template +5 -10
  15. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  16. workbench/model_scripts/xgb_model/generated_model_script.py +24 -33
  17. workbench/model_scripts/xgb_model/xgb_model.template +23 -32
  18. workbench/utils/model_utils.py +2 -1
  19. workbench/utils/shap_utils.py +10 -2
  20. workbench/utils/xgboost_model_utils.py +160 -137
  21. {workbench-0.8.178.dist-info → workbench-0.8.180.dist-info}/METADATA +1 -1
  22. {workbench-0.8.178.dist-info → workbench-0.8.180.dist-info}/RECORD +26 -26
  23. {workbench-0.8.178.dist-info → workbench-0.8.180.dist-info}/WHEEL +0 -0
  24. {workbench-0.8.178.dist-info → workbench-0.8.180.dist-info}/entry_points.txt +0 -0
  25. {workbench-0.8.178.dist-info → workbench-0.8.180.dist-info}/licenses/LICENSE +0 -0
  26. {workbench-0.8.178.dist-info → workbench-0.8.180.dist-info}/top_level.txt +0 -0
workbench/api/endpoint.py CHANGED
@@ -4,6 +4,7 @@ Endpoints can be viewed in the AWS Sagemaker interfaces or in the Workbench
4
4
  Dashboard UI, which provides additional model details and performance metrics"""
5
5
 
6
6
  import pandas as pd
7
+ from typing import Tuple
7
8
 
8
9
  # Workbench Imports
9
10
  from workbench.core.artifacts.endpoint_core import EndpointCore
@@ -70,14 +71,14 @@ class Endpoint(EndpointCore):
70
71
  """
71
72
  return super().fast_inference(eval_df, threads=threads)
72
73
 
73
- def cross_fold_inference(self, nfolds: int = 5) -> dict:
74
+ def cross_fold_inference(self, nfolds: int = 5) -> Tuple[dict, pd.DataFrame]:
74
75
  """Run cross-fold inference (only works for XGBoost models)
75
76
 
76
77
  Args:
77
78
  nfolds (int): The number of folds to use for cross-validation (default: 5)
78
79
 
79
80
  Returns:
80
- dict: A dictionary with fold results
81
+ Tuple(dict, pd.DataFrame): A tuple containing a dictionary of metrics and a DataFrame with predictions
81
82
  """
82
83
  return super().cross_fold_inference(nfolds)
83
84
 
@@ -8,7 +8,7 @@ import pandas as pd
8
8
  import numpy as np
9
9
  from io import StringIO
10
10
  import awswrangler as wr
11
- from typing import Union, Optional
11
+ from typing import Union, Optional, Tuple
12
12
  import hashlib
13
13
 
14
14
  # Model Performance Scores
@@ -436,24 +436,24 @@ class EndpointCore(Artifact):
436
436
  # Return the prediction DataFrame
437
437
  return prediction_df
438
438
 
439
- def cross_fold_inference(self, nfolds: int = 5) -> dict:
439
+ def cross_fold_inference(self, nfolds: int = 5) -> Tuple[dict, pd.DataFrame]:
440
440
  """Run cross-fold inference (only works for XGBoost models)
441
441
 
442
442
  Args:
443
443
  nfolds (int): Number of folds to use for cross-fold (default: 5)
444
444
 
445
445
  Returns:
446
- dict: Dictionary with the cross-fold inference results
446
+ Tuple[dict, pd.DataFrame]: Tuple of (cross_fold_metrics, out_of_fold_df)
447
447
  """
448
448
 
449
449
  # Grab our model
450
450
  model = ModelCore(self.model_name)
451
451
 
452
452
  # Compute CrossFold Metrics
453
- cross_fold_metrics = cross_fold_inference(model, nfolds=nfolds)
453
+ cross_fold_metrics, out_of_fold_df = cross_fold_inference(model, nfolds=nfolds)
454
454
  if cross_fold_metrics:
455
455
  self.param_store.upsert(f"/workbench/models/{model.name}/inference/cross_fold", cross_fold_metrics)
456
- return cross_fold_metrics
456
+ return cross_fold_metrics, out_of_fold_df
457
457
 
458
458
  def fast_inference(self, eval_df: pd.DataFrame, threads: int = 4) -> pd.DataFrame:
459
459
  """Run inference on the Endpoint using the provided DataFrame
@@ -17,7 +17,7 @@ from workbench.core.artifacts.artifact import Artifact
17
17
  from workbench.core.artifacts.data_source_factory import DataSourceFactory
18
18
  from workbench.core.artifacts.athena_source import AthenaSource
19
19
 
20
- from typing import TYPE_CHECKING, Optional
20
+ from typing import TYPE_CHECKING, Optional, List, Union
21
21
 
22
22
  from workbench.utils.aws_utils import aws_throttle
23
23
 
@@ -514,7 +514,7 @@ class FeatureSetCore(Artifact):
514
514
 
515
515
  Args:
516
516
  filter_expression (Optional[str]): A SQL filter expression (e.g., "age > 25 AND status = 'active'")
517
- If None or empty string, will reset to default training view with no filter
517
+ If None or empty string, will reset to training view with no filter
518
518
  (default: None)
519
519
  """
520
520
  from workbench.core.views import TrainingView
@@ -528,6 +528,29 @@ class FeatureSetCore(Artifact):
528
528
  self, id_column=self.id_column, holdout_ids=holdout_ids, filter_expression=filter_expression
529
529
  )
530
530
 
531
+ def exclude_ids_from_training(self, ids: List[Union[str, int]], column_name: Optional[str] = None):
532
+ """Exclude a list of IDs from the training view
533
+
534
+ Args:
535
+ ids (List[Union[str, int]],): List of IDs to exclude from training
536
+ column_name (Optional[str]): Column name to filter on.
537
+ If None, uses self.id_column (default: None)
538
+ """
539
+ # Use the default id_column if not specified
540
+ column = column_name or self.id_column
541
+
542
+ # Handle empty list case
543
+ if not ids:
544
+ self.log.warning("No IDs provided to exclude")
545
+ return
546
+
547
+ # Build the filter expression with proper SQL quoting
548
+ quoted_ids = ", ".join([repr(id) for id in ids])
549
+ filter_expression = f"{column} NOT IN ({quoted_ids})"
550
+
551
+ # Apply the filter
552
+ self.set_training_filter(filter_expression)
553
+
531
554
  @classmethod
532
555
  def delete_views(cls, table: str, database: str):
533
556
  """Delete any views associated with this FeatureSet
@@ -769,6 +792,13 @@ if __name__ == "__main__":
769
792
  print(f"Training Data: {training_data.shape}")
770
793
  print(training_data)
771
794
 
795
+ # Test excluding ids from training
796
+ print("Excluding ids from training...")
797
+ my_features.exclude_ids_from_training([1, 2, 3, 4, 5])
798
+ training_data = my_features.get_training_data()
799
+ print(f"Training Data: {training_data.shape}")
800
+ print(training_data)
801
+
772
802
  # Now delete the AWS artifacts associated with this Feature Set
773
803
  # print("Deleting Workbench Feature Set...")
774
804
  # my_features.delete()
@@ -8,7 +8,7 @@ TEMPLATE_PARAMS = {
8
8
  "id_column": "{{id_column}}",
9
9
  "features": "{{feature_list}}",
10
10
  "target": "{{target_column}}",
11
- "track_columns": "{{track_columns}}"
11
+ "track_columns": "{{track_columns}}",
12
12
  }
13
13
 
14
14
  from io import StringIO
@@ -73,10 +73,7 @@ if __name__ == "__main__":
73
73
  args = parser.parse_args()
74
74
 
75
75
  # Load training data from the specified directory
76
- training_files = [
77
- os.path.join(args.train, file)
78
- for file in os.listdir(args.train) if file.endswith(".csv")
79
- ]
76
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
80
77
  all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
81
78
 
82
79
  # Check if the DataFrame is empty
@@ -88,6 +85,7 @@ if __name__ == "__main__":
88
85
  # Now serialize the model
89
86
  model.serialize(args.model_dir)
90
87
 
88
+
91
89
  # Model loading and prediction functions
92
90
  def model_fn(model_dir):
93
91
 
@@ -14,7 +14,7 @@ import pandas as pd
14
14
  TEMPLATE_PARAMS = {
15
15
  "features": "{{feature_list}}",
16
16
  "target": "{{target_column}}",
17
- "train_all_data": "{{train_all_data}}"
17
+ "train_all_data": "{{train_all_data}}",
18
18
  }
19
19
 
20
20
 
@@ -37,7 +37,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
37
37
  """
38
38
  Matches and renames DataFrame columns to match model feature names (case-insensitive).
39
39
  Prioritizes exact matches, then case-insensitive matches.
40
-
40
+
41
41
  Raises ValueError if any model features cannot be matched.
42
42
  """
43
43
  df_columns_lower = {col.lower(): col for col in df.columns}
@@ -81,10 +81,7 @@ if __name__ == "__main__":
81
81
  args = parser.parse_args()
82
82
 
83
83
  # Load training data from the specified directory
84
- training_files = [
85
- os.path.join(args.train, file)
86
- for file in os.listdir(args.train) if file.endswith(".csv")
87
- ]
84
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
88
85
  df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
89
86
 
90
87
  # Check if the DataFrame is empty
@@ -109,8 +106,10 @@ if __name__ == "__main__":
109
106
  # Create and train the Regression/Confidence model
110
107
  # model = BayesianRidge()
111
108
  model = BayesianRidge(
112
- alpha_1=1e-6, alpha_2=1e-6, # Noise precision
113
- lambda_1=1e-6, lambda_2=1e-6, # Weight precision
109
+ alpha_1=1e-6,
110
+ alpha_2=1e-6, # Noise precision
111
+ lambda_1=1e-6,
112
+ lambda_2=1e-6, # Weight precision
114
113
  fit_intercept=True,
115
114
  )
116
115
 
@@ -4,11 +4,7 @@ import awswrangler as wr
4
4
  import numpy as np
5
5
 
6
6
  # Model Performance Scores
7
- from sklearn.metrics import (
8
- mean_absolute_error,
9
- r2_score,
10
- root_mean_squared_error
11
- )
7
+ from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
12
8
  from sklearn.model_selection import KFold
13
9
  from scipy.optimize import minimize
14
10
 
@@ -23,7 +19,7 @@ TEMPLATE_PARAMS = {
23
19
  "features": "{{feature_list}}",
24
20
  "target": "{{target_column}}",
25
21
  "train_all_data": "{{train_all_data}}",
26
- "model_metrics_s3_path": "{{model_metrics_s3_path}}"
22
+ "model_metrics_s3_path": "{{model_metrics_s3_path}}",
27
23
  }
28
24
 
29
25
 
@@ -47,7 +43,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
47
43
  """
48
44
  Matches and renames DataFrame columns to match model feature names (case-insensitive).
49
45
  Prioritizes exact matches, then case-insensitive matches.
50
-
46
+
51
47
  Raises ValueError if any model features cannot be matched.
52
48
  """
53
49
  df_columns_lower = {col.lower(): col for col in df.columns}
@@ -90,10 +86,7 @@ if __name__ == "__main__":
90
86
  args = parser.parse_args()
91
87
 
92
88
  # Load training data from the specified directory
93
- training_files = [
94
- os.path.join(args.train, file)
95
- for file in os.listdir(args.train) if file.endswith(".csv")
96
- ]
89
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
97
90
  df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
98
91
 
99
92
  # Check if the DataFrame is empty
@@ -172,16 +165,14 @@ if __name__ == "__main__":
172
165
  cv_residuals = np.array(cv_residuals)
173
166
  cv_uncertainties = np.array(cv_uncertainties)
174
167
 
175
-
176
168
  # Optimize calibration parameters: σ_cal = a * σ_uc + b
177
169
  def neg_log_likelihood(params):
178
170
  a, b = params
179
171
  sigma_cal = a * cv_uncertainties + b
180
172
  sigma_cal = np.maximum(sigma_cal, 1e-8) # Prevent division by zero
181
- return np.sum(0.5 * np.log(2 * np.pi * sigma_cal ** 2) + 0.5 * (cv_residuals ** 2) / (sigma_cal ** 2))
173
+ return np.sum(0.5 * np.log(2 * np.pi * sigma_cal**2) + 0.5 * (cv_residuals**2) / (sigma_cal**2))
182
174
 
183
-
184
- result = minimize(neg_log_likelihood, x0=[1.0, 0.1], method='Nelder-Mead')
175
+ result = minimize(neg_log_likelihood, x0=[1.0, 0.1], method="Nelder-Mead")
185
176
  cal_a, cal_b = result.x
186
177
 
187
178
  print(f"Calibration parameters: a={cal_a:.4f}, b={cal_b:.4f}")
@@ -205,7 +196,9 @@ if __name__ == "__main__":
205
196
  result_df["prediction"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].mean(axis=1)
206
197
 
207
198
  # Compute uncalibrated uncertainty
208
- result_df["prediction_std_uc"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].std(axis=1)
199
+ result_df["prediction_std_uc"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].std(
200
+ axis=1
201
+ )
209
202
 
210
203
  # Apply calibration to uncertainty
211
204
  result_df["prediction_std"] = cal_a * result_df["prediction_std_uc"] + cal_b
@@ -352,4 +345,4 @@ def predict_fn(df, models) -> pd.DataFrame:
352
345
  df = df.reindex(sorted(df.columns), axis=1)
353
346
 
354
347
  # All done, return the DataFrame
355
- return df
348
+ return df
@@ -9,7 +9,7 @@ from sklearn.model_selection import train_test_split
9
9
  TEMPLATE_PARAMS = {
10
10
  "features": "{{feature_list}}",
11
11
  "target": "{{target_column}}",
12
- "train_all_data": "{{train_all_data}}"
12
+ "train_all_data": "{{train_all_data}}",
13
13
  }
14
14
 
15
15
  from io import StringIO
@@ -33,7 +33,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
33
33
  """
34
34
  Matches and renames DataFrame columns to match model feature names (case-insensitive).
35
35
  Prioritizes exact matches, then case-insensitive matches.
36
-
36
+
37
37
  Raises ValueError if any model features cannot be matched.
38
38
  """
39
39
  df_columns_lower = {col.lower(): col for col in df.columns}
@@ -46,7 +46,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
46
46
  rename_dict[df_columns_lower[feature.lower()]] = feature
47
47
  else:
48
48
  missing.append(feature)
49
-
49
+
50
50
  if missing:
51
51
  raise ValueError(f"Features not found: {missing}")
52
52
 
@@ -76,10 +76,7 @@ if __name__ == "__main__":
76
76
  args = parser.parse_args()
77
77
 
78
78
  # Load training data from the specified directory
79
- training_files = [
80
- os.path.join(args.train, file)
81
- for file in os.listdir(args.train) if file.endswith(".csv")
82
- ]
79
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
83
80
  df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
84
81
 
85
82
  # Check if the DataFrame is empty
@@ -112,10 +109,7 @@ if __name__ == "__main__":
112
109
  )
113
110
 
114
111
  # Create a Pipeline with StandardScaler
115
- model = Pipeline([
116
- ("scaler", StandardScaler()),
117
- ("model", model)
118
- ])
112
+ model = Pipeline([("scaler", StandardScaler()), ("model", model)])
119
113
 
120
114
  # Prepare features and targets for training
121
115
  X_train = df_train[features]
@@ -5,11 +5,7 @@ from xgboost import XGBRegressor
5
5
  from sklearn.model_selection import train_test_split
6
6
 
7
7
  # Model Performance Scores
8
- from sklearn.metrics import (
9
- mean_absolute_error,
10
- r2_score,
11
- root_mean_squared_error
12
- )
8
+ from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
13
9
 
14
10
  from io import StringIO
15
11
  import json
@@ -22,10 +18,11 @@ from typing import List, Tuple
22
18
 
23
19
  # Template Placeholders
24
20
  TEMPLATE_PARAMS = {
25
- "target": "logs",
26
- "features": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo'],
21
+ "target": "solubility",
22
+ "features": ['molwt', 'mollogp', 'molmr', 'heavyatomcount', 'numhacceptors', 'numhdonors', 'numheteroatoms', 'numrotatablebonds', 'numvalenceelectrons', 'numaromaticrings', 'numsaturatedrings', 'numaliphaticrings', 'ringcount', 'tpsa', 'labuteasa', 'balabanj', 'bertzct'],
27
23
  "compressed_features": [],
28
- "train_all_data": True
24
+ "train_all_data": False,
25
+ "hyperparameters": {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.05, 'subsample': 0.7, 'colsample_bytree': 0.3, 'colsample_bylevel': 0.5, 'min_child_weight': 5, 'gamma': 0.2, 'reg_alpha': 0.5, 'reg_lambda': 2.0, 'scale_pos_weight': 1},
29
26
  }
30
27
 
31
28
 
@@ -101,7 +98,7 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
101
98
 
102
99
 
103
100
  def decompress_features(
104
- df: pd.DataFrame, features: List[str], compressed_features: List[str]
101
+ df: pd.DataFrame, features: List[str], compressed_features: List[str]
105
102
  ) -> Tuple[pd.DataFrame, List[str]]:
106
103
  """Prepare features for the model by decompressing bitstring features
107
104
 
@@ -162,6 +159,7 @@ if __name__ == "__main__":
162
159
  orig_features = features.copy()
163
160
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
164
161
  train_all_data = TEMPLATE_PARAMS["train_all_data"]
162
+ hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
165
163
  validation_split = 0.2
166
164
 
167
165
  # Script arguments for input/output directories
@@ -174,11 +172,7 @@ if __name__ == "__main__":
174
172
  args = parser.parse_args()
175
173
 
176
174
  # Read the training data into DataFrames
177
- training_files = [
178
- os.path.join(args.train, file)
179
- for file in os.listdir(args.train)
180
- if file.endswith(".csv")
181
- ]
175
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
182
176
  print(f"Training Files: {training_files}")
183
177
 
184
178
  # Combine files and read them all into a single pandas dataframe
@@ -213,9 +207,7 @@ if __name__ == "__main__":
213
207
  else:
214
208
  # Just do a random training Split
215
209
  print("WARNING: No training column found, splitting data with random state=42")
216
- df_train, df_val = train_test_split(
217
- all_df, test_size=validation_split, random_state=42
218
- )
210
+ df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
219
211
  print(f"FIT/TRAIN: {df_train.shape}")
220
212
  print(f"VALIDATION: {df_val.shape}")
221
213
 
@@ -227,7 +219,8 @@ if __name__ == "__main__":
227
219
 
228
220
  # Train XGBoost for point predictions
229
221
  print("\nTraining XGBoost for point predictions...")
230
- xgb_model = XGBRegressor(enable_categorical=True)
222
+ print(f" Hyperparameters: {hyperparameters}")
223
+ xgb_model = XGBRegressor(enable_categorical=True, **hyperparameters)
231
224
  xgb_model.fit(X_train, y_train)
232
225
 
233
226
  # Evaluate XGBoost performance
@@ -272,7 +265,7 @@ if __name__ == "__main__":
272
265
  colsample_bytree=0.8,
273
266
  random_state=42,
274
267
  verbose=-1,
275
- force_col_wise=True
268
+ force_col_wise=True,
276
269
  )
277
270
  est.fit(X_train, y_train)
278
271
  quantile_estimators.append(est)
@@ -280,9 +273,7 @@ if __name__ == "__main__":
280
273
  # Create MAPIE CQR model for this confidence level
281
274
  print(f" Setting up MAPIE CQR for {confidence_level * 100:.0f}% confidence...")
282
275
  mapie_model = ConformalizedQuantileRegressor(
283
- quantile_estimators,
284
- confidence_level=confidence_level,
285
- prefit=True
276
+ quantile_estimators, confidence_level=confidence_level, prefit=True
286
277
  )
287
278
 
288
279
  # Conformalize the model
@@ -337,8 +328,8 @@ if __name__ == "__main__":
337
328
  "xgb_rmse": float(xgb_rmse),
338
329
  "xgb_mae": float(xgb_mae),
339
330
  "xgb_r2": float(xgb_r2),
340
- "n_validation": len(df_val)
341
- }
331
+ "n_validation": len(df_val),
332
+ },
342
333
  }
343
334
  with open(os.path.join(args.model_dir, "model_config.json"), "w") as fp:
344
335
  json.dump(model_config, fp, indent=2)
@@ -379,7 +370,7 @@ def model_fn(model_dir) -> dict:
379
370
  "xgb_model": xgb_model,
380
371
  "mapie_models": mapie_models,
381
372
  "confidence_levels": config["confidence_levels"],
382
- "category_mappings": category_mappings
373
+ "category_mappings": category_mappings,
383
374
  }
384
375
 
385
376
 
@@ -404,7 +395,7 @@ def output_fn(output_df, accept_type):
404
395
  """Supports both CSV and JSON output formats."""
405
396
  if "text/csv" in accept_type:
406
397
  # Convert categorical columns to string to avoid fillna issues
407
- for col in output_df.select_dtypes(include=['category']).columns:
398
+ for col in output_df.select_dtypes(include=["category"]).columns:
408
399
  output_df[col] = output_df[col].astype(str)
409
400
  csv_output = output_df.fillna("N/A").to_csv(index=False)
410
401
  return csv_output, "text/csv"
@@ -425,6 +416,10 @@ def predict_fn(df, models) -> pd.DataFrame:
425
416
  pd.DataFrame: DataFrame with XGBoost predictions and conformalized intervals
426
417
  """
427
418
 
419
+ # Flag for outlier stretch adjustment for the prediction intervals
420
+ # if the predicted values are outside the intervals
421
+ outlier_stretch = False
422
+
428
423
  # Grab our feature columns (from training)
429
424
  model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
430
425
  with open(os.path.join(model_dir, "feature_columns.json")) as fp:
@@ -435,11 +430,7 @@ def predict_fn(df, models) -> pd.DataFrame:
435
430
 
436
431
  # Apply categorical mappings if they exist
437
432
  if models.get("category_mappings"):
438
- matched_df, _ = convert_categorical_types(
439
- matched_df,
440
- model_features,
441
- models["category_mappings"]
442
- )
433
+ matched_df, _ = convert_categorical_types(matched_df, model_features, models["category_mappings"])
443
434
 
444
435
  # Get features for prediction
445
436
  X = matched_df[model_features]
@@ -475,7 +466,7 @@ def predict_fn(df, models) -> pd.DataFrame:
475
466
  # Add median (q_50) from XGBoost prediction
476
467
  df["q_50"] = df["prediction"]
477
468
 
478
- # Calculate a psueduo-standard deviation from the 68% interval width
469
+ # Calculate a pseudo-standard deviation from the 68% interval width
479
470
  df["prediction_std"] = (df["q_84"] - df["q_16"]) / 2.0
480
471
 
481
472
  # Reorder the quantile columns for easier reading
@@ -484,7 +475,19 @@ def predict_fn(df, models) -> pd.DataFrame:
484
475
  df = df[other_cols + quantile_cols]
485
476
 
486
477
  # Adjust the outer quantiles to ensure they encompass the prediction
487
- df["q_025"] = np.minimum(df["q_025"], df["prediction"])
488
- df["q_975"] = np.maximum(df["q_975"], df["prediction"])
478
+ if outlier_stretch:
479
+ # Lower intervals adjustments
480
+ df["q_025"] = np.minimum(df["q_025"], df["prediction"])
481
+ df["q_05"] = np.minimum(df["q_05"], df["prediction"])
482
+ df["q_10"] = np.minimum(df["q_10"], df["prediction"])
483
+ df["q_16"] = np.minimum(df["q_16"], df["prediction"])
484
+ df["q_25"] = np.minimum(df["q_25"], df["prediction"])
485
+
486
+ # Upper intervals adjustments
487
+ df["q_75"] = np.maximum(df["q_75"], df["prediction"])
488
+ df["q_84"] = np.maximum(df["q_84"], df["prediction"])
489
+ df["q_90"] = np.maximum(df["q_90"], df["prediction"])
490
+ df["q_95"] = np.maximum(df["q_95"], df["prediction"])
491
+ df["q_975"] = np.maximum(df["q_975"], df["prediction"])
489
492
 
490
493
  return df