workbench 0.8.162__py3-none-any.whl → 0.8.220__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

Files changed (147) hide show
  1. workbench/algorithms/dataframe/__init__.py +1 -2
  2. workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
  3. workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
  4. workbench/algorithms/dataframe/fingerprint_proximity.py +422 -86
  5. workbench/algorithms/dataframe/projection_2d.py +44 -21
  6. workbench/algorithms/dataframe/proximity.py +259 -305
  7. workbench/algorithms/graph/light/proximity_graph.py +14 -12
  8. workbench/algorithms/models/cleanlab_model.py +382 -0
  9. workbench/algorithms/models/noise_model.py +388 -0
  10. workbench/algorithms/sql/outliers.py +3 -3
  11. workbench/api/__init__.py +5 -1
  12. workbench/api/compound.py +1 -1
  13. workbench/api/df_store.py +17 -108
  14. workbench/api/endpoint.py +18 -5
  15. workbench/api/feature_set.py +121 -15
  16. workbench/api/meta.py +5 -2
  17. workbench/api/meta_model.py +289 -0
  18. workbench/api/model.py +55 -21
  19. workbench/api/monitor.py +1 -16
  20. workbench/api/parameter_store.py +3 -52
  21. workbench/cached/cached_model.py +4 -4
  22. workbench/core/artifacts/__init__.py +11 -2
  23. workbench/core/artifacts/artifact.py +16 -8
  24. workbench/core/artifacts/data_capture_core.py +355 -0
  25. workbench/core/artifacts/df_store_core.py +114 -0
  26. workbench/core/artifacts/endpoint_core.py +382 -253
  27. workbench/core/artifacts/feature_set_core.py +249 -45
  28. workbench/core/artifacts/model_core.py +135 -80
  29. workbench/core/artifacts/monitor_core.py +33 -248
  30. workbench/core/artifacts/parameter_store_core.py +98 -0
  31. workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
  32. workbench/core/cloud_platform/aws/aws_meta.py +12 -5
  33. workbench/core/cloud_platform/aws/aws_session.py +4 -4
  34. workbench/core/pipelines/pipeline_executor.py +1 -1
  35. workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
  36. workbench/core/transforms/features_to_model/features_to_model.py +62 -40
  37. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +76 -15
  38. workbench/core/transforms/pandas_transforms/pandas_to_features.py +38 -2
  39. workbench/core/views/training_view.py +113 -42
  40. workbench/core/views/view.py +53 -3
  41. workbench/core/views/view_utils.py +4 -4
  42. workbench/model_script_utils/model_script_utils.py +339 -0
  43. workbench/model_script_utils/pytorch_utils.py +405 -0
  44. workbench/model_script_utils/uq_harness.py +278 -0
  45. workbench/model_scripts/chemprop/chemprop.template +649 -0
  46. workbench/model_scripts/chemprop/generated_model_script.py +649 -0
  47. workbench/model_scripts/chemprop/model_script_utils.py +339 -0
  48. workbench/model_scripts/chemprop/requirements.txt +3 -0
  49. workbench/model_scripts/custom_models/chem_info/fingerprints.py +175 -0
  50. workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
  51. workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
  52. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
  53. workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
  54. workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
  55. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +8 -10
  56. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  57. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
  58. workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
  59. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  60. workbench/model_scripts/custom_models/uq_models/ngboost.template +30 -18
  61. workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
  62. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
  63. workbench/model_scripts/meta_model/generated_model_script.py +209 -0
  64. workbench/model_scripts/meta_model/meta_model.template +209 -0
  65. workbench/model_scripts/pytorch_model/generated_model_script.py +444 -500
  66. workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
  67. workbench/model_scripts/pytorch_model/pytorch.template +440 -496
  68. workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
  69. workbench/model_scripts/pytorch_model/requirements.txt +1 -1
  70. workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
  71. workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
  72. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  73. workbench/model_scripts/script_generation.py +20 -11
  74. workbench/model_scripts/uq_models/generated_model_script.py +248 -0
  75. workbench/model_scripts/xgb_model/generated_model_script.py +372 -404
  76. workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
  77. workbench/model_scripts/xgb_model/uq_harness.py +278 -0
  78. workbench/model_scripts/xgb_model/xgb_model.template +369 -401
  79. workbench/repl/workbench_shell.py +28 -19
  80. workbench/resources/open_source_api.key +1 -1
  81. workbench/scripts/endpoint_test.py +162 -0
  82. workbench/scripts/lambda_test.py +73 -0
  83. workbench/scripts/meta_model_sim.py +35 -0
  84. workbench/scripts/ml_pipeline_batch.py +137 -0
  85. workbench/scripts/ml_pipeline_sqs.py +186 -0
  86. workbench/scripts/monitor_cloud_watch.py +20 -100
  87. workbench/scripts/training_test.py +85 -0
  88. workbench/utils/aws_utils.py +4 -3
  89. workbench/utils/chem_utils/__init__.py +0 -0
  90. workbench/utils/chem_utils/fingerprints.py +175 -0
  91. workbench/utils/chem_utils/misc.py +194 -0
  92. workbench/utils/chem_utils/mol_descriptors.py +483 -0
  93. workbench/utils/chem_utils/mol_standardize.py +450 -0
  94. workbench/utils/chem_utils/mol_tagging.py +348 -0
  95. workbench/utils/chem_utils/projections.py +219 -0
  96. workbench/utils/chem_utils/salts.py +256 -0
  97. workbench/utils/chem_utils/sdf.py +292 -0
  98. workbench/utils/chem_utils/toxicity.py +250 -0
  99. workbench/utils/chem_utils/vis.py +253 -0
  100. workbench/utils/chemprop_utils.py +141 -0
  101. workbench/utils/cloudwatch_handler.py +1 -1
  102. workbench/utils/cloudwatch_utils.py +137 -0
  103. workbench/utils/config_manager.py +3 -7
  104. workbench/utils/endpoint_utils.py +5 -7
  105. workbench/utils/license_manager.py +2 -6
  106. workbench/utils/meta_model_simulator.py +499 -0
  107. workbench/utils/metrics_utils.py +256 -0
  108. workbench/utils/model_utils.py +278 -79
  109. workbench/utils/monitor_utils.py +44 -62
  110. workbench/utils/pandas_utils.py +3 -3
  111. workbench/utils/pytorch_utils.py +87 -0
  112. workbench/utils/shap_utils.py +11 -57
  113. workbench/utils/workbench_logging.py +0 -3
  114. workbench/utils/workbench_sqs.py +1 -1
  115. workbench/utils/xgboost_local_crossfold.py +267 -0
  116. workbench/utils/xgboost_model_utils.py +127 -219
  117. workbench/web_interface/components/model_plot.py +14 -2
  118. workbench/web_interface/components/plugin_unit_test.py +5 -2
  119. workbench/web_interface/components/plugins/dashboard_status.py +3 -1
  120. workbench/web_interface/components/plugins/generated_compounds.py +1 -1
  121. workbench/web_interface/components/plugins/model_details.py +38 -74
  122. workbench/web_interface/components/plugins/scatter_plot.py +6 -10
  123. {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/METADATA +31 -9
  124. {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/RECORD +128 -96
  125. workbench-0.8.220.dist-info/entry_points.txt +11 -0
  126. {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/licenses/LICENSE +1 -1
  127. workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
  128. workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
  129. workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
  130. workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
  131. workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
  132. workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
  133. workbench/model_scripts/custom_models/proximity/proximity.py +0 -384
  134. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
  135. workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
  136. workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -273
  137. workbench/model_scripts/custom_models/uq_models/proximity.py +0 -384
  138. workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
  139. workbench/model_scripts/quant_regression/quant_regression.template +0 -279
  140. workbench/model_scripts/quant_regression/requirements.txt +0 -1
  141. workbench/utils/chem_utils.py +0 -1556
  142. workbench/utils/execution_environment.py +0 -211
  143. workbench/utils/fast_inference.py +0 -167
  144. workbench/utils/resource_utils.py +0 -39
  145. workbench-0.8.162.dist-info/entry_points.txt +0 -5
  146. {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/WHEEL +0 -0
  147. {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/top_level.txt +0 -0
@@ -1,477 +1,445 @@
1
- # Imports for XGB Model
2
- import xgboost as xgb
3
- import awswrangler as wr
1
+ # XGBoost Model Template for Workbench
2
+ #
3
+ # This template handles both classification and regression models with:
4
+ # - K-fold cross-validation ensemble training (or single train/val split)
5
+ # - Out-of-fold predictions for validation metrics
6
+ # - Uncertainty quantification for regression models
7
+ # - Sample weights support
8
+ # - Categorical feature handling
9
+ # - Compressed feature decompression
10
+ #
11
+ # NOTE: Imports are structured to minimize serverless endpoint startup time.
12
+ # Heavy imports (sklearn, awswrangler) are deferred to training time.
13
+
14
+ import json
15
+ import os
16
+
17
+ import joblib
4
18
  import numpy as np
19
+ import pandas as pd
20
+ import xgboost as xgb
5
21
 
6
- # Model Performance Scores
7
- from sklearn.metrics import (
8
- mean_absolute_error,
9
- r2_score,
10
- root_mean_squared_error,
11
- precision_recall_fscore_support,
12
- confusion_matrix,
22
+ from model_script_utils import (
23
+ convert_categorical_types,
24
+ decompress_features,
25
+ expand_proba_column,
26
+ input_fn,
27
+ match_features_case_insensitive,
28
+ output_fn,
29
+ )
30
+ from uq_harness import (
31
+ compute_confidence,
32
+ load_uq_models,
33
+ predict_intervals,
13
34
  )
14
35
 
15
- # Classification Encoder
16
- from sklearn.preprocessing import LabelEncoder
36
+ # =============================================================================
37
+ # Default Hyperparameters
38
+ # =============================================================================
39
+ DEFAULT_HYPERPARAMETERS = {
40
+ # Training parameters
41
+ "n_folds": 5, # Number of CV folds (1 = single train/val split)
42
+ # Core tree parameters
43
+ "n_estimators": 300,
44
+ "max_depth": 7,
45
+ "learning_rate": 0.05,
46
+ # Sampling parameters (less aggressive - ensemble provides regularization)
47
+ "subsample": 0.8,
48
+ "colsample_bytree": 0.8,
49
+ # Regularization (lighter - ensemble averaging reduces overfitting)
50
+ "min_child_weight": 3,
51
+ "gamma": 0.1,
52
+ "reg_alpha": 0.1,
53
+ "reg_lambda": 1.0,
54
+ # Random seed
55
+ "seed": 42,
56
+ }
17
57
 
18
- # Scikit Learn Imports
19
- from sklearn.model_selection import train_test_split
58
+ # Workbench-specific parameters (not passed to XGBoost)
59
+ WORKBENCH_PARAMS = {"n_folds"}
20
60
 
21
- from io import StringIO
22
- import json
23
- import argparse
24
- import joblib
25
- import os
26
- import pandas as pd
27
- from typing import List, Tuple
61
+ # Regression-only parameters (filtered out for classifiers)
62
+ REGRESSION_ONLY_PARAMS = {"objective"}
28
63
 
29
- # Template Parameters
64
+ # Template parameters (filled in by Workbench)
30
65
  TEMPLATE_PARAMS = {
31
- "model_type": "classifier",
32
- "target_column": "solubility_class",
33
- "features": ['molwt', 'mollogp', 'molmr', 'heavyatomcount', 'numhacceptors', 'numhdonors', 'numheteroatoms', 'numrotatablebonds', 'numvalenceelectrons', 'numaromaticrings', 'numsaturatedrings', 'numaliphaticrings', 'ringcount', 'tpsa', 'labuteasa', 'balabanj', 'bertzct', 'fingerprint'],
66
+ "model_type": "uq_regressor",
67
+ "target": "udm_asy_res_efflux_ratio",
68
+ "features": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo'],
69
+ "id_column": "udm_mol_bat_id",
34
70
  "compressed_features": ['fingerprint'],
35
- "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/aqsol-fingerprints-plus-class/training",
36
- "train_all_data": True
71
+ "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/caco2-er-reg-temporal/training",
72
+ "hyperparameters": {'n_folds': 1},
37
73
  }
38
74
 
39
- # Function to check if dataframe is empty
40
- def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
41
- """
42
- Check if the provided dataframe is empty and raise an exception if it is.
43
-
44
- Args:
45
- df (pd.DataFrame): DataFrame to check
46
- df_name (str): Name of the DataFrame
47
- """
48
- if df.empty:
49
- msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
50
- print(msg)
51
- raise ValueError(msg)
52
75
 
53
-
54
- def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFrame:
55
- """
56
- Expands a column in a DataFrame containing a list of probabilities into separate columns.
57
-
58
- Args:
59
- df (pd.DataFrame): DataFrame containing a "pred_proba" column
60
- class_labels (List[str]): List of class labels
61
-
62
- Returns:
63
- pd.DataFrame: DataFrame with the "pred_proba" expanded into separate columns
64
- """
65
-
66
- # Sanity check
67
- proba_column = "pred_proba"
68
- if proba_column not in df.columns:
69
- raise ValueError('DataFrame does not contain a "pred_proba" column')
70
-
71
- # Construct new column names with '_proba' suffix
72
- proba_splits = [f"{label}_proba" for label in class_labels]
73
-
74
- # Expand the proba_column into separate columns for each probability
75
- proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
76
-
77
- # Drop any proba columns and reset the index in prep for the concat
78
- df = df.drop(columns=[proba_column]+proba_splits, errors="ignore")
79
- df = df.reset_index(drop=True)
80
-
81
- # Concatenate the new columns with the original DataFrame
82
- df = pd.concat([df, proba_df], axis=1)
83
- print(df)
84
- return df
85
-
86
-
87
- def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
88
- """
89
- Matches and renames DataFrame columns to match model feature names (case-insensitive).
90
- Prioritizes exact matches, then case-insensitive matches.
91
-
92
- Raises ValueError if any model features cannot be matched.
93
- """
94
- df_columns_lower = {col.lower(): col for col in df.columns}
95
- rename_dict = {}
96
- missing = []
97
-
98
- for feature in model_features:
99
- if feature in df.columns:
100
- continue # Exact match
101
- elif feature.lower() in df_columns_lower:
102
- rename_dict[df_columns_lower[feature.lower()]] = feature
103
- else:
104
- missing.append(feature)
105
-
106
- if missing:
107
- raise ValueError(f"Features not found: {missing}")
108
-
109
- return df.rename(columns=rename_dict)
110
-
111
-
112
- def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
113
- """
114
- Converts appropriate columns to categorical type with consistent mappings.
115
-
116
- Args:
117
- df (pd.DataFrame): The DataFrame to process.
118
- features (list): List of feature names to consider for conversion.
119
- category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
120
- training mode. If populated, we're in inference mode.
121
-
122
- Returns:
123
- tuple: (processed DataFrame, category mappings dictionary)
124
- """
125
- # Training mode
126
- if category_mappings == {}:
127
- for col in df.select_dtypes(include=["object", "string"]):
128
- if col in features and df[col].nunique() < 20:
129
- print(f"Training mode: Converting {col} to category")
130
- df[col] = df[col].astype("category")
131
- category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
132
-
133
- # Inference mode
76
+ # =============================================================================
77
+ # Model Loading (for SageMaker inference)
78
+ # =============================================================================
79
+ def model_fn(model_dir: str) -> dict:
80
+ """Load XGBoost ensemble from the specified directory."""
81
+ # Load ensemble metadata
82
+ metadata_path = os.path.join(model_dir, "ensemble_metadata.json")
83
+ if os.path.exists(metadata_path):
84
+ with open(metadata_path) as f:
85
+ metadata = json.load(f)
86
+ n_ensemble = metadata["n_ensemble"]
134
87
  else:
135
- for col, categories in category_mappings.items():
136
- if col in df.columns:
137
- print(f"Inference mode: Applying categorical mapping for {col}")
138
- df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
139
-
140
- return df, category_mappings
141
-
88
+ n_ensemble = 1 # Legacy single model
142
89
 
143
- def decompress_features(df: pd.DataFrame, features: List[str], compressed_features: List[str]) -> Tuple[pd.DataFrame, List[str]]:
144
- """Prepare features for the XGBoost model
90
+ # Load ensemble models
91
+ ensemble_models = []
92
+ for i in range(n_ensemble):
93
+ model_path = os.path.join(model_dir, f"xgb_model_{i}.joblib")
94
+ if not os.path.exists(model_path):
95
+ model_path = os.path.join(model_dir, "xgb_model.joblib") # Legacy fallback
96
+ ensemble_models.append(joblib.load(model_path))
145
97
 
146
- Args:
147
- df (pd.DataFrame): The features DataFrame
148
- features (List[str]): Full list of feature names
149
- compressed_features (List[str]): List of feature names to decompress (bitstrings)
98
+ print(f"Loaded {len(ensemble_models)} model(s)")
150
99
 
151
- Returns:
152
- pd.DataFrame: DataFrame with the decompressed features
153
- List[str]: Updated list of feature names after decompression
100
+ # Load label encoder (classifier only)
101
+ label_encoder = None
102
+ encoder_path = os.path.join(model_dir, "label_encoder.joblib")
103
+ if os.path.exists(encoder_path):
104
+ label_encoder = joblib.load(encoder_path)
105
+
106
+ # Load category mappings
107
+ category_mappings = {}
108
+ category_path = os.path.join(model_dir, "category_mappings.json")
109
+ if os.path.exists(category_path):
110
+ with open(category_path) as f:
111
+ category_mappings = json.load(f)
112
+
113
+ # Load UQ models (regression only)
114
+ uq_models, uq_metadata = None, None
115
+ uq_path = os.path.join(model_dir, "uq_metadata.json")
116
+ if os.path.exists(uq_path):
117
+ uq_models, uq_metadata = load_uq_models(model_dir)
118
+
119
+ return {
120
+ "ensemble_models": ensemble_models,
121
+ "n_ensemble": n_ensemble,
122
+ "label_encoder": label_encoder,
123
+ "category_mappings": category_mappings,
124
+ "uq_models": uq_models,
125
+ "uq_metadata": uq_metadata,
126
+ }
127
+
128
+
129
+ # =============================================================================
130
+ # Inference (for SageMaker inference)
131
+ # =============================================================================
132
+ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
133
+ """Make predictions with XGBoost ensemble."""
134
+ model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
135
+ with open(os.path.join(model_dir, "feature_columns.json")) as f:
136
+ features = json.load(f)
137
+ print(f"Model Features: {features}")
154
138
 
155
- Raises:
156
- ValueError: If any missing values are found in the specified features
157
- """
139
+ # Extract model components
140
+ ensemble_models = model_dict["ensemble_models"]
141
+ label_encoder = model_dict.get("label_encoder")
142
+ category_mappings = model_dict.get("category_mappings", {})
143
+ uq_models = model_dict.get("uq_models")
144
+ uq_metadata = model_dict.get("uq_metadata")
145
+ compressed_features = TEMPLATE_PARAMS["compressed_features"]
158
146
 
159
- # Check for any missing values in the required features
160
- missing_counts = df[features].isna().sum()
161
- if missing_counts.any():
162
- missing_features = missing_counts[missing_counts > 0]
163
- print(
164
- f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
165
- "WARNING: You might want to remove/replace all NaN values before processing."
166
- )
147
+ # Prepare features
148
+ matched_df = match_features_case_insensitive(df, features)
149
+ matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
167
150
 
168
- # Decompress the specified compressed features
169
- decompressed_features = features
170
- for feature in compressed_features:
171
- if (feature not in df.columns) or (feature not in features):
172
- print(f"Feature '{feature}' not in the features list, skipping decompression.")
173
- continue
151
+ if compressed_features:
152
+ print("Decompressing features for prediction...")
153
+ matched_df, features = decompress_features(matched_df, features, compressed_features)
174
154
 
175
- # Remove the feature from the list of features to avoid duplication
176
- decompressed_features.remove(feature)
155
+ X = matched_df[features]
177
156
 
178
- # Handle all compressed features as bitstrings
179
- bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
180
- prefix = feature[:3]
157
+ # Collect ensemble predictions
158
+ all_preds = [m.predict(X) for m in ensemble_models]
159
+ ensemble_preds = np.stack(all_preds, axis=0)
181
160
 
182
- # Create all new columns at once - avoids fragmentation
183
- new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
184
- new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
161
+ if label_encoder is not None:
162
+ # Classification: average probabilities, then argmax
163
+ all_probs = [m.predict_proba(X) for m in ensemble_models]
164
+ avg_probs = np.mean(np.stack(all_probs, axis=0), axis=0)
165
+ class_preds = np.argmax(avg_probs, axis=1)
185
166
 
186
- # Add to features list
187
- decompressed_features.extend(new_col_names)
167
+ df["prediction"] = label_encoder.inverse_transform(class_preds)
168
+ df["pred_proba"] = [p.tolist() for p in avg_probs]
169
+ df = expand_proba_column(df, label_encoder.classes_)
170
+ else:
171
+ # Regression: average predictions
172
+ df["prediction"] = np.mean(ensemble_preds, axis=0)
173
+ df["prediction_std"] = np.std(ensemble_preds, axis=0)
188
174
 
189
- # Drop original column and concatenate new ones
190
- df = df.drop(columns=[feature])
191
- df = pd.concat([df, new_df], axis=1)
175
+ # Add UQ intervals if available
176
+ if uq_models and uq_metadata:
177
+ df = predict_intervals(df, X, uq_models, uq_metadata)
178
+ df = compute_confidence(df, uq_metadata["median_interval_width"], "q_10", "q_90")
192
179
 
193
- return df, decompressed_features
180
+ print(f"Inference complete: {len(df)} predictions, {len(ensemble_models)} ensemble members")
181
+ return df
194
182
 
195
183
 
184
+ # =============================================================================
185
+ # Training
186
+ # =============================================================================
196
187
  if __name__ == "__main__":
197
- """The main function is for training the XGBoost model"""
188
+ # -------------------------------------------------------------------------
189
+ # Training-only imports (deferred to reduce serverless startup time)
190
+ # -------------------------------------------------------------------------
191
+ import argparse
192
+
193
+ import awswrangler as wr
194
+ from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
195
+ from sklearn.preprocessing import LabelEncoder
196
+
197
+ from model_script_utils import (
198
+ check_dataframe,
199
+ compute_classification_metrics,
200
+ compute_regression_metrics,
201
+ print_classification_metrics,
202
+ print_confusion_matrix,
203
+ print_regression_metrics,
204
+ )
205
+ from uq_harness import (
206
+ save_uq_models,
207
+ train_uq_models,
208
+ )
198
209
 
199
- # Harness Template Parameters
200
- target = TEMPLATE_PARAMS["target_column"]
210
+ # -------------------------------------------------------------------------
211
+ # Setup: Parse arguments and load data
212
+ # -------------------------------------------------------------------------
213
+ parser = argparse.ArgumentParser()
214
+ parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
215
+ parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
216
+ parser.add_argument("--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data"))
217
+ args = parser.parse_args()
218
+
219
+ # Extract template parameters
220
+ target = TEMPLATE_PARAMS["target"]
201
221
  features = TEMPLATE_PARAMS["features"]
202
222
  orig_features = features.copy()
223
+ id_column = TEMPLATE_PARAMS["id_column"]
203
224
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
204
225
  model_type = TEMPLATE_PARAMS["model_type"]
205
226
  model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
206
- train_all_data = TEMPLATE_PARAMS["train_all_data"]
207
- validation_split = 0.2
208
-
209
- # Script arguments for input/output directories
210
- parser = argparse.ArgumentParser()
211
- parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
212
- parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
213
- parser.add_argument(
214
- "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
215
- )
216
- args = parser.parse_args()
227
+ hyperparameters = {**DEFAULT_HYPERPARAMETERS, **(TEMPLATE_PARAMS["hyperparameters"] or {})}
217
228
 
218
- # Read the training data into DataFrames
219
- training_files = [
220
- os.path.join(args.train, file)
221
- for file in os.listdir(args.train)
222
- if file.endswith(".csv")
223
- ]
229
+ # Load training data
230
+ training_files = [os.path.join(args.train, f) for f in os.listdir(args.train) if f.endswith(".csv")]
224
231
  print(f"Training Files: {training_files}")
225
-
226
- # Combine files and read them all into a single pandas dataframe
227
- all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
228
-
229
- # Check if the dataframe is empty
232
+ all_df = pd.concat([pd.read_csv(f, engine="python") for f in training_files])
230
233
  check_dataframe(all_df, "training_df")
231
234
 
232
- # Features/Target output
233
235
  print(f"Target: {target}")
234
- print(f"Features: {str(features)}")
236
+ print(f"Features: {features}")
237
+ print(f"Hyperparameters: {hyperparameters}")
235
238
 
236
- # Convert any features that might be categorical to 'category' type
239
+ # -------------------------------------------------------------------------
240
+ # Preprocessing: Categorical features and decompression
241
+ # -------------------------------------------------------------------------
237
242
  all_df, category_mappings = convert_categorical_types(all_df, features)
238
243
 
239
- # If we have compressed features, decompress them
240
244
  if compressed_features:
241
- print(f"Decompressing features {compressed_features}...")
245
+ print(f"Decompressing features: {compressed_features}")
242
246
  all_df, features = decompress_features(all_df, features, compressed_features)
243
247
 
244
- # Do we want to train on all the data?
245
- if train_all_data:
246
- print("Training on ALL of the data")
247
- df_train = all_df.copy()
248
- df_val = all_df.copy()
249
-
250
- # Does the dataframe have a training column?
251
- elif "training" in all_df.columns:
252
- print("Found training column, splitting data based on training column")
253
- df_train = all_df[all_df["training"]]
254
- df_val = all_df[~all_df["training"]]
255
- else:
256
- # Just do a random training Split
257
- print("WARNING: No training column found, splitting data with random state=42")
258
- df_train, df_val = train_test_split(
259
- all_df, test_size=validation_split, random_state=42
260
- )
261
- print(f"FIT/TRAIN: {df_train.shape}")
262
- print(f"VALIDATION: {df_val.shape}")
263
-
264
- # Now spin up our XGB Model
248
+ # -------------------------------------------------------------------------
249
+ # Classification setup
250
+ # -------------------------------------------------------------------------
251
+ label_encoder = None
265
252
  if model_type == "classifier":
266
- xgb_model = xgb.XGBClassifier(enable_categorical=True)
267
-
268
- # Encode the target column
269
253
  label_encoder = LabelEncoder()
270
- df_train[target] = label_encoder.fit_transform(df_train[target])
271
- df_val[target] = label_encoder.transform(df_val[target])
254
+ all_df[target] = label_encoder.fit_transform(all_df[target])
255
+ print(f"Class labels: {label_encoder.classes_.tolist()}")
272
256
 
273
- else:
274
- xgb_model = xgb.XGBRegressor(enable_categorical=True)
275
- label_encoder = None # We don't need this for regression
276
-
277
- # Grab our Features, Target and Train the Model
278
- y_train = df_train[target]
279
- X_train= df_train[features]
280
- xgb_model.fit(X_train, y_train)
281
-
282
- # Make Predictions on the Validation Set
283
- print(f"Making Predictions on Validation Set...")
284
- y_validate = df_val[target]
285
- X_validate = df_val[features]
286
- preds = xgb_model.predict(X_validate)
287
- if model_type == "classifier":
288
- # Also get the probabilities for each class
289
- print("Processing Probabilities...")
290
- probs = xgb_model.predict_proba(X_validate)
291
- df_val["pred_proba"] = [p.tolist() for p in probs]
292
-
293
- # Expand the pred_proba column into separate columns for each class
294
- print(df_val.columns)
295
- df_val = expand_proba_column(df_val, label_encoder.classes_)
296
- print(df_val.columns)
297
-
298
- # Decode the target and prediction labels
299
- y_validate = label_encoder.inverse_transform(y_validate)
300
- preds = label_encoder.inverse_transform(preds)
301
-
302
- # Save predictions to S3 (just the target, prediction, and '_proba' columns)
303
- df_val["prediction"] = preds
304
- output_columns = [target, "prediction"]
305
- output_columns += [col for col in df_val.columns if col.endswith("_proba")]
306
- wr.s3.to_csv(
307
- df_val[output_columns],
308
- path=f"{model_metrics_s3_path}/validation_predictions.csv",
309
- index=False,
310
- )
257
+ # -------------------------------------------------------------------------
258
+ # Cross-validation setup
259
+ # -------------------------------------------------------------------------
260
+ n_folds = hyperparameters["n_folds"]
261
+ xgb_params = {k: v for k, v in hyperparameters.items() if k not in WORKBENCH_PARAMS}
311
262
 
312
- # Report Performance Metrics
313
- if model_type == "classifier":
314
- # Get the label names and their integer mapping
315
- label_names = label_encoder.classes_
316
-
317
- # Calculate various model performance metrics
318
- scores = precision_recall_fscore_support(
319
- y_validate, preds, average=None, labels=label_names
320
- )
321
-
322
- # Put the scores into a dataframe
323
- score_df = pd.DataFrame(
324
- {
325
- target: label_names,
326
- "precision": scores[0],
327
- "recall": scores[1],
328
- "fscore": scores[2],
329
- "support": scores[3],
330
- }
331
- )
332
-
333
- # We need to get creative with the Classification Metrics
334
- metrics = ["precision", "recall", "fscore", "support"]
335
- for t in label_names:
336
- for m in metrics:
337
- value = score_df.loc[score_df[target] == t, m].iloc[0]
338
- print(f"Metrics:{t}:{m} {value}")
339
-
340
- # Compute and output the confusion matrix
341
- conf_mtx = confusion_matrix(y_validate, preds, labels=label_names)
342
- for i, row_name in enumerate(label_names):
343
- for j, col_name in enumerate(label_names):
344
- value = conf_mtx[i, j]
345
- print(f"ConfusionMatrix:{row_name}:{col_name} {value}")
263
+ # Map 'seed' to 'random_state' for XGBoost
264
+ if "seed" in xgb_params:
265
+ xgb_params["random_state"] = xgb_params.pop("seed")
346
266
 
267
+ # Handle objective: filter regression-only params for classifiers, set default for regressors
268
+ if model_type == "classifier":
269
+ xgb_params = {k: v for k, v in xgb_params.items() if k not in REGRESSION_ONLY_PARAMS}
347
270
  else:
348
- # Calculate various model performance metrics (regression)
349
- rmse = root_mean_squared_error(y_validate, preds)
350
- mae = mean_absolute_error(y_validate, preds)
351
- r2 = r2_score(y_validate, preds)
352
- print(f"RMSE: {rmse:.3f}")
353
- print(f"MAE: {mae:.3f}")
354
- print(f"R2: {r2:.3f}")
355
- print(f"NumRows: {len(df_val)}")
356
-
357
- # Now save the model to the standard place/name
358
- xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
359
- if label_encoder:
360
- joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
361
-
362
- # Save the features (this will validate input during predictions)
363
- with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
364
- json.dump(orig_features, fp) # We save the original features, not the decompressed ones
365
-
366
- # Save the category mappings
367
- with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
368
- json.dump(category_mappings, fp)
369
-
370
-
371
- def model_fn(model_dir):
372
- """Deserialize and return fitted XGBoost model"""
373
-
374
- model_path = os.path.join(model_dir, "xgb_model.json")
375
-
376
- with open(model_path, "r") as f:
377
- model_json = json.load(f)
378
-
379
- sklearn_data = model_json['learner']['attributes']['scikit_learn']
380
- model_type = json.loads(sklearn_data)['_estimator_type']
381
-
382
- model_class = xgb.XGBClassifier if model_type == "classifier" else xgb.XGBRegressor
383
- model = model_class(enable_categorical=True)
384
- model.load_model(model_path)
271
+ # Default to MAE (reg:absoluteerror) for regression if not specified
272
+ xgb_params.setdefault("objective", "reg:absoluteerror")
385
273
 
386
- return model
274
+ print(f"XGBoost params: {xgb_params}")
387
275
 
276
+ if n_folds == 1:
277
+ # Single train/val split
278
+ if "training" in all_df.columns:
279
+ print("Using 'training' column for train/val split")
280
+ train_idx = np.where(all_df["training"])[0]
281
+ val_idx = np.where(~all_df["training"])[0]
282
+ else:
283
+ print("WARNING: No 'training' column found, using random 80/20 split")
284
+ indices = np.arange(len(all_df))
285
+ train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)
286
+ folds = [(train_idx, val_idx)]
287
+ else:
288
+ # K-fold cross-validation
289
+ if model_type == "classifier":
290
+ kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
291
+ folds = list(kfold.split(all_df, all_df[target]))
292
+ else:
293
+ kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
294
+ folds = list(kfold.split(all_df))
388
295
 
389
- def input_fn(input_data, content_type):
390
- """Parse input data and return a DataFrame."""
391
- if not input_data:
392
- raise ValueError("Empty input data is not supported!")
393
-
394
- # Decode bytes to string if necessary
395
- if isinstance(input_data, bytes):
396
- input_data = input_data.decode("utf-8")
296
+ print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold ensemble'}...")
397
297
 
398
- if "text/csv" in content_type:
399
- return pd.read_csv(StringIO(input_data))
400
- elif "application/json" in content_type:
401
- return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
298
+ # -------------------------------------------------------------------------
299
+ # Training loop
300
+ # -------------------------------------------------------------------------
301
+ # Initialize out-of-fold storage
302
+ oof_predictions = np.full(len(all_df), np.nan, dtype=np.float64)
303
+ if model_type == "classifier":
304
+ num_classes = len(label_encoder.classes_)
305
+ oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
402
306
  else:
403
- raise ValueError(f"{content_type} not supported!")
404
-
307
+ oof_proba = None
308
+
309
+ # Check for sample weights
310
+ has_sample_weights = "sample_weight" in all_df.columns
311
+ if has_sample_weights:
312
+ sw = all_df["sample_weight"]
313
+ print(f"Using sample weights: min={sw.min():.2f}, max={sw.max():.2f}, mean={sw.mean():.2f}")
314
+
315
+ # Train ensemble
316
+ ensemble_models = []
317
+ for fold_idx, (train_idx, val_idx) in enumerate(folds):
318
+ print(f"\n{'='*50}")
319
+ print(f"Fold {fold_idx + 1}/{len(folds)} - Train: {len(train_idx)}, Val: {len(val_idx)}")
320
+ print(f"{'='*50}")
321
+
322
+ # Prepare fold data
323
+ X_train = all_df.iloc[train_idx][features]
324
+ y_train = all_df.iloc[train_idx][target]
325
+ X_val = all_df.iloc[val_idx][features]
326
+ sample_weights = all_df.iloc[train_idx]["sample_weight"] if has_sample_weights else None
327
+
328
+ # Create model with fold-specific random state for diversity
329
+ fold_params = {**xgb_params, "random_state": xgb_params.get("random_state", 42) + fold_idx}
330
+ if model_type == "classifier":
331
+ model = xgb.XGBClassifier(enable_categorical=True, **fold_params)
332
+ else:
333
+ model = xgb.XGBRegressor(enable_categorical=True, **fold_params)
334
+
335
+ # Train
336
+ model.fit(X_train, y_train, sample_weight=sample_weights)
337
+ ensemble_models.append(model)
338
+
339
+ # Out-of-fold predictions
340
+ oof_predictions[val_idx] = model.predict(X_val)
341
+ if model_type == "classifier":
342
+ oof_proba[val_idx] = model.predict_proba(X_val)
343
+
344
+ print(f"\nTraining complete! Trained {len(ensemble_models)} model(s).")
345
+
346
+ # -------------------------------------------------------------------------
347
+ # Prepare validation results
348
+ # -------------------------------------------------------------------------
349
+ if n_folds == 1:
350
+ # Single fold: only validation rows
351
+ val_mask = ~np.isnan(oof_predictions)
352
+ df_val = all_df[val_mask].copy()
353
+ predictions = oof_predictions[val_mask]
354
+ if oof_proba is not None:
355
+ oof_proba = oof_proba[val_mask]
356
+ else:
357
+ # K-fold: all rows have out-of-fold predictions
358
+ df_val = all_df.copy()
359
+ predictions = oof_predictions
405
360
 
406
- def output_fn(output_df, accept_type):
407
- """Supports both CSV and JSON output formats."""
408
- if "text/csv" in accept_type:
409
- csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
410
- return csv_output, "text/csv"
411
- elif "application/json" in accept_type:
412
- return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
361
+ # Decode labels for classification
362
+ if model_type == "classifier":
363
+ df_val[target] = label_encoder.inverse_transform(df_val[target].astype(int))
364
+ df_val["prediction"] = label_encoder.inverse_transform(predictions.astype(int))
365
+ if oof_proba is not None:
366
+ df_val["pred_proba"] = [p.tolist() for p in oof_proba]
367
+ df_val = expand_proba_column(df_val, label_encoder.classes_)
413
368
  else:
414
- raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
369
+ df_val["prediction"] = predictions
415
370
 
371
+ # -------------------------------------------------------------------------
372
+ # Compute and print metrics
373
+ # -------------------------------------------------------------------------
374
+ y_true = df_val[target].values
375
+ y_pred = df_val["prediction"].values
416
376
 
417
- def predict_fn(df, model) -> pd.DataFrame:
418
- """Make Predictions with our XGB Model
377
+ if model_type == "classifier":
378
+ label_names = label_encoder.classes_
379
+ score_df = compute_classification_metrics(y_true, y_pred, label_names, target)
380
+ print_classification_metrics(score_df, target, label_names)
381
+ print_confusion_matrix(y_true, y_pred, label_names)
382
+ else:
383
+ metrics = compute_regression_metrics(y_true, y_pred)
384
+ print_regression_metrics(metrics)
385
+
386
+ # Compute ensemble prediction_std
387
+ if n_folds > 1:
388
+ all_preds = np.stack([m.predict(all_df[features]) for m in ensemble_models])
389
+ df_val["prediction_std"] = np.std(all_preds, axis=0)
390
+ print(f"Ensemble std - mean: {df_val['prediction_std'].mean():.4f}, max: {df_val['prediction_std'].max():.4f}")
391
+ else:
392
+ df_val["prediction_std"] = 0.0
393
+
394
+ # Train UQ models for uncertainty quantification
395
+ print("\n" + "=" * 50)
396
+ print("Training UQ Models")
397
+ print("=" * 50)
398
+ uq_models, uq_metadata = train_uq_models(
399
+ all_df[features], all_df[target], df_val[features], y_true
400
+ )
401
+ df_val = predict_intervals(df_val, df_val[features], uq_models, uq_metadata)
402
+ df_val = compute_confidence(df_val, uq_metadata["median_interval_width"])
419
403
 
420
- Args:
421
- df (pd.DataFrame): The input DataFrame
422
- model: The model use for predictions
404
+ # -------------------------------------------------------------------------
405
+ # Save validation predictions to S3
406
+ # -------------------------------------------------------------------------
407
+ output_columns = []
408
+ if id_column in df_val.columns:
409
+ output_columns.append(id_column)
410
+ output_columns += [target, "prediction"]
423
411
 
424
- Returns:
425
- pd.DataFrame: The DataFrame with the predictions added
426
- """
427
- compressed_features = TEMPLATE_PARAMS["compressed_features"]
412
+ if model_type != "classifier":
413
+ output_columns.append("prediction_std")
414
+ output_columns += [c for c in df_val.columns if c.startswith("q_") or c == "confidence"]
428
415
 
429
- # Grab our feature columns (from training)
430
- model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
431
- with open(os.path.join(model_dir, "feature_columns.json")) as fp:
432
- features = json.load(fp)
433
- print(f"Model Features: {features}")
416
+ output_columns += [c for c in df_val.columns if c.endswith("_proba")]
434
417
 
435
- # Load the category mappings (from training)
436
- with open(os.path.join(model_dir, "category_mappings.json")) as fp:
437
- category_mappings = json.load(fp)
418
+ wr.s3.to_csv(df_val[output_columns], f"{model_metrics_s3_path}/validation_predictions.csv", index=False)
438
419
 
439
- # Load our Label Encoder if we have one
440
- label_encoder = None
441
- if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")):
442
- label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
420
+ # -------------------------------------------------------------------------
421
+ # Save model artifacts
422
+ # -------------------------------------------------------------------------
423
+ for idx, m in enumerate(ensemble_models):
424
+ joblib.dump(m, os.path.join(args.model_dir, f"xgb_model_{idx}.joblib"))
425
+ print(f"Saved {len(ensemble_models)} model(s)")
443
426
 
444
- # We're going match features in a case-insensitive manner, accounting for all the permutations
445
- # - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
446
- # - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
447
- matched_df = match_features_case_insensitive(df, features)
427
+ with open(os.path.join(args.model_dir, "ensemble_metadata.json"), "w") as f:
428
+ json.dump({"n_ensemble": len(ensemble_models), "n_folds": n_folds}, f)
448
429
 
449
- # Detect categorical types in the incoming DataFrame
450
- matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
430
+ with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as f:
431
+ json.dump(orig_features, f)
451
432
 
452
- # If we have compressed features, decompress them
453
- if compressed_features:
454
- print("Decompressing features for prediction...")
455
- matched_df, features = decompress_features(matched_df, features, compressed_features)
433
+ with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as f:
434
+ json.dump(category_mappings, f)
456
435
 
457
- # Predict the features against our XGB Model
458
- X = matched_df[features]
459
- predictions = model.predict(X)
436
+ with open(os.path.join(args.model_dir, "hyperparameters.json"), "w") as f:
437
+ json.dump(hyperparameters, f, indent=2)
460
438
 
461
- # If we have a label encoder, decode the predictions
462
439
  if label_encoder:
463
- predictions = label_encoder.inverse_transform(predictions)
464
-
465
- # Set the predictions on the DataFrame
466
- df["prediction"] = predictions
467
-
468
- # Does our model have a 'predict_proba' method? If so we will call it and add the results to the DataFrame
469
- if getattr(model, "predict_proba", None):
470
- probs = model.predict_proba(matched_df[features])
471
- df["pred_proba"] = [p.tolist() for p in probs]
440
+ joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
472
441
 
473
- # Expand the pred_proba column into separate columns for each class
474
- df = expand_proba_column(df, label_encoder.classes_)
442
+ if model_type != "classifier":
443
+ save_uq_models(uq_models, uq_metadata, args.model_dir)
475
444
 
476
- # All done, return the DataFrame with new columns for the predictions
477
- return df
445
+ print(f"\nModel training complete! Artifacts saved to {args.model_dir}")