PyPI - workbench - Versions diffs - 0.8.162__py3-none-any.whl → 0.8.220__py3-none-any.whl - Mend

workbench 0.8.162py3-none-any.whl → 0.8.220py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (147) hide show

workbench/algorithms/dataframe/__init__.py +1 -2
workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
workbench/algorithms/dataframe/fingerprint_proximity.py +422 -86
workbench/algorithms/dataframe/projection_2d.py +44 -21
workbench/algorithms/dataframe/proximity.py +259 -305
workbench/algorithms/graph/light/proximity_graph.py +14 -12
workbench/algorithms/models/cleanlab_model.py +382 -0
workbench/algorithms/models/noise_model.py +388 -0
workbench/algorithms/sql/outliers.py +3 -3
workbench/api/__init__.py +5 -1
workbench/api/compound.py +1 -1
workbench/api/df_store.py +17 -108
workbench/api/endpoint.py +18 -5
workbench/api/feature_set.py +121 -15
workbench/api/meta.py +5 -2
workbench/api/meta_model.py +289 -0
workbench/api/model.py +55 -21
workbench/api/monitor.py +1 -16
workbench/api/parameter_store.py +3 -52
workbench/cached/cached_model.py +4 -4
workbench/core/artifacts/__init__.py +11 -2
workbench/core/artifacts/artifact.py +16 -8
workbench/core/artifacts/data_capture_core.py +355 -0
workbench/core/artifacts/df_store_core.py +114 -0
workbench/core/artifacts/endpoint_core.py +382 -253
workbench/core/artifacts/feature_set_core.py +249 -45
workbench/core/artifacts/model_core.py +135 -80
workbench/core/artifacts/monitor_core.py +33 -248
workbench/core/artifacts/parameter_store_core.py +98 -0
workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
workbench/core/cloud_platform/aws/aws_meta.py +12 -5
workbench/core/cloud_platform/aws/aws_session.py +4 -4
workbench/core/pipelines/pipeline_executor.py +1 -1
workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
workbench/core/transforms/features_to_model/features_to_model.py +62 -40
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +76 -15
workbench/core/transforms/pandas_transforms/pandas_to_features.py +38 -2
workbench/core/views/training_view.py +113 -42
workbench/core/views/view.py +53 -3
workbench/core/views/view_utils.py +4 -4
workbench/model_script_utils/model_script_utils.py +339 -0
workbench/model_script_utils/pytorch_utils.py +405 -0
workbench/model_script_utils/uq_harness.py +278 -0
workbench/model_scripts/chemprop/chemprop.template +649 -0
workbench/model_scripts/chemprop/generated_model_script.py +649 -0
workbench/model_scripts/chemprop/model_script_utils.py +339 -0
workbench/model_scripts/chemprop/requirements.txt +3 -0
workbench/model_scripts/custom_models/chem_info/fingerprints.py +175 -0
workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +8 -10
workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
workbench/model_scripts/custom_models/uq_models/ngboost.template +30 -18
workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
workbench/model_scripts/meta_model/generated_model_script.py +209 -0
workbench/model_scripts/meta_model/meta_model.template +209 -0
workbench/model_scripts/pytorch_model/generated_model_script.py +444 -500
workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
workbench/model_scripts/pytorch_model/pytorch.template +440 -496
workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
workbench/model_scripts/pytorch_model/requirements.txt +1 -1
workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
workbench/model_scripts/script_generation.py +20 -11
workbench/model_scripts/uq_models/generated_model_script.py +248 -0
workbench/model_scripts/xgb_model/generated_model_script.py +372 -404
workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
workbench/model_scripts/xgb_model/uq_harness.py +278 -0
workbench/model_scripts/xgb_model/xgb_model.template +369 -401
workbench/repl/workbench_shell.py +28 -19
workbench/resources/open_source_api.key +1 -1
workbench/scripts/endpoint_test.py +162 -0
workbench/scripts/lambda_test.py +73 -0
workbench/scripts/meta_model_sim.py +35 -0
workbench/scripts/ml_pipeline_batch.py +137 -0
workbench/scripts/ml_pipeline_sqs.py +186 -0
workbench/scripts/monitor_cloud_watch.py +20 -100
workbench/scripts/training_test.py +85 -0
workbench/utils/aws_utils.py +4 -3
workbench/utils/chem_utils/__init__.py +0 -0
workbench/utils/chem_utils/fingerprints.py +175 -0
workbench/utils/chem_utils/misc.py +194 -0
workbench/utils/chem_utils/mol_descriptors.py +483 -0
workbench/utils/chem_utils/mol_standardize.py +450 -0
workbench/utils/chem_utils/mol_tagging.py +348 -0
workbench/utils/chem_utils/projections.py +219 -0
workbench/utils/chem_utils/salts.py +256 -0
workbench/utils/chem_utils/sdf.py +292 -0
workbench/utils/chem_utils/toxicity.py +250 -0
workbench/utils/chem_utils/vis.py +253 -0
workbench/utils/chemprop_utils.py +141 -0
workbench/utils/cloudwatch_handler.py +1 -1
workbench/utils/cloudwatch_utils.py +137 -0
workbench/utils/config_manager.py +3 -7
workbench/utils/endpoint_utils.py +5 -7
workbench/utils/license_manager.py +2 -6
workbench/utils/meta_model_simulator.py +499 -0
workbench/utils/metrics_utils.py +256 -0
workbench/utils/model_utils.py +278 -79
workbench/utils/monitor_utils.py +44 -62
workbench/utils/pandas_utils.py +3 -3
workbench/utils/pytorch_utils.py +87 -0
workbench/utils/shap_utils.py +11 -57
workbench/utils/workbench_logging.py +0 -3
workbench/utils/workbench_sqs.py +1 -1
workbench/utils/xgboost_local_crossfold.py +267 -0
workbench/utils/xgboost_model_utils.py +127 -219
workbench/web_interface/components/model_plot.py +14 -2
workbench/web_interface/components/plugin_unit_test.py +5 -2
workbench/web_interface/components/plugins/dashboard_status.py +3 -1
workbench/web_interface/components/plugins/generated_compounds.py +1 -1
workbench/web_interface/components/plugins/model_details.py +38 -74
workbench/web_interface/components/plugins/scatter_plot.py +6 -10
{workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/METADATA +31 -9
{workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/RECORD +128 -96
workbench-0.8.220.dist-info/entry_points.txt +11 -0
{workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/licenses/LICENSE +1 -1
workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
workbench/model_scripts/custom_models/proximity/proximity.py +0 -384
workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -273
workbench/model_scripts/custom_models/uq_models/proximity.py +0 -384
workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
workbench/model_scripts/quant_regression/quant_regression.template +0 -279
workbench/model_scripts/quant_regression/requirements.txt +0 -1
workbench/utils/chem_utils.py +0 -1556
workbench/utils/execution_environment.py +0 -211
workbench/utils/fast_inference.py +0 -167
workbench/utils/resource_utils.py +0 -39
workbench-0.8.162.dist-info/entry_points.txt +0 -5
{workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/WHEEL +0 -0
{workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/top_level.txt +0 -0

workbench/model_scripts/scikit_learn/generated_model_script.py CHANGED Viewed

@@ -4,11 +4,11 @@ None
 # Template Placeholders
 TEMPLATE_PARAMS = {
     "model_type": "regressor",
-    "target_column": "solubility",
-    "feature_list": ['molwt', 'mollogp', 'molmr', 'heavyatomcount', 'numhacceptors', 'numhdonors', 'numheteroatoms', 'numrotatablebonds', 'numvalenceelectrons', 'numaromaticrings', 'numsaturatedrings', 'numaliphaticrings', 'ringcount', 'tpsa', 'labuteasa', 'balabanj', 'bertzct'],
+    "target_column": "udm_asy_res_efflux_ratio",
+    "feature_list": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo', 'tertiary_amine_count', 'type_i_pattern_count', 'type_ii_pattern_count', 'aromatic_interaction_score', 'molecular_axis_length', 'molecular_asymmetry', 'molecular_volume_3d', 'radius_of_gyration', 'asphericity', 'charge_centroid_distance', 'nitrogen_span', 'amide_count', 'hba_hbd_ratio', 'intramolecular_hbond_potential', 'amphiphilic_moment'],
     "model_class": PyTorch,
-    "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/aqsol-pytorch-reg/training",
-    "train_all_data": False
+    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/caco2-er-reg-pytorch-test/training",
+    "train_all_data": False,
 }
 import awswrangler as wr
@@ -99,10 +99,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Load training data from the specified directory
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
     # Check if the DataFrame is empty
@@ -116,10 +113,7 @@ if __name__ == "__main__":
     if needs_standardization:
         # Create a pipeline with standardization and the model
-        model = Pipeline([
-            ("scaler", StandardScaler()),
-            ("model", model)
-        ])
+        model = Pipeline([("scaler", StandardScaler()), ("model", model)])
     # Handle logic based on the model_type
     if model_type in ["classifier", "regressor"]:
@@ -206,6 +200,7 @@ if __name__ == "__main__":
     with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
         json.dump(feature_list, fp)
 #
 # Inference Section
 #

workbench/model_scripts/scikit_learn/scikit_learn.template CHANGED Viewed

@@ -8,7 +8,7 @@ TEMPLATE_PARAMS = {
     "feature_list": "{{feature_list}}",
     "model_class": "{{model_class}}",
     "model_metrics_s3_path": "{{model_metrics_s3_path}}",
-    "train_all_data": "{{train_all_data}}"
+    "train_all_data": "{{train_all_data}}",
 }
 import awswrangler as wr
@@ -99,10 +99,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Load training data from the specified directory
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
     # Check if the DataFrame is empty
@@ -116,10 +113,7 @@ if __name__ == "__main__":
     if needs_standardization:
         # Create a pipeline with standardization and the model
-        model = Pipeline([
-            ("scaler", StandardScaler()),
-            ("model", model)
-        ])
+        model = Pipeline([("scaler", StandardScaler()), ("model", model)])
     # Handle logic based on the model_type
     if model_type in ["classifier", "regressor"]:
@@ -206,6 +200,7 @@ if __name__ == "__main__":
     with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
         json.dump(feature_list, fp)
 #
 # Inference Section
 #

workbench/model_scripts/script_generation.py CHANGED Viewed

@@ -70,6 +70,11 @@ def fill_template(template_path: str, params: dict, output_script: str) -> str:
     # Sanity check to ensure all placeholders were replaced
     if "{{" in template and "}}" in template:
         msg = "Not all template placeholders were replaced. Please check your params."
+        # Show which placeholders are still present
+        start = template.index("{{")
+        end = template.index("}}", start) + 2
+        msg += f" Unreplaced placeholder: {template[start:end]}"
         log.critical(msg)
         raise ValueError(msg)
@@ -88,32 +93,36 @@ def generate_model_script(template_params: dict) -> str:
         template_params (dict): Dictionary containing the parameters:
             - model_imports (str): Import string for the model class
             - model_type (ModelType): The enumerated type of model to generate
+            - model_framework (str): The enumerated model framework to use
             - model_class (str): The model class to use (e.g., "RandomForestRegressor")
             - target_column (str): Column name of the target variable
             - feature_list (list[str]): A list of columns for the features
             - model_metrics_s3_path (str): The S3 path to store the model metrics
             - train_all_data (bool): Whether to train on all (100%) of the data
             - hyperparameters (dict, optional): Hyperparameters for the model (default: None)
+            - child_endpoints (list[str], optional): For META models, list of child endpoint names
     Returns:
         str: The name of the generated model script
     """
-    from workbench.api import ModelType  # Avoid circular import
+    from workbench.api import ModelType, ModelFramework  # Avoid circular import
     # Determine which template to use based on model type
     if template_params.get("model_class"):
-        if template_params["model_class"].lower() == "pytorch":
-            template_name = "pytorch.template"
-            model_script_dir = "pytorch_model"
-        else:
-            template_name = "scikit_learn.template"
-            model_script_dir = "scikit_learn"
-    elif template_params["model_type"] in [ModelType.REGRESSOR, ModelType.CLASSIFIER]:
+        template_name = "scikit_learn.template"
+        model_script_dir = "scikit_learn"
+    elif template_params["model_framework"] == ModelFramework.PYTORCH:
+        template_name = "pytorch.template"
+        model_script_dir = "pytorch_model"
+    elif template_params["model_framework"] == ModelFramework.CHEMPROP:
+        template_name = "chemprop.template"
+        model_script_dir = "chemprop"
+    elif template_params["model_framework"] == ModelFramework.META:
+        template_name = "meta_model.template"
+        model_script_dir = "meta_model"
+    elif template_params["model_type"] in [ModelType.REGRESSOR, ModelType.UQ_REGRESSOR, ModelType.CLASSIFIER]:
         template_name = "xgb_model.template"
         model_script_dir = "xgb_model"
-    elif template_params["model_type"] == ModelType.UQ_REGRESSOR:
-        template_name = "quant_regression.template"
-        model_script_dir = "quant_regression"
     elif template_params["model_type"] == ModelType.ENSEMBLE_REGRESSOR:
         template_name = "ensemble_xgb.template"
         model_script_dir = "ensemble_xgb"

workbench/model_scripts/uq_models/generated_model_script.py ADDED Viewed

@@ -0,0 +1,248 @@
+# Model: XGBoost for point predictions + MAPIE UQ Harness for conformalized intervals
+from xgboost import XGBRegressor
+from sklearn.model_selection import train_test_split
+import json
+import argparse
+import joblib
+import os
+import numpy as np
+import pandas as pd
+# Shared model script utilities
+from model_script_utils import (
+    check_dataframe,
+    match_features_case_insensitive,
+    convert_categorical_types,
+    decompress_features,
+    input_fn,
+    output_fn,
+    compute_regression_metrics,
+    print_regression_metrics,
+)
+# UQ Harness for uncertainty quantification
+from uq_harness import (
+    train_uq_models,
+    save_uq_models,
+    load_uq_models,
+    predict_intervals,
+    compute_confidence,
+)
+# Template Placeholders
+TEMPLATE_PARAMS = {
+    "target": "solubility",
+    "features": ['molwt', 'mollogp', 'molmr', 'heavyatomcount', 'numhacceptors', 'numhdonors', 'numheteroatoms', 'numrotatablebonds', 'numvalenceelectrons', 'numaromaticrings', 'numsaturatedrings', 'numaliphaticrings', 'ringcount', 'tpsa', 'labuteasa', 'balabanj', 'bertzct'],
+    "compressed_features": [],
+    "train_all_data": False,
+    "hyperparameters": {'training_config': {'max_epochs': 150}, 'model_config': {'layers': '128-64-32'}},
+}
+if __name__ == "__main__":
+    # Template Parameters
+    target = TEMPLATE_PARAMS["target"]
+    features = TEMPLATE_PARAMS["features"]
+    orig_features = features.copy()
+    compressed_features = TEMPLATE_PARAMS["compressed_features"]
+    train_all_data = TEMPLATE_PARAMS["train_all_data"]
+    hyperparameters = TEMPLATE_PARAMS["hyperparameters"] or {}
+    validation_split = 0.2
+    # Script arguments for input/output directories
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
+    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
+    parser.add_argument(
+        "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
+    )
+    args = parser.parse_args()
+    # Read the training data into DataFrames
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
+    print(f"Training Files: {training_files}")
+    # Combine files and read them all into a single pandas dataframe
+    all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
+    # Check if the dataframe is empty
+    check_dataframe(all_df, "training_df")
+    # Features/Target output
+    print(f"Target: {target}")
+    print(f"Features: {str(features)}")
+    # Convert any features that might be categorical to 'category' type
+    all_df, category_mappings = convert_categorical_types(all_df, features)
+    # If we have compressed features, decompress them
+    if compressed_features:
+        print(f"Decompressing features {compressed_features}...")
+        all_df, features = decompress_features(all_df, features, compressed_features)
+    # Do we want to train on all the data?
+    if train_all_data:
+        print("Training on ALL of the data")
+        df_train = all_df.copy()
+        df_val = all_df.copy()
+    # Does the dataframe have a training column?
+    elif "training" in all_df.columns:
+        print("Found training column, splitting data based on training column")
+        df_train = all_df[all_df["training"]]
+        df_val = all_df[~all_df["training"]]
+    else:
+        # Just do a random training Split
+        print("WARNING: No training column found, splitting data with random state=42")
+        df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
+    print(f"FIT/TRAIN: {df_train.shape}")
+    print(f"VALIDATION: {df_val.shape}")
+    # Extract sample weights if present
+    if "sample_weight" in df_train.columns:
+        sample_weights = df_train["sample_weight"]
+        print(f"Using sample weights: min={sample_weights.min():.2f}, max={sample_weights.max():.2f}, mean={sample_weights.mean():.2f}")
+    else:
+        sample_weights = None
+        print("No sample weights found, training with equal weights")
+    # Prepare features and targets for training
+    X_train = df_train[features]
+    X_validate = df_val[features]
+    y_train = df_train[target]
+    y_validate = df_val[target]
+    # ==========================================
+    # Train XGBoost for point predictions
+    # ==========================================
+    print("\nTraining XGBoost for point predictions...")
+    print(f"  Hyperparameters: {hyperparameters}")
+    xgb_model = XGBRegressor(enable_categorical=True, **hyperparameters)
+    xgb_model.fit(X_train, y_train, sample_weight=sample_weights)
+    # Evaluate XGBoost performance
+    y_pred_xgb = xgb_model.predict(X_validate)
+    xgb_metrics = compute_regression_metrics(y_validate, y_pred_xgb)
+    print(f"\nXGBoost Point Prediction Performance:")
+    print_regression_metrics(xgb_metrics)
+    # ==========================================
+    # Train UQ models using the harness
+    # ==========================================
+    uq_models, uq_metadata = train_uq_models(X_train, y_train, X_validate, y_validate)
+    print(f"\nOverall Model Performance Summary:")
+    print_regression_metrics(xgb_metrics)
+    # ==========================================
+    # Save all models
+    # ==========================================
+    # Save the trained XGBoost model
+    joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
+    # Save UQ models using the harness
+    save_uq_models(uq_models, uq_metadata, args.model_dir)
+    # Save the feature list
+    with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
+        json.dump(features, fp)
+    # Save category mappings if any
+    if category_mappings:
+        with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
+            json.dump(category_mappings, fp)
+    # Save model configuration
+    model_config = {
+        "model_type": "XGBoost_MAPIE_UQ",
+        "confidence_levels": uq_metadata["confidence_levels"],
+        "n_features": len(features),
+        "target": target,
+        "validation_metrics": {
+            "xgb_rmse": float(xgb_metrics["rmse"]),
+            "xgb_mae": float(xgb_metrics["mae"]),
+            "xgb_r2": float(xgb_metrics["r2"]),
+            "n_validation": len(df_val),
+        },
+    }
+    with open(os.path.join(args.model_dir, "model_config.json"), "w") as fp:
+        json.dump(model_config, fp, indent=2)
+    print(f"\nModel training complete!")
+    print(f"Saved XGBoost model and {len(uq_models)} UQ models to {args.model_dir}")
+#
+# Inference Section
+#
+def model_fn(model_dir) -> dict:
+    """Load XGBoost and all UQ models from the specified directory."""
+    # Load model configuration
+    with open(os.path.join(model_dir, "model_config.json")) as fp:
+        config = json.load(fp)
+    # Load XGBoost regressor
+    xgb_path = os.path.join(model_dir, "xgb_model.joblib")
+    xgb_model = joblib.load(xgb_path)
+    # Load UQ models using the harness
+    uq_models, uq_metadata = load_uq_models(model_dir)
+    # Load category mappings if they exist
+    category_mappings = {}
+    category_path = os.path.join(model_dir, "category_mappings.json")
+    if os.path.exists(category_path):
+        with open(category_path) as fp:
+            category_mappings = json.load(fp)
+    return {
+        "xgb_model": xgb_model,
+        "uq_models": uq_models,
+        "uq_metadata": uq_metadata,
+        "category_mappings": category_mappings,
+    }
+def predict_fn(df, models) -> pd.DataFrame:
+    """Make predictions using XGBoost for point estimates and UQ harness for intervals.
+    Args:
+        df (pd.DataFrame): The input DataFrame
+        models (dict): Dictionary containing XGBoost and UQ models
+    Returns:
+        pd.DataFrame: DataFrame with predictions and conformalized intervals
+    """
+    # Grab our feature columns (from training)
+    model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
+    with open(os.path.join(model_dir, "feature_columns.json")) as fp:
+        model_features = json.load(fp)
+    # Match features in a case-insensitive manner
+    matched_df = match_features_case_insensitive(df, model_features)
+    # Apply categorical mappings if they exist
+    if models.get("category_mappings"):
+        matched_df, _ = convert_categorical_types(matched_df, model_features, models["category_mappings"])
+    # Get features for prediction
+    X = matched_df[model_features]
+    # Get XGBoost point predictions
+    df["prediction"] = models["xgb_model"].predict(X)
+    # Get prediction intervals using UQ harness
+    df = predict_intervals(df, X, models["uq_models"], models["uq_metadata"])
+    # Compute confidence scores
+    df = compute_confidence(
+        df,
+        median_interval_width=models["uq_metadata"]["median_interval_width"],
+        lower_q="q_10",
+        upper_q="q_90",
+    )
+    return df

workbench 0.8.162__py3-none-any.whl → 0.8.220__py3-none-any.whl

Potentially problematic release.

workbench 0.8.162py3-none-any.whl → 0.8.220py3-none-any.whl