PyPI - workbench - Versions diffs - 0.8.177__py3-none-any.whl → 0.8.179__py3-none-any.whl - Mend

workbench 0.8.177py3-none-any.whl → 0.8.179py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (27) hide show

workbench/model_scripts/pytorch_model/pytorch.template CHANGED Viewed

@@ -36,12 +36,12 @@ from typing import List, Tuple
 # Template Parameters
 TEMPLATE_PARAMS = {
     "model_type": "{{model_type}}",
-    "target_column": "{{target_column}}",
+    "target": "{{target_column}}",
     "features": "{{feature_list}}",
     "compressed_features": "{{compressed_features}}",
     "model_metrics_s3_path": "{{model_metrics_s3_path}}",
     "train_all_data": "{{train_all_data}}",
-    "hyperparameters": "{{hyperparameters}}"
+    "hyperparameters": "{{hyperparameters}}",
 }
@@ -103,7 +103,6 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     df_columns_lower = {col.lower(): col for col in df.columns}
     rename_dict = {}
     missing = []
     for feature in model_features:
         if feature in df.columns:
             continue  # Exact match
@@ -115,6 +114,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     if missing:
         raise ValueError(f"Features not found: {missing}")
+    # Rename the DataFrame columns to match the model features
     return df.rename(columns=rename_dict)
@@ -210,7 +210,7 @@ def model_fn(model_dir):
     original_cwd = os.getcwd()
     try:
         # Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
-        os.chdir('/tmp')
+        os.chdir("/tmp")
         # Load the model
         model_path = os.path.join(model_dir, "tabular_model")
@@ -328,7 +328,7 @@ if __name__ == "__main__":
     """The main function is for training the PyTorch Tabular model"""
     # Harness Template Parameters
-    target = TEMPLATE_PARAMS["target_column"]
+    target = TEMPLATE_PARAMS["target"]
     features = TEMPLATE_PARAMS["features"]
     orig_features = features.copy()
     compressed_features = TEMPLATE_PARAMS["compressed_features"]
@@ -348,11 +348,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Read the training data into DataFrames
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train)
-        if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     print(f"Training Files: {training_files}")
     # Combine files and read them all into a single pandas dataframe
@@ -433,8 +429,7 @@ if __name__ == "__main__":
     }
     # Override defaults with training_config if present
-    training_overrides = {k: v for k, v in hyperparameters.get('training_config', {}).items()
-                          if k in trainer_defaults}
+    training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
     # Print overwrites
     for key, value in training_overrides.items():
         print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
@@ -451,8 +446,7 @@ if __name__ == "__main__":
         "initialization": "kaiming",
     }
     # Override defaults with model_config if present
-    model_overrides = {k: v for k, v in hyperparameters.get('model_config', {}).items()
-                          if k in model_defaults}
+    model_overrides = {k: v for k, v in hyperparameters.get("model_config", {}).items() if k in model_defaults}
     # Print overwrites
     for key, value in model_overrides.items():
         print(f"MODEL CONFIG Override: {key}: {model_defaults[key]} → {value}")
@@ -461,10 +455,7 @@ if __name__ == "__main__":
     # Use CategoryEmbedding model configuration for general-purpose tabular modeling.
     # Works effectively for both regression and classification as the foundational
     # architecture in PyTorch Tabular
-    model_config = CategoryEmbeddingModelConfig(
-        task=task,
-        **model_params
-    )
+    model_config = CategoryEmbeddingModelConfig(task=task, **model_params)
     optimizer_config = OptimizerConfig()
     #####################################

workbench/model_scripts/quant_regression/quant_regression.template CHANGED Viewed

@@ -4,11 +4,7 @@ import awswrangler as wr
 from sklearn.model_selection import train_test_split
 # Model Performance Scores
-from sklearn.metrics import (
-    mean_absolute_error,
-    r2_score,
-    root_mean_squared_error
-)
+from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
 from io import StringIO
 import json
@@ -22,9 +18,10 @@ TEMPLATE_PARAMS = {
     "target_column": "{{target_column}}",
     "features": "{{feature_list}}",
     "model_metrics_s3_path": "{{model_metrics_s3_path}}",
-    "train_all_data": "{{train_all_data}}"
+    "train_all_data": "{{train_all_data}}",
 }
 # Function to check if dataframe is empty
 def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
     """
@@ -64,6 +61,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     # Rename the DataFrame columns to match the model features
     return df.rename(columns=rename_dict)
 if __name__ == "__main__":
     """The main function is for training the XGBoost Quantile Regression models"""
@@ -86,10 +84,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Load training data from the specified directory
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     print(f"Training Files: {training_files}")
     # Combine files and read them all into a single pandas dataframe

workbench/model_scripts/scikit_learn/scikit_learn.template CHANGED Viewed

@@ -8,7 +8,7 @@ TEMPLATE_PARAMS = {
     "feature_list": "{{feature_list}}",
     "model_class": "{{model_class}}",
     "model_metrics_s3_path": "{{model_metrics_s3_path}}",
-    "train_all_data": "{{train_all_data}}"
+    "train_all_data": "{{train_all_data}}",
 }
 import awswrangler as wr
@@ -99,10 +99,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Load training data from the specified directory
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
     # Check if the DataFrame is empty
@@ -116,10 +113,7 @@ if __name__ == "__main__":
     if needs_standardization:
         # Create a pipeline with standardization and the model
-        model = Pipeline([
-            ("scaler", StandardScaler()),
-            ("model", model)
-        ])
+        model = Pipeline([("scaler", StandardScaler()), ("model", model)])
     # Handle logic based on the model_type
     if model_type in ["classifier", "regressor"]:
@@ -206,6 +200,7 @@ if __name__ == "__main__":
     with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
         json.dump(feature_list, fp)
 #
 # Inference Section
 #

workbench/model_scripts/xgb_model/generated_model_script.py CHANGED Viewed

@@ -32,10 +32,12 @@ TEMPLATE_PARAMS = {
     "target": "udm_asy_res_value",
     "features": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo'],
     "compressed_features": [],
-    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/pka-a1-reg-0-nightly-100-test/training",
-    "train_all_data": True
+    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/logd-hyper-80/training",
+    "train_all_data": False,
+    "hyperparameters": {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.05, 'subsample': 0.7, 'colsample_bytree': 0.3, 'colsample_bylevel': 0.5, 'min_child_weight': 5, 'gamma': 0.2, 'reg_alpha': 0.5, 'reg_lambda': 2.0, 'scale_pos_weight': 1},
 }
 # Function to check if dataframe is empty
 def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
     """
@@ -75,7 +77,7 @@ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFra
     proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
     # Drop any proba columns and reset the index in prep for the concat
-    df = df.drop(columns=[proba_column]+proba_splits, errors="ignore")
+    df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
     df = df.reset_index(drop=True)
     # Concatenate the new columns with the original DataFrame
@@ -140,8 +142,10 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
     return df, category_mappings
-def decompress_features(df: pd.DataFrame, features: List[str], compressed_features: List[str]) -> Tuple[pd.DataFrame, List[str]]:
-    """Prepare features for the XGBoost model
+def decompress_features(
+    df: pd.DataFrame, features: List[str], compressed_features: List[str]
+) -> Tuple[pd.DataFrame, List[str]]:
+    """Prepare features for the model
     Args:
         df (pd.DataFrame): The features DataFrame
@@ -204,6 +208,7 @@ if __name__ == "__main__":
     model_type = TEMPLATE_PARAMS["model_type"]
     model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
     train_all_data = TEMPLATE_PARAMS["train_all_data"]
+    hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
     validation_split = 0.2
     # Script arguments for input/output directories
@@ -216,11 +221,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Read the training data into DataFrames
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train)
-        if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     print(f"Training Files: {training_files}")
     # Combine files and read them all into a single pandas dataframe
@@ -255,15 +256,16 @@ if __name__ == "__main__":
     else:
         # Just do a random training Split
         print("WARNING: No training column found, splitting data with random state=42")
-        df_train, df_val = train_test_split(
-            all_df, test_size=validation_split, random_state=42
-        )
+        df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
     print(f"FIT/TRAIN: {df_train.shape}")
     print(f"VALIDATION: {df_val.shape}")
+    # Use any hyperparameters to set up both the trainer and model configurations
+    print(f"Hyperparameters: {hyperparameters}")
     # Now spin up our XGB Model
     if model_type == "classifier":
-        xgb_model = xgb.XGBClassifier(enable_categorical=True)
+        xgb_model = xgb.XGBClassifier(enable_categorical=True, **hyperparameters)
         # Encode the target column
         label_encoder = LabelEncoder()
@@ -271,12 +273,12 @@ if __name__ == "__main__":
         df_val[target] = label_encoder.transform(df_val[target])
     else:
-        xgb_model = xgb.XGBRegressor(enable_categorical=True)
+        xgb_model = xgb.XGBRegressor(enable_categorical=True, **hyperparameters)
         label_encoder = None  # We don't need this for regression
     # Grab our Features, Target and Train the Model
     y_train = df_train[target]
-    X_train= df_train[features]
+    X_train = df_train[features]
     xgb_model.fit(X_train, y_train)
     # Make Predictions on the Validation Set
@@ -315,9 +317,7 @@ if __name__ == "__main__":
         label_names = label_encoder.classes_
         # Calculate various model performance metrics
-        scores = precision_recall_fscore_support(
-            y_validate, preds, average=None, labels=label_names
-        )
+        scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
         # Put the scores into a dataframe
         score_df = pd.DataFrame(
@@ -355,7 +355,9 @@ if __name__ == "__main__":
         print(f"NumRows: {len(df_val)}")
     # Now save the model to the standard place/name
-    xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
+    joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
+    # Save the label encoder if we have one
     if label_encoder:
         joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
@@ -370,19 +372,8 @@ if __name__ == "__main__":
 def model_fn(model_dir):
     """Deserialize and return fitted XGBoost model"""
-    model_path = os.path.join(model_dir, "xgb_model.json")
-    with open(model_path, "r") as f:
-        model_json = json.load(f)
-    sklearn_data = model_json['learner']['attributes']['scikit_learn']
-    model_type = json.loads(sklearn_data)['_estimator_type']
-    model_class = xgb.XGBClassifier if model_type == "classifier" else xgb.XGBRegressor
-    model = model_class(enable_categorical=True)
-    model.load_model(model_path)
+    model_path = os.path.join(model_dir, "xgb_model.joblib")
+    model = joblib.load(model_path)
     return model

workbench/model_scripts/xgb_model/xgb_model.template CHANGED Viewed

@@ -33,9 +33,11 @@ TEMPLATE_PARAMS = {
     "features": "{{feature_list}}",
     "compressed_features": "{{compressed_features}}",
     "model_metrics_s3_path": "{{model_metrics_s3_path}}",
-    "train_all_data": "{{train_all_data}}"
+    "train_all_data": "{{train_all_data}}",
+    "hyperparameters": "{{hyperparameters}}",
 }
 # Function to check if dataframe is empty
 def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
     """
@@ -75,7 +77,7 @@ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFra
     proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
     # Drop any proba columns and reset the index in prep for the concat
-    df = df.drop(columns=[proba_column]+proba_splits, errors="ignore")
+    df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
     df = df.reset_index(drop=True)
     # Concatenate the new columns with the original DataFrame
@@ -140,8 +142,10 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
     return df, category_mappings
-def decompress_features(df: pd.DataFrame, features: List[str], compressed_features: List[str]) -> Tuple[pd.DataFrame, List[str]]:
-    """Prepare features for the XGBoost model
+def decompress_features(
+    df: pd.DataFrame, features: List[str], compressed_features: List[str]
+) -> Tuple[pd.DataFrame, List[str]]:
+    """Prepare features for the model
     Args:
         df (pd.DataFrame): The features DataFrame
@@ -204,6 +208,7 @@ if __name__ == "__main__":
     model_type = TEMPLATE_PARAMS["model_type"]
     model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
     train_all_data = TEMPLATE_PARAMS["train_all_data"]
+    hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
     validation_split = 0.2
     # Script arguments for input/output directories
@@ -216,11 +221,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Read the training data into DataFrames
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train)
-        if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     print(f"Training Files: {training_files}")
     # Combine files and read them all into a single pandas dataframe
@@ -255,15 +256,16 @@ if __name__ == "__main__":
     else:
         # Just do a random training Split
         print("WARNING: No training column found, splitting data with random state=42")
-        df_train, df_val = train_test_split(
-            all_df, test_size=validation_split, random_state=42
-        )
+        df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
     print(f"FIT/TRAIN: {df_train.shape}")
     print(f"VALIDATION: {df_val.shape}")
+    # Use any hyperparameters to set up both the trainer and model configurations
+    print(f"Hyperparameters: {hyperparameters}")
     # Now spin up our XGB Model
     if model_type == "classifier":
-        xgb_model = xgb.XGBClassifier(enable_categorical=True)
+        xgb_model = xgb.XGBClassifier(enable_categorical=True, **hyperparameters)
         # Encode the target column
         label_encoder = LabelEncoder()
@@ -271,12 +273,12 @@ if __name__ == "__main__":
         df_val[target] = label_encoder.transform(df_val[target])
     else:
-        xgb_model = xgb.XGBRegressor(enable_categorical=True)
+        xgb_model = xgb.XGBRegressor(enable_categorical=True, **hyperparameters)
         label_encoder = None  # We don't need this for regression
     # Grab our Features, Target and Train the Model
     y_train = df_train[target]
-    X_train= df_train[features]
+    X_train = df_train[features]
     xgb_model.fit(X_train, y_train)
     # Make Predictions on the Validation Set
@@ -315,9 +317,7 @@ if __name__ == "__main__":
         label_names = label_encoder.classes_
         # Calculate various model performance metrics
-        scores = precision_recall_fscore_support(
-            y_validate, preds, average=None, labels=label_names
-        )
+        scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
         # Put the scores into a dataframe
         score_df = pd.DataFrame(
@@ -355,7 +355,9 @@ if __name__ == "__main__":
         print(f"NumRows: {len(df_val)}")
     # Now save the model to the standard place/name
-    xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
+    joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
+    # Save the label encoder if we have one
     if label_encoder:
         joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
@@ -370,19 +372,8 @@ if __name__ == "__main__":
 def model_fn(model_dir):
     """Deserialize and return fitted XGBoost model"""
-    model_path = os.path.join(model_dir, "xgb_model.json")
-    with open(model_path, "r") as f:
-        model_json = json.load(f)
-    sklearn_data = model_json['learner']['attributes']['scikit_learn']
-    model_type = json.loads(sklearn_data)['_estimator_type']
-    model_class = xgb.XGBClassifier if model_type == "classifier" else xgb.XGBRegressor
-    model = model_class(enable_categorical=True)
-    model.load_model(model_path)
+    model_path = os.path.join(model_dir, "xgb_model.joblib")
+    model = joblib.load(model_path)
     return model

workbench/scripts/ml_pipeline_sqs.py CHANGED Viewed

@@ -13,12 +13,13 @@ cm = ConfigManager()
 workbench_bucket = cm.get_config("WORKBENCH_BUCKET")
-def submit_to_sqs(script_path: str, size: str = "small") -> None:
+def submit_to_sqs(script_path: str, size: str = "small", realtime: bool = False) -> None:
     """
     Upload script to S3 and submit message to SQS queue for processing.
     Args:
         script_path: Local path to the ML pipeline script
         size: Job size tier - "small" (default), "medium", or "large"
+        realtime: If True, sets serverless=False for real-time processing (default: False, meaning serverless=True)
     """
     print(f"\n{'=' * 60}")
     print("🚀  SUBMITTING ML PIPELINE JOB")
@@ -33,6 +34,7 @@ def submit_to_sqs(script_path: str, size: str = "small") -> None:
     print(f"📄  Script: {script_file.name}")
     print(f"📏  Size tier: {size}")
+    print(f"⚡  Mode: {'Real-time' if realtime else 'Serverless'} (serverless={'False' if realtime else 'True'})")
     print(f"🪣  Bucket: {workbench_bucket}")
     sqs = AWSAccountClamp().boto3_session.client("sqs")
     script_name = script_file.name
@@ -88,6 +90,10 @@ def submit_to_sqs(script_path: str, size: str = "small") -> None:
     # Prepare message
     message = {"script_path": s3_path, "size": size}
+    # Set serverless environment variable (defaults to True, False if --realtime)
+    message["environment"] = {"SERVERLESS": "False" if realtime else "True"}
     print("\n📨  Sending message to SQS...")
     # Send the message to SQS
@@ -110,6 +116,7 @@ def submit_to_sqs(script_path: str, size: str = "small") -> None:
     print(f"{'=' * 60}")
     print(f"📄  Script: {script_name}")
     print(f"📏  Size: {size}")
+    print(f"⚡  Mode: {'Real-time' if realtime else 'Serverless'} (SERVERLESS={'False' if realtime else 'True'})")
     print(f"🆔  Message ID: {message_id}")
     print("\n🔍  MONITORING LOCATIONS:")
     print(f"   • SQS Queue: AWS Console → SQS → {queue_name}")
@@ -126,9 +133,14 @@ def main():
     parser.add_argument(
         "--size", default="small", choices=["small", "medium", "large"], help="Job size tier (default: small)"
     )
+    parser.add_argument(
+        "--realtime",
+        action="store_true",
+        help="Run in real-time mode (sets serverless=False). Default is serverless mode (serverless=True)",
+    )
     args = parser.parse_args()
     try:
-        submit_to_sqs(args.script_file, args.size)
+        submit_to_sqs(args.script_file, args.size, realtime=args.realtime)
     except Exception as e:
         print(f"\n❌  ERROR: {e}")
         log.error(f"Error: {e}")

workbench/utils/model_utils.py CHANGED Viewed

@@ -222,6 +222,8 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
         lower_95, upper_95 = df["q_025"], df["q_975"]
         lower_90, upper_90 = df["q_05"], df["q_95"]
         lower_80, upper_80 = df["q_10"], df["q_90"]
+        lower_68 = df.get("q_16", 0)
+        upper_68 = df.get("q_84", 0)
         lower_50, upper_50 = df["q_25"], df["q_75"]
     elif "prediction_std" in df.columns:
         lower_95 = df["prediction"] - 1.96 * df["prediction_std"]
@@ -230,6 +232,8 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
         upper_90 = df["prediction"] + 1.645 * df["prediction_std"]
         lower_80 = df["prediction"] - 1.282 * df["prediction_std"]
         upper_80 = df["prediction"] + 1.282 * df["prediction_std"]
+        lower_68 = df["prediction"] - 1.0 * df["prediction_std"]
+        upper_68 = df["prediction"] + 1.0 * df["prediction_std"]
         lower_50 = df["prediction"] - 0.674 * df["prediction_std"]
         upper_50 = df["prediction"] + 0.674 * df["prediction_std"]
     else:
@@ -241,11 +245,13 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
     coverage_95 = np.mean((df[target_col] >= lower_95) & (df[target_col] <= upper_95))
     coverage_90 = np.mean((df[target_col] >= lower_90) & (df[target_col] <= upper_90))
     coverage_80 = np.mean((df[target_col] >= lower_80) & (df[target_col] <= upper_80))
+    coverage_68 = np.mean((df[target_col] >= lower_68) & (df[target_col] <= upper_68))
     coverage_50 = np.mean((df[target_col] >= lower_50) & (df[target_col] <= upper_50))
     avg_width_95 = np.mean(upper_95 - lower_95)
     avg_width_90 = np.mean(upper_90 - lower_90)
     avg_width_80 = np.mean(upper_80 - lower_80)
     avg_width_50 = np.mean(upper_50 - lower_50)
+    avg_width_68 = np.mean(upper_68 - lower_68)
     # --- CRPS (measures calibration + sharpness) ---
     z = (df[target_col] - df["prediction"]) / df["prediction_std"]
@@ -269,12 +275,14 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
     # Collect results
     results = {
         "coverage_50": coverage_50,
+        "coverage_68": coverage_68,
         "coverage_80": coverage_80,
         "coverage_90": coverage_90,
         "coverage_95": coverage_95,
-        "avg_std": avg_std,
         "median_std": median_std,
+        "avg_std": avg_std,
         "avg_width_50": avg_width_50,
+        "avg_width_68": avg_width_68,
         "avg_width_80": avg_width_80,
         "avg_width_90": avg_width_90,
         "avg_width_95": avg_width_95,
@@ -286,12 +294,14 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
     print("\n=== UQ Metrics ===")
     print(f"Coverage @ 50%: {coverage_50:.3f} (target: 0.50)")
+    print(f"Coverage @ 68%: {coverage_68:.3f} (target: 0.68)")
     print(f"Coverage @ 80%: {coverage_80:.3f} (target: 0.80)")
     print(f"Coverage @ 90%: {coverage_90:.3f} (target: 0.90)")
     print(f"Coverage @ 95%: {coverage_95:.3f} (target: 0.95)")
-    print(f"Avg Prediction StdDev: {avg_std:.3f}")
     print(f"Median Prediction StdDev: {median_std:.3f}")
+    print(f"Avg Prediction StdDev: {avg_std:.3f}")
     print(f"Average 50% Width: {avg_width_50:.3f}")
+    print(f"Average 68% Width: {avg_width_68:.3f}")
     print(f"Average 80% Width: {avg_width_80:.3f}")
     print(f"Average 90% Width: {avg_width_90:.3f}")
     print(f"Average 95% Width: {avg_width_95:.3f}")

workbench 0.8.177__py3-none-any.whl → 0.8.179__py3-none-any.whl

Potentially problematic release.

workbench 0.8.177py3-none-any.whl → 0.8.179py3-none-any.whl