PyPI - workbench - Versions diffs - 0.8.224__py3-none-any.whl → 0.8.231__py3-none-any.whl - Mend

workbench 0.8.224py3-none-any.whl → 0.8.231py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

workbench/__init__.py +1 -0
workbench/algorithms/dataframe/__init__.py +2 -0
workbench/algorithms/dataframe/smart_aggregator.py +161 -0
workbench/algorithms/sql/column_stats.py +0 -1
workbench/algorithms/sql/correlations.py +0 -1
workbench/algorithms/sql/descriptive_stats.py +0 -1
workbench/api/meta.py +0 -1
workbench/cached/cached_meta.py +0 -1
workbench/cached/cached_model.py +37 -7
workbench/core/artifacts/endpoint_core.py +12 -2
workbench/core/artifacts/feature_set_core.py +66 -8
workbench/core/cloud_platform/cloud_meta.py +0 -1
workbench/model_script_utils/model_script_utils.py +30 -0
workbench/model_script_utils/uq_harness.py +0 -1
workbench/model_scripts/chemprop/chemprop.template +3 -0
workbench/model_scripts/chemprop/generated_model_script.py +3 -3
workbench/model_scripts/chemprop/model_script_utils.py +30 -0
workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +0 -1
workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +0 -1
workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +0 -1
workbench/model_scripts/pytorch_model/generated_model_script.py +50 -32
workbench/model_scripts/pytorch_model/model_script_utils.py +30 -0
workbench/model_scripts/pytorch_model/pytorch.template +47 -29
workbench/model_scripts/pytorch_model/uq_harness.py +0 -1
workbench/model_scripts/script_generation.py +0 -1
workbench/model_scripts/xgb_model/model_script_utils.py +30 -0
workbench/model_scripts/xgb_model/uq_harness.py +0 -1
workbench/themes/dark/custom.css +85 -8
workbench/themes/dark/plotly.json +6 -6
workbench/themes/light/custom.css +172 -70
workbench/themes/light/plotly.json +9 -9
workbench/themes/midnight_blue/custom.css +48 -29
workbench/themes/midnight_blue/plotly.json +1 -1
workbench/utils/aws_utils.py +0 -1
workbench/utils/chem_utils/mol_descriptors.py +0 -1
workbench/utils/chem_utils/vis.py +137 -27
workbench/utils/clientside_callbacks.py +41 -0
workbench/utils/markdown_utils.py +57 -0
workbench/utils/pipeline_utils.py +0 -1
workbench/utils/plot_utils.py +8 -110
workbench/web_interface/components/experiments/outlier_plot.py +0 -1
workbench/web_interface/components/model_plot.py +2 -0
workbench/web_interface/components/plugin_unit_test.py +0 -1
workbench/web_interface/components/plugins/ag_table.py +2 -4
workbench/web_interface/components/plugins/confusion_matrix.py +3 -6
workbench/web_interface/components/plugins/model_details.py +10 -6
workbench/web_interface/components/plugins/scatter_plot.py +56 -43
workbench/web_interface/components/settings_menu.py +2 -1
workbench/web_interface/page_views/main_page.py +0 -1
{workbench-0.8.224.dist-info → workbench-0.8.231.dist-info}/METADATA +31 -29
{workbench-0.8.224.dist-info → workbench-0.8.231.dist-info}/RECORD +55 -59
{workbench-0.8.224.dist-info → workbench-0.8.231.dist-info}/WHEEL +1 -1
workbench/themes/quartz/base_css.url +0 -1
workbench/themes/quartz/custom.css +0 -117
workbench/themes/quartz/plotly.json +0 -642
workbench/themes/quartz_dark/base_css.url +0 -1
workbench/themes/quartz_dark/custom.css +0 -131
workbench/themes/quartz_dark/plotly.json +0 -642
{workbench-0.8.224.dist-info → workbench-0.8.231.dist-info}/entry_points.txt +0 -0
{workbench-0.8.224.dist-info → workbench-0.8.231.dist-info}/licenses/LICENSE +0 -0
{workbench-0.8.224.dist-info → workbench-0.8.231.dist-info}/top_level.txt +0 -0

workbench/model_scripts/pytorch_model/generated_model_script.py CHANGED Viewed

@@ -59,12 +59,12 @@ DEFAULT_HYPERPARAMETERS = {
 # Template parameters (filled in by Workbench)
 TEMPLATE_PARAMS = {
-    "model_type": "uq_regressor",
-    "target": "udm_asy_res_efflux_ratio",
+    "model_type": "classifier",
+    "target": "class",
     "features": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo'],
     "id_column": "udm_mol_bat_id",
     "compressed_features": [],
-    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/caco2-er-pytorch-260113/training",
+    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/caco2-er-class-pytorch-1-fr/training",
     "hyperparameters": {},
 }
@@ -152,24 +152,30 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
         print("Decompressing features for prediction...")
         matched_df, features = decompress_features(matched_df, features, compressed_features)
-    # Track missing features
-    missing_mask = matched_df[features].isna().any(axis=1)
-    if missing_mask.any():
-        print(f"Warning: {missing_mask.sum()} rows have missing features")
+    # Impute missing values (categorical with mode, continuous handled by scaler)
+    missing_counts = matched_df[features].isna().sum()
+    if missing_counts.any():
+        missing_features = missing_counts[missing_counts > 0]
+        print(f"Imputing missing values: {missing_features.to_dict()}")
+        # Load categorical imputation values if available
+        impute_path = os.path.join(model_dir, "categorical_impute.json")
+        if os.path.exists(impute_path):
+            with open(impute_path) as f:
+                cat_impute_values = json.load(f)
+            for col in categorical_cols:
+                if col in cat_impute_values and matched_df[col].isna().any():
+                    matched_df[col] = matched_df[col].fillna(cat_impute_values[col])
+        # Continuous features are imputed by FeatureScaler.transform() using column means
     # Initialize output columns
     df["prediction"] = np.nan
     if model_type in ["regressor", "uq_regressor"]:
         df["prediction_std"] = np.nan
-    complete_df = matched_df[~missing_mask].copy()
-    if len(complete_df) == 0:
-        print("Warning: No complete rows to predict on")
-        return df
-    # Prepare data for inference (with standardization)
+    # Prepare data for inference (with standardization and continuous imputation)
     x_cont, x_cat, _, _, _ = prepare_data(
-        complete_df, continuous_cols, categorical_cols, category_mappings=category_mappings, scaler=scaler
+        matched_df, continuous_cols, categorical_cols, category_mappings=category_mappings, scaler=scaler
     )
     # Collect ensemble predictions
@@ -191,28 +197,20 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
         class_preds = np.argmax(avg_probs, axis=1)
         predictions = label_encoder.inverse_transform(class_preds)
-        all_proba = pd.Series([None] * len(df), index=df.index, dtype=object)
-        all_proba.loc[~missing_mask] = [p.tolist() for p in avg_probs]
-        df["pred_proba"] = all_proba
+        df["pred_proba"] = [p.tolist() for p in avg_probs]
         df = expand_proba_column(df, label_encoder.classes_)
     else:
         # Regression
         predictions = preds.flatten()
-        df.loc[~missing_mask, "prediction_std"] = preds_std.flatten()
+        df["prediction_std"] = preds_std.flatten()
         # Add UQ intervals if available
         if uq_models and uq_metadata:
-            X_complete = complete_df[features]
-            df_complete = df.loc[~missing_mask].copy()
-            df_complete["prediction"] = predictions  # Set prediction before compute_confidence
-            df_complete = predict_intervals(df_complete, X_complete, uq_models, uq_metadata)
-            df_complete = compute_confidence(df_complete, uq_metadata["median_interval_width"], "q_10", "q_90")
-            # Copy UQ columns back to main dataframe
-            for col in df_complete.columns:
-                if col.startswith("q_") or col == "confidence":
-                    df.loc[~missing_mask, col] = df_complete[col].values
-    df.loc[~missing_mask, "prediction"] = predictions
+            df["prediction"] = predictions  # Set prediction before compute_confidence
+            df = predict_intervals(df, matched_df[features], uq_models, uq_metadata)
+            df = compute_confidence(df, uq_metadata["median_interval_width"], "q_10", "q_90")
+    df["prediction"] = predictions
     return df
@@ -275,11 +273,11 @@ if __name__ == "__main__":
     all_df = pd.concat([pd.read_csv(f, engine="python") for f in training_files])
     check_dataframe(all_df, "training_df")
-    # Drop rows with missing features
+    # Drop rows with missing target (required for training)
     initial_count = len(all_df)
-    all_df = all_df.dropna(subset=features)
+    all_df = all_df.dropna(subset=[target])
     if len(all_df) < initial_count:
-        print(f"Dropped {initial_count - len(all_df)} rows with missing features")
+        print(f"Dropped {initial_count - len(all_df)} rows with missing target")
     print(f"Target: {target}")
     print(f"Features: {features}")
@@ -301,6 +299,23 @@ if __name__ == "__main__":
     print(f"Categorical: {categorical_cols}")
     print(f"Continuous: {len(continuous_cols)} columns")
+    # Report and handle missing values in features
+    # Compute categorical imputation values (mode) for use at inference time
+    cat_impute_values = {}
+    for col in categorical_cols:
+        mode_val = all_df[col].mode().iloc[0] if not all_df[col].mode().empty else all_df[col].cat.categories[0]
+        cat_impute_values[col] = str(mode_val)  # Convert to string for JSON serialization
+    missing_counts = all_df[features].isna().sum()
+    if missing_counts.any():
+        missing_features = missing_counts[missing_counts > 0]
+        print(f"Missing values in features (will be imputed): {missing_features.to_dict()}")
+        # Impute categorical features with mode (most frequent value)
+        for col in categorical_cols:
+            if all_df[col].isna().any():
+                all_df[col] = all_df[col].fillna(cat_impute_values[col])
+        # Continuous features are imputed by FeatureScaler.transform() using column means
     # -------------------------------------------------------------------------
     # Classification setup
     # -------------------------------------------------------------------------
@@ -506,6 +521,9 @@ if __name__ == "__main__":
     with open(os.path.join(args.model_dir, "feature_metadata.json"), "w") as f:
         json.dump({"continuous_cols": continuous_cols, "categorical_cols": categorical_cols}, f)
+    with open(os.path.join(args.model_dir, "categorical_impute.json"), "w") as f:
+        json.dump(cat_impute_values, f)
     with open(os.path.join(args.model_dir, "hyperparameters.json"), "w") as f:
         json.dump(hyperparameters, f, indent=2)

workbench/model_scripts/pytorch_model/model_script_utils.py CHANGED Viewed

@@ -249,6 +249,36 @@ def output_fn(output_df: pd.DataFrame, accept_type: str) -> tuple[str, str]:
         raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
+def cap_std_outliers(std_array: np.ndarray) -> np.ndarray:
+    """Cap extreme outliers in prediction_std using IQR method.
+    Uses the standard IQR fence (Q3 + 1.5*IQR) to cap extreme values.
+    This prevents unreasonably large std values while preserving the
+    relative ordering and keeping meaningful high-uncertainty signals.
+    Args:
+        std_array: Array of standard deviations (n_samples,) or (n_samples, n_targets)
+    Returns:
+        Array with outliers capped at the upper fence
+    """
+    if std_array.ndim == 1:
+        std_array = std_array.reshape(-1, 1)
+        squeeze = True
+    else:
+        squeeze = False
+    capped = std_array.copy()
+    for col in range(capped.shape[1]):
+        col_data = capped[:, col]
+        q1, q3 = np.percentile(col_data, [25, 75])
+        iqr = q3 - q1
+        upper_bound = q3 + 1.5 * iqr
+        capped[:, col] = np.minimum(col_data, upper_bound)
+    return capped.squeeze() if squeeze else capped
 def compute_regression_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> dict[str, float]:
     """Compute standard regression metrics.

workbench/model_scripts/pytorch_model/pytorch.template CHANGED Viewed

@@ -152,24 +152,30 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
         print("Decompressing features for prediction...")
         matched_df, features = decompress_features(matched_df, features, compressed_features)
-    # Track missing features
-    missing_mask = matched_df[features].isna().any(axis=1)
-    if missing_mask.any():
-        print(f"Warning: {missing_mask.sum()} rows have missing features")
+    # Impute missing values (categorical with mode, continuous handled by scaler)
+    missing_counts = matched_df[features].isna().sum()
+    if missing_counts.any():
+        missing_features = missing_counts[missing_counts > 0]
+        print(f"Imputing missing values: {missing_features.to_dict()}")
+        # Load categorical imputation values if available
+        impute_path = os.path.join(model_dir, "categorical_impute.json")
+        if os.path.exists(impute_path):
+            with open(impute_path) as f:
+                cat_impute_values = json.load(f)
+            for col in categorical_cols:
+                if col in cat_impute_values and matched_df[col].isna().any():
+                    matched_df[col] = matched_df[col].fillna(cat_impute_values[col])
+        # Continuous features are imputed by FeatureScaler.transform() using column means
     # Initialize output columns
     df["prediction"] = np.nan
     if model_type in ["regressor", "uq_regressor"]:
         df["prediction_std"] = np.nan
-    complete_df = matched_df[~missing_mask].copy()
-    if len(complete_df) == 0:
-        print("Warning: No complete rows to predict on")
-        return df
-    # Prepare data for inference (with standardization)
+    # Prepare data for inference (with standardization and continuous imputation)
     x_cont, x_cat, _, _, _ = prepare_data(
-        complete_df, continuous_cols, categorical_cols, category_mappings=category_mappings, scaler=scaler
+        matched_df, continuous_cols, categorical_cols, category_mappings=category_mappings, scaler=scaler
     )
     # Collect ensemble predictions
@@ -191,28 +197,20 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
         class_preds = np.argmax(avg_probs, axis=1)
         predictions = label_encoder.inverse_transform(class_preds)
-        all_proba = pd.Series([None] * len(df), index=df.index, dtype=object)
-        all_proba.loc[~missing_mask] = [p.tolist() for p in avg_probs]
-        df["pred_proba"] = all_proba
+        df["pred_proba"] = [p.tolist() for p in avg_probs]
         df = expand_proba_column(df, label_encoder.classes_)
     else:
         # Regression
         predictions = preds.flatten()
-        df.loc[~missing_mask, "prediction_std"] = preds_std.flatten()
+        df["prediction_std"] = preds_std.flatten()
         # Add UQ intervals if available
         if uq_models and uq_metadata:
-            X_complete = complete_df[features]
-            df_complete = df.loc[~missing_mask].copy()
-            df_complete["prediction"] = predictions  # Set prediction before compute_confidence
-            df_complete = predict_intervals(df_complete, X_complete, uq_models, uq_metadata)
-            df_complete = compute_confidence(df_complete, uq_metadata["median_interval_width"], "q_10", "q_90")
-            # Copy UQ columns back to main dataframe
-            for col in df_complete.columns:
-                if col.startswith("q_") or col == "confidence":
-                    df.loc[~missing_mask, col] = df_complete[col].values
-    df.loc[~missing_mask, "prediction"] = predictions
+            df["prediction"] = predictions  # Set prediction before compute_confidence
+            df = predict_intervals(df, matched_df[features], uq_models, uq_metadata)
+            df = compute_confidence(df, uq_metadata["median_interval_width"], "q_10", "q_90")
+    df["prediction"] = predictions
     return df
@@ -275,11 +273,11 @@ if __name__ == "__main__":
     all_df = pd.concat([pd.read_csv(f, engine="python") for f in training_files])
     check_dataframe(all_df, "training_df")
-    # Drop rows with missing features
+    # Drop rows with missing target (required for training)
     initial_count = len(all_df)
-    all_df = all_df.dropna(subset=features)
+    all_df = all_df.dropna(subset=[target])
     if len(all_df) < initial_count:
-        print(f"Dropped {initial_count - len(all_df)} rows with missing features")
+        print(f"Dropped {initial_count - len(all_df)} rows with missing target")
     print(f"Target: {target}")
     print(f"Features: {features}")
@@ -301,6 +299,23 @@ if __name__ == "__main__":
     print(f"Categorical: {categorical_cols}")
     print(f"Continuous: {len(continuous_cols)} columns")
+    # Report and handle missing values in features
+    # Compute categorical imputation values (mode) for use at inference time
+    cat_impute_values = {}
+    for col in categorical_cols:
+        mode_val = all_df[col].mode().iloc[0] if not all_df[col].mode().empty else all_df[col].cat.categories[0]
+        cat_impute_values[col] = str(mode_val)  # Convert to string for JSON serialization
+    missing_counts = all_df[features].isna().sum()
+    if missing_counts.any():
+        missing_features = missing_counts[missing_counts > 0]
+        print(f"Missing values in features (will be imputed): {missing_features.to_dict()}")
+        # Impute categorical features with mode (most frequent value)
+        for col in categorical_cols:
+            if all_df[col].isna().any():
+                all_df[col] = all_df[col].fillna(cat_impute_values[col])
+        # Continuous features are imputed by FeatureScaler.transform() using column means
     # -------------------------------------------------------------------------
     # Classification setup
     # -------------------------------------------------------------------------
@@ -506,6 +521,9 @@ if __name__ == "__main__":
     with open(os.path.join(args.model_dir, "feature_metadata.json"), "w") as f:
         json.dump({"continuous_cols": continuous_cols, "categorical_cols": categorical_cols}, f)
+    with open(os.path.join(args.model_dir, "categorical_impute.json"), "w") as f:
+        json.dump(cat_impute_values, f)
     with open(os.path.join(args.model_dir, "hyperparameters.json"), "w") as f:
         json.dump(hyperparameters, f, indent=2)

workbench/model_scripts/pytorch_model/uq_harness.py CHANGED Viewed

@@ -22,7 +22,6 @@ import joblib
 from lightgbm import LGBMRegressor
 from mapie.regression import ConformalizedQuantileRegressor
 # Default confidence levels for prediction intervals
 DEFAULT_CONFIDENCE_LEVELS = [0.50, 0.68, 0.80, 0.90, 0.95]

workbench/model_scripts/script_generation.py CHANGED Viewed

@@ -6,7 +6,6 @@ import logging
 from pathlib import Path
 import importlib.util
 # Setup the logger
 log = logging.getLogger("workbench")

workbench/model_scripts/xgb_model/model_script_utils.py CHANGED Viewed

@@ -249,6 +249,36 @@ def output_fn(output_df: pd.DataFrame, accept_type: str) -> tuple[str, str]:
         raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
+def cap_std_outliers(std_array: np.ndarray) -> np.ndarray:
+    """Cap extreme outliers in prediction_std using IQR method.
+    Uses the standard IQR fence (Q3 + 1.5*IQR) to cap extreme values.
+    This prevents unreasonably large std values while preserving the
+    relative ordering and keeping meaningful high-uncertainty signals.
+    Args:
+        std_array: Array of standard deviations (n_samples,) or (n_samples, n_targets)
+    Returns:
+        Array with outliers capped at the upper fence
+    """
+    if std_array.ndim == 1:
+        std_array = std_array.reshape(-1, 1)
+        squeeze = True
+    else:
+        squeeze = False
+    capped = std_array.copy()
+    for col in range(capped.shape[1]):
+        col_data = capped[:, col]
+        q1, q3 = np.percentile(col_data, [25, 75])
+        iqr = q3 - q1
+        upper_bound = q3 + 1.5 * iqr
+        capped[:, col] = np.minimum(col_data, upper_bound)
+    return capped.squeeze() if squeeze else capped
 def compute_regression_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> dict[str, float]:
     """Compute standard regression metrics.

workbench/model_scripts/xgb_model/uq_harness.py CHANGED Viewed

@@ -22,7 +22,6 @@ import joblib
 from lightgbm import LGBMRegressor
 from mapie.regression import ConformalizedQuantileRegressor
 # Default confidence levels for prediction intervals
 DEFAULT_CONFIDENCE_LEVELS = [0.50, 0.68, 0.80, 0.90, 0.95]

workbench/themes/dark/custom.css CHANGED Viewed

@@ -3,6 +3,7 @@ h1, h2, h3, h4 {
 }
 body {
     color: rgb(180, 180, 180); /* We want the text dim white */
+    background: linear-gradient(90deg, rgb(45, 45, 45) 0%, rgb(35, 35, 35) 100%);
 }
 /* Custom CSS to style bold text */
@@ -36,21 +37,38 @@ a:hover {
 /* AgGrid custom CSS */
-/* There's a one pixel border around the grid that we want to remove */
-.ag-root-wrapper {
-    border: none !important; /* Force removal with !important */
-}
-/* Box shadow and rounded corners for all AgGrid themes */
+/* AG Grid 33+ uses CSS variables for theming - set them at the theme level */
 [class*="ag-theme-"] {
+    --ag-background-color: rgb(40, 40, 40);
+    --ag-odd-row-background-color: rgb(40, 40, 40);
+    --ag-row-background-color: rgb(50, 50, 50);
+    --ag-selected-row-background-color: rgb(60, 70, 90);
+    --ag-row-hover-color: rgb(55, 55, 65);
+    --ag-header-background-color: rgb(35, 35, 35);
+    --ag-border-color: rgba(80, 80, 80, 0.5);
+    --ag-foreground-color: rgb(180, 180, 180);
+    --ag-header-foreground-color: rgb(220, 220, 220);
+    --ag-wrapper-border-radius: 12px;
+    /* Box shadow and rounded corners */
     box-shadow: 2px 2px 6px 5px rgba(0, 0, 0, 0.25);
-    border-radius: 12px; /* Rounded corners */
+    border-radius: 12px;
     border: 0.5px solid rgba(0, 0, 0, 0.5);
     margin: 0;
     padding: 0;
 }
+/* Remove border from the grid wrapper */
+.ag-root-wrapper {
+    border: none !important;
+}
+/* AG Grid container - remove padding but allow shadow overflow */
+div:has(> [class*="ag-theme-"]) {
+    padding: 0 !important;
+    overflow: visible !important;
+}
 /* Apply styling to Workbench containers */
 .workbench-container {
     box-shadow: 2px 2px 6px 5px rgba(0, 0, 0, 0.25);
@@ -110,6 +128,40 @@ a:hover {
     color: rgb(100, 255, 100);
 }
+/* Dropdown styling (dcc.Dropdown) - override Bootstrap's variables */
+.dash-dropdown {
+    --bs-body-bg: rgb(35, 35, 35);
+    --bs-body-color: rgb(210, 210, 210);
+    --bs-border-color: rgb(60, 60, 60);
+}
+/* Bootstrap form controls (dbc components) */
+.form-select, .form-control {
+    background-color: rgb(35, 35, 35) !important;
+    border: 1px solid rgb(60, 60, 60) !important;
+    color: rgb(210, 210, 210) !important;
+}
+.form-select:focus, .form-control:focus {
+    background-color: rgb(45, 45, 45) !important;
+    border-color: rgb(80, 80, 80) !important;
+    box-shadow: 0 0 0 0.2rem rgba(80, 80, 80, 0.25) !important;
+}
+.dropdown-menu {
+    background-color: rgb(35, 35, 35) !important;
+    border: 1px solid rgb(60, 60, 60) !important;
+}
+.dropdown-item {
+    color: rgb(210, 210, 210) !important;
+}
+.dropdown-item:hover, .dropdown-item:focus {
+    background-color: rgb(50, 50, 50) !important;
+    color: rgb(230, 230, 230) !important;
+}
 /* Table styling */
 table {
     width: 100%;
@@ -128,4 +180,29 @@ td {
     padding: 5px;
     border: 0.5px solid #444;
     text-align: center !important;
+}
+/* AG Grid table header colors - gradient theme */
+/* Data Sources tables - red gradient */
+#main_data_sources .ag-header,
+#data_sources_table .ag-header {
+    background: linear-gradient(180deg, rgb(140, 60, 60) 0%, rgb(80, 35, 35) 100%) !important;
+}
+/* Feature Sets tables - yellow/olive gradient */
+#main_feature_sets .ag-header,
+#feature_sets_table .ag-header {
+    background: linear-gradient(180deg, rgb(120, 115, 55) 0%, rgb(70, 65, 30) 100%) !important;
+}
+/* Models tables - green gradient */
+#main_models .ag-header,
+#models_table .ag-header {
+    background: linear-gradient(180deg, rgb(55, 110, 55) 0%, rgb(30, 60, 30) 100%) !important;
+}
+/* Endpoints tables - purple gradient */
+#main_endpoints .ag-header,
+#endpoints_table .ag-header {
+    background: linear-gradient(180deg, rgb(100, 60, 120) 0%, rgb(55, 30, 70) 100%) !important;
 }

workbench/themes/dark/plotly.json CHANGED Viewed

@@ -483,11 +483,11 @@
                 [1.0, "rgb(200, 100, 100)"]
             ],
             "sequential": [
-                [0.0, "rgb(100, 100, 200)"],
-                [0.4, "rgb(100, 200, 100)"],
-                [0.65, "rgb(180, 180, 50)"],
-                [0.85, "rgb(200, 100, 100)"],
-                [1.0, "rgb(200, 100, 100)"]
+                [0.0, "rgba(80, 100, 255, 1.0)"],
+                [0.25, "rgba(70, 145, 220, 1.0)"],
+                [0.5, "rgba(70, 220, 100, 1.0)"],
+                [0.75, "rgba(255, 181, 80, 1.0)"],
+                [1.0, "rgba(232, 50, 131, 1.0)"]
             ],
             "sequentialminus": [
                 [0.0, "rgb(255, 100, 100)"],
@@ -527,7 +527,7 @@
             "style": "dark"
         },
         "paper_bgcolor": "rgba(0, 0, 0, 0.0)",
-        "plot_bgcolor": "rgba(0, 0, 0, 0.0)",
+        "plot_bgcolor": "rgb(40, 40, 40)",
         "polar": {
             "angularaxis": {
                 "gridcolor": "#506784",

workbench 0.8.224__py3-none-any.whl → 0.8.231__py3-none-any.whl

workbench 0.8.224py3-none-any.whl → 0.8.231py3-none-any.whl