PyPI - workbench - Versions diffs - 0.8.234__py3-none-any.whl → 0.8.236__py3-none-any.whl - Mend

workbench 0.8.234py3-none-any.whl → 0.8.236py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

workbench/algorithms/dataframe/smart_aggregator.py CHANGED Viewed

@@ -55,25 +55,30 @@ def smart_aggregator(df: pd.DataFrame, target_rows: int = 1000, outlier_column:
         result["aggregation_count"] = 1
         return result.reset_index(drop=True)
-    # Handle NaN values - fill with column median
-    df_for_clustering = df[numeric_cols].fillna(df[numeric_cols].median())
-    # Normalize and cluster
-    X = StandardScaler().fit_transform(df_for_clustering)
+    # Handle NaN values - fill with column median (use numpy for speed)
+    clustering_data = df[numeric_cols].values
+    col_medians = np.nanmedian(clustering_data, axis=0)
+    nan_mask = np.isnan(clustering_data)
+    clustering_data = np.where(nan_mask, col_medians, clustering_data)
+    # Normalize and cluster (n_init=1 since MiniBatchKMeans is already approximate)
+    X = StandardScaler().fit_transform(clustering_data)
     df["_cluster"] = MiniBatchKMeans(
-        n_clusters=min(target_rows, n_rows), random_state=42, batch_size=min(1024, n_rows), n_init=3
+        n_clusters=min(target_rows, n_rows), random_state=42, batch_size=min(1024, n_rows), n_init=1
     ).fit_predict(X)
     # Post-process: give high-outlier rows their own unique clusters so they don't get aggregated
     if outlier_column and outlier_column in df.columns:
-        # Top 10% of outlier values get their own clusters, capped at 200
-        n_to_isolate = min(int(n_rows * 0.1), 200)
-        threshold = df[outlier_column].nlargest(n_to_isolate).min()
-        high_outlier_mask = df[outlier_column] >= threshold
+        # Top 10% of outlier values get their own clusters, capped at 20% of target_rows
+        n_to_isolate = min(int(n_rows * 0.1), int(target_rows * 0.2))
+        outlier_values = df[outlier_column].values
+        threshold = np.partition(outlier_values, -n_to_isolate)[-n_to_isolate]
+        high_outlier_mask = outlier_values >= threshold
         n_high_outliers = high_outlier_mask.sum()
-        # Assign unique cluster IDs starting after the max existing cluster
+        # Assign unique cluster IDs starting after the max existing cluster (match dtype to avoid warning)
         max_cluster = df["_cluster"].max()
-        df.loc[high_outlier_mask, "_cluster"] = range(max_cluster + 1, max_cluster + 1 + n_high_outliers)
+        new_cluster_ids = np.arange(max_cluster + 1, max_cluster + 1 + n_high_outliers, dtype=df["_cluster"].dtype)
+        df.loc[high_outlier_mask, "_cluster"] = new_cluster_ids
         log.info(f"smart_aggregator: Isolated {n_high_outliers} high-outlier rows (>= {threshold:.3f})")
     elif outlier_column:
         log.warning(f"smart_aggregator: outlier_column '{outlier_column}' not found in columns")

workbench/api/endpoint.py CHANGED Viewed

@@ -29,7 +29,12 @@ class Endpoint(EndpointCore):
         return super().details(**kwargs)
     def inference(
-        self, eval_df: pd.DataFrame, capture_name: str = None, id_column: str = None, drop_error_rows: bool = False
+        self,
+        eval_df: pd.DataFrame,
+        capture_name: str = None,
+        id_column: str = None,
+        drop_error_rows: bool = False,
+        include_quantiles: bool = False,
     ) -> pd.DataFrame:
         """Run inference on the Endpoint using the provided DataFrame
@@ -38,11 +43,12 @@ class Endpoint(EndpointCore):
             capture_name (str, optional): The Name of the capture to use (default: None)
             id_column (str, optional): The name of the column to use as the ID (default: None)
             drop_error_rows (bool): Whether to drop rows with errors (default: False)
+            include_quantiles (bool): Include q_* quantile columns in saved output (default: False)
         Returns:
             pd.DataFrame: The DataFrame with predictions
         """
-        return super().inference(eval_df, capture_name, id_column, drop_error_rows)
+        return super().inference(eval_df, capture_name, id_column, drop_error_rows, include_quantiles)
     def auto_inference(self) -> pd.DataFrame:
         """Run inference on the Endpoint using the test data from the model training view
@@ -75,13 +81,16 @@ class Endpoint(EndpointCore):
         """
         return super().fast_inference(eval_df, threads=threads)
-    def cross_fold_inference(self) -> pd.DataFrame:
+    def cross_fold_inference(self, include_quantiles: bool = False) -> pd.DataFrame:
         """Pull cross-fold inference from model associated with this Endpoint
+        Args:
+            include_quantiles (bool): Include q_* quantile columns in saved output (default: False)
         Returns:
             pd.DataFrame: A DataFrame with cross fold predictions
         """
-        return super().cross_fold_inference()
+        return super().cross_fold_inference(include_quantiles)
 if __name__ == "__main__":

workbench/cached/cached_model.py CHANGED Viewed

@@ -86,13 +86,13 @@ class CachedModel(CachedArtifactMixin, ModelCore):
     @CachedArtifactMixin.cache_result
     def get_inference_predictions(
-        self, capture_name: str = "full_cross_fold", target_rows: int = 1000
+        self, capture_name: str = "full_cross_fold", target_rows: int = 2000
     ) -> Union[pd.DataFrame, None]:
         """Retrieve the captured prediction results for this model
         Args:
             capture_name (str, optional): Specific capture_name (default: full_cross_fold)
-            target_rows (int, optional): Target number of rows to return (default: 1000)
+            target_rows (int, optional): Target number of rows to return (default: 2000)
         Returns:
             pd.DataFrame: DataFrame of the Captured Predictions (might be None)

workbench/core/artifacts/endpoint_core.py CHANGED Viewed

@@ -370,7 +370,12 @@ class EndpointCore(Artifact):
         return self.inference(eval_df, "full_inference")
     def inference(
-        self, eval_df: pd.DataFrame, capture_name: str = None, id_column: str = None, drop_error_rows: bool = False
+        self,
+        eval_df: pd.DataFrame,
+        capture_name: str = None,
+        id_column: str = None,
+        drop_error_rows: bool = False,
+        include_quantiles: bool = False,
     ) -> pd.DataFrame:
         """Run inference on the Endpoint using the provided DataFrame
@@ -379,6 +384,7 @@ class EndpointCore(Artifact):
             capture_name (str, optional): Name of the inference capture (default=None)
             id_column (str, optional): Name of the ID column (default=None)
             drop_error_rows (bool, optional): If True, drop rows that had endpoint errors/issues (default=False)
+            include_quantiles (bool): Include q_* quantile columns in saved output (default: False)
         Returns:
             pd.DataFrame: DataFrame with the inference results
@@ -478,6 +484,7 @@ class EndpointCore(Artifact):
                         description,
                         features,
                         id_column,
+                        include_quantiles,
                     )
                 # Save primary target (or single target) with original capture_name
@@ -491,6 +498,7 @@ class EndpointCore(Artifact):
                         capture_name.replace("_", " ").title(),
                         features,
                         id_column,
+                        include_quantiles,
                     )
             # Capture uncertainty metrics if prediction_std is available (UQ, ChemProp, etc.)
@@ -501,9 +509,12 @@ class EndpointCore(Artifact):
         # Return the prediction DataFrame
         return prediction_df
-    def cross_fold_inference(self) -> pd.DataFrame:
+    def cross_fold_inference(self, include_quantiles: bool = False) -> pd.DataFrame:
         """Pull cross-fold inference training results for this Endpoint's model
+        Args:
+            include_quantiles (bool): Include q_* quantile columns in saved output (default: False)
         Returns:
             pd.DataFrame: A DataFrame with cross fold predictions
         """
@@ -594,6 +605,7 @@ class EndpointCore(Artifact):
                     description,
                     features=additional_columns,
                     id_column=id_column,
+                    include_quantiles=include_quantiles,
                 )
             # Save primary target (or single target) as "full_cross_fold"
@@ -607,6 +619,7 @@ class EndpointCore(Artifact):
                     "Full Cross Fold",
                     features=additional_columns,
                     id_column=id_column,
+                    include_quantiles=include_quantiles,
                 )
         return out_of_fold_df
@@ -824,6 +837,7 @@ class EndpointCore(Artifact):
         description: str,
         features: list,
         id_column: str = None,
+        include_quantiles: bool = False,
     ):
         """Internal: Capture the inference results and metrics to S3 for a single target
@@ -836,6 +850,7 @@ class EndpointCore(Artifact):
             description (str): Description of the inference results
             features (list): List of features to include in the inference results
             id_column (str, optional): Name of the ID column (default=None)
+            include_quantiles (bool): Include q_* quantile columns in output (default: False)
         """
         # Compute a dataframe hash (just use the last 8)
@@ -862,7 +877,7 @@ class EndpointCore(Artifact):
         wr.s3.to_csv(metrics, f"{inference_capture_path}/inference_metrics.csv", index=False)
         # Save the inference predictions for this target
-        self._save_target_inference(inference_capture_path, pred_results_df, target, id_column)
+        self._save_target_inference(inference_capture_path, pred_results_df, target, id_column, include_quantiles)
         # CLASSIFIER: Write the confusion matrix to our S3 Model Inference Folder
         if model_type == ModelType.CLASSIFIER:
@@ -882,6 +897,7 @@ class EndpointCore(Artifact):
         pred_results_df: pd.DataFrame,
         target: str,
         id_column: str = None,
+        include_quantiles: bool = False,
     ):
         """Save inference results for a single target.
@@ -890,6 +906,7 @@ class EndpointCore(Artifact):
             pred_results_df (pd.DataFrame): DataFrame with prediction results
             target (str): Target column name
             id_column (str, optional): Name of the ID column
+            include_quantiles (bool): Include q_* quantile columns in output (default: False)
         """
         cols = pred_results_df.columns
@@ -902,8 +919,16 @@ class EndpointCore(Artifact):
         output_columns += [c for c in ["prediction", "prediction_std"] if c in cols]
-        # Add UQ columns (q_*, confidence) and proba columns
-        output_columns += [c for c in cols if c.startswith("q_") or c == "confidence" or c.endswith("_proba")]
+        # Add confidence column (always include if present)
+        if "confidence" in cols:
+            output_columns.append("confidence")
+        # Add quantile columns (q_*) only if requested
+        if include_quantiles:
+            output_columns += [c for c in cols if c.startswith("q_")]
+        # Add proba columns for classifiers
+        output_columns += [c for c in cols if c.endswith("_proba")]
         # Add smiles column if present
         if "smiles" in cols:

workbench/model_script_utils/model_script_utils.py CHANGED Viewed

@@ -16,6 +16,7 @@ from sklearn.metrics import (
     r2_score,
     root_mean_squared_error,
 )
+from sklearn.model_selection import GroupKFold, GroupShuffleSplit
 from scipy.stats import spearmanr
@@ -367,3 +368,227 @@ def print_confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray, label_names:
         for j, col_name in enumerate(label_names):
             value = conf_mtx[i, j]
             print(f"ConfusionMatrix:{row_name}:{col_name} {value}")
+# =============================================================================
+# Dataset Splitting Utilities for Molecular Data
+# =============================================================================
+def get_scaffold(smiles: str) -> str:
+    """Extract Bemis-Murcko scaffold from a SMILES string.
+    Args:
+        smiles: SMILES string of the molecule
+    Returns:
+        SMILES string of the scaffold, or empty string if molecule is invalid
+    """
+    from rdkit import Chem
+    from rdkit.Chem.Scaffolds import MurckoScaffold
+    mol = Chem.MolFromSmiles(smiles)
+    if mol is None:
+        return ""
+    try:
+        scaffold = MurckoScaffold.GetScaffoldForMol(mol)
+        return Chem.MolToSmiles(scaffold)
+    except Exception:
+        return ""
+def get_scaffold_groups(smiles_list: list[str]) -> np.ndarray:
+    """Assign each molecule to a scaffold group.
+    Args:
+        smiles_list: List of SMILES strings
+    Returns:
+        Array of group indices (same scaffold = same group)
+    """
+    scaffold_to_group = {}
+    groups = []
+    for smi in smiles_list:
+        scaffold = get_scaffold(smi)
+        if scaffold not in scaffold_to_group:
+            scaffold_to_group[scaffold] = len(scaffold_to_group)
+        groups.append(scaffold_to_group[scaffold])
+    n_scaffolds = len(scaffold_to_group)
+    print(f"Found {n_scaffolds} unique scaffolds from {len(smiles_list)} molecules")
+    return np.array(groups)
+def get_butina_clusters(smiles_list: list[str], cutoff: float = 0.4) -> np.ndarray:
+    """Cluster molecules using Butina algorithm on Morgan fingerprints.
+    Uses RDKit's Butina clustering with Tanimoto distance on Morgan fingerprints.
+    This is Pat Walters' recommended approach for creating diverse train/test splits.
+    Args:
+        smiles_list: List of SMILES strings
+        cutoff: Tanimoto distance cutoff for clustering (default 0.4)
+               Lower values = more clusters = more similar molecules per cluster
+    Returns:
+        Array of cluster indices
+    """
+    from rdkit import Chem, DataStructs
+    from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
+    from rdkit.ML.Cluster import Butina
+    # Create Morgan fingerprint generator
+    fp_gen = GetMorganGenerator(radius=2, fpSize=2048)
+    # Generate Morgan fingerprints
+    fps = []
+    valid_indices = []
+    for i, smi in enumerate(smiles_list):
+        mol = Chem.MolFromSmiles(smi)
+        if mol is not None:
+            fp = fp_gen.GetFingerprint(mol)
+            fps.append(fp)
+            valid_indices.append(i)
+    if len(fps) == 0:
+        raise ValueError("No valid molecules found for clustering")
+    # Compute distance matrix (upper triangle only for efficiency)
+    n = len(fps)
+    dists = []
+    for i in range(1, n):
+        sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
+        dists.extend([1 - s for s in sims])
+    # Butina clustering
+    clusters = Butina.ClusterData(dists, n, cutoff, isDistData=True)
+    # Map back to original indices
+    cluster_labels = np.zeros(len(smiles_list), dtype=int)
+    for cluster_idx, cluster in enumerate(clusters):
+        for mol_idx in cluster:
+            original_idx = valid_indices[mol_idx]
+            cluster_labels[original_idx] = cluster_idx
+    # Assign invalid molecules to their own clusters
+    next_cluster = len(clusters)
+    for i in range(len(smiles_list)):
+        if i not in valid_indices:
+            cluster_labels[i] = next_cluster
+            next_cluster += 1
+    n_clusters = len(set(cluster_labels))
+    print(f"Butina clustering: {n_clusters} clusters from {len(smiles_list)} molecules (cutoff={cutoff})")
+    return cluster_labels
+def _find_smiles_column(columns: list[str]) -> str | None:
+    """Find SMILES column (case-insensitive match for 'smiles').
+    Args:
+        columns: List of column names
+    Returns:
+        The matching column name, or None if not found
+    """
+    return next((c for c in columns if c.lower() == "smiles"), None)
+def get_split_indices(
+    df: pd.DataFrame,
+    n_splits: int = 5,
+    strategy: str = "random",
+    smiles_column: str | None = None,
+    target_column: str | None = None,
+    test_size: float = 0.2,
+    random_state: int = 42,
+    butina_cutoff: float = 0.4,
+) -> list[tuple[np.ndarray, np.ndarray]]:
+    """Get train/validation split indices using various strategies.
+    This is a unified interface for generating splits that can be used across
+    all model templates (XGBoost, PyTorch, ChemProp).
+    Args:
+        df: DataFrame containing the data
+        n_splits: Number of CV folds (1 = single train/val split)
+        strategy: Split strategy - one of:
+            - "random": Standard random split (default sklearn behavior)
+            - "scaffold": Bemis-Murcko scaffold-based grouping
+            - "butina": Morgan fingerprint clustering (recommended for ADMET)
+        smiles_column: Column containing SMILES. If None, auto-detects 'smiles' (case-insensitive)
+        target_column: Column containing target values (for stratification, optional)
+        test_size: Fraction for validation set when n_splits=1 (default 0.2)
+        random_state: Random seed for reproducibility
+        butina_cutoff: Tanimoto distance cutoff for Butina clustering (default 0.4)
+    Returns:
+        List of (train_indices, val_indices) tuples
+    Note:
+        If scaffold/butina strategy is requested but no SMILES column is found,
+        automatically falls back to random split with a warning message.
+    Example:
+        >>> folds = get_split_indices(df, n_splits=5, strategy="scaffold")
+        >>> for train_idx, val_idx in folds:
+        ...     X_train, X_val = df.iloc[train_idx], df.iloc[val_idx]
+    """
+    from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
+    n_samples = len(df)
+    # Random split (original behavior)
+    if strategy == "random":
+        if n_splits == 1:
+            indices = np.arange(n_samples)
+            train_idx, val_idx = train_test_split(indices, test_size=test_size, random_state=random_state)
+            return [(train_idx, val_idx)]
+        else:
+            if target_column and df[target_column].dtype in ["object", "category", "bool"]:
+                kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
+                return list(kfold.split(df, df[target_column]))
+            else:
+                kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
+                return list(kfold.split(df))
+    # Scaffold or Butina split requires SMILES - auto-detect if not provided
+    if smiles_column is None:
+        smiles_column = _find_smiles_column(df.columns.tolist())
+    # Fall back to random split if no SMILES column available
+    if smiles_column is None or smiles_column not in df.columns:
+        print(f"No 'smiles' column found for strategy='{strategy}', falling back to random split")
+        return get_split_indices(
+            df,
+            n_splits=n_splits,
+            strategy="random",
+            target_column=target_column,
+            test_size=test_size,
+            random_state=random_state,
+        )
+    smiles_list = df[smiles_column].tolist()
+    # Get group assignments
+    if strategy == "scaffold":
+        groups = get_scaffold_groups(smiles_list)
+    elif strategy == "butina":
+        groups = get_butina_clusters(smiles_list, cutoff=butina_cutoff)
+    else:
+        raise ValueError(f"Unknown strategy: {strategy}. Use 'random', 'scaffold', or 'butina'")
+    # Generate splits using GroupKFold or GroupShuffleSplit
+    if n_splits == 1:
+        # Single split: use GroupShuffleSplit
+        splitter = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
+        return list(splitter.split(df, groups=groups))
+    else:
+        # K-fold: use GroupKFold (ensures no group appears in both train and val)
+        # Note: GroupKFold doesn't shuffle, so we shuffle group order first
+        unique_groups = np.unique(groups)
+        rng = np.random.default_rng(random_state)
+        shuffled_group_map = {g: i for i, g in enumerate(rng.permutation(unique_groups))}
+        shuffled_groups = np.array([shuffled_group_map[g] for g in groups])
+        gkf = GroupKFold(n_splits=n_splits)
+        return list(gkf.split(df, groups=shuffled_groups))

workbench/model_script_utils/uq_harness.py CHANGED Viewed

@@ -3,6 +3,25 @@
 This module provides a reusable UQ harness that can wrap any point predictor model
 (XGBoost, PyTorch, ChemProp, etc.) to provide calibrated prediction intervals.
+Features:
+    - Conformalized Quantile Regression (CQR) for distribution-free coverage guarantees
+    - Multiple confidence levels (50%, 68%, 80%, 90%, 95%)
+    - Confidence scoring based on interval width
+Why CQR without additional Z-scaling:
+    MAPIE's conformalization step already guarantees that prediction intervals achieve
+    their target coverage on the calibration set. For example, an 80% CI will contain
+    ~80% of true values. This is the core promise of conformal prediction.
+    Z-scaling (post-hoc interval adjustment) would only help if there's a distribution
+    shift between calibration and test data. However:
+    1. We'd compute Z-scale on the same calibration set MAPIE uses, making it redundant
+    2. Our cross-fold validation metrics confirm coverage is already well-calibrated
+    3. Adding Z-scaling would "second-guess" MAPIE's principled conformalization
+    Empirically, our models achieve excellent coverage (e.g., 80% CI → 80.1% coverage),
+    validating that MAPIE's approach is sufficient without additional calibration.
 Usage:
     # Training
     uq_models, uq_metadata = train_uq_models(X_train, y_train, X_val, y_val)
@@ -240,38 +259,37 @@ def compute_confidence(
     median_interval_width: float,
     lower_q: str = "q_10",
     upper_q: str = "q_90",
-    alpha: float = 1.0,
-    beta: float = 1.0,
 ) -> pd.DataFrame:
     """Compute confidence scores (0.0 to 1.0) based on prediction interval width.
-    Uses exponential decay based on:
-    1. Interval width relative to median (alpha weight)
-    2. Distance from median prediction (beta weight)
+    Confidence is derived from the 80% prediction interval (q_10 to q_90) width:
+    - Narrower intervals → higher confidence (model is more certain)
+    - Wider intervals → lower confidence (model is less certain)
+    Why 80% CI (q_10/q_90)?
+        - 68% CI is too narrow and sensitive to noise
+        - 95% CI is too wide and less discriminating between samples
+        - 80% provides a good balance for ranking prediction reliability
+    Formula: confidence = exp(-width / median_width)
+        - When width equals median, confidence ≈ 0.37
+        - When width is half median, confidence ≈ 0.61
+        - When width is double median, confidence ≈ 0.14
+    This exponential decay is a common choice for converting uncertainty to
+    confidence scores, providing a smooth mapping that appropriately penalizes
+    high-uncertainty predictions.
     Args:
-        df: DataFrame with 'prediction', 'q_50', and quantile columns
+        df: DataFrame with quantile columns from predict_intervals()
         median_interval_width: Pre-computed median interval width from training data
         lower_q: Lower quantile column name (default: 'q_10')
         upper_q: Upper quantile column name (default: 'q_90')
-        alpha: Weight for interval width term (default: 1.0)
-        beta: Weight for distance from median term (default: 1.0)
     Returns:
-        DataFrame with added 'confidence' column
+        DataFrame with added 'confidence' column (values between 0 and 1)
     """
-    # Interval width
     interval_width = (df[upper_q] - df[lower_q]).abs()
-    # Distance from median, normalized by interval width
-    distance_from_median = (df["prediction"] - df["q_50"]).abs()
-    normalized_distance = distance_from_median / (interval_width + 1e-6)
-    # Cap the distance penalty at 1.0
-    normalized_distance = np.minimum(normalized_distance, 1.0)
-    # Confidence using exponential decay
-    interval_term = interval_width / median_interval_width
-    df["confidence"] = np.exp(-(alpha * interval_term + beta * normalized_distance))
+    df["confidence"] = np.exp(-interval_width / median_interval_width)
     return df

workbench/model_scripts/chemprop/chemprop.template CHANGED Viewed

@@ -44,6 +44,12 @@ DEFAULT_HYPERPARAMETERS = {
     "ffn_num_layers": 2,
     # Loss function for regression (mae, mse)
     "criterion": "mae",
+    # Split strategy: "random", "scaffold", or "butina"
+    # - random: Standard random split (default)
+    # - scaffold: Bemis-Murcko scaffold-based grouping
+    # - butina: Morgan fingerprint clustering (recommended for ADMET)
+    "split_strategy": "random",
+    "butina_cutoff": 0.4,  # Tanimoto distance cutoff for Butina clustering
     # Random seed
     "seed": 42,
     # Foundation model support
@@ -305,6 +311,7 @@ if __name__ == "__main__":
         check_dataframe,
         compute_classification_metrics,
         compute_regression_metrics,
+        get_split_indices,
         print_classification_metrics,
         print_confusion_matrix,
         print_regression_metrics,
@@ -518,22 +525,29 @@ if __name__ == "__main__":
     n_folds = hyperparameters["n_folds"]
     batch_size = hyperparameters["batch_size"]
-    if n_folds == 1:
-        if "training" in all_df.columns:
-            print("Using 'training' column for train/val split")
-            train_idx = np.where(all_df["training"])[0]
-            val_idx = np.where(~all_df["training"])[0]
-        else:
-            print("WARNING: No 'training' column, using random 80/20 split")
-            train_idx, val_idx = train_test_split(np.arange(len(all_df)), test_size=0.2, random_state=42)
+    # Get split strategy parameters
+    split_strategy = hyperparameters.get("split_strategy", "random")
+    butina_cutoff = hyperparameters.get("butina_cutoff", 0.4)
+    # Check for pre-defined training column (overrides split strategy)
+    if n_folds == 1 and "training" in all_df.columns:
+        print("Using 'training' column for train/val split")
+        train_idx = np.where(all_df["training"])[0]
+        val_idx = np.where(~all_df["training"])[0]
         folds = [(train_idx, val_idx)]
     else:
-        if model_type == "classifier":
-            kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
-            folds = list(kfold.split(all_df, all_df[target_columns[0]]))
-        else:
-            kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
-            folds = list(kfold.split(all_df))
+        # Use unified split interface (auto-detects 'smiles' column for scaffold/butina)
+        target_col = target_columns[0] if model_type == "classifier" else None
+        folds = get_split_indices(
+            all_df,
+            n_splits=n_folds,
+            strategy=split_strategy,
+            target_column=target_col,
+            test_size=0.2,
+            random_state=42,
+            butina_cutoff=butina_cutoff,
+        )
+        print(f"Split strategy: {split_strategy}")
     print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold ensemble'}...")
@@ -738,6 +752,7 @@ if __name__ == "__main__":
     output_columns += [f"{t}_pred" for t in target_columns] + [f"{t}_pred_std" for t in target_columns]
     output_columns += ["prediction", "prediction_std", "confidence"]
     output_columns += [c for c in df_val.columns if c.endswith("_proba")]
     output_columns = [c for c in output_columns if c in df_val.columns]
     wr.s3.to_csv(df_val[output_columns], f"{model_metrics_s3_path}/validation_predictions.csv", index=False)

workbench 0.8.234__py3-none-any.whl → 0.8.236__py3-none-any.whl

workbench 0.8.234py3-none-any.whl → 0.8.236py3-none-any.whl