PyPI - workbench - Versions diffs - 0.8.219__py3-none-any.whl → 0.8.231__py3-none-any.whl - Mend

workbench 0.8.219py3-none-any.whl → 0.8.231py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

workbench/__init__.py +1 -0
workbench/algorithms/dataframe/__init__.py +2 -0
workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
workbench/algorithms/dataframe/fingerprint_proximity.py +190 -31
workbench/algorithms/dataframe/projection_2d.py +8 -2
workbench/algorithms/dataframe/proximity.py +3 -0
workbench/algorithms/dataframe/smart_aggregator.py +161 -0
workbench/algorithms/sql/column_stats.py +0 -1
workbench/algorithms/sql/correlations.py +0 -1
workbench/algorithms/sql/descriptive_stats.py +0 -1
workbench/api/feature_set.py +0 -1
workbench/api/meta.py +0 -1
workbench/cached/cached_meta.py +0 -1
workbench/cached/cached_model.py +37 -7
workbench/core/artifacts/endpoint_core.py +12 -2
workbench/core/artifacts/feature_set_core.py +238 -225
workbench/core/cloud_platform/cloud_meta.py +0 -1
workbench/core/transforms/features_to_model/features_to_model.py +2 -8
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +2 -0
workbench/model_script_utils/model_script_utils.py +30 -0
workbench/model_script_utils/uq_harness.py +0 -1
workbench/model_scripts/chemprop/chemprop.template +196 -68
workbench/model_scripts/chemprop/generated_model_script.py +197 -72
workbench/model_scripts/chemprop/model_script_utils.py +30 -0
workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +0 -1
workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +0 -1
workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +0 -1
workbench/model_scripts/pytorch_model/generated_model_script.py +52 -34
workbench/model_scripts/pytorch_model/model_script_utils.py +30 -0
workbench/model_scripts/pytorch_model/pytorch.template +47 -29
workbench/model_scripts/pytorch_model/uq_harness.py +0 -1
workbench/model_scripts/script_generation.py +0 -1
workbench/model_scripts/xgb_model/generated_model_script.py +3 -3
workbench/model_scripts/xgb_model/model_script_utils.py +30 -0
workbench/model_scripts/xgb_model/uq_harness.py +0 -1
workbench/scripts/ml_pipeline_sqs.py +71 -2
workbench/themes/dark/custom.css +85 -8
workbench/themes/dark/plotly.json +6 -6
workbench/themes/light/custom.css +172 -64
workbench/themes/light/plotly.json +9 -9
workbench/themes/midnight_blue/custom.css +82 -29
workbench/themes/midnight_blue/plotly.json +1 -1
workbench/utils/aws_utils.py +0 -1
workbench/utils/chem_utils/mol_descriptors.py +0 -1
workbench/utils/chem_utils/projections.py +16 -6
workbench/utils/chem_utils/vis.py +137 -27
workbench/utils/clientside_callbacks.py +41 -0
workbench/utils/markdown_utils.py +57 -0
workbench/utils/model_utils.py +0 -1
workbench/utils/pipeline_utils.py +0 -1
workbench/utils/plot_utils.py +52 -36
workbench/utils/theme_manager.py +95 -30
workbench/web_interface/components/experiments/outlier_plot.py +0 -1
workbench/web_interface/components/model_plot.py +2 -0
workbench/web_interface/components/plugin_unit_test.py +0 -1
workbench/web_interface/components/plugins/ag_table.py +2 -4
workbench/web_interface/components/plugins/confusion_matrix.py +3 -6
workbench/web_interface/components/plugins/model_details.py +10 -6
workbench/web_interface/components/plugins/scatter_plot.py +184 -85
workbench/web_interface/components/settings_menu.py +185 -0
workbench/web_interface/page_views/main_page.py +0 -1
{workbench-0.8.219.dist-info → workbench-0.8.231.dist-info}/METADATA +34 -41
{workbench-0.8.219.dist-info → workbench-0.8.231.dist-info}/RECORD +67 -69
{workbench-0.8.219.dist-info → workbench-0.8.231.dist-info}/WHEEL +1 -1
workbench/themes/quartz/base_css.url +0 -1
workbench/themes/quartz/custom.css +0 -117
workbench/themes/quartz/plotly.json +0 -642
workbench/themes/quartz_dark/base_css.url +0 -1
workbench/themes/quartz_dark/custom.css +0 -131
workbench/themes/quartz_dark/plotly.json +0 -642
{workbench-0.8.219.dist-info → workbench-0.8.231.dist-info}/entry_points.txt +0 -0
{workbench-0.8.219.dist-info → workbench-0.8.231.dist-info}/licenses/LICENSE +0 -0
{workbench-0.8.219.dist-info → workbench-0.8.231.dist-info}/top_level.txt +0 -0

workbench/algorithms/dataframe/smart_aggregator.py ADDED Viewed

@@ -0,0 +1,161 @@
+"""SmartSample: Intelligently reduce DataFrame rows by aggregating similar rows together."""
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from sklearn.cluster import MiniBatchKMeans
+import logging
+# Set up logging
+log = logging.getLogger("workbench")
+def smart_aggregator(df: pd.DataFrame, target_rows: int = 1000, outlier_column: str = "residual") -> pd.DataFrame:
+    """
+    Reduce DataFrame rows by aggregating similar rows based on numeric column similarity.
+    This is a performant (2-pass) algorithm:
+    1. Pass 1: Normalize numeric columns and cluster similar rows using MiniBatchKMeans
+    2. Pass 2: Aggregate each cluster (mean for numeric, first for non-numeric)
+    Args:
+        df: Input DataFrame.
+        target_rows: Target number of rows in output (default: 1000).
+        outlier_column: Column where high values should resist aggregation (default: "residual").
+                       Rows with high values in this column will be kept separate while rows
+                       with low values cluster together. Set to None to disable.
+    Returns:
+        Reduced DataFrame with 'aggregation_count' column showing how many rows were combined.
+    """
+    if df is None or df.empty:
+        return df
+    n_rows = len(df)
+    # Preserve original column order
+    original_columns = df.columns.tolist()
+    # If already at or below target, just add the count column and return
+    if n_rows <= target_rows:
+        result = df.copy()
+        result["aggregation_count"] = 1
+        return result
+    log.info(f"smart_aggregator: Reducing {n_rows} rows to ~{target_rows} rows")
+    # Identify columns by type
+    df = df.copy()
+    numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
+    non_numeric_cols = [c for c in df.columns if c not in numeric_cols]
+    if not numeric_cols:
+        log.warning("smart_aggregator: No numeric columns for clustering, falling back to random sample")
+        result = df.sample(n=target_rows)
+        result["aggregation_count"] = 1
+        return result.reset_index(drop=True)
+    # Handle NaN values - fill with column median
+    df_for_clustering = df[numeric_cols].fillna(df[numeric_cols].median())
+    # Normalize and cluster
+    X = StandardScaler().fit_transform(df_for_clustering)
+    df["_cluster"] = MiniBatchKMeans(
+        n_clusters=min(target_rows, n_rows), random_state=42, batch_size=min(1024, n_rows), n_init=3
+    ).fit_predict(X)
+    # Post-process: give high-outlier rows their own unique clusters so they don't get aggregated
+    if outlier_column and outlier_column in df.columns:
+        # Top 10% of outlier values get their own clusters, capped at 200
+        n_to_isolate = min(int(n_rows * 0.1), 200)
+        threshold = df[outlier_column].nlargest(n_to_isolate).min()
+        high_outlier_mask = df[outlier_column] >= threshold
+        n_high_outliers = high_outlier_mask.sum()
+        # Assign unique cluster IDs starting after the max existing cluster
+        max_cluster = df["_cluster"].max()
+        df.loc[high_outlier_mask, "_cluster"] = range(max_cluster + 1, max_cluster + 1 + n_high_outliers)
+        log.info(f"smart_aggregator: Isolated {n_high_outliers} high-outlier rows (>= {threshold:.3f})")
+    elif outlier_column:
+        log.warning(f"smart_aggregator: outlier_column '{outlier_column}' not found in columns")
+    # Aggregate each cluster (mean for numeric, first for non-numeric)
+    agg_dict = {col: "mean" for col in numeric_cols} | {col: "first" for col in non_numeric_cols}
+    grouped = df.groupby("_cluster")
+    result = grouped.agg(agg_dict).reset_index(drop=True)
+    result["aggregation_count"] = grouped.size().values
+    # Restore original column order, with aggregation_count at the end
+    result = result[original_columns + ["aggregation_count"]]
+    log.info(f"smart_aggregator: Reduced to {len(result)} rows")
+    return result
+# Testing
+if __name__ == "__main__":
+    pd.set_option("display.max_columns", None)
+    pd.set_option("display.width", 1000)
+    # Create test data with clusters
+    np.random.seed(42)
+    n_samples = 10000
+    # Create 3 distinct clusters
+    cluster_1 = np.random.randn(n_samples // 3, 3) + np.array([0, 0, 0])
+    cluster_2 = np.random.randn(n_samples // 3, 3) + np.array([5, 5, 5])
+    cluster_3 = np.random.randn(n_samples // 3, 3) + np.array([10, 0, 5])
+    features = np.vstack([cluster_1, cluster_2, cluster_3])
+    # Create target and prediction columns, then compute residuals
+    target = features[:, 0] + features[:, 1] * 0.5 + np.random.randn(len(features)) * 0.1
+    prediction = target + np.random.randn(len(features)) * 0.5  # Add noise for residuals
+    residuals = np.abs(target - prediction)
+    data = {
+        "id": [f"id_{i}" for i in range(len(features))],
+        "A": features[:, 0],
+        "B": features[:, 1],
+        "C": features[:, 2],
+        "category": np.random.choice(["cat1", "cat2", "cat3"], len(features)),
+        "target": target,
+        "prediction": prediction,
+        "residual": residuals,
+    }
+    df = pd.DataFrame(data)
+    print(f"Original DataFrame: {len(df)} rows")
+    print(df.head())
+    print()
+    # Test smart_aggregator with residuals preservation
+    result = smart_aggregator(df, target_rows=500)
+    print(f"smart_aggregator result: {len(result)} rows")
+    print(result.head(20))
+    print()
+    print("Aggregation count stats:")
+    print(result["aggregation_count"].describe())
+    print()
+    # Show that high-residual points have lower aggregation counts
+    print("Aggregation count by residual quartile:")
+    result["residual_quartile"] = pd.qcut(result["residual"], 4, labels=["Q1 (low)", "Q2", "Q3", "Q4 (high)"])
+    print(result.groupby("residual_quartile")["aggregation_count"].mean())
+    # Test with real Workbench data
+    print("\n" + "=" * 80)
+    print("Testing with Workbench data...")
+    print("=" * 80)
+    from workbench.api import Model
+    model = Model("abalone-regression")
+    df = model.get_inference_predictions()
+    if df is not None:
+        print(f"\nOriginal DataFrame: {len(df)} rows")
+        print(df.head())
+        result = smart_aggregator(df, target_rows=500)
+        print(f"\nsmart_aggregator result: {len(result)} rows")
+        print(result.head())
+        print("\nAggregation count stats:")
+        print(result["aggregation_count"].describe())

workbench/algorithms/sql/column_stats.py CHANGED Viewed

@@ -6,7 +6,6 @@ import pandas as pd
 # Workbench Imports
 from workbench.core.artifacts.data_source_abstract import DataSourceAbstract
 # Workbench Logger
 log = logging.getLogger("workbench")

workbench/algorithms/sql/correlations.py CHANGED Viewed

@@ -7,7 +7,6 @@ from collections import defaultdict
 # Workbench Imports
 from workbench.core.artifacts.data_source_abstract import DataSourceAbstract
 # Workbench Logger
 log = logging.getLogger("workbench")

workbench/algorithms/sql/descriptive_stats.py CHANGED Viewed

@@ -7,7 +7,6 @@ from collections import defaultdict
 # Workbench Imports
 from workbench.core.artifacts.data_source_abstract import DataSourceAbstract
 # Workbench Logger
 log = logging.getLogger("workbench")

workbench/api/feature_set.py CHANGED Viewed

@@ -214,7 +214,6 @@ class FeatureSet(FeatureSetCore):
             include_all_columns=include_all_columns,
             radius=radius,
             n_bits=n_bits,
-            counts=counts,
         )
     def cleanlab_model(

workbench/api/meta.py CHANGED Viewed

@@ -6,7 +6,6 @@ such as Data Sources, Feature Sets, Models, and Endpoints.
 from typing import Union
 import pandas as pd
 # Workbench Imports
 from workbench.core.cloud_platform.cloud_meta import CloudMeta

workbench/cached/cached_meta.py CHANGED Viewed

@@ -6,7 +6,6 @@ import pandas as pd
 from functools import wraps
 from concurrent.futures import ThreadPoolExecutor
 # Workbench Imports
 from workbench.core.cloud_platform.cloud_meta import CloudMeta
 from workbench.utils.workbench_cache import WorkbenchCache

workbench/cached/cached_model.py CHANGED Viewed

@@ -4,8 +4,9 @@ from typing import Union
 import pandas as pd
 # Workbench Imports
-from workbench.core.artifacts.model_core import ModelCore
+from workbench.core.artifacts.model_core import ModelCore, ModelType
 from workbench.core.artifacts.cached_artifact_mixin import CachedArtifactMixin
+from workbench.algorithms.dataframe import smart_aggregator
 class CachedModel(CachedArtifactMixin, ModelCore):
@@ -84,20 +85,49 @@ class CachedModel(CachedArtifactMixin, ModelCore):
         return super().get_inference_metrics(capture_name=capture_name)
     @CachedArtifactMixin.cache_result
-    def get_inference_predictions(self, capture_name: str = "auto_inference") -> Union[pd.DataFrame, None]:
+    def get_inference_predictions(
+        self, capture_name: str = "full_cross_fold", target_rows: int = 1000
+    ) -> Union[pd.DataFrame, None]:
         """Retrieve the captured prediction results for this model
         Args:
-            capture_name (str, optional): Specific capture_name (default: training_holdout)
+            capture_name (str, optional): Specific capture_name (default: full_cross_fold)
+            target_rows (int, optional): Target number of rows to return (default: 1000)
         Returns:
             pd.DataFrame: DataFrame of the Captured Predictions (might be None)
         """
-        # Note: This method can generate larger dataframes, so we'll sample if needed
         df = super().get_inference_predictions(capture_name=capture_name)
-        if df is not None and len(df) > 5000:
-            self.log.warning(f"{self.name}:{capture_name} Sampling Inference Predictions to 5000 rows")
-            return df.sample(5000)
+        if df is None:
+            return None
+        # Compute residual based on model type
+        is_regressor = self.model_type in [ModelType.REGRESSOR, ModelType.UQ_REGRESSOR, ModelType.ENSEMBLE_REGRESSOR]
+        is_classifier = self.model_type == ModelType.CLASSIFIER
+        if is_regressor:
+            target = self.target()
+            if target and "prediction" in df.columns and target in df.columns:
+                df["residual"] = abs(df["prediction"] - df[target])
+        elif is_classifier:
+            target = self.target()
+            class_labels = self.class_labels()
+            if target and "prediction" in df.columns and target in df.columns and class_labels:
+                # Create a mapping from label to ordinal index
+                label_to_idx = {label: idx for idx, label in enumerate(class_labels)}
+                # Compute residual as distance between predicted and actual class
+                df["residual"] = abs(
+                    df["prediction"].map(label_to_idx).fillna(-1) - df[target].map(label_to_idx).fillna(-1)
+                )
+        # Use smart_aggregator to aggregate similar rows if we have too many
+        if len(df) > target_rows:
+            self.log.info(
+                f"{self.name}:{capture_name} Using smart_aggregator to reduce {len(df)} rows to ~{target_rows}"
+            )
+            df = smart_aggregator(df, target_rows=target_rows)
         return df
     @CachedArtifactMixin.cache_result

workbench/core/artifacts/endpoint_core.py CHANGED Viewed

@@ -546,7 +546,14 @@ class EndpointCore(Artifact):
         target_list = targets if isinstance(targets, list) else [targets]
         primary_target = target_list[0]
-        # Collect UQ columns (q_*, confidence) for additional tracking
+        # If we don't have a smiles column, try to merge it from the FeatureSet
+        if "smiles" not in out_of_fold_df.columns:
+            fs_df = fs.query(f'SELECT {fs.id_column}, "smiles" FROM "{fs.athena_table}"')
+            if "smiles" in fs_df.columns:
+                self.log.info("Merging 'smiles' column from FeatureSet into out-of-fold predictions.")
+                out_of_fold_df = out_of_fold_df.merge(fs_df, on=fs.id_column, how="left")
+        # Collect UQ columns (q_*, confidence) for additional tracking (used for hashing)
         additional_columns = [col for col in out_of_fold_df.columns if col.startswith("q_") or col == "confidence"]
         if additional_columns:
             self.log.info(f"UQ columns from training: {', '.join(additional_columns)}")
@@ -559,7 +566,6 @@ class EndpointCore(Artifact):
         # For single-target models (99% of cases), just save as "full_cross_fold"
         # For multi-target models, save each as cv_{target} plus primary as "full_cross_fold"
         is_multi_target = len(target_list) > 1
         for target in target_list:
             # Drop rows with NaN target values for metrics/plots
             target_df = out_of_fold_df.dropna(subset=[target])
@@ -899,6 +905,10 @@ class EndpointCore(Artifact):
         # Add UQ columns (q_*, confidence) and proba columns
         output_columns += [c for c in cols if c.startswith("q_") or c == "confidence" or c.endswith("_proba")]
+        # Add smiles column if present
+        if "smiles" in cols:
+            output_columns.append("smiles")
         # Write the predictions to S3
         output_file = f"{inference_capture_path}/inference_predictions.csv"
         self.log.info(f"Writing predictions to {output_file}")

workbench 0.8.219__py3-none-any.whl → 0.8.231__py3-none-any.whl

workbench 0.8.219py3-none-any.whl → 0.8.231py3-none-any.whl