workbench 0.8.219__py3-none-any.whl → 0.8.231__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- workbench/__init__.py +1 -0
- workbench/algorithms/dataframe/__init__.py +2 -0
- workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
- workbench/algorithms/dataframe/fingerprint_proximity.py +190 -31
- workbench/algorithms/dataframe/projection_2d.py +8 -2
- workbench/algorithms/dataframe/proximity.py +3 -0
- workbench/algorithms/dataframe/smart_aggregator.py +161 -0
- workbench/algorithms/sql/column_stats.py +0 -1
- workbench/algorithms/sql/correlations.py +0 -1
- workbench/algorithms/sql/descriptive_stats.py +0 -1
- workbench/api/feature_set.py +0 -1
- workbench/api/meta.py +0 -1
- workbench/cached/cached_meta.py +0 -1
- workbench/cached/cached_model.py +37 -7
- workbench/core/artifacts/endpoint_core.py +12 -2
- workbench/core/artifacts/feature_set_core.py +238 -225
- workbench/core/cloud_platform/cloud_meta.py +0 -1
- workbench/core/transforms/features_to_model/features_to_model.py +2 -8
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +2 -0
- workbench/model_script_utils/model_script_utils.py +30 -0
- workbench/model_script_utils/uq_harness.py +0 -1
- workbench/model_scripts/chemprop/chemprop.template +196 -68
- workbench/model_scripts/chemprop/generated_model_script.py +197 -72
- workbench/model_scripts/chemprop/model_script_utils.py +30 -0
- workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +0 -1
- workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +0 -1
- workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +0 -1
- workbench/model_scripts/pytorch_model/generated_model_script.py +52 -34
- workbench/model_scripts/pytorch_model/model_script_utils.py +30 -0
- workbench/model_scripts/pytorch_model/pytorch.template +47 -29
- workbench/model_scripts/pytorch_model/uq_harness.py +0 -1
- workbench/model_scripts/script_generation.py +0 -1
- workbench/model_scripts/xgb_model/generated_model_script.py +3 -3
- workbench/model_scripts/xgb_model/model_script_utils.py +30 -0
- workbench/model_scripts/xgb_model/uq_harness.py +0 -1
- workbench/scripts/ml_pipeline_sqs.py +71 -2
- workbench/themes/dark/custom.css +85 -8
- workbench/themes/dark/plotly.json +6 -6
- workbench/themes/light/custom.css +172 -64
- workbench/themes/light/plotly.json +9 -9
- workbench/themes/midnight_blue/custom.css +82 -29
- workbench/themes/midnight_blue/plotly.json +1 -1
- workbench/utils/aws_utils.py +0 -1
- workbench/utils/chem_utils/mol_descriptors.py +0 -1
- workbench/utils/chem_utils/projections.py +16 -6
- workbench/utils/chem_utils/vis.py +137 -27
- workbench/utils/clientside_callbacks.py +41 -0
- workbench/utils/markdown_utils.py +57 -0
- workbench/utils/model_utils.py +0 -1
- workbench/utils/pipeline_utils.py +0 -1
- workbench/utils/plot_utils.py +52 -36
- workbench/utils/theme_manager.py +95 -30
- workbench/web_interface/components/experiments/outlier_plot.py +0 -1
- workbench/web_interface/components/model_plot.py +2 -0
- workbench/web_interface/components/plugin_unit_test.py +0 -1
- workbench/web_interface/components/plugins/ag_table.py +2 -4
- workbench/web_interface/components/plugins/confusion_matrix.py +3 -6
- workbench/web_interface/components/plugins/model_details.py +10 -6
- workbench/web_interface/components/plugins/scatter_plot.py +184 -85
- workbench/web_interface/components/settings_menu.py +185 -0
- workbench/web_interface/page_views/main_page.py +0 -1
- {workbench-0.8.219.dist-info → workbench-0.8.231.dist-info}/METADATA +34 -41
- {workbench-0.8.219.dist-info → workbench-0.8.231.dist-info}/RECORD +67 -69
- {workbench-0.8.219.dist-info → workbench-0.8.231.dist-info}/WHEEL +1 -1
- workbench/themes/quartz/base_css.url +0 -1
- workbench/themes/quartz/custom.css +0 -117
- workbench/themes/quartz/plotly.json +0 -642
- workbench/themes/quartz_dark/base_css.url +0 -1
- workbench/themes/quartz_dark/custom.css +0 -131
- workbench/themes/quartz_dark/plotly.json +0 -642
- {workbench-0.8.219.dist-info → workbench-0.8.231.dist-info}/entry_points.txt +0 -0
- {workbench-0.8.219.dist-info → workbench-0.8.231.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.219.dist-info → workbench-0.8.231.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
"""SmartSample: Intelligently reduce DataFrame rows by aggregating similar rows together."""
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
from sklearn.preprocessing import StandardScaler
|
|
6
|
+
from sklearn.cluster import MiniBatchKMeans
|
|
7
|
+
import logging
|
|
8
|
+
|
|
9
|
+
# Set up logging
|
|
10
|
+
log = logging.getLogger("workbench")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def smart_aggregator(df: pd.DataFrame, target_rows: int = 1000, outlier_column: str = "residual") -> pd.DataFrame:
|
|
14
|
+
"""
|
|
15
|
+
Reduce DataFrame rows by aggregating similar rows based on numeric column similarity.
|
|
16
|
+
|
|
17
|
+
This is a performant (2-pass) algorithm:
|
|
18
|
+
1. Pass 1: Normalize numeric columns and cluster similar rows using MiniBatchKMeans
|
|
19
|
+
2. Pass 2: Aggregate each cluster (mean for numeric, first for non-numeric)
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
df: Input DataFrame.
|
|
23
|
+
target_rows: Target number of rows in output (default: 1000).
|
|
24
|
+
outlier_column: Column where high values should resist aggregation (default: "residual").
|
|
25
|
+
Rows with high values in this column will be kept separate while rows
|
|
26
|
+
with low values cluster together. Set to None to disable.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
Reduced DataFrame with 'aggregation_count' column showing how many rows were combined.
|
|
30
|
+
"""
|
|
31
|
+
if df is None or df.empty:
|
|
32
|
+
return df
|
|
33
|
+
|
|
34
|
+
n_rows = len(df)
|
|
35
|
+
|
|
36
|
+
# Preserve original column order
|
|
37
|
+
original_columns = df.columns.tolist()
|
|
38
|
+
|
|
39
|
+
# If already at or below target, just add the count column and return
|
|
40
|
+
if n_rows <= target_rows:
|
|
41
|
+
result = df.copy()
|
|
42
|
+
result["aggregation_count"] = 1
|
|
43
|
+
return result
|
|
44
|
+
|
|
45
|
+
log.info(f"smart_aggregator: Reducing {n_rows} rows to ~{target_rows} rows")
|
|
46
|
+
|
|
47
|
+
# Identify columns by type
|
|
48
|
+
df = df.copy()
|
|
49
|
+
numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
|
|
50
|
+
non_numeric_cols = [c for c in df.columns if c not in numeric_cols]
|
|
51
|
+
|
|
52
|
+
if not numeric_cols:
|
|
53
|
+
log.warning("smart_aggregator: No numeric columns for clustering, falling back to random sample")
|
|
54
|
+
result = df.sample(n=target_rows)
|
|
55
|
+
result["aggregation_count"] = 1
|
|
56
|
+
return result.reset_index(drop=True)
|
|
57
|
+
|
|
58
|
+
# Handle NaN values - fill with column median
|
|
59
|
+
df_for_clustering = df[numeric_cols].fillna(df[numeric_cols].median())
|
|
60
|
+
|
|
61
|
+
# Normalize and cluster
|
|
62
|
+
X = StandardScaler().fit_transform(df_for_clustering)
|
|
63
|
+
df["_cluster"] = MiniBatchKMeans(
|
|
64
|
+
n_clusters=min(target_rows, n_rows), random_state=42, batch_size=min(1024, n_rows), n_init=3
|
|
65
|
+
).fit_predict(X)
|
|
66
|
+
|
|
67
|
+
# Post-process: give high-outlier rows their own unique clusters so they don't get aggregated
|
|
68
|
+
if outlier_column and outlier_column in df.columns:
|
|
69
|
+
# Top 10% of outlier values get their own clusters, capped at 200
|
|
70
|
+
n_to_isolate = min(int(n_rows * 0.1), 200)
|
|
71
|
+
threshold = df[outlier_column].nlargest(n_to_isolate).min()
|
|
72
|
+
high_outlier_mask = df[outlier_column] >= threshold
|
|
73
|
+
n_high_outliers = high_outlier_mask.sum()
|
|
74
|
+
# Assign unique cluster IDs starting after the max existing cluster
|
|
75
|
+
max_cluster = df["_cluster"].max()
|
|
76
|
+
df.loc[high_outlier_mask, "_cluster"] = range(max_cluster + 1, max_cluster + 1 + n_high_outliers)
|
|
77
|
+
log.info(f"smart_aggregator: Isolated {n_high_outliers} high-outlier rows (>= {threshold:.3f})")
|
|
78
|
+
elif outlier_column:
|
|
79
|
+
log.warning(f"smart_aggregator: outlier_column '{outlier_column}' not found in columns")
|
|
80
|
+
|
|
81
|
+
# Aggregate each cluster (mean for numeric, first for non-numeric)
|
|
82
|
+
agg_dict = {col: "mean" for col in numeric_cols} | {col: "first" for col in non_numeric_cols}
|
|
83
|
+
grouped = df.groupby("_cluster")
|
|
84
|
+
result = grouped.agg(agg_dict).reset_index(drop=True)
|
|
85
|
+
result["aggregation_count"] = grouped.size().values
|
|
86
|
+
|
|
87
|
+
# Restore original column order, with aggregation_count at the end
|
|
88
|
+
result = result[original_columns + ["aggregation_count"]]
|
|
89
|
+
|
|
90
|
+
log.info(f"smart_aggregator: Reduced to {len(result)} rows")
|
|
91
|
+
return result
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# Testing
|
|
95
|
+
if __name__ == "__main__":
|
|
96
|
+
pd.set_option("display.max_columns", None)
|
|
97
|
+
pd.set_option("display.width", 1000)
|
|
98
|
+
|
|
99
|
+
# Create test data with clusters
|
|
100
|
+
np.random.seed(42)
|
|
101
|
+
n_samples = 10000
|
|
102
|
+
|
|
103
|
+
# Create 3 distinct clusters
|
|
104
|
+
cluster_1 = np.random.randn(n_samples // 3, 3) + np.array([0, 0, 0])
|
|
105
|
+
cluster_2 = np.random.randn(n_samples // 3, 3) + np.array([5, 5, 5])
|
|
106
|
+
cluster_3 = np.random.randn(n_samples // 3, 3) + np.array([10, 0, 5])
|
|
107
|
+
|
|
108
|
+
features = np.vstack([cluster_1, cluster_2, cluster_3])
|
|
109
|
+
|
|
110
|
+
# Create target and prediction columns, then compute residuals
|
|
111
|
+
target = features[:, 0] + features[:, 1] * 0.5 + np.random.randn(len(features)) * 0.1
|
|
112
|
+
prediction = target + np.random.randn(len(features)) * 0.5 # Add noise for residuals
|
|
113
|
+
residuals = np.abs(target - prediction)
|
|
114
|
+
|
|
115
|
+
data = {
|
|
116
|
+
"id": [f"id_{i}" for i in range(len(features))],
|
|
117
|
+
"A": features[:, 0],
|
|
118
|
+
"B": features[:, 1],
|
|
119
|
+
"C": features[:, 2],
|
|
120
|
+
"category": np.random.choice(["cat1", "cat2", "cat3"], len(features)),
|
|
121
|
+
"target": target,
|
|
122
|
+
"prediction": prediction,
|
|
123
|
+
"residual": residuals,
|
|
124
|
+
}
|
|
125
|
+
df = pd.DataFrame(data)
|
|
126
|
+
|
|
127
|
+
print(f"Original DataFrame: {len(df)} rows")
|
|
128
|
+
print(df.head())
|
|
129
|
+
print()
|
|
130
|
+
|
|
131
|
+
# Test smart_aggregator with residuals preservation
|
|
132
|
+
result = smart_aggregator(df, target_rows=500)
|
|
133
|
+
print(f"smart_aggregator result: {len(result)} rows")
|
|
134
|
+
print(result.head(20))
|
|
135
|
+
print()
|
|
136
|
+
print("Aggregation count stats:")
|
|
137
|
+
print(result["aggregation_count"].describe())
|
|
138
|
+
print()
|
|
139
|
+
# Show that high-residual points have lower aggregation counts
|
|
140
|
+
print("Aggregation count by residual quartile:")
|
|
141
|
+
result["residual_quartile"] = pd.qcut(result["residual"], 4, labels=["Q1 (low)", "Q2", "Q3", "Q4 (high)"])
|
|
142
|
+
print(result.groupby("residual_quartile")["aggregation_count"].mean())
|
|
143
|
+
|
|
144
|
+
# Test with real Workbench data
|
|
145
|
+
print("\n" + "=" * 80)
|
|
146
|
+
print("Testing with Workbench data...")
|
|
147
|
+
print("=" * 80)
|
|
148
|
+
|
|
149
|
+
from workbench.api import Model
|
|
150
|
+
|
|
151
|
+
model = Model("abalone-regression")
|
|
152
|
+
df = model.get_inference_predictions()
|
|
153
|
+
if df is not None:
|
|
154
|
+
print(f"\nOriginal DataFrame: {len(df)} rows")
|
|
155
|
+
print(df.head())
|
|
156
|
+
|
|
157
|
+
result = smart_aggregator(df, target_rows=500)
|
|
158
|
+
print(f"\nsmart_aggregator result: {len(result)} rows")
|
|
159
|
+
print(result.head())
|
|
160
|
+
print("\nAggregation count stats:")
|
|
161
|
+
print(result["aggregation_count"].describe())
|
workbench/api/feature_set.py
CHANGED
workbench/api/meta.py
CHANGED
workbench/cached/cached_meta.py
CHANGED
workbench/cached/cached_model.py
CHANGED
|
@@ -4,8 +4,9 @@ from typing import Union
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
|
|
6
6
|
# Workbench Imports
|
|
7
|
-
from workbench.core.artifacts.model_core import ModelCore
|
|
7
|
+
from workbench.core.artifacts.model_core import ModelCore, ModelType
|
|
8
8
|
from workbench.core.artifacts.cached_artifact_mixin import CachedArtifactMixin
|
|
9
|
+
from workbench.algorithms.dataframe import smart_aggregator
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
class CachedModel(CachedArtifactMixin, ModelCore):
|
|
@@ -84,20 +85,49 @@ class CachedModel(CachedArtifactMixin, ModelCore):
|
|
|
84
85
|
return super().get_inference_metrics(capture_name=capture_name)
|
|
85
86
|
|
|
86
87
|
@CachedArtifactMixin.cache_result
|
|
87
|
-
def get_inference_predictions(
|
|
88
|
+
def get_inference_predictions(
|
|
89
|
+
self, capture_name: str = "full_cross_fold", target_rows: int = 1000
|
|
90
|
+
) -> Union[pd.DataFrame, None]:
|
|
88
91
|
"""Retrieve the captured prediction results for this model
|
|
89
92
|
|
|
90
93
|
Args:
|
|
91
|
-
capture_name (str, optional): Specific capture_name (default:
|
|
94
|
+
capture_name (str, optional): Specific capture_name (default: full_cross_fold)
|
|
95
|
+
target_rows (int, optional): Target number of rows to return (default: 1000)
|
|
92
96
|
|
|
93
97
|
Returns:
|
|
94
98
|
pd.DataFrame: DataFrame of the Captured Predictions (might be None)
|
|
95
99
|
"""
|
|
96
|
-
# Note: This method can generate larger dataframes, so we'll sample if needed
|
|
97
100
|
df = super().get_inference_predictions(capture_name=capture_name)
|
|
98
|
-
if df is
|
|
99
|
-
|
|
100
|
-
|
|
101
|
+
if df is None:
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
# Compute residual based on model type
|
|
105
|
+
is_regressor = self.model_type in [ModelType.REGRESSOR, ModelType.UQ_REGRESSOR, ModelType.ENSEMBLE_REGRESSOR]
|
|
106
|
+
is_classifier = self.model_type == ModelType.CLASSIFIER
|
|
107
|
+
|
|
108
|
+
if is_regressor:
|
|
109
|
+
target = self.target()
|
|
110
|
+
if target and "prediction" in df.columns and target in df.columns:
|
|
111
|
+
df["residual"] = abs(df["prediction"] - df[target])
|
|
112
|
+
|
|
113
|
+
elif is_classifier:
|
|
114
|
+
target = self.target()
|
|
115
|
+
class_labels = self.class_labels()
|
|
116
|
+
if target and "prediction" in df.columns and target in df.columns and class_labels:
|
|
117
|
+
# Create a mapping from label to ordinal index
|
|
118
|
+
label_to_idx = {label: idx for idx, label in enumerate(class_labels)}
|
|
119
|
+
# Compute residual as distance between predicted and actual class
|
|
120
|
+
df["residual"] = abs(
|
|
121
|
+
df["prediction"].map(label_to_idx).fillna(-1) - df[target].map(label_to_idx).fillna(-1)
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Use smart_aggregator to aggregate similar rows if we have too many
|
|
125
|
+
if len(df) > target_rows:
|
|
126
|
+
self.log.info(
|
|
127
|
+
f"{self.name}:{capture_name} Using smart_aggregator to reduce {len(df)} rows to ~{target_rows}"
|
|
128
|
+
)
|
|
129
|
+
df = smart_aggregator(df, target_rows=target_rows)
|
|
130
|
+
|
|
101
131
|
return df
|
|
102
132
|
|
|
103
133
|
@CachedArtifactMixin.cache_result
|
|
@@ -546,7 +546,14 @@ class EndpointCore(Artifact):
|
|
|
546
546
|
target_list = targets if isinstance(targets, list) else [targets]
|
|
547
547
|
primary_target = target_list[0]
|
|
548
548
|
|
|
549
|
-
#
|
|
549
|
+
# If we don't have a smiles column, try to merge it from the FeatureSet
|
|
550
|
+
if "smiles" not in out_of_fold_df.columns:
|
|
551
|
+
fs_df = fs.query(f'SELECT {fs.id_column}, "smiles" FROM "{fs.athena_table}"')
|
|
552
|
+
if "smiles" in fs_df.columns:
|
|
553
|
+
self.log.info("Merging 'smiles' column from FeatureSet into out-of-fold predictions.")
|
|
554
|
+
out_of_fold_df = out_of_fold_df.merge(fs_df, on=fs.id_column, how="left")
|
|
555
|
+
|
|
556
|
+
# Collect UQ columns (q_*, confidence) for additional tracking (used for hashing)
|
|
550
557
|
additional_columns = [col for col in out_of_fold_df.columns if col.startswith("q_") or col == "confidence"]
|
|
551
558
|
if additional_columns:
|
|
552
559
|
self.log.info(f"UQ columns from training: {', '.join(additional_columns)}")
|
|
@@ -559,7 +566,6 @@ class EndpointCore(Artifact):
|
|
|
559
566
|
# For single-target models (99% of cases), just save as "full_cross_fold"
|
|
560
567
|
# For multi-target models, save each as cv_{target} plus primary as "full_cross_fold"
|
|
561
568
|
is_multi_target = len(target_list) > 1
|
|
562
|
-
|
|
563
569
|
for target in target_list:
|
|
564
570
|
# Drop rows with NaN target values for metrics/plots
|
|
565
571
|
target_df = out_of_fold_df.dropna(subset=[target])
|
|
@@ -899,6 +905,10 @@ class EndpointCore(Artifact):
|
|
|
899
905
|
# Add UQ columns (q_*, confidence) and proba columns
|
|
900
906
|
output_columns += [c for c in cols if c.startswith("q_") or c == "confidence" or c.endswith("_proba")]
|
|
901
907
|
|
|
908
|
+
# Add smiles column if present
|
|
909
|
+
if "smiles" in cols:
|
|
910
|
+
output_columns.append("smiles")
|
|
911
|
+
|
|
902
912
|
# Write the predictions to S3
|
|
903
913
|
output_file = f"{inference_capture_path}/inference_predictions.csv"
|
|
904
914
|
self.log.info(f"Writing predictions to {output_file}")
|