workbench 0.8.219__py3-none-any.whl → 0.8.231__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. workbench/__init__.py +1 -0
  2. workbench/algorithms/dataframe/__init__.py +2 -0
  3. workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
  4. workbench/algorithms/dataframe/fingerprint_proximity.py +190 -31
  5. workbench/algorithms/dataframe/projection_2d.py +8 -2
  6. workbench/algorithms/dataframe/proximity.py +3 -0
  7. workbench/algorithms/dataframe/smart_aggregator.py +161 -0
  8. workbench/algorithms/sql/column_stats.py +0 -1
  9. workbench/algorithms/sql/correlations.py +0 -1
  10. workbench/algorithms/sql/descriptive_stats.py +0 -1
  11. workbench/api/feature_set.py +0 -1
  12. workbench/api/meta.py +0 -1
  13. workbench/cached/cached_meta.py +0 -1
  14. workbench/cached/cached_model.py +37 -7
  15. workbench/core/artifacts/endpoint_core.py +12 -2
  16. workbench/core/artifacts/feature_set_core.py +238 -225
  17. workbench/core/cloud_platform/cloud_meta.py +0 -1
  18. workbench/core/transforms/features_to_model/features_to_model.py +2 -8
  19. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +2 -0
  20. workbench/model_script_utils/model_script_utils.py +30 -0
  21. workbench/model_script_utils/uq_harness.py +0 -1
  22. workbench/model_scripts/chemprop/chemprop.template +196 -68
  23. workbench/model_scripts/chemprop/generated_model_script.py +197 -72
  24. workbench/model_scripts/chemprop/model_script_utils.py +30 -0
  25. workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +0 -1
  26. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +0 -1
  27. workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +0 -1
  28. workbench/model_scripts/pytorch_model/generated_model_script.py +52 -34
  29. workbench/model_scripts/pytorch_model/model_script_utils.py +30 -0
  30. workbench/model_scripts/pytorch_model/pytorch.template +47 -29
  31. workbench/model_scripts/pytorch_model/uq_harness.py +0 -1
  32. workbench/model_scripts/script_generation.py +0 -1
  33. workbench/model_scripts/xgb_model/generated_model_script.py +3 -3
  34. workbench/model_scripts/xgb_model/model_script_utils.py +30 -0
  35. workbench/model_scripts/xgb_model/uq_harness.py +0 -1
  36. workbench/scripts/ml_pipeline_sqs.py +71 -2
  37. workbench/themes/dark/custom.css +85 -8
  38. workbench/themes/dark/plotly.json +6 -6
  39. workbench/themes/light/custom.css +172 -64
  40. workbench/themes/light/plotly.json +9 -9
  41. workbench/themes/midnight_blue/custom.css +82 -29
  42. workbench/themes/midnight_blue/plotly.json +1 -1
  43. workbench/utils/aws_utils.py +0 -1
  44. workbench/utils/chem_utils/mol_descriptors.py +0 -1
  45. workbench/utils/chem_utils/projections.py +16 -6
  46. workbench/utils/chem_utils/vis.py +137 -27
  47. workbench/utils/clientside_callbacks.py +41 -0
  48. workbench/utils/markdown_utils.py +57 -0
  49. workbench/utils/model_utils.py +0 -1
  50. workbench/utils/pipeline_utils.py +0 -1
  51. workbench/utils/plot_utils.py +52 -36
  52. workbench/utils/theme_manager.py +95 -30
  53. workbench/web_interface/components/experiments/outlier_plot.py +0 -1
  54. workbench/web_interface/components/model_plot.py +2 -0
  55. workbench/web_interface/components/plugin_unit_test.py +0 -1
  56. workbench/web_interface/components/plugins/ag_table.py +2 -4
  57. workbench/web_interface/components/plugins/confusion_matrix.py +3 -6
  58. workbench/web_interface/components/plugins/model_details.py +10 -6
  59. workbench/web_interface/components/plugins/scatter_plot.py +184 -85
  60. workbench/web_interface/components/settings_menu.py +185 -0
  61. workbench/web_interface/page_views/main_page.py +0 -1
  62. {workbench-0.8.219.dist-info → workbench-0.8.231.dist-info}/METADATA +34 -41
  63. {workbench-0.8.219.dist-info → workbench-0.8.231.dist-info}/RECORD +67 -69
  64. {workbench-0.8.219.dist-info → workbench-0.8.231.dist-info}/WHEEL +1 -1
  65. workbench/themes/quartz/base_css.url +0 -1
  66. workbench/themes/quartz/custom.css +0 -117
  67. workbench/themes/quartz/plotly.json +0 -642
  68. workbench/themes/quartz_dark/base_css.url +0 -1
  69. workbench/themes/quartz_dark/custom.css +0 -131
  70. workbench/themes/quartz_dark/plotly.json +0 -642
  71. {workbench-0.8.219.dist-info → workbench-0.8.231.dist-info}/entry_points.txt +0 -0
  72. {workbench-0.8.219.dist-info → workbench-0.8.231.dist-info}/licenses/LICENSE +0 -0
  73. {workbench-0.8.219.dist-info → workbench-0.8.231.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,161 @@
1
+ """SmartSample: Intelligently reduce DataFrame rows by aggregating similar rows together."""
2
+
3
+ import pandas as pd
4
+ import numpy as np
5
+ from sklearn.preprocessing import StandardScaler
6
+ from sklearn.cluster import MiniBatchKMeans
7
+ import logging
8
+
9
+ # Set up logging
10
+ log = logging.getLogger("workbench")
11
+
12
+
13
+ def smart_aggregator(df: pd.DataFrame, target_rows: int = 1000, outlier_column: str = "residual") -> pd.DataFrame:
14
+ """
15
+ Reduce DataFrame rows by aggregating similar rows based on numeric column similarity.
16
+
17
+ This is a performant (2-pass) algorithm:
18
+ 1. Pass 1: Normalize numeric columns and cluster similar rows using MiniBatchKMeans
19
+ 2. Pass 2: Aggregate each cluster (mean for numeric, first for non-numeric)
20
+
21
+ Args:
22
+ df: Input DataFrame.
23
+ target_rows: Target number of rows in output (default: 1000).
24
+ outlier_column: Column where high values should resist aggregation (default: "residual").
25
+ Rows with high values in this column will be kept separate while rows
26
+ with low values cluster together. Set to None to disable.
27
+
28
+ Returns:
29
+ Reduced DataFrame with 'aggregation_count' column showing how many rows were combined.
30
+ """
31
+ if df is None or df.empty:
32
+ return df
33
+
34
+ n_rows = len(df)
35
+
36
+ # Preserve original column order
37
+ original_columns = df.columns.tolist()
38
+
39
+ # If already at or below target, just add the count column and return
40
+ if n_rows <= target_rows:
41
+ result = df.copy()
42
+ result["aggregation_count"] = 1
43
+ return result
44
+
45
+ log.info(f"smart_aggregator: Reducing {n_rows} rows to ~{target_rows} rows")
46
+
47
+ # Identify columns by type
48
+ df = df.copy()
49
+ numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
50
+ non_numeric_cols = [c for c in df.columns if c not in numeric_cols]
51
+
52
+ if not numeric_cols:
53
+ log.warning("smart_aggregator: No numeric columns for clustering, falling back to random sample")
54
+ result = df.sample(n=target_rows)
55
+ result["aggregation_count"] = 1
56
+ return result.reset_index(drop=True)
57
+
58
+ # Handle NaN values - fill with column median
59
+ df_for_clustering = df[numeric_cols].fillna(df[numeric_cols].median())
60
+
61
+ # Normalize and cluster
62
+ X = StandardScaler().fit_transform(df_for_clustering)
63
+ df["_cluster"] = MiniBatchKMeans(
64
+ n_clusters=min(target_rows, n_rows), random_state=42, batch_size=min(1024, n_rows), n_init=3
65
+ ).fit_predict(X)
66
+
67
+ # Post-process: give high-outlier rows their own unique clusters so they don't get aggregated
68
+ if outlier_column and outlier_column in df.columns:
69
+ # Top 10% of outlier values get their own clusters, capped at 200
70
+ n_to_isolate = min(int(n_rows * 0.1), 200)
71
+ threshold = df[outlier_column].nlargest(n_to_isolate).min()
72
+ high_outlier_mask = df[outlier_column] >= threshold
73
+ n_high_outliers = high_outlier_mask.sum()
74
+ # Assign unique cluster IDs starting after the max existing cluster
75
+ max_cluster = df["_cluster"].max()
76
+ df.loc[high_outlier_mask, "_cluster"] = range(max_cluster + 1, max_cluster + 1 + n_high_outliers)
77
+ log.info(f"smart_aggregator: Isolated {n_high_outliers} high-outlier rows (>= {threshold:.3f})")
78
+ elif outlier_column:
79
+ log.warning(f"smart_aggregator: outlier_column '{outlier_column}' not found in columns")
80
+
81
+ # Aggregate each cluster (mean for numeric, first for non-numeric)
82
+ agg_dict = {col: "mean" for col in numeric_cols} | {col: "first" for col in non_numeric_cols}
83
+ grouped = df.groupby("_cluster")
84
+ result = grouped.agg(agg_dict).reset_index(drop=True)
85
+ result["aggregation_count"] = grouped.size().values
86
+
87
+ # Restore original column order, with aggregation_count at the end
88
+ result = result[original_columns + ["aggregation_count"]]
89
+
90
+ log.info(f"smart_aggregator: Reduced to {len(result)} rows")
91
+ return result
92
+
93
+
94
+ # Testing
95
+ if __name__ == "__main__":
96
+ pd.set_option("display.max_columns", None)
97
+ pd.set_option("display.width", 1000)
98
+
99
+ # Create test data with clusters
100
+ np.random.seed(42)
101
+ n_samples = 10000
102
+
103
+ # Create 3 distinct clusters
104
+ cluster_1 = np.random.randn(n_samples // 3, 3) + np.array([0, 0, 0])
105
+ cluster_2 = np.random.randn(n_samples // 3, 3) + np.array([5, 5, 5])
106
+ cluster_3 = np.random.randn(n_samples // 3, 3) + np.array([10, 0, 5])
107
+
108
+ features = np.vstack([cluster_1, cluster_2, cluster_3])
109
+
110
+ # Create target and prediction columns, then compute residuals
111
+ target = features[:, 0] + features[:, 1] * 0.5 + np.random.randn(len(features)) * 0.1
112
+ prediction = target + np.random.randn(len(features)) * 0.5 # Add noise for residuals
113
+ residuals = np.abs(target - prediction)
114
+
115
+ data = {
116
+ "id": [f"id_{i}" for i in range(len(features))],
117
+ "A": features[:, 0],
118
+ "B": features[:, 1],
119
+ "C": features[:, 2],
120
+ "category": np.random.choice(["cat1", "cat2", "cat3"], len(features)),
121
+ "target": target,
122
+ "prediction": prediction,
123
+ "residual": residuals,
124
+ }
125
+ df = pd.DataFrame(data)
126
+
127
+ print(f"Original DataFrame: {len(df)} rows")
128
+ print(df.head())
129
+ print()
130
+
131
+ # Test smart_aggregator with residuals preservation
132
+ result = smart_aggregator(df, target_rows=500)
133
+ print(f"smart_aggregator result: {len(result)} rows")
134
+ print(result.head(20))
135
+ print()
136
+ print("Aggregation count stats:")
137
+ print(result["aggregation_count"].describe())
138
+ print()
139
+ # Show that high-residual points have lower aggregation counts
140
+ print("Aggregation count by residual quartile:")
141
+ result["residual_quartile"] = pd.qcut(result["residual"], 4, labels=["Q1 (low)", "Q2", "Q3", "Q4 (high)"])
142
+ print(result.groupby("residual_quartile")["aggregation_count"].mean())
143
+
144
+ # Test with real Workbench data
145
+ print("\n" + "=" * 80)
146
+ print("Testing with Workbench data...")
147
+ print("=" * 80)
148
+
149
+ from workbench.api import Model
150
+
151
+ model = Model("abalone-regression")
152
+ df = model.get_inference_predictions()
153
+ if df is not None:
154
+ print(f"\nOriginal DataFrame: {len(df)} rows")
155
+ print(df.head())
156
+
157
+ result = smart_aggregator(df, target_rows=500)
158
+ print(f"\nsmart_aggregator result: {len(result)} rows")
159
+ print(result.head())
160
+ print("\nAggregation count stats:")
161
+ print(result["aggregation_count"].describe())
@@ -6,7 +6,6 @@ import pandas as pd
6
6
  # Workbench Imports
7
7
  from workbench.core.artifacts.data_source_abstract import DataSourceAbstract
8
8
 
9
-
10
9
  # Workbench Logger
11
10
  log = logging.getLogger("workbench")
12
11
 
@@ -7,7 +7,6 @@ from collections import defaultdict
7
7
  # Workbench Imports
8
8
  from workbench.core.artifacts.data_source_abstract import DataSourceAbstract
9
9
 
10
-
11
10
  # Workbench Logger
12
11
  log = logging.getLogger("workbench")
13
12
 
@@ -7,7 +7,6 @@ from collections import defaultdict
7
7
  # Workbench Imports
8
8
  from workbench.core.artifacts.data_source_abstract import DataSourceAbstract
9
9
 
10
-
11
10
  # Workbench Logger
12
11
  log = logging.getLogger("workbench")
13
12
 
@@ -214,7 +214,6 @@ class FeatureSet(FeatureSetCore):
214
214
  include_all_columns=include_all_columns,
215
215
  radius=radius,
216
216
  n_bits=n_bits,
217
- counts=counts,
218
217
  )
219
218
 
220
219
  def cleanlab_model(
workbench/api/meta.py CHANGED
@@ -6,7 +6,6 @@ such as Data Sources, Feature Sets, Models, and Endpoints.
6
6
  from typing import Union
7
7
  import pandas as pd
8
8
 
9
-
10
9
  # Workbench Imports
11
10
  from workbench.core.cloud_platform.cloud_meta import CloudMeta
12
11
 
@@ -6,7 +6,6 @@ import pandas as pd
6
6
  from functools import wraps
7
7
  from concurrent.futures import ThreadPoolExecutor
8
8
 
9
-
10
9
  # Workbench Imports
11
10
  from workbench.core.cloud_platform.cloud_meta import CloudMeta
12
11
  from workbench.utils.workbench_cache import WorkbenchCache
@@ -4,8 +4,9 @@ from typing import Union
4
4
  import pandas as pd
5
5
 
6
6
  # Workbench Imports
7
- from workbench.core.artifacts.model_core import ModelCore
7
+ from workbench.core.artifacts.model_core import ModelCore, ModelType
8
8
  from workbench.core.artifacts.cached_artifact_mixin import CachedArtifactMixin
9
+ from workbench.algorithms.dataframe import smart_aggregator
9
10
 
10
11
 
11
12
  class CachedModel(CachedArtifactMixin, ModelCore):
@@ -84,20 +85,49 @@ class CachedModel(CachedArtifactMixin, ModelCore):
84
85
  return super().get_inference_metrics(capture_name=capture_name)
85
86
 
86
87
  @CachedArtifactMixin.cache_result
87
- def get_inference_predictions(self, capture_name: str = "auto_inference") -> Union[pd.DataFrame, None]:
88
+ def get_inference_predictions(
89
+ self, capture_name: str = "full_cross_fold", target_rows: int = 1000
90
+ ) -> Union[pd.DataFrame, None]:
88
91
  """Retrieve the captured prediction results for this model
89
92
 
90
93
  Args:
91
- capture_name (str, optional): Specific capture_name (default: training_holdout)
94
+ capture_name (str, optional): Specific capture_name (default: full_cross_fold)
95
+ target_rows (int, optional): Target number of rows to return (default: 1000)
92
96
 
93
97
  Returns:
94
98
  pd.DataFrame: DataFrame of the Captured Predictions (might be None)
95
99
  """
96
- # Note: This method can generate larger dataframes, so we'll sample if needed
97
100
  df = super().get_inference_predictions(capture_name=capture_name)
98
- if df is not None and len(df) > 5000:
99
- self.log.warning(f"{self.name}:{capture_name} Sampling Inference Predictions to 5000 rows")
100
- return df.sample(5000)
101
+ if df is None:
102
+ return None
103
+
104
+ # Compute residual based on model type
105
+ is_regressor = self.model_type in [ModelType.REGRESSOR, ModelType.UQ_REGRESSOR, ModelType.ENSEMBLE_REGRESSOR]
106
+ is_classifier = self.model_type == ModelType.CLASSIFIER
107
+
108
+ if is_regressor:
109
+ target = self.target()
110
+ if target and "prediction" in df.columns and target in df.columns:
111
+ df["residual"] = abs(df["prediction"] - df[target])
112
+
113
+ elif is_classifier:
114
+ target = self.target()
115
+ class_labels = self.class_labels()
116
+ if target and "prediction" in df.columns and target in df.columns and class_labels:
117
+ # Create a mapping from label to ordinal index
118
+ label_to_idx = {label: idx for idx, label in enumerate(class_labels)}
119
+ # Compute residual as distance between predicted and actual class
120
+ df["residual"] = abs(
121
+ df["prediction"].map(label_to_idx).fillna(-1) - df[target].map(label_to_idx).fillna(-1)
122
+ )
123
+
124
+ # Use smart_aggregator to aggregate similar rows if we have too many
125
+ if len(df) > target_rows:
126
+ self.log.info(
127
+ f"{self.name}:{capture_name} Using smart_aggregator to reduce {len(df)} rows to ~{target_rows}"
128
+ )
129
+ df = smart_aggregator(df, target_rows=target_rows)
130
+
101
131
  return df
102
132
 
103
133
  @CachedArtifactMixin.cache_result
@@ -546,7 +546,14 @@ class EndpointCore(Artifact):
546
546
  target_list = targets if isinstance(targets, list) else [targets]
547
547
  primary_target = target_list[0]
548
548
 
549
- # Collect UQ columns (q_*, confidence) for additional tracking
549
+ # If we don't have a smiles column, try to merge it from the FeatureSet
550
+ if "smiles" not in out_of_fold_df.columns:
551
+ fs_df = fs.query(f'SELECT {fs.id_column}, "smiles" FROM "{fs.athena_table}"')
552
+ if "smiles" in fs_df.columns:
553
+ self.log.info("Merging 'smiles' column from FeatureSet into out-of-fold predictions.")
554
+ out_of_fold_df = out_of_fold_df.merge(fs_df, on=fs.id_column, how="left")
555
+
556
+ # Collect UQ columns (q_*, confidence) for additional tracking (used for hashing)
550
557
  additional_columns = [col for col in out_of_fold_df.columns if col.startswith("q_") or col == "confidence"]
551
558
  if additional_columns:
552
559
  self.log.info(f"UQ columns from training: {', '.join(additional_columns)}")
@@ -559,7 +566,6 @@ class EndpointCore(Artifact):
559
566
  # For single-target models (99% of cases), just save as "full_cross_fold"
560
567
  # For multi-target models, save each as cv_{target} plus primary as "full_cross_fold"
561
568
  is_multi_target = len(target_list) > 1
562
-
563
569
  for target in target_list:
564
570
  # Drop rows with NaN target values for metrics/plots
565
571
  target_df = out_of_fold_df.dropna(subset=[target])
@@ -899,6 +905,10 @@ class EndpointCore(Artifact):
899
905
  # Add UQ columns (q_*, confidence) and proba columns
900
906
  output_columns += [c for c in cols if c.startswith("q_") or c == "confidence" or c.endswith("_proba")]
901
907
 
908
+ # Add smiles column if present
909
+ if "smiles" in cols:
910
+ output_columns.append("smiles")
911
+
902
912
  # Write the predictions to S3
903
913
  output_file = f"{inference_capture_path}/inference_predictions.csv"
904
914
  self.log.info(f"Writing predictions to {output_file}")