workbench 0.8.162__py3-none-any.whl → 0.8.220__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

Files changed (147) hide show
  1. workbench/algorithms/dataframe/__init__.py +1 -2
  2. workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
  3. workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
  4. workbench/algorithms/dataframe/fingerprint_proximity.py +422 -86
  5. workbench/algorithms/dataframe/projection_2d.py +44 -21
  6. workbench/algorithms/dataframe/proximity.py +259 -305
  7. workbench/algorithms/graph/light/proximity_graph.py +14 -12
  8. workbench/algorithms/models/cleanlab_model.py +382 -0
  9. workbench/algorithms/models/noise_model.py +388 -0
  10. workbench/algorithms/sql/outliers.py +3 -3
  11. workbench/api/__init__.py +5 -1
  12. workbench/api/compound.py +1 -1
  13. workbench/api/df_store.py +17 -108
  14. workbench/api/endpoint.py +18 -5
  15. workbench/api/feature_set.py +121 -15
  16. workbench/api/meta.py +5 -2
  17. workbench/api/meta_model.py +289 -0
  18. workbench/api/model.py +55 -21
  19. workbench/api/monitor.py +1 -16
  20. workbench/api/parameter_store.py +3 -52
  21. workbench/cached/cached_model.py +4 -4
  22. workbench/core/artifacts/__init__.py +11 -2
  23. workbench/core/artifacts/artifact.py +16 -8
  24. workbench/core/artifacts/data_capture_core.py +355 -0
  25. workbench/core/artifacts/df_store_core.py +114 -0
  26. workbench/core/artifacts/endpoint_core.py +382 -253
  27. workbench/core/artifacts/feature_set_core.py +249 -45
  28. workbench/core/artifacts/model_core.py +135 -80
  29. workbench/core/artifacts/monitor_core.py +33 -248
  30. workbench/core/artifacts/parameter_store_core.py +98 -0
  31. workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
  32. workbench/core/cloud_platform/aws/aws_meta.py +12 -5
  33. workbench/core/cloud_platform/aws/aws_session.py +4 -4
  34. workbench/core/pipelines/pipeline_executor.py +1 -1
  35. workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
  36. workbench/core/transforms/features_to_model/features_to_model.py +62 -40
  37. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +76 -15
  38. workbench/core/transforms/pandas_transforms/pandas_to_features.py +38 -2
  39. workbench/core/views/training_view.py +113 -42
  40. workbench/core/views/view.py +53 -3
  41. workbench/core/views/view_utils.py +4 -4
  42. workbench/model_script_utils/model_script_utils.py +339 -0
  43. workbench/model_script_utils/pytorch_utils.py +405 -0
  44. workbench/model_script_utils/uq_harness.py +278 -0
  45. workbench/model_scripts/chemprop/chemprop.template +649 -0
  46. workbench/model_scripts/chemprop/generated_model_script.py +649 -0
  47. workbench/model_scripts/chemprop/model_script_utils.py +339 -0
  48. workbench/model_scripts/chemprop/requirements.txt +3 -0
  49. workbench/model_scripts/custom_models/chem_info/fingerprints.py +175 -0
  50. workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
  51. workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
  52. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
  53. workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
  54. workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
  55. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +8 -10
  56. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  57. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
  58. workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
  59. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  60. workbench/model_scripts/custom_models/uq_models/ngboost.template +30 -18
  61. workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
  62. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
  63. workbench/model_scripts/meta_model/generated_model_script.py +209 -0
  64. workbench/model_scripts/meta_model/meta_model.template +209 -0
  65. workbench/model_scripts/pytorch_model/generated_model_script.py +444 -500
  66. workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
  67. workbench/model_scripts/pytorch_model/pytorch.template +440 -496
  68. workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
  69. workbench/model_scripts/pytorch_model/requirements.txt +1 -1
  70. workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
  71. workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
  72. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  73. workbench/model_scripts/script_generation.py +20 -11
  74. workbench/model_scripts/uq_models/generated_model_script.py +248 -0
  75. workbench/model_scripts/xgb_model/generated_model_script.py +372 -404
  76. workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
  77. workbench/model_scripts/xgb_model/uq_harness.py +278 -0
  78. workbench/model_scripts/xgb_model/xgb_model.template +369 -401
  79. workbench/repl/workbench_shell.py +28 -19
  80. workbench/resources/open_source_api.key +1 -1
  81. workbench/scripts/endpoint_test.py +162 -0
  82. workbench/scripts/lambda_test.py +73 -0
  83. workbench/scripts/meta_model_sim.py +35 -0
  84. workbench/scripts/ml_pipeline_batch.py +137 -0
  85. workbench/scripts/ml_pipeline_sqs.py +186 -0
  86. workbench/scripts/monitor_cloud_watch.py +20 -100
  87. workbench/scripts/training_test.py +85 -0
  88. workbench/utils/aws_utils.py +4 -3
  89. workbench/utils/chem_utils/__init__.py +0 -0
  90. workbench/utils/chem_utils/fingerprints.py +175 -0
  91. workbench/utils/chem_utils/misc.py +194 -0
  92. workbench/utils/chem_utils/mol_descriptors.py +483 -0
  93. workbench/utils/chem_utils/mol_standardize.py +450 -0
  94. workbench/utils/chem_utils/mol_tagging.py +348 -0
  95. workbench/utils/chem_utils/projections.py +219 -0
  96. workbench/utils/chem_utils/salts.py +256 -0
  97. workbench/utils/chem_utils/sdf.py +292 -0
  98. workbench/utils/chem_utils/toxicity.py +250 -0
  99. workbench/utils/chem_utils/vis.py +253 -0
  100. workbench/utils/chemprop_utils.py +141 -0
  101. workbench/utils/cloudwatch_handler.py +1 -1
  102. workbench/utils/cloudwatch_utils.py +137 -0
  103. workbench/utils/config_manager.py +3 -7
  104. workbench/utils/endpoint_utils.py +5 -7
  105. workbench/utils/license_manager.py +2 -6
  106. workbench/utils/meta_model_simulator.py +499 -0
  107. workbench/utils/metrics_utils.py +256 -0
  108. workbench/utils/model_utils.py +278 -79
  109. workbench/utils/monitor_utils.py +44 -62
  110. workbench/utils/pandas_utils.py +3 -3
  111. workbench/utils/pytorch_utils.py +87 -0
  112. workbench/utils/shap_utils.py +11 -57
  113. workbench/utils/workbench_logging.py +0 -3
  114. workbench/utils/workbench_sqs.py +1 -1
  115. workbench/utils/xgboost_local_crossfold.py +267 -0
  116. workbench/utils/xgboost_model_utils.py +127 -219
  117. workbench/web_interface/components/model_plot.py +14 -2
  118. workbench/web_interface/components/plugin_unit_test.py +5 -2
  119. workbench/web_interface/components/plugins/dashboard_status.py +3 -1
  120. workbench/web_interface/components/plugins/generated_compounds.py +1 -1
  121. workbench/web_interface/components/plugins/model_details.py +38 -74
  122. workbench/web_interface/components/plugins/scatter_plot.py +6 -10
  123. {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/METADATA +31 -9
  124. {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/RECORD +128 -96
  125. workbench-0.8.220.dist-info/entry_points.txt +11 -0
  126. {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/licenses/LICENSE +1 -1
  127. workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
  128. workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
  129. workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
  130. workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
  131. workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
  132. workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
  133. workbench/model_scripts/custom_models/proximity/proximity.py +0 -384
  134. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
  135. workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
  136. workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -273
  137. workbench/model_scripts/custom_models/uq_models/proximity.py +0 -384
  138. workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
  139. workbench/model_scripts/quant_regression/quant_regression.template +0 -279
  140. workbench/model_scripts/quant_regression/requirements.txt +0 -1
  141. workbench/utils/chem_utils.py +0 -1556
  142. workbench/utils/execution_environment.py +0 -211
  143. workbench/utils/fast_inference.py +0 -167
  144. workbench/utils/resource_utils.py +0 -39
  145. workbench-0.8.162.dist-info/entry_points.txt +0 -5
  146. {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/WHEEL +0 -0
  147. {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/top_level.txt +0 -0
@@ -22,7 +22,14 @@ class Projection2D:
22
22
  self.log = logging.getLogger("workbench")
23
23
  self.projection_model = None
24
24
 
25
- def fit_transform(self, input_df: pd.DataFrame, features: list = None, projection: str = "UMAP") -> pd.DataFrame:
25
+ def fit_transform(
26
+ self,
27
+ input_df: pd.DataFrame,
28
+ features: list = None,
29
+ feature_matrix: np.ndarray = None,
30
+ metric: str = "euclidean",
31
+ projection: str = "UMAP",
32
+ ) -> pd.DataFrame:
26
33
  """Fit and transform a DataFrame using the selected dimensionality reduction method.
27
34
 
28
35
  This method creates a copy of the input DataFrame, processes the specified features
@@ -32,6 +39,9 @@ class Projection2D:
32
39
  Args:
33
40
  input_df (pd.DataFrame): The DataFrame containing features to project.
34
41
  features (list, optional): List of feature column names. If None, numeric columns are auto-selected.
42
+ feature_matrix (np.ndarray, optional): Pre-computed feature matrix. If provided, features is ignored
43
+ and no scaling is applied (caller is responsible for appropriate preprocessing).
44
+ metric (str, optional): Distance metric for UMAP (e.g., 'euclidean', 'jaccard'). Default 'euclidean'.
35
45
  projection (str, optional): The projection to use ('UMAP', 'TSNE', 'MDS' or 'PCA'). Default 'UMAP'.
36
46
 
37
47
  Returns:
@@ -40,36 +50,44 @@ class Projection2D:
40
50
  # Create a copy of the input DataFrame
41
51
  df = input_df.copy()
42
52
 
43
- # Auto-identify numeric features if none are provided
44
- if features is None:
45
- features = [col for col in df.select_dtypes(include="number").columns if not col.endswith("id")]
46
- self.log.info(f"Auto-identified numeric features: {features}")
47
-
48
- if len(features) < 2 or df.empty:
49
- self.log.critical("At least two numeric features are required, and DataFrame must not be empty.")
50
- return df
51
-
52
- # Process a copy of the feature data for projection
53
- X = df[features]
54
- X = X.apply(lambda col: col.fillna(col.mean()))
55
- X_scaled = StandardScaler().fit_transform(X)
53
+ # If a feature matrix is provided, use it directly (no scaling)
54
+ if feature_matrix is not None:
55
+ if len(feature_matrix) != len(df):
56
+ self.log.critical("feature_matrix length must match DataFrame length.")
57
+ return df
58
+ X_processed = feature_matrix
59
+ else:
60
+ # Auto-identify numeric features if none are provided
61
+ if features is None:
62
+ features = [col for col in df.select_dtypes(include="number").columns if not col.endswith("id")]
63
+ self.log.info(f"Auto-identified numeric features: {features}")
64
+
65
+ if len(features) < 2 or df.empty:
66
+ self.log.critical("At least two numeric features are required, and DataFrame must not be empty.")
67
+ return df
68
+
69
+ # Process a copy of the feature data for projection
70
+ X = df[features]
71
+ X = X.apply(lambda col: col.fillna(col.mean()))
72
+ X_processed = StandardScaler().fit_transform(X)
56
73
 
57
74
  # Select the projection method (using df for perplexity calculation)
58
- self.projection_model = self._get_projection_model(projection, df)
75
+ self.projection_model = self._get_projection_model(projection, df, metric=metric)
59
76
 
60
- # Apply the projection on the normalized data
61
- projection_result = self.projection_model.fit_transform(X_scaled)
77
+ # Apply the projection on the processed data
78
+ projection_result = self.projection_model.fit_transform(X_processed)
62
79
  df[["x", "y"]] = projection_result
63
80
 
64
81
  # Resolve coincident points and return the new DataFrame
65
82
  return self.resolve_coincident_points(df)
66
83
 
67
- def _get_projection_model(self, projection: str, df: pd.DataFrame):
84
+ def _get_projection_model(self, projection: str, df: pd.DataFrame, metric: str = "euclidean"):
68
85
  """Select and return the appropriate projection model.
69
86
 
70
87
  Args:
71
88
  projection (str): The projection method ('TSNE', 'MDS', 'PCA', or 'UMAP').
72
89
  df (pd.DataFrame): The DataFrame being transformed (used for computing perplexity).
90
+ metric (str): Distance metric for UMAP (default 'euclidean').
73
91
 
74
92
  Returns:
75
93
  A dimensionality reduction model instance.
@@ -88,8 +106,14 @@ class Projection2D:
88
106
  return PCA(n_components=2)
89
107
 
90
108
  if projection == "UMAP" and UMAP_AVAILABLE:
91
- self.log.info("Projection: UMAP")
92
- return umap.UMAP(n_components=2)
109
+ # UMAP default n_neighbors=15, adjust if dataset is smaller
110
+ n_neighbors = min(15, len(df) - 1)
111
+ if n_neighbors < 15:
112
+ self.log.warning(
113
+ f"Dataset size ({len(df)}) smaller than default n_neighbors, using n_neighbors={n_neighbors}"
114
+ )
115
+ self.log.info(f"Projection: UMAP with metric={metric}, n_neighbors={n_neighbors}")
116
+ return umap.UMAP(n_components=2, metric=metric, n_neighbors=n_neighbors)
93
117
 
94
118
  self.log.warning(
95
119
  f"Projection method '{projection}' not recognized or UMAP not available. Falling back to TSNE."
@@ -118,7 +142,6 @@ class Projection2D:
118
142
 
119
143
  # Find duplicates
120
144
  duplicated = rounded.duplicated(subset=["x_round", "y_round"], keep=False)
121
- print("Coincident Points found:", duplicated.sum())
122
145
  if not duplicated.any():
123
146
  return df
124
147