workbench 0.8.177__py3-none-any.whl → 0.8.227__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

Files changed (140) hide show
  1. workbench/__init__.py +1 -0
  2. workbench/algorithms/dataframe/__init__.py +1 -2
  3. workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
  4. workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
  5. workbench/algorithms/dataframe/fingerprint_proximity.py +422 -86
  6. workbench/algorithms/dataframe/projection_2d.py +44 -21
  7. workbench/algorithms/dataframe/proximity.py +259 -305
  8. workbench/algorithms/graph/light/proximity_graph.py +12 -11
  9. workbench/algorithms/models/cleanlab_model.py +382 -0
  10. workbench/algorithms/models/noise_model.py +388 -0
  11. workbench/algorithms/sql/column_stats.py +0 -1
  12. workbench/algorithms/sql/correlations.py +0 -1
  13. workbench/algorithms/sql/descriptive_stats.py +0 -1
  14. workbench/algorithms/sql/outliers.py +3 -3
  15. workbench/api/__init__.py +5 -1
  16. workbench/api/df_store.py +17 -108
  17. workbench/api/endpoint.py +14 -12
  18. workbench/api/feature_set.py +117 -11
  19. workbench/api/meta.py +0 -1
  20. workbench/api/meta_model.py +289 -0
  21. workbench/api/model.py +52 -21
  22. workbench/api/parameter_store.py +3 -52
  23. workbench/cached/cached_meta.py +0 -1
  24. workbench/cached/cached_model.py +49 -11
  25. workbench/core/artifacts/__init__.py +11 -2
  26. workbench/core/artifacts/artifact.py +5 -5
  27. workbench/core/artifacts/df_store_core.py +114 -0
  28. workbench/core/artifacts/endpoint_core.py +319 -204
  29. workbench/core/artifacts/feature_set_core.py +249 -45
  30. workbench/core/artifacts/model_core.py +135 -82
  31. workbench/core/artifacts/parameter_store_core.py +98 -0
  32. workbench/core/cloud_platform/cloud_meta.py +0 -1
  33. workbench/core/pipelines/pipeline_executor.py +1 -1
  34. workbench/core/transforms/features_to_model/features_to_model.py +60 -44
  35. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +43 -10
  36. workbench/core/transforms/pandas_transforms/pandas_to_features.py +38 -2
  37. workbench/core/views/training_view.py +113 -42
  38. workbench/core/views/view.py +53 -3
  39. workbench/core/views/view_utils.py +4 -4
  40. workbench/model_script_utils/model_script_utils.py +339 -0
  41. workbench/model_script_utils/pytorch_utils.py +405 -0
  42. workbench/model_script_utils/uq_harness.py +277 -0
  43. workbench/model_scripts/chemprop/chemprop.template +774 -0
  44. workbench/model_scripts/chemprop/generated_model_script.py +774 -0
  45. workbench/model_scripts/chemprop/model_script_utils.py +339 -0
  46. workbench/model_scripts/chemprop/requirements.txt +3 -0
  47. workbench/model_scripts/custom_models/chem_info/fingerprints.py +175 -0
  48. workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +0 -1
  49. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +0 -1
  50. workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -2
  51. workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
  52. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +8 -10
  53. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  54. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
  55. workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
  56. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  57. workbench/model_scripts/custom_models/uq_models/ngboost.template +15 -16
  58. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
  59. workbench/model_scripts/meta_model/generated_model_script.py +209 -0
  60. workbench/model_scripts/meta_model/meta_model.template +209 -0
  61. workbench/model_scripts/pytorch_model/generated_model_script.py +443 -499
  62. workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
  63. workbench/model_scripts/pytorch_model/pytorch.template +440 -496
  64. workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
  65. workbench/model_scripts/pytorch_model/requirements.txt +1 -1
  66. workbench/model_scripts/pytorch_model/uq_harness.py +277 -0
  67. workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
  68. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  69. workbench/model_scripts/script_generation.py +15 -12
  70. workbench/model_scripts/uq_models/generated_model_script.py +248 -0
  71. workbench/model_scripts/xgb_model/generated_model_script.py +371 -403
  72. workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
  73. workbench/model_scripts/xgb_model/uq_harness.py +277 -0
  74. workbench/model_scripts/xgb_model/xgb_model.template +367 -399
  75. workbench/repl/workbench_shell.py +18 -14
  76. workbench/resources/open_source_api.key +1 -1
  77. workbench/scripts/endpoint_test.py +162 -0
  78. workbench/scripts/lambda_test.py +73 -0
  79. workbench/scripts/meta_model_sim.py +35 -0
  80. workbench/scripts/ml_pipeline_sqs.py +122 -6
  81. workbench/scripts/training_test.py +85 -0
  82. workbench/themes/dark/custom.css +59 -0
  83. workbench/themes/dark/plotly.json +5 -5
  84. workbench/themes/light/custom.css +153 -40
  85. workbench/themes/light/plotly.json +9 -9
  86. workbench/themes/midnight_blue/custom.css +59 -0
  87. workbench/utils/aws_utils.py +0 -1
  88. workbench/utils/chem_utils/fingerprints.py +87 -46
  89. workbench/utils/chem_utils/mol_descriptors.py +0 -1
  90. workbench/utils/chem_utils/projections.py +16 -6
  91. workbench/utils/chem_utils/vis.py +25 -27
  92. workbench/utils/chemprop_utils.py +141 -0
  93. workbench/utils/config_manager.py +2 -6
  94. workbench/utils/endpoint_utils.py +5 -7
  95. workbench/utils/license_manager.py +2 -6
  96. workbench/utils/markdown_utils.py +57 -0
  97. workbench/utils/meta_model_simulator.py +499 -0
  98. workbench/utils/metrics_utils.py +256 -0
  99. workbench/utils/model_utils.py +260 -76
  100. workbench/utils/pipeline_utils.py +0 -1
  101. workbench/utils/plot_utils.py +159 -34
  102. workbench/utils/pytorch_utils.py +87 -0
  103. workbench/utils/shap_utils.py +11 -57
  104. workbench/utils/theme_manager.py +95 -30
  105. workbench/utils/xgboost_local_crossfold.py +267 -0
  106. workbench/utils/xgboost_model_utils.py +127 -220
  107. workbench/web_interface/components/experiments/outlier_plot.py +0 -1
  108. workbench/web_interface/components/model_plot.py +16 -2
  109. workbench/web_interface/components/plugin_unit_test.py +5 -3
  110. workbench/web_interface/components/plugins/ag_table.py +2 -4
  111. workbench/web_interface/components/plugins/confusion_matrix.py +3 -6
  112. workbench/web_interface/components/plugins/model_details.py +48 -80
  113. workbench/web_interface/components/plugins/scatter_plot.py +192 -92
  114. workbench/web_interface/components/settings_menu.py +184 -0
  115. workbench/web_interface/page_views/main_page.py +0 -1
  116. {workbench-0.8.177.dist-info → workbench-0.8.227.dist-info}/METADATA +31 -17
  117. {workbench-0.8.177.dist-info → workbench-0.8.227.dist-info}/RECORD +121 -106
  118. {workbench-0.8.177.dist-info → workbench-0.8.227.dist-info}/entry_points.txt +4 -0
  119. {workbench-0.8.177.dist-info → workbench-0.8.227.dist-info}/licenses/LICENSE +1 -1
  120. workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
  121. workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
  122. workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
  123. workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
  124. workbench/model_scripts/custom_models/proximity/proximity.py +0 -384
  125. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -494
  126. workbench/model_scripts/custom_models/uq_models/mapie.template +0 -494
  127. workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -386
  128. workbench/model_scripts/custom_models/uq_models/proximity.py +0 -384
  129. workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
  130. workbench/model_scripts/quant_regression/quant_regression.template +0 -279
  131. workbench/model_scripts/quant_regression/requirements.txt +0 -1
  132. workbench/themes/quartz/base_css.url +0 -1
  133. workbench/themes/quartz/custom.css +0 -117
  134. workbench/themes/quartz/plotly.json +0 -642
  135. workbench/themes/quartz_dark/base_css.url +0 -1
  136. workbench/themes/quartz_dark/custom.css +0 -131
  137. workbench/themes/quartz_dark/plotly.json +0 -642
  138. workbench/utils/resource_utils.py +0 -39
  139. {workbench-0.8.177.dist-info → workbench-0.8.227.dist-info}/WHEEL +0 -0
  140. {workbench-0.8.177.dist-info → workbench-0.8.227.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,7 @@ from typing import Union
4
4
  import pandas as pd
5
5
 
6
6
  # Workbench Imports
7
- from workbench.core.artifacts.model_core import ModelCore
7
+ from workbench.core.artifacts.model_core import ModelCore, ModelType
8
8
  from workbench.core.artifacts.cached_artifact_mixin import CachedArtifactMixin
9
9
 
10
10
 
@@ -72,11 +72,11 @@ class CachedModel(CachedArtifactMixin, ModelCore):
72
72
  return super().list_inference_runs()
73
73
 
74
74
  @CachedArtifactMixin.cache_result
75
- def get_inference_metrics(self, capture_name: str = "latest") -> Union[pd.DataFrame, None]:
75
+ def get_inference_metrics(self, capture_name: str = "auto") -> Union[pd.DataFrame, None]:
76
76
  """Retrieve the captured prediction results for this model
77
77
 
78
78
  Args:
79
- capture_name (str, optional): Specific capture_name (default: latest)
79
+ capture_name (str, optional): Specific capture_name (default: auto)
80
80
 
81
81
  Returns:
82
82
  pd.DataFrame: DataFrame of the Captured Metrics (might be None)
@@ -84,28 +84,66 @@ class CachedModel(CachedArtifactMixin, ModelCore):
84
84
  return super().get_inference_metrics(capture_name=capture_name)
85
85
 
86
86
  @CachedArtifactMixin.cache_result
87
- def get_inference_predictions(self, capture_name: str = "auto_inference") -> Union[pd.DataFrame, None]:
87
+ def get_inference_predictions(
88
+ self, capture_name: str = "auto_inference", limit: int = 1000
89
+ ) -> Union[pd.DataFrame, None]:
88
90
  """Retrieve the captured prediction results for this model
89
91
 
90
92
  Args:
91
- capture_name (str, optional): Specific capture_name (default: training_holdout)
93
+ capture_name (str, optional): Specific capture_name (default: auto_inference)
94
+ limit (int, optional): Maximum rows to return (default: 1000)
92
95
 
93
96
  Returns:
94
97
  pd.DataFrame: DataFrame of the Captured Predictions (might be None)
95
98
  """
96
- # Note: This method can generate larger dataframes, so we'll sample if needed
97
99
  df = super().get_inference_predictions(capture_name=capture_name)
98
- if df is not None and len(df) > 5000:
99
- self.log.warning(f"{self.name}:{capture_name} Sampling Inference Predictions to 5000 rows")
100
- return df.sample(5000)
100
+ if df is None:
101
+ return None
102
+
103
+ # Compute residual and do smart sampling based on model type
104
+ is_regressor = self.model_type in [ModelType.REGRESSOR, ModelType.UQ_REGRESSOR, ModelType.ENSEMBLE_REGRESSOR]
105
+ is_classifier = self.model_type == ModelType.CLASSIFIER
106
+
107
+ if is_regressor:
108
+ target = self.target()
109
+ if target and "prediction" in df.columns and target in df.columns:
110
+ df["residual"] = abs(df["prediction"] - df[target])
111
+
112
+ elif is_classifier:
113
+ target = self.target()
114
+ class_labels = self.class_labels()
115
+ if target and "prediction" in df.columns and target in df.columns and class_labels:
116
+ # Create a mapping from label to ordinal index
117
+ label_to_idx = {label: idx for idx, label in enumerate(class_labels)}
118
+ # Compute residual as distance between predicted and actual class
119
+ df["residual"] = abs(
120
+ df["prediction"].map(label_to_idx).fillna(-1) - df[target].map(label_to_idx).fillna(-1)
121
+ )
122
+
123
+ # Smart sampling: half high-residual rows, half random from the rest
124
+ if "residual" in df.columns and len(df) > limit:
125
+ half_limit = limit // 2
126
+ self.log.warning(
127
+ f"{self.name}:{capture_name} Sampling {limit} rows (top {half_limit} residuals + {half_limit} random)"
128
+ )
129
+ top_residuals = df.nlargest(half_limit, "residual")
130
+ remaining = df.drop(top_residuals.index)
131
+ random_sample = remaining.sample(min(half_limit, len(remaining)))
132
+ return pd.concat([top_residuals, random_sample]).reset_index(drop=True)
133
+
134
+ # Fallback: just limit rows if no residual computed
135
+ if len(df) > limit:
136
+ self.log.warning(f"{self.name}:{capture_name} Sampling to {limit} rows")
137
+ return df.sample(limit)
138
+
101
139
  return df
102
140
 
103
141
  @CachedArtifactMixin.cache_result
104
- def confusion_matrix(self, capture_name: str = "latest") -> Union[pd.DataFrame, None]:
142
+ def confusion_matrix(self, capture_name: str = "auto") -> Union[pd.DataFrame, None]:
105
143
  """Retrieve the confusion matrix for the model
106
144
 
107
145
  Args:
108
- capture_name (str, optional): Specific capture_name (default: latest)
146
+ capture_name (str, optional): Specific capture_name (default: auto)
109
147
 
110
148
  Returns:
111
149
  pd.DataFrame: DataFrame of the Confusion Matrix (might be None)
@@ -15,7 +15,16 @@ from .artifact import Artifact
15
15
  from .athena_source import AthenaSource
16
16
  from .data_source_abstract import DataSourceAbstract
17
17
  from .feature_set_core import FeatureSetCore
18
- from .model_core import ModelCore, ModelType
18
+ from .model_core import ModelCore, ModelType, ModelFramework
19
19
  from .endpoint_core import EndpointCore
20
20
 
21
- __all__ = ["Artifact", "AthenaSource", "DataSourceAbstract", "FeatureSetCore", "ModelCore", "ModelType", "EndpointCore"]
21
+ __all__ = [
22
+ "Artifact",
23
+ "AthenaSource",
24
+ "DataSourceAbstract",
25
+ "FeatureSetCore",
26
+ "ModelCore",
27
+ "ModelType",
28
+ "ModelFramework",
29
+ "EndpointCore",
30
+ ]
@@ -8,8 +8,8 @@ from typing import Union
8
8
 
9
9
  # Workbench Imports
10
10
  from workbench.core.cloud_platform.aws.aws_account_clamp import AWSAccountClamp
11
- from workbench.core.cloud_platform.aws.aws_parameter_store import AWSParameterStore as ParameterStore
12
- from workbench.core.cloud_platform.aws.aws_df_store import AWSDFStore as DFStore
11
+ from workbench.core.artifacts.parameter_store_core import ParameterStoreCore
12
+ from workbench.core.artifacts.df_store_core import DFStoreCore
13
13
  from workbench.utils.aws_utils import dict_to_aws_tags
14
14
  from workbench.utils.config_manager import ConfigManager, FatalConfigError
15
15
  from workbench.core.cloud_platform.cloud_meta import CloudMeta
@@ -48,11 +48,11 @@ class Artifact(ABC):
48
48
  tag_delimiter = "::"
49
49
 
50
50
  # Grab our Dataframe Cache Storage
51
- df_cache = DFStore(path_prefix="/workbench/dataframe_cache")
51
+ df_cache = DFStoreCore(path_prefix="/workbench/dataframe_cache")
52
52
 
53
53
  # Artifact may want to use the Parameter Store or Dataframe Store
54
- param_store = ParameterStore()
55
- df_store = DFStore()
54
+ param_store = ParameterStoreCore()
55
+ df_store = DFStoreCore()
56
56
 
57
57
  def __init__(self, name: str, use_cached_meta: bool = False):
58
58
  """Initialize the Artifact Base Class
@@ -0,0 +1,114 @@
1
+ """DFStoreCore: Fast/efficient storage of DataFrames using AWS S3/Parquet/Snappy"""
2
+
3
+ import logging
4
+ from typing import Union
5
+
6
+ # Workbench Imports
7
+ from workbench.utils.config_manager import ConfigManager
8
+ from workbench.core.cloud_platform.aws.aws_account_clamp import AWSAccountClamp
9
+
10
+ # Workbench Bridges Import
11
+ from workbench_bridges.api import DFStore as BridgesDFStore
12
+
13
+
14
+ class DFStoreCore(BridgesDFStore):
15
+ """DFStoreCore: Fast/efficient storage of DataFrames using AWS S3/Parquet/Snappy
16
+
17
+ Common Usage:
18
+ ```python
19
+ df_store = DFStoreCore()
20
+
21
+ # List Data
22
+ df_store.list()
23
+
24
+ # Add DataFrame
25
+ df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
26
+ df_store.upsert("/test/my_data", df)
27
+
28
+ # Retrieve DataFrame
29
+ df = df_store.get("/test/my_data")
30
+ print(df)
31
+
32
+ # Delete Data
33
+ df_store.delete("/test/my_data")
34
+ ```
35
+ """
36
+
37
+ def __init__(self, path_prefix: Union[str, None] = None):
38
+ """DFStoreCore Init Method
39
+
40
+ Args:
41
+ path_prefix (Union[str, None], optional): Add a path prefix to storage locations (Defaults to None)
42
+ """
43
+ # Get config from workbench's systems
44
+ bucket = ConfigManager().get_config("WORKBENCH_BUCKET")
45
+ session = AWSAccountClamp().boto3_session
46
+
47
+ # Initialize parent with workbench config
48
+ super().__init__(path_prefix=path_prefix, s3_bucket=bucket, boto3_session=session)
49
+ self.log = logging.getLogger("workbench")
50
+
51
+
52
+ if __name__ == "__main__":
53
+ """Exercise the DFStoreCore Class"""
54
+ import time
55
+ import pandas as pd
56
+
57
+ # Create a DFStoreCore manager
58
+ df_store = DFStoreCore()
59
+
60
+ # Details of the Dataframe Store
61
+ print("Detailed Data...")
62
+ print(df_store.details())
63
+
64
+ # Add a new DataFrame
65
+ my_df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
66
+ df_store.upsert("/testing/test_data", my_df)
67
+
68
+ # Get the DataFrame
69
+ print(f"Getting data 'test_data':\n{df_store.get('/testing/test_data')}")
70
+
71
+ # Now let's test adding a Series
72
+ series = pd.Series([1, 2, 3, 4], name="Series")
73
+ df_store.upsert("/testing/test_series", series)
74
+ print(f"Getting data 'test_series':\n{df_store.get('/testing/test_series')}")
75
+
76
+ # Summary of the data
77
+ print("Summary Data...")
78
+ print(df_store.summary())
79
+
80
+ # Repr of the DFStoreCore object
81
+ print("DFStoreCore Object:")
82
+ print(df_store)
83
+
84
+ # Check if the data exists
85
+ print("Check if data exists...")
86
+ print(df_store.check("/testing/test_data"))
87
+ print(df_store.check("/testing/test_series"))
88
+
89
+ # Time the check
90
+ start_time = time.time()
91
+ print(df_store.check("/testing/test_data"))
92
+ print("--- Check %s seconds ---" % (time.time() - start_time))
93
+
94
+ # Now delete the test data
95
+ df_store.delete("/testing/test_data")
96
+ df_store.delete("/testing/test_series")
97
+
98
+ # Check if the data exists
99
+ print("Check if data exists...")
100
+ print(df_store.check("/testing/test_data"))
101
+ print(df_store.check("/testing/test_series"))
102
+
103
+ # Add a bunch of dataframes and then test recursive delete
104
+ for i in range(10):
105
+ df_store.upsert(f"/testing/data_{i}", pd.DataFrame({"A": [1, 2], "B": [3, 4]}))
106
+ print("Before Recursive Delete:")
107
+ print(df_store.summary())
108
+ df_store.delete_recursive("/testing")
109
+ print("After Recursive Delete:")
110
+ print(df_store.summary())
111
+
112
+ # Get a non-existent DataFrame
113
+ print("Getting non-existent data...")
114
+ print(df_store.get("/testing/no_where"))