PyPI - workbench - Versions diffs - 0.8.174__py3-none-any.whl → 0.8.227__py3-none-any.whl - Mend

workbench 0.8.174py3-none-any.whl → 0.8.227py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (145) hide show

workbench/__init__.py +1 -0
workbench/algorithms/dataframe/__init__.py +1 -2
workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
workbench/algorithms/dataframe/fingerprint_proximity.py +422 -86
workbench/algorithms/dataframe/projection_2d.py +44 -21
workbench/algorithms/dataframe/proximity.py +259 -305
workbench/algorithms/graph/light/proximity_graph.py +12 -11
workbench/algorithms/models/cleanlab_model.py +382 -0
workbench/algorithms/models/noise_model.py +388 -0
workbench/algorithms/sql/column_stats.py +0 -1
workbench/algorithms/sql/correlations.py +0 -1
workbench/algorithms/sql/descriptive_stats.py +0 -1
workbench/algorithms/sql/outliers.py +3 -3
workbench/api/__init__.py +5 -1
workbench/api/df_store.py +17 -108
workbench/api/endpoint.py +14 -12
workbench/api/feature_set.py +117 -11
workbench/api/meta.py +0 -1
workbench/api/meta_model.py +289 -0
workbench/api/model.py +52 -21
workbench/api/parameter_store.py +3 -52
workbench/cached/cached_meta.py +0 -1
workbench/cached/cached_model.py +49 -11
workbench/core/artifacts/__init__.py +11 -2
workbench/core/artifacts/artifact.py +7 -7
workbench/core/artifacts/data_capture_core.py +8 -1
workbench/core/artifacts/df_store_core.py +114 -0
workbench/core/artifacts/endpoint_core.py +323 -205
workbench/core/artifacts/feature_set_core.py +249 -45
workbench/core/artifacts/model_core.py +133 -101
workbench/core/artifacts/parameter_store_core.py +98 -0
workbench/core/cloud_platform/aws/aws_account_clamp.py +48 -2
workbench/core/cloud_platform/cloud_meta.py +0 -1
workbench/core/pipelines/pipeline_executor.py +1 -1
workbench/core/transforms/features_to_model/features_to_model.py +60 -44
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +43 -10
workbench/core/transforms/pandas_transforms/pandas_to_features.py +38 -2
workbench/core/views/training_view.py +113 -42
workbench/core/views/view.py +53 -3
workbench/core/views/view_utils.py +4 -4
workbench/model_script_utils/model_script_utils.py +339 -0
workbench/model_script_utils/pytorch_utils.py +405 -0
workbench/model_script_utils/uq_harness.py +277 -0
workbench/model_scripts/chemprop/chemprop.template +774 -0
workbench/model_scripts/chemprop/generated_model_script.py +774 -0
workbench/model_scripts/chemprop/model_script_utils.py +339 -0
workbench/model_scripts/chemprop/requirements.txt +3 -0
workbench/model_scripts/custom_models/chem_info/fingerprints.py +175 -0
workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +18 -7
workbench/model_scripts/custom_models/chem_info/mol_standardize.py +80 -58
workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +0 -1
workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -2
workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +8 -10
workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
workbench/model_scripts/custom_models/uq_models/ngboost.template +15 -16
workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
workbench/model_scripts/meta_model/generated_model_script.py +209 -0
workbench/model_scripts/meta_model/meta_model.template +209 -0
workbench/model_scripts/pytorch_model/generated_model_script.py +443 -499
workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
workbench/model_scripts/pytorch_model/pytorch.template +440 -496
workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
workbench/model_scripts/pytorch_model/requirements.txt +1 -1
workbench/model_scripts/pytorch_model/uq_harness.py +277 -0
workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
workbench/model_scripts/script_generation.py +15 -12
workbench/model_scripts/uq_models/generated_model_script.py +248 -0
workbench/model_scripts/xgb_model/generated_model_script.py +371 -403
workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
workbench/model_scripts/xgb_model/uq_harness.py +277 -0
workbench/model_scripts/xgb_model/xgb_model.template +367 -399
workbench/repl/workbench_shell.py +18 -14
workbench/resources/open_source_api.key +1 -1
workbench/scripts/endpoint_test.py +162 -0
workbench/scripts/lambda_test.py +73 -0
workbench/scripts/meta_model_sim.py +35 -0
workbench/scripts/ml_pipeline_sqs.py +122 -6
workbench/scripts/training_test.py +85 -0
workbench/themes/dark/custom.css +59 -0
workbench/themes/dark/plotly.json +5 -5
workbench/themes/light/custom.css +153 -40
workbench/themes/light/plotly.json +9 -9
workbench/themes/midnight_blue/custom.css +59 -0
workbench/utils/aws_utils.py +0 -1
workbench/utils/chem_utils/fingerprints.py +87 -46
workbench/utils/chem_utils/mol_descriptors.py +18 -7
workbench/utils/chem_utils/mol_standardize.py +80 -58
workbench/utils/chem_utils/projections.py +16 -6
workbench/utils/chem_utils/vis.py +25 -27
workbench/utils/chemprop_utils.py +141 -0
workbench/utils/config_manager.py +2 -6
workbench/utils/endpoint_utils.py +5 -7
workbench/utils/license_manager.py +2 -6
workbench/utils/markdown_utils.py +57 -0
workbench/utils/meta_model_simulator.py +499 -0
workbench/utils/metrics_utils.py +256 -0
workbench/utils/model_utils.py +274 -87
workbench/utils/pipeline_utils.py +0 -1
workbench/utils/plot_utils.py +159 -34
workbench/utils/pytorch_utils.py +87 -0
workbench/utils/shap_utils.py +11 -57
workbench/utils/theme_manager.py +95 -30
workbench/utils/xgboost_local_crossfold.py +267 -0
workbench/utils/xgboost_model_utils.py +127 -220
workbench/web_interface/components/experiments/outlier_plot.py +0 -1
workbench/web_interface/components/model_plot.py +16 -2
workbench/web_interface/components/plugin_unit_test.py +5 -3
workbench/web_interface/components/plugins/ag_table.py +2 -4
workbench/web_interface/components/plugins/confusion_matrix.py +3 -6
workbench/web_interface/components/plugins/model_details.py +48 -80
workbench/web_interface/components/plugins/scatter_plot.py +192 -92
workbench/web_interface/components/settings_menu.py +184 -0
workbench/web_interface/page_views/main_page.py +0 -1
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/METADATA +31 -17
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/RECORD +125 -111
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/entry_points.txt +4 -0
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/licenses/LICENSE +1 -1
workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
workbench/model_scripts/custom_models/proximity/proximity.py +0 -384
workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
workbench/model_scripts/custom_models/uq_models/mapie.template +0 -502
workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -386
workbench/model_scripts/custom_models/uq_models/proximity.py +0 -384
workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
workbench/model_scripts/quant_regression/quant_regression.template +0 -279
workbench/model_scripts/quant_regression/requirements.txt +0 -1
workbench/themes/quartz/base_css.url +0 -1
workbench/themes/quartz/custom.css +0 -117
workbench/themes/quartz/plotly.json +0 -642
workbench/themes/quartz_dark/base_css.url +0 -1
workbench/themes/quartz_dark/custom.css +0 -131
workbench/themes/quartz_dark/plotly.json +0 -642
workbench/utils/fast_inference.py +0 -167
workbench/utils/resource_utils.py +0 -39
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/WHEEL +0 -0
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/top_level.txt +0 -0

workbench/cached/cached_model.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Union
 import pandas as pd
 # Workbench Imports
-from workbench.core.artifacts.model_core import ModelCore
+from workbench.core.artifacts.model_core import ModelCore, ModelType
 from workbench.core.artifacts.cached_artifact_mixin import CachedArtifactMixin
@@ -72,11 +72,11 @@ class CachedModel(CachedArtifactMixin, ModelCore):
         return super().list_inference_runs()
     @CachedArtifactMixin.cache_result
-    def get_inference_metrics(self, capture_name: str = "latest") -> Union[pd.DataFrame, None]:
+    def get_inference_metrics(self, capture_name: str = "auto") -> Union[pd.DataFrame, None]:
         """Retrieve the captured prediction results for this model
         Args:
-            capture_name (str, optional): Specific capture_name (default: latest)
+            capture_name (str, optional): Specific capture_name (default: auto)
         Returns:
             pd.DataFrame: DataFrame of the Captured Metrics (might be None)
@@ -84,28 +84,66 @@ class CachedModel(CachedArtifactMixin, ModelCore):
         return super().get_inference_metrics(capture_name=capture_name)
     @CachedArtifactMixin.cache_result
-    def get_inference_predictions(self, capture_name: str = "auto_inference") -> Union[pd.DataFrame, None]:
+    def get_inference_predictions(
+        self, capture_name: str = "auto_inference", limit: int = 1000
+    ) -> Union[pd.DataFrame, None]:
         """Retrieve the captured prediction results for this model
         Args:
-            capture_name (str, optional): Specific capture_name (default: training_holdout)
+            capture_name (str, optional): Specific capture_name (default: auto_inference)
+            limit (int, optional): Maximum rows to return (default: 1000)
         Returns:
             pd.DataFrame: DataFrame of the Captured Predictions (might be None)
         """
-        # Note: This method can generate larger dataframes, so we'll sample if needed
         df = super().get_inference_predictions(capture_name=capture_name)
-        if df is not None and len(df) > 5000:
-            self.log.warning(f"{self.name}:{capture_name} Sampling Inference Predictions to 5000 rows")
-            return df.sample(5000)
+        if df is None:
+            return None
+        # Compute residual and do smart sampling based on model type
+        is_regressor = self.model_type in [ModelType.REGRESSOR, ModelType.UQ_REGRESSOR, ModelType.ENSEMBLE_REGRESSOR]
+        is_classifier = self.model_type == ModelType.CLASSIFIER
+        if is_regressor:
+            target = self.target()
+            if target and "prediction" in df.columns and target in df.columns:
+                df["residual"] = abs(df["prediction"] - df[target])
+        elif is_classifier:
+            target = self.target()
+            class_labels = self.class_labels()
+            if target and "prediction" in df.columns and target in df.columns and class_labels:
+                # Create a mapping from label to ordinal index
+                label_to_idx = {label: idx for idx, label in enumerate(class_labels)}
+                # Compute residual as distance between predicted and actual class
+                df["residual"] = abs(
+                    df["prediction"].map(label_to_idx).fillna(-1) - df[target].map(label_to_idx).fillna(-1)
+                )
+        # Smart sampling: half high-residual rows, half random from the rest
+        if "residual" in df.columns and len(df) > limit:
+            half_limit = limit // 2
+            self.log.warning(
+                f"{self.name}:{capture_name} Sampling {limit} rows (top {half_limit} residuals + {half_limit} random)"
+            )
+            top_residuals = df.nlargest(half_limit, "residual")
+            remaining = df.drop(top_residuals.index)
+            random_sample = remaining.sample(min(half_limit, len(remaining)))
+            return pd.concat([top_residuals, random_sample]).reset_index(drop=True)
+        # Fallback: just limit rows if no residual computed
+        if len(df) > limit:
+            self.log.warning(f"{self.name}:{capture_name} Sampling to {limit} rows")
+            return df.sample(limit)
         return df
     @CachedArtifactMixin.cache_result
-    def confusion_matrix(self, capture_name: str = "latest") -> Union[pd.DataFrame, None]:
+    def confusion_matrix(self, capture_name: str = "auto") -> Union[pd.DataFrame, None]:
         """Retrieve the confusion matrix for the model
         Args:
-            capture_name (str, optional): Specific capture_name (default: latest)
+            capture_name (str, optional): Specific capture_name (default: auto)
         Returns:
             pd.DataFrame: DataFrame of the Confusion Matrix (might be None)

workbench/core/artifacts/__init__.py CHANGED Viewed

@@ -15,7 +15,16 @@ from .artifact import Artifact
 from .athena_source import AthenaSource
 from .data_source_abstract import DataSourceAbstract
 from .feature_set_core import FeatureSetCore
-from .model_core import ModelCore, ModelType
+from .model_core import ModelCore, ModelType, ModelFramework
 from .endpoint_core import EndpointCore
-__all__ = ["Artifact", "AthenaSource", "DataSourceAbstract", "FeatureSetCore", "ModelCore", "ModelType", "EndpointCore"]
+__all__ = [
+    "Artifact",
+    "AthenaSource",
+    "DataSourceAbstract",
+    "FeatureSetCore",
+    "ModelCore",
+    "ModelType",
+    "ModelFramework",
+    "EndpointCore",
+]

workbench/core/artifacts/artifact.py CHANGED Viewed

@@ -8,8 +8,8 @@ from typing import Union
 # Workbench Imports
 from workbench.core.cloud_platform.aws.aws_account_clamp import AWSAccountClamp
-from workbench.core.cloud_platform.aws.aws_parameter_store import AWSParameterStore as ParameterStore
-from workbench.core.cloud_platform.aws.aws_df_store import AWSDFStore as DFStore
+from workbench.core.artifacts.parameter_store_core import ParameterStoreCore
+from workbench.core.artifacts.df_store_core import DFStoreCore
 from workbench.utils.aws_utils import dict_to_aws_tags
 from workbench.utils.config_manager import ConfigManager, FatalConfigError
 from workbench.core.cloud_platform.cloud_meta import CloudMeta
@@ -48,11 +48,11 @@ class Artifact(ABC):
     tag_delimiter = "::"
     # Grab our Dataframe Cache Storage
-    df_cache = DFStore(path_prefix="/workbench/dataframe_cache")
+    df_cache = DFStoreCore(path_prefix="/workbench/dataframe_cache")
     # Artifact may want to use the Parameter Store or Dataframe Store
-    param_store = ParameterStore()
-    df_store = DFStore()
+    param_store = ParameterStoreCore()
+    df_store = DFStoreCore()
     def __init__(self, name: str, use_cached_meta: bool = False):
         """Initialize the Artifact Base Class
@@ -238,8 +238,8 @@ class Artifact(ABC):
         """
         # Check for ReadOnly Role
-        if self.aws_account_clamp.read_only_role:
-            self.log.info("Cannot add metadata with a ReadOnly Role...")
+        if self.aws_account_clamp.read_only:
+            self.log.info("Cannot add metadata with a ReadOnly Permissions...")
             return
         # Sanity check

workbench/core/artifacts/data_capture_core.py CHANGED Viewed

@@ -231,6 +231,13 @@ class DataCaptureCore:
             self.log.info(f"Processing {len(files)} files from {from_date} onwards.")
         else:
             self.log.info(f"Processing all {len(files)} files...")
+        # Check if any files remain after filtering
+        if not files:
+            self.log.info("No files to process after date filtering.")
+            return pd.DataFrame(), pd.DataFrame()
+        # Sort files by name (assumed to include timestamp)
         files.sort()
         # Get all timestamps in one batch if needed
@@ -337,7 +344,7 @@ if __name__ == "__main__":
     # print(pred_df.head())
     # Check that data capture is working
-    input_df, output_df = dc.get_captured_data()
+    input_df, output_df = dc.get_captured_data(from_date="2025-09-01")
     if input_df.empty and output_df.empty:
         print("No data capture files found, for a new endpoint it may take a few minutes to start capturing data")
     else:

workbench/core/artifacts/df_store_core.py ADDED Viewed

@@ -0,0 +1,114 @@
+"""DFStoreCore: Fast/efficient storage of DataFrames using AWS S3/Parquet/Snappy"""
+import logging
+from typing import Union
+# Workbench Imports
+from workbench.utils.config_manager import ConfigManager
+from workbench.core.cloud_platform.aws.aws_account_clamp import AWSAccountClamp
+# Workbench Bridges Import
+from workbench_bridges.api import DFStore as BridgesDFStore
+class DFStoreCore(BridgesDFStore):
+    """DFStoreCore: Fast/efficient storage of DataFrames using AWS S3/Parquet/Snappy
+        Common Usage:
+    ```python
+            df_store = DFStoreCore()
+            # List Data
+            df_store.list()
+            # Add DataFrame
+            df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
+            df_store.upsert("/test/my_data", df)
+            # Retrieve DataFrame
+            df = df_store.get("/test/my_data")
+            print(df)
+            # Delete Data
+            df_store.delete("/test/my_data")
+    ```
+    """
+    def __init__(self, path_prefix: Union[str, None] = None):
+        """DFStoreCore Init Method
+        Args:
+            path_prefix (Union[str, None], optional): Add a path prefix to storage locations (Defaults to None)
+        """
+        # Get config from workbench's systems
+        bucket = ConfigManager().get_config("WORKBENCH_BUCKET")
+        session = AWSAccountClamp().boto3_session
+        # Initialize parent with workbench config
+        super().__init__(path_prefix=path_prefix, s3_bucket=bucket, boto3_session=session)
+        self.log = logging.getLogger("workbench")
+if __name__ == "__main__":
+    """Exercise the DFStoreCore Class"""
+    import time
+    import pandas as pd
+    # Create a DFStoreCore manager
+    df_store = DFStoreCore()
+    # Details of the Dataframe Store
+    print("Detailed Data...")
+    print(df_store.details())
+    # Add a new DataFrame
+    my_df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
+    df_store.upsert("/testing/test_data", my_df)
+    # Get the DataFrame
+    print(f"Getting data 'test_data':\n{df_store.get('/testing/test_data')}")
+    # Now let's test adding a Series
+    series = pd.Series([1, 2, 3, 4], name="Series")
+    df_store.upsert("/testing/test_series", series)
+    print(f"Getting data 'test_series':\n{df_store.get('/testing/test_series')}")
+    # Summary of the data
+    print("Summary Data...")
+    print(df_store.summary())
+    # Repr of the DFStoreCore object
+    print("DFStoreCore Object:")
+    print(df_store)
+    # Check if the data exists
+    print("Check if data exists...")
+    print(df_store.check("/testing/test_data"))
+    print(df_store.check("/testing/test_series"))
+    # Time the check
+    start_time = time.time()
+    print(df_store.check("/testing/test_data"))
+    print("--- Check %s seconds ---" % (time.time() - start_time))
+    # Now delete the test data
+    df_store.delete("/testing/test_data")
+    df_store.delete("/testing/test_series")
+    # Check if the data exists
+    print("Check if data exists...")
+    print(df_store.check("/testing/test_data"))
+    print(df_store.check("/testing/test_series"))
+    # Add a bunch of dataframes and then test recursive delete
+    for i in range(10):
+        df_store.upsert(f"/testing/data_{i}", pd.DataFrame({"A": [1, 2], "B": [3, 4]}))
+    print("Before Recursive Delete:")
+    print(df_store.summary())
+    df_store.delete_recursive("/testing")
+    print("After Recursive Delete:")
+    print(df_store.summary())
+    # Get a non-existent DataFrame
+    print("Getting non-existent data...")
+    print(df_store.get("/testing/no_where"))

workbench 0.8.174__py3-none-any.whl → 0.8.227__py3-none-any.whl

Potentially problematic release.

workbench 0.8.174py3-none-any.whl → 0.8.227py3-none-any.whl