PyPI - workbench - Versions diffs - 0.8.174__py3-none-any.whl → 0.8.227__py3-none-any.whl - Mend

workbench 0.8.174py3-none-any.whl → 0.8.227py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (145) hide show

workbench/__init__.py +1 -0
workbench/algorithms/dataframe/__init__.py +1 -2
workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
workbench/algorithms/dataframe/fingerprint_proximity.py +422 -86
workbench/algorithms/dataframe/projection_2d.py +44 -21
workbench/algorithms/dataframe/proximity.py +259 -305
workbench/algorithms/graph/light/proximity_graph.py +12 -11
workbench/algorithms/models/cleanlab_model.py +382 -0
workbench/algorithms/models/noise_model.py +388 -0
workbench/algorithms/sql/column_stats.py +0 -1
workbench/algorithms/sql/correlations.py +0 -1
workbench/algorithms/sql/descriptive_stats.py +0 -1
workbench/algorithms/sql/outliers.py +3 -3
workbench/api/__init__.py +5 -1
workbench/api/df_store.py +17 -108
workbench/api/endpoint.py +14 -12
workbench/api/feature_set.py +117 -11
workbench/api/meta.py +0 -1
workbench/api/meta_model.py +289 -0
workbench/api/model.py +52 -21
workbench/api/parameter_store.py +3 -52
workbench/cached/cached_meta.py +0 -1
workbench/cached/cached_model.py +49 -11
workbench/core/artifacts/__init__.py +11 -2
workbench/core/artifacts/artifact.py +7 -7
workbench/core/artifacts/data_capture_core.py +8 -1
workbench/core/artifacts/df_store_core.py +114 -0
workbench/core/artifacts/endpoint_core.py +323 -205
workbench/core/artifacts/feature_set_core.py +249 -45
workbench/core/artifacts/model_core.py +133 -101
workbench/core/artifacts/parameter_store_core.py +98 -0
workbench/core/cloud_platform/aws/aws_account_clamp.py +48 -2
workbench/core/cloud_platform/cloud_meta.py +0 -1
workbench/core/pipelines/pipeline_executor.py +1 -1
workbench/core/transforms/features_to_model/features_to_model.py +60 -44
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +43 -10
workbench/core/transforms/pandas_transforms/pandas_to_features.py +38 -2
workbench/core/views/training_view.py +113 -42
workbench/core/views/view.py +53 -3
workbench/core/views/view_utils.py +4 -4
workbench/model_script_utils/model_script_utils.py +339 -0
workbench/model_script_utils/pytorch_utils.py +405 -0
workbench/model_script_utils/uq_harness.py +277 -0
workbench/model_scripts/chemprop/chemprop.template +774 -0
workbench/model_scripts/chemprop/generated_model_script.py +774 -0
workbench/model_scripts/chemprop/model_script_utils.py +339 -0
workbench/model_scripts/chemprop/requirements.txt +3 -0
workbench/model_scripts/custom_models/chem_info/fingerprints.py +175 -0
workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +18 -7
workbench/model_scripts/custom_models/chem_info/mol_standardize.py +80 -58
workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +0 -1
workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -2
workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +8 -10
workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
workbench/model_scripts/custom_models/uq_models/ngboost.template +15 -16
workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
workbench/model_scripts/meta_model/generated_model_script.py +209 -0
workbench/model_scripts/meta_model/meta_model.template +209 -0
workbench/model_scripts/pytorch_model/generated_model_script.py +443 -499
workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
workbench/model_scripts/pytorch_model/pytorch.template +440 -496
workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
workbench/model_scripts/pytorch_model/requirements.txt +1 -1
workbench/model_scripts/pytorch_model/uq_harness.py +277 -0
workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
workbench/model_scripts/script_generation.py +15 -12
workbench/model_scripts/uq_models/generated_model_script.py +248 -0
workbench/model_scripts/xgb_model/generated_model_script.py +371 -403
workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
workbench/model_scripts/xgb_model/uq_harness.py +277 -0
workbench/model_scripts/xgb_model/xgb_model.template +367 -399
workbench/repl/workbench_shell.py +18 -14
workbench/resources/open_source_api.key +1 -1
workbench/scripts/endpoint_test.py +162 -0
workbench/scripts/lambda_test.py +73 -0
workbench/scripts/meta_model_sim.py +35 -0
workbench/scripts/ml_pipeline_sqs.py +122 -6
workbench/scripts/training_test.py +85 -0
workbench/themes/dark/custom.css +59 -0
workbench/themes/dark/plotly.json +5 -5
workbench/themes/light/custom.css +153 -40
workbench/themes/light/plotly.json +9 -9
workbench/themes/midnight_blue/custom.css +59 -0
workbench/utils/aws_utils.py +0 -1
workbench/utils/chem_utils/fingerprints.py +87 -46
workbench/utils/chem_utils/mol_descriptors.py +18 -7
workbench/utils/chem_utils/mol_standardize.py +80 -58
workbench/utils/chem_utils/projections.py +16 -6
workbench/utils/chem_utils/vis.py +25 -27
workbench/utils/chemprop_utils.py +141 -0
workbench/utils/config_manager.py +2 -6
workbench/utils/endpoint_utils.py +5 -7
workbench/utils/license_manager.py +2 -6
workbench/utils/markdown_utils.py +57 -0
workbench/utils/meta_model_simulator.py +499 -0
workbench/utils/metrics_utils.py +256 -0
workbench/utils/model_utils.py +274 -87
workbench/utils/pipeline_utils.py +0 -1
workbench/utils/plot_utils.py +159 -34
workbench/utils/pytorch_utils.py +87 -0
workbench/utils/shap_utils.py +11 -57
workbench/utils/theme_manager.py +95 -30
workbench/utils/xgboost_local_crossfold.py +267 -0
workbench/utils/xgboost_model_utils.py +127 -220
workbench/web_interface/components/experiments/outlier_plot.py +0 -1
workbench/web_interface/components/model_plot.py +16 -2
workbench/web_interface/components/plugin_unit_test.py +5 -3
workbench/web_interface/components/plugins/ag_table.py +2 -4
workbench/web_interface/components/plugins/confusion_matrix.py +3 -6
workbench/web_interface/components/plugins/model_details.py +48 -80
workbench/web_interface/components/plugins/scatter_plot.py +192 -92
workbench/web_interface/components/settings_menu.py +184 -0
workbench/web_interface/page_views/main_page.py +0 -1
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/METADATA +31 -17
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/RECORD +125 -111
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/entry_points.txt +4 -0
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/licenses/LICENSE +1 -1
workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
workbench/model_scripts/custom_models/proximity/proximity.py +0 -384
workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
workbench/model_scripts/custom_models/uq_models/mapie.template +0 -502
workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -386
workbench/model_scripts/custom_models/uq_models/proximity.py +0 -384
workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
workbench/model_scripts/quant_regression/quant_regression.template +0 -279
workbench/model_scripts/quant_regression/requirements.txt +0 -1
workbench/themes/quartz/base_css.url +0 -1
workbench/themes/quartz/custom.css +0 -117
workbench/themes/quartz/plotly.json +0 -642
workbench/themes/quartz_dark/base_css.url +0 -1
workbench/themes/quartz_dark/custom.css +0 -131
workbench/themes/quartz_dark/plotly.json +0 -642
workbench/utils/fast_inference.py +0 -167
workbench/utils/resource_utils.py +0 -39
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/WHEEL +0 -0
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/top_level.txt +0 -0

workbench/algorithms/models/noise_model.py ADDED Viewed

@@ -0,0 +1,388 @@
+import pandas as pd
+import numpy as np
+from xgboost import XGBRegressor
+from typing import List
+import logging
+from workbench.algorithms.dataframe.feature_space_proximity import FeatureSpaceProximity
+# Set up logging
+log = logging.getLogger("workbench")
+class NoiseModel:
+    """Composite noise detection for regression data using multiple complementary signals.
+    The NoiseModel identifies potentially noisy or problematic samples in regression datasets
+    by combining three independent signals:
+    1. **Underfit Model Residuals**: A deliberately simple XGBoost model (low depth, few trees)
+       that captures only the main trends. High residuals indicate samples in complex regions
+       or unusual areas of the feature space.
+    2. **Overfit Model Residuals**: A deliberately complex XGBoost model (deep trees, many
+       iterations, no regularization) that attempts to memorize the training data. High residuals
+       here indicate samples the model *cannot* fit even when trying to memorize - a strong
+       signal of label noise. This is the "training error" approach validated in:
+       "Denoising Drug Discovery Data for Improved ADMET Property Prediction" (Merck, JCIM 2024)
+    3. **High Target Gradient (HTG)**: Using the Proximity class, measures disagreement between
+       a sample's target value and its neighbors in feature space. High gradients indicate
+       activity cliffs or potential measurement errors where similar compounds have very
+       different target values.
+    The combined noise score weights the overfit residual signal more heavily (2x) based on
+    the paper's finding that training error is the most reliable noise detector for regression.
+    Example:
+        ```python
+        from workbench.algorithms.models.noise_model import NoiseModel
+        # Create noise model
+        noise_model = NoiseModel(df, id_column="id", features=feature_list, target="target")
+        # Get noise scores for all samples
+        scores_df = noise_model.get_scores()
+        # Get sample weights for training (lower weight for noisy samples)
+        weights = noise_model.get_sample_weights(strategy="inverse")
+        # Get clean subset (bottom 90% by noise score)
+        clean_df = noise_model.get_clean_subset(percentile=90)
+        # Find samples with same features but different targets (definite noise)
+        conflicts = noise_model.coincident_conflicts()
+        ```
+    References:
+        Adrian, M., Chung, Y., & Cheng, A. C. (2024). Denoising Drug Discovery Data for
+        Improved ADMET Property Prediction. J. Chem. Inf. Model., 64(16), 6324-6337.
+    """
+    def __init__(
+        self,
+        df: pd.DataFrame,
+        id_column: str,
+        features: List[str],
+        target: str,
+    ):
+        """
+        Initialize the NoiseModel class.
+        Args:
+            df: DataFrame containing data for noise detection.
+            id_column: Name of the column used as the identifier.
+            features: List of feature column names.
+            target: Name of the target column.
+        """
+        self.id_column = id_column
+        self.target = target
+        # Filter out non-numeric features
+        self.features = self._validate_features(df, features)
+        # Drop NaN rows in features and target
+        self.df = df.dropna(subset=self.features + [self.target]).copy()
+        # Compute target stats for normalization
+        self.target_std = self.df[self.target].std()
+        self.target_range = self.df[self.target].max() - self.df[self.target].min()
+        # Build all component models
+        self._build_models()
+        # Precompute all noise signals
+        self._precompute_signals()
+    def get_scores(self) -> pd.DataFrame:
+        """
+        Get noise scores for all samples.
+        Returns:
+            DataFrame with id, individual signal columns, and combined noise_score
+        """
+        result = self.df[[self.id_column, self.target]].copy()
+        result["underfit_residual"] = self.df["underfit_residual"]
+        result["overfit_residual"] = self.df["overfit_residual"]
+        result["htg_score"] = self.df["htg_score"]
+        result["noise_score"] = self.df["noise_score"]
+        return result.sort_values("noise_score", ascending=False).reset_index(drop=True)
+    def get_sample_weights(self, strategy: str = "inverse") -> pd.Series:
+        """
+        Get sample weights for training, indexed by id_column.
+        Args:
+            strategy: Weighting strategy
+                - "inverse": 1 / (1 + noise_score)
+                - "soft": 1 - noise_score (clipped to [0.1, 1.0])
+                - "threshold": 1.0 if noise_score < median, else 0.5
+        Returns:
+            Series of weights indexed by id_column
+        """
+        scores = self.df.set_index(self.id_column)["noise_score"]
+        if strategy == "inverse":
+            weights = 1.0 / (1.0 + scores)
+        elif strategy == "soft":
+            weights = (1.0 - scores).clip(lower=0.1, upper=1.0)
+        elif strategy == "threshold":
+            median_score = scores.median()
+            weights = (scores < median_score).apply(lambda x: 1.0 if x else 0.5)
+        else:
+            raise ValueError(f"Unknown strategy: {strategy}")
+        return weights
+    def get_clean_subset(self, percentile: float = 90.0) -> pd.DataFrame:
+        """
+        Get a subset of data with lowest noise scores.
+        Args:
+            percentile: Keep samples below this percentile of noise score (default: 90 = bottom 90%)
+        Returns:
+            DataFrame of "clean" samples
+        """
+        threshold = np.percentile(self.df["noise_score"], percentile)
+        return self.df[self.df["noise_score"] <= threshold].copy()
+    def get_noisy_samples(self, top_percent: float = 10.0) -> pd.DataFrame:
+        """
+        Get samples with highest noise scores.
+        Args:
+            top_percent: Percentage of noisiest samples to return (default: 10%)
+        Returns:
+            DataFrame of noisy samples, sorted by noise_score descending
+        """
+        percentile = 100 - top_percent
+        threshold = np.percentile(self.df["noise_score"], percentile)
+        noisy = self.df[self.df["noise_score"] >= threshold].copy()
+        return noisy.sort_values("noise_score", ascending=False).reset_index(drop=True)
+    def coincident_conflicts(self, distance_threshold: float = 1e-5) -> pd.DataFrame:
+        """
+        Find samples that map to the same point in feature space but have different targets.
+        These are definitive noise - same features, different target values.
+        Args:
+            distance_threshold: Maximum distance to consider "coincident" (default: 1e-5)
+        Returns:
+            DataFrame of coincident conflicts with their target differences
+        """
+        # Use proximity to find coincident points
+        coincident = self.df[self.df["nn_distance"] < distance_threshold].copy()
+        if len(coincident) == 0:
+            return pd.DataFrame(columns=[self.id_column, self.target, "nn_id", "nn_target", "nn_target_diff"])
+        return (
+            coincident[[self.id_column, self.target, "nn_id", "nn_target", "nn_target_diff", "noise_score"]]
+            .sort_values("nn_target_diff", ascending=False)
+            .reset_index(drop=True)
+        )
+    def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
+        """Remove non-numeric features and log warnings."""
+        non_numeric = [f for f in features if f not in df.select_dtypes(include=["number"]).columns]
+        if non_numeric:
+            log.warning(f"Non-numeric features {non_numeric} aren't currently supported, excluding them")
+        return [f for f in features if f not in non_numeric]
+    def _build_models(self) -> None:
+        """Build the underfit, overfit, and proximity models."""
+        log.info("Building noise detection models...")
+        X = self.df[self.features]
+        y = self.df[self.target]
+        # Underfit model: intentionally simple (high bias)
+        log.info("  Fitting underfit model...")
+        self.underfit_model = XGBRegressor(
+            max_depth=2,
+            n_estimators=20,
+            learning_rate=0.1,
+            random_state=42,
+            verbosity=0,
+        )
+        self.underfit_model.fit(X, y)
+        # Overfit model: intentionally complex (high variance, low regularization)
+        log.info("  Fitting overfit model...")
+        self.overfit_model = XGBRegressor(
+            max_depth=12,
+            n_estimators=500,
+            learning_rate=0.1,
+            reg_lambda=0.0,
+            reg_alpha=0.0,
+            min_child_weight=1,
+            random_state=42,
+            verbosity=0,
+        )
+        self.overfit_model.fit(X, y)
+        # Proximity model for feature space analysis
+        log.info("  Building proximity model...")
+        self.proximity = FeatureSpaceProximity(
+            self.df,
+            id_column=self.id_column,
+            features=self.features,
+            target=self.target,
+        )
+        # Copy proximity metrics to our df
+        self.df["nn_distance"] = self.proximity.df["nn_distance"].values
+        self.df["nn_id"] = self.proximity.df["nn_id"].values
+        self.df["nn_target"] = self.proximity.df["nn_target"].values
+        self.df["nn_target_diff"] = self.proximity.df["nn_target_diff"].values
+        log.info("Noise detection models built successfully")
+    def _precompute_signals(self) -> None:
+        """Precompute all noise signals for every sample."""
+        log.info("Precomputing noise signals...")
+        X = self.df[self.features]
+        y = self.df[self.target].values
+        # Underfit residuals (normalized by target std)
+        underfit_pred = self.underfit_model.predict(X)
+        self.df["underfit_residual"] = np.abs(y - underfit_pred) / self.target_std
+        # Overfit residuals (normalized by target std)
+        # This is the key "training error" signal from the paper
+        overfit_pred = self.overfit_model.predict(X)
+        self.df["overfit_residual"] = np.abs(y - overfit_pred) / self.target_std
+        # HTG score: neighbor disagreement (normalized by target std)
+        # Using nn_target_diff directly, normalized
+        self.df["htg_score"] = self.df["nn_target_diff"] / self.target_std
+        # Combine into overall noise score
+        # Scale each component to [0, 1] using percentile ranks, then average
+        self.df["noise_score"] = self._compute_combined_score()
+        log.info("Noise signals precomputed successfully")
+    def _compute_combined_score(self) -> np.ndarray:
+        """
+        Combine individual signals into a single noise score.
+        Uses percentile ranks to normalize each signal to [0, 1], then averages.
+        Overfit residual gets higher weight as it's the most validated signal (per the paper).
+        """
+        # Convert to percentile ranks (0-1 scale)
+        overfit_rank = self.df["overfit_residual"].rank(pct=True)
+        htg_rank = self.df["htg_score"].rank(pct=True)
+        # Weighted average: overfit gets 2x weight based on paper's findings
+        # that training error is the best noise detector
+        combined = (2.0 * overfit_rank + 1.0 * htg_rank) / 3.0
+        return combined.values
+# Testing the NoiseModel class
+if __name__ == "__main__":
+    from workbench.api import FeatureSet, Model
+    pd.set_option("display.max_columns", None)
+    pd.set_option("display.width", 1000)
+    # Create a sample DataFrame with some noisy points
+    np.random.seed(42)
+    n_samples = 100
+    # Generate clean data: y = 2*x1 + 3*x2 + noise
+    x1 = np.random.randn(n_samples)
+    x2 = np.random.randn(n_samples)
+    y_clean = 2 * x1 + 3 * x2 + np.random.randn(n_samples) * 0.1
+    # Add some noisy points (last 10 samples)
+    y_noisy = y_clean.copy()
+    y_noisy[-10:] += np.random.randn(10) * 5  # Large noise
+    data = {
+        "ID": [f"sample_{i}" for i in range(n_samples)],
+        "Feature1": x1,
+        "Feature2": x2,
+        "target": y_noisy,
+    }
+    df = pd.DataFrame(data)
+    print("=" * 80)
+    print("Testing NoiseModel...")
+    print("=" * 80)
+    # Create noise model
+    noise_model = NoiseModel(
+        df,
+        id_column="ID",
+        features=["Feature1", "Feature2"],
+        target="target",
+    )
+    # Get noise scores
+    print("\nTop 10 noisiest samples:")
+    scores = noise_model.get_scores()
+    print(scores.head(10))
+    # Check if our artificially noisy samples are detected
+    noisy_ids = [f"sample_{i}" for i in range(90, 100)]
+    detected = scores[scores["ID"].isin(noisy_ids)]
+    median_score = scores["noise_score"].median()
+    print(f"\nOf 10 noisy samples, {len(detected[detected['noise_score'] > median_score])} above median noise score")
+    # Get sample weights
+    print("\nSample weights (inverse strategy):")
+    weights = noise_model.get_sample_weights(strategy="inverse")
+    print(f"  Min weight: {weights.min():.3f}")
+    print(f"  Max weight: {weights.max():.3f}")
+    print(f"  Mean weight: {weights.mean():.3f}")
+    # Get clean subset
+    clean = noise_model.get_clean_subset(percentile=90)
+    print(f"\nClean subset (bottom 90%): {len(clean)} samples")
+    # Get noisy samples
+    noisy = noise_model.get_noisy_samples(top_percent=10)
+    print(f"\nNoisy samples (top 10%): {len(noisy)} samples")
+    print(noisy[["ID", "target", "overfit_residual", "htg_score", "noise_score"]].head())
+    # Test with real data
+    print("\n" + "=" * 80)
+    print("Testing with AQSol data...")
+    print("=" * 80)
+    fs = FeatureSet("aqsol_features")
+    model = Model("aqsol-regression")
+    if fs.exists():
+        features = model.features()
+        target = model.target()
+        df = fs.pull_dataframe()
+        noise_model = NoiseModel(
+            df,
+            id_column=fs.id_column,
+            features=features,
+            target=target,
+        )
+        print("\nTop 10 noisiest compounds:")
+        scores = noise_model.get_scores()
+        print(scores.head(10))
+        print("\nCoincident conflicts:")
+        conflicts = noise_model.coincident_conflicts()
+        print(f"Found {len(conflicts)} coincident conflicts")
+        if len(conflicts) > 0:
+            print(conflicts.head())
+        print("\nNoise score distribution:")
+        print(scores["noise_score"].describe())

workbench/algorithms/sql/column_stats.py CHANGED Viewed

@@ -6,7 +6,6 @@ import pandas as pd
 # Workbench Imports
 from workbench.core.artifacts.data_source_abstract import DataSourceAbstract
 # Workbench Logger
 log = logging.getLogger("workbench")

workbench/algorithms/sql/correlations.py CHANGED Viewed

@@ -7,7 +7,6 @@ from collections import defaultdict
 # Workbench Imports
 from workbench.core.artifacts.data_source_abstract import DataSourceAbstract
 # Workbench Logger
 log = logging.getLogger("workbench")

workbench/algorithms/sql/descriptive_stats.py CHANGED Viewed

@@ -7,7 +7,6 @@ from collections import defaultdict
 # Workbench Imports
 from workbench.core.artifacts.data_source_abstract import DataSourceAbstract
 # Workbench Logger
 log = logging.getLogger("workbench")

workbench/algorithms/sql/outliers.py CHANGED Viewed

@@ -209,9 +209,9 @@ class Outliers:
             else:
                 return group.nlargest(n, col)
-        # Group by 'outlier_group' and apply the helper function, explicitly selecting columns
-        top_outliers = outlier_df.groupby("outlier_group", group_keys=False).apply(
-            get_extreme_values, include_groups=True
+        # Group by 'outlier_group' and apply the helper function, explicitly selecting columns to silence warning
+        top_outliers = outlier_df.groupby("outlier_group", group_keys=False)[outlier_df.columns].apply(
+            get_extreme_values
         )
         return top_outliers.reset_index(drop=True)

workbench/api/__init__.py CHANGED Viewed

@@ -5,6 +5,7 @@ These class provide high-level APIs for the Workbench package, offering easy acc
 - DataSource: Manages AWS Data Catalog and Athena
 - FeatureSet: Manages AWS Feature Store and Feature Groups
 - Model: Manages the training and deployment of AWS Model Groups and Packages
+- MetaModel: A Model that aggregates predictions from multiple child endpoints
 - ModelType: Enum for the different model types supported by Workbench
 - Endpoint: Manages the deployment and invocations/inference on AWS Endpoints
 - Meta: Provides an API to retrieve AWS Metadata for the above classes
@@ -14,7 +15,8 @@ These class provide high-level APIs for the Workbench package, offering easy acc
 from .data_source import DataSource
 from .feature_set import FeatureSet
-from .model import Model, ModelType
+from .model import Model, ModelType, ModelFramework
+from .meta_model import MetaModel
 from .endpoint import Endpoint
 from .meta import Meta
 from .parameter_store import ParameterStore
@@ -24,7 +26,9 @@ __all__ = [
     "DataSource",
     "FeatureSet",
     "Model",
+    "MetaModel",
     "ModelType",
+    "ModelFramework",
     "Endpoint",
     "Meta",
     "ParameterStore",

workbench/api/df_store.py CHANGED Viewed

@@ -1,35 +1,32 @@
 """DFStore: Fast/efficient storage of DataFrames using AWS S3/Parquet/Snappy"""
-from datetime import datetime
 from typing import Union
-import logging
-import pandas as pd
 # Workbench Imports
-from workbench.core.cloud_platform.aws.aws_df_store import AWSDFStore
+from workbench.core.artifacts.df_store_core import DFStoreCore
-class DFStore(AWSDFStore):
+class DFStore(DFStoreCore):
     """DFStore: Fast/efficient storage of DataFrames using AWS S3/Parquet/Snappy
-    Common Usage:
-        ```python
-        df_store = DFStore()
+        Common Usage:
+    ```python
+            df_store = DFStore()
-        # List Data
-        df_store.list()
+            # List Data
+            df_store.list()
-        # Add DataFrame
-        df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
-        df_store.upsert("/test/my_data", df)
+            # Add DataFrame
+            df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
+            df_store.upsert("/test/my_data", df)
-        # Retrieve DataFrame
-        df = df_store.get("/test/my_data")
-        print(df)
+            # Retrieve DataFrame
+            df = df_store.get("/test/my_data")
+            print(df)
-        # Delete Data
-        df_store.delete("/test/my_data")
-        ```
+            # Delete Data
+            df_store.delete("/test/my_data")
+    ```
     """
     def __init__(self, path_prefix: Union[str, None] = None):
@@ -38,101 +35,13 @@ class DFStore(AWSDFStore):
         Args:
             path_prefix (Union[str, None], optional): Add a path prefix to storage locations (Defaults to None)
         """
-        self.log = logging.getLogger("workbench")
-        # Initialize the SuperClass
         super().__init__(path_prefix=path_prefix)
-    def list(self, include_cache: bool = False) -> list:
-        """List all the objects in the data_store prefix.
-        Args:
-            include_cache (bool, optional): Include cache objects in the list (Defaults to False).
-        Returns:
-            list: A list of all the objects in the data_store prefix.
-        """
-        return super().list(include_cache=include_cache)
-    def summary(self, include_cache: bool = False) -> pd.DataFrame:
-        """Return a nicely formatted summary of object locations, sizes (in MB), and modified dates.
-        Args:
-            include_cache (bool, optional): Include cache objects in the summary (Defaults to False).
-        Returns:
-            pd.DataFrame: A formatted DataFrame with the summary details.
-        """
-        return super().summary(include_cache=include_cache)
-    def details(self, include_cache: bool = False) -> pd.DataFrame:
-        """Return a DataFrame with detailed metadata for all objects in the data_store prefix.
-        Args:
-            include_cache (bool, optional): Include cache objects in the details (Defaults to False).
-        Returns:
-            pd.DataFrame: A DataFrame with detailed metadata for all objects in the data_store prefix.
-        """
-        return super().details(include_cache=include_cache)
-    def check(self, location: str) -> bool:
-        """Check if a DataFrame exists at the specified location
-        Args:
-            location (str): The location of the data to check.
-        Returns:
-            bool: True if the data exists, False otherwise.
-        """
-        return super().check(location)
-    def get(self, location: str) -> Union[pd.DataFrame, None]:
-        """Retrieve a DataFrame from AWS S3.
-        Args:
-            location (str): The location of the data to retrieve.
-        Returns:
-            pd.DataFrame: The retrieved DataFrame or None if not found.
-        """
-        _df = super().get(location)
-        if _df is None:
-            self.log.error(f"Dataframe not found at location: {location}")
-        return _df
-    def upsert(self, location: str, data: Union[pd.DataFrame, pd.Series]):
-        """Insert or update a DataFrame or Series in the AWS S3.
-        Args:
-            location (str): The location of the data.
-            data (Union[pd.DataFrame, pd.Series]): The data to be stored.
-        """
-        super().upsert(location, data)
-    def last_modified(self, location: str) -> Union[datetime, None]:
-        """Get the last modified date of the DataFrame at the specified location.
-        Args:
-            location (str): The location of the data to check.
-        Returns:
-            Union[datetime, None]: The last modified date of the DataFrame or None if not found.
-        """
-        return super().last_modified(location)
-    def delete(self, location: str):
-        """Delete a DataFrame from the AWS S3.
-        Args:
-            location (str): The location of the data to delete.
-        """
-        super().delete(location)
 if __name__ == "__main__":
     """Exercise the DFStore Class"""
     import time
+    import pandas as pd
     # Create a DFStore manager
     df_store = DFStore()

workbench/api/endpoint.py CHANGED Viewed

@@ -44,16 +44,21 @@ class Endpoint(EndpointCore):
         """
         return super().inference(eval_df, capture_name, id_column, drop_error_rows)
-    def auto_inference(self, capture: bool = False) -> pd.DataFrame:
-        """Run inference on the Endpoint using the FeatureSet evaluation data
+    def auto_inference(self) -> pd.DataFrame:
+        """Run inference on the Endpoint using the test data from the model training view
-        Args:
-            capture (bool): Capture the inference results
+        Returns:
+            pd.DataFrame: The DataFrame with predictions
+        """
+        return super().auto_inference()
+    def full_inference(self) -> pd.DataFrame:
+        """Run inference on the Endpoint using the full data from the model training view
         Returns:
             pd.DataFrame: The DataFrame with predictions
         """
-        return super().auto_inference(capture)
+        return super().full_inference()
     def fast_inference(self, eval_df: pd.DataFrame, threads: int = 4) -> pd.DataFrame:
         """Run inference on the Endpoint using the provided DataFrame
@@ -70,16 +75,13 @@ class Endpoint(EndpointCore):
         """
         return super().fast_inference(eval_df, threads=threads)
-    def cross_fold_inference(self, nfolds: int = 5) -> dict:
-        """Run cross-fold inference (only works for XGBoost models)
-        Args:
-            nfolds (int): The number of folds to use for cross-validation (default: 5)
+    def cross_fold_inference(self) -> pd.DataFrame:
+        """Pull cross-fold inference from model associated with this Endpoint
         Returns:
-            dict: A dictionary with fold results
+            pd.DataFrame: A DataFrame with cross fold predictions
         """
-        return super().cross_fold_inference(nfolds)
+        return super().cross_fold_inference()
 if __name__ == "__main__":

workbench 0.8.174__py3-none-any.whl → 0.8.227__py3-none-any.whl

Potentially problematic release.

workbench 0.8.174py3-none-any.whl → 0.8.227py3-none-any.whl