PyPI - workbench - Versions diffs - 0.8.168__py3-none-any.whl → 0.8.193__py3-none-any.whl - Mend

workbench 0.8.168py3-none-any.whl → 0.8.193py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

workbench/algorithms/dataframe/proximity.py +143 -102
workbench/algorithms/graph/light/proximity_graph.py +2 -1
workbench/api/compound.py +1 -1
workbench/api/endpoint.py +3 -2
workbench/api/feature_set.py +4 -4
workbench/api/model.py +16 -12
workbench/api/monitor.py +1 -16
workbench/core/artifacts/artifact.py +11 -3
workbench/core/artifacts/data_capture_core.py +355 -0
workbench/core/artifacts/endpoint_core.py +113 -27
workbench/core/artifacts/feature_set_core.py +72 -13
workbench/core/artifacts/model_core.py +71 -49
workbench/core/artifacts/monitor_core.py +33 -249
workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
workbench/core/cloud_platform/aws/aws_meta.py +11 -4
workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
workbench/core/transforms/features_to_model/features_to_model.py +11 -6
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
workbench/core/views/training_view.py +49 -53
workbench/core/views/view.py +51 -1
workbench/core/views/view_utils.py +4 -4
workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
workbench/model_scripts/custom_models/proximity/proximity.py +143 -102
workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
workbench/model_scripts/custom_models/uq_models/meta_uq.template +156 -58
workbench/model_scripts/custom_models/uq_models/ngboost.template +20 -14
workbench/model_scripts/custom_models/uq_models/proximity.py +143 -102
workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
workbench/model_scripts/pytorch_model/pytorch.template +9 -18
workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
workbench/model_scripts/script_generation.py +7 -2
workbench/model_scripts/uq_models/mapie.template +492 -0
workbench/model_scripts/uq_models/requirements.txt +1 -0
workbench/model_scripts/xgb_model/generated_model_script.py +34 -43
workbench/model_scripts/xgb_model/xgb_model.template +31 -40
workbench/repl/workbench_shell.py +4 -4
workbench/scripts/lambda_launcher.py +63 -0
workbench/scripts/{ml_pipeline_launcher.py → ml_pipeline_batch.py} +49 -51
workbench/scripts/ml_pipeline_sqs.py +186 -0
workbench/utils/chem_utils/__init__.py +0 -0
workbench/utils/chem_utils/fingerprints.py +134 -0
workbench/utils/chem_utils/misc.py +194 -0
workbench/utils/chem_utils/mol_descriptors.py +483 -0
workbench/utils/chem_utils/mol_standardize.py +450 -0
workbench/utils/chem_utils/mol_tagging.py +348 -0
workbench/utils/chem_utils/projections.py +209 -0
workbench/utils/chem_utils/salts.py +256 -0
workbench/utils/chem_utils/sdf.py +292 -0
workbench/utils/chem_utils/toxicity.py +250 -0
workbench/utils/chem_utils/vis.py +253 -0
workbench/utils/config_manager.py +2 -6
workbench/utils/endpoint_utils.py +5 -7
workbench/utils/license_manager.py +2 -6
workbench/utils/model_utils.py +89 -31
workbench/utils/monitor_utils.py +44 -62
workbench/utils/pandas_utils.py +3 -3
workbench/utils/shap_utils.py +10 -2
workbench/utils/workbench_sqs.py +1 -1
workbench/utils/xgboost_model_utils.py +300 -151
workbench/web_interface/components/model_plot.py +7 -1
workbench/web_interface/components/plugins/dashboard_status.py +3 -1
workbench/web_interface/components/plugins/generated_compounds.py +1 -1
workbench/web_interface/components/plugins/model_details.py +7 -2
workbench/web_interface/components/plugins/scatter_plot.py +3 -3
{workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/METADATA +24 -2
{workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/RECORD +77 -72
{workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/entry_points.txt +3 -1
{workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/licenses/LICENSE +1 -1
workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
workbench/model_scripts/pytorch_model/generated_model_script.py +0 -576
workbench/model_scripts/quant_regression/quant_regression.template +0 -279
workbench/model_scripts/quant_regression/requirements.txt +0 -1
workbench/model_scripts/scikit_learn/generated_model_script.py +0 -307
workbench/utils/chem_utils.py +0 -1556
workbench/utils/fast_inference.py +0 -167
workbench/utils/resource_utils.py +0 -39
{workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/WHEEL +0 -0
{workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/top_level.txt +0 -0

workbench/model_scripts/custom_models/proximity/proximity.py CHANGED Viewed

@@ -2,10 +2,9 @@ import pandas as pd
 import numpy as np
 from sklearn.preprocessing import StandardScaler
 from sklearn.neighbors import NearestNeighbors
-from typing import List, Dict
+from typing import List, Dict, Optional
 import logging
 import pickle
-import os
 import json
 from pathlib import Path
 from enum import Enum
@@ -14,7 +13,6 @@ from enum import Enum
 log = logging.getLogger("workbench")
-# ^Enumerated^ Proximity Types (distance or similarity)
 class ProximityType(Enum):
     DISTANCE = "distance"
     SIMILARITY = "similarity"
@@ -26,44 +24,49 @@ class Proximity:
         df: pd.DataFrame,
         id_column: str,
         features: List[str],
-        target: str = None,
-        track_columns: List[str] = None,
+        target: Optional[str] = None,
+        track_columns: Optional[List[str]] = None,
         n_neighbors: int = 10,
     ):
         """
         Initialize the Proximity class.
         Args:
-            df (pd.DataFrame): DataFrame containing data for neighbor computations.
-            id_column (str): Name of the column used as the identifier.
-            features (List[str]): List of feature column names to be used for neighbor computations.
-            target (str, optional): Name of the target column. Defaults to None.
-            track_columns (List[str], optional): Additional columns to track in results. Defaults to None.
-            n_neighbors (int): Number of neighbors to compute. Defaults to 10.
+            df: DataFrame containing data for neighbor computations.
+            id_column: Name of the column used as the identifier.
+            features: List of feature column names to be used for neighbor computations.
+            target: Name of the target column. Defaults to None.
+            track_columns: Additional columns to track in results. Defaults to None.
+            n_neighbors: Number of neighbors to compute. Defaults to 10.
         """
-        self.df = df.dropna(subset=features).copy()
         self.id_column = id_column
-        self.n_neighbors = min(n_neighbors, len(self.df) - 1)
         self.target = target
-        self.features = features
+        self.track_columns = track_columns or []
+        self.proximity_type = None
         self.scaler = None
         self.X = None
         self.nn = None
-        self.proximity_type = None
-        self.track_columns = track_columns or []
-        # Right now we only support numeric features, so remove any columns that are not numeric
-        non_numeric_features = self.df[self.features].select_dtypes(exclude=["number"]).columns.tolist()
-        if non_numeric_features:
-            log.warning(f"Non-numeric features {non_numeric_features} aren't currently supported...")
-            self.features = [f for f in self.features if f not in non_numeric_features]
+        # Filter out non-numeric features
+        self.features = self._validate_features(df, features)
+        # Drop NaN rows and set up DataFrame
+        self.df = df.dropna(subset=self.features).copy()
+        self.n_neighbors = min(n_neighbors, len(self.df) - 1)
         # Build the proximity model
         self.build_proximity_model()
+    def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
+        """Remove non-numeric features and log warnings."""
+        non_numeric = df[features].select_dtypes(exclude=["number"]).columns.tolist()
+        if non_numeric:
+            log.warning(f"Non-numeric features {non_numeric} aren't currently supported...")
+            return [f for f in features if f not in non_numeric]
+        return features
     def build_proximity_model(self) -> None:
-        """Standardize features and fit Nearest Neighbors model.
-        Note: This method can be overridden in subclasses for custom behavior."""
+        """Standardize features and fit Nearest Neighbors model."""
         self.proximity_type = ProximityType.DISTANCE
         self.scaler = StandardScaler()
         self.X = self.scaler.fit_transform(self.df[self.features])
@@ -74,27 +77,60 @@ class Proximity:
         Compute nearest neighbors for all rows in the dataset.
         Returns:
-            pd.DataFrame: A DataFrame of neighbors and their distances.
+            DataFrame of neighbors and their distances.
         """
         distances, indices = self.nn.kneighbors(self.X)
-        results = []
-        for i, (dists, nbrs) in enumerate(zip(distances, indices)):
-            query_id = self.df.iloc[i][self.id_column]
-            # Process neighbors
-            for neighbor_idx, dist in zip(nbrs, dists):
-                # Skip self (neighbor index == current row index)
-                if neighbor_idx == i:
-                    continue
-                results.append(self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist))
+        results = [
+            self._build_neighbor_result(
+                query_id=self.df.iloc[i][self.id_column], neighbor_idx=neighbor_idx, distance=dist
+            )
+            for i, (dists, nbrs) in enumerate(zip(distances, indices))
+            for neighbor_idx, dist in zip(nbrs, dists)
+            if neighbor_idx != i  # Skip self
+        ]
         return pd.DataFrame(results)
     def neighbors(
+        self,
+        id_or_ids,
+        n_neighbors: Optional[int] = 5,
+        radius: Optional[float] = None,
+        include_self: bool = True,
+    ) -> pd.DataFrame:
+        """
+        Return neighbors for ID(s) from the existing dataset.
+        Args:
+            id_or_ids: Single ID or list of IDs to look up
+            n_neighbors: Number of neighbors to return (default: 5)
+            radius: If provided, find all neighbors within this radius
+            include_self: Whether to include self in results (if present)
+        Returns:
+            DataFrame containing neighbors and distances
+        """
+        # Normalize to list
+        ids = [id_or_ids] if not isinstance(id_or_ids, list) else id_or_ids
+        # Validate IDs exist
+        missing_ids = set(ids) - set(self.df[self.id_column])
+        if missing_ids:
+            raise ValueError(f"IDs not found in dataset: {missing_ids}")
+        # Filter to requested IDs and preserve order
+        query_df = self.df[self.df[self.id_column].isin(ids)]
+        query_df = query_df.set_index(self.id_column).loc[ids].reset_index()
+        # Use the core implementation
+        return self.find_neighbors(query_df, n_neighbors=n_neighbors, radius=radius, include_self=include_self)
+    def find_neighbors(
         self,
         query_df: pd.DataFrame,
-        radius: float = None,
+        n_neighbors: Optional[int] = 5,
+        radius: Optional[float] = None,
         include_self: bool = True,
     ) -> pd.DataFrame:
         """
@@ -102,63 +138,63 @@ class Proximity:
         Args:
             query_df: DataFrame containing query points
+            n_neighbors: Number of neighbors to return (default: 5)
             radius: If provided, find all neighbors within this radius
             include_self: Whether to include self in results (if present)
         Returns:
             DataFrame containing neighbors and distances
-        Note: The query DataFrame must include the feature columns. The id_column is optional.
         """
-        # Check if all required features are present
+        # Validate features
         missing = set(self.features) - set(query_df.columns)
         if missing:
             raise ValueError(f"Query DataFrame is missing required feature columns: {missing}")
-        # Check if id_column is present
         id_column_present = self.id_column in query_df.columns
-        # None of the features can be NaNs, so report rows with NaNs and then drop them
-        rows_with_nan = query_df[self.features].isna().any(axis=1)
-        # Print the ID column for rows with NaNs
-        if rows_with_nan.any():
-            log.warning(f"Found {rows_with_nan.sum()} rows with NaNs in feature columns:")
-            log.warning(query_df.loc[rows_with_nan, self.id_column])
-        # Drop rows with NaNs in feature columns and reassign to query_df
-        query_df = query_df.dropna(subset=self.features)
+        # Handle NaN rows
+        query_df = self._handle_nan_rows(query_df, id_column_present)
-        # Transform the query features using the model's scaler
+        # Transform query features
         X_query = self.scaler.transform(query_df[self.features])
-        # Get neighbors using either radius or k-nearest neighbors
+        # Get neighbors
         if radius is not None:
             distances, indices = self.nn.radius_neighbors(X_query, radius=radius)
         else:
-            distances, indices = self.nn.kneighbors(X_query)
+            distances, indices = self.nn.kneighbors(X_query, n_neighbors=n_neighbors)
         # Build results
-        all_results = []
+        results = []
         for i, (dists, nbrs) in enumerate(zip(distances, indices)):
-            # Use the ID from the query DataFrame if available, otherwise use the row index
             query_id = query_df.iloc[i][self.id_column] if id_column_present else f"query_{i}"
             for neighbor_idx, dist in zip(nbrs, dists):
-                # Skip if the neighbor is the query itself and include_self is False
                 neighbor_id = self.df.iloc[neighbor_idx][self.id_column]
+                # Skip if neighbor is self and include_self is False
                 if not include_self and neighbor_id == query_id:
                     continue
-                all_results.append(
-                    self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist)
-                )
+                results.append(self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist))
+        results_df = pd.DataFrame(results).sort_values([self.id_column, "distance"]).reset_index(drop=True)
+        return results_df
+    def _handle_nan_rows(self, query_df: pd.DataFrame, id_column_present: bool) -> pd.DataFrame:
+        """Drop rows with NaN values in feature columns and log warnings."""
+        rows_with_nan = query_df[self.features].isna().any(axis=1)
+        if rows_with_nan.any():
+            log.warning(f"Found {rows_with_nan.sum()} rows with NaNs in feature columns:")
+            if id_column_present:
+                log.warning(query_df.loc[rows_with_nan, self.id_column])
-        return pd.DataFrame(all_results)
+        return query_df.dropna(subset=self.features)
     def _build_neighbor_result(self, query_id, neighbor_idx: int, distance: float) -> Dict:
         """
-        Internal: Build a result dictionary for a single neighbor.
+        Build a result dictionary for a single neighbor.
         Args:
             query_id: ID of the query point
@@ -169,27 +205,30 @@ class Proximity:
             Dictionary containing neighbor information
         """
         neighbor_id = self.df.iloc[neighbor_idx][self.id_column]
+        neighbor_row = self.df.iloc[neighbor_idx]
-        # Basic neighbor info
-        neighbor_info = {
+        # Start with basic info
+        result = {
             self.id_column: query_id,
             "neighbor_id": neighbor_id,
             "distance": distance,
         }
-        # Determine which additional columns to include
-        relevant_cols = [self.target, "prediction"] if self.target else []
-        relevant_cols += [c for c in self.df.columns if "_proba" in c or "residual" in c]
-        relevant_cols += ["outlier"]
+        # Columns to automatically include if they exist
+        auto_include = (
+            ([self.target, "prediction"] if self.target else [])
+            + self.track_columns
+            + [col for col in self.df.columns if "_proba" in col or "residual" in col or col == "outlier"]
+        )
-        # Add user-specified columns
-        relevant_cols += self.track_columns
+        # Add values for existing columns
+        for col in auto_include:
+            if col in self.df.columns:
+                result[col] = neighbor_row[col]
-        # Add values for each relevant column that exists in the dataframe
-        for col in filter(lambda c: c in self.df.columns, relevant_cols):
-            neighbor_info[col] = self.df.iloc[neighbor_idx][col]
-        return neighbor_info
+        # Truncate very small distances to zero
+        result["distance"] = 0.0 if distance < 1e-7 else distance
+        return result
     def serialize(self, directory: str) -> None:
         """
@@ -198,8 +237,8 @@ class Proximity:
         Args:
             directory: Directory path to save the model components
         """
-        # Create directory if it doesn't exist
-        os.makedirs(directory, exist_ok=True)
+        dir_path = Path(directory)
+        dir_path.mkdir(parents=True, exist_ok=True)
         # Save metadata
         metadata = {
@@ -210,17 +249,16 @@ class Proximity:
             "n_neighbors": self.n_neighbors,
         }
-        with open(os.path.join(directory, "metadata.json"), "w") as f:
-            json.dump(metadata, f)
+        (dir_path / "metadata.json").write_text(json.dumps(metadata))
-        # Save the DataFrame
-        self.df.to_pickle(os.path.join(directory, "df.pkl"))
+        # Save DataFrame
+        self.df.to_pickle(dir_path / "df.pkl")
-        # Save the scaler and nearest neighbors model
-        with open(os.path.join(directory, "scaler.pkl"), "wb") as f:
+        # Save models
+        with open(dir_path / "scaler.pkl", "wb") as f:
             pickle.dump(self.scaler, f)
-        with open(os.path.join(directory, "nn_model.pkl"), "wb") as f:
+        with open(dir_path / "nn_model.pkl", "wb") as f:
             pickle.dump(self.nn, f)
         log.info(f"Proximity model serialized to {directory}")
@@ -234,23 +272,22 @@ class Proximity:
             directory: Directory path containing the serialized model components
         Returns:
-            Proximity: A new Proximity instance
+            A new Proximity instance
         """
-        directory_path = Path(directory)
-        if not directory_path.exists() or not directory_path.is_dir():
+        dir_path = Path(directory)
+        if not dir_path.is_dir():
             raise ValueError(f"Directory {directory} does not exist or is not a directory")
         # Load metadata
-        with open(os.path.join(directory, "metadata.json"), "r") as f:
-            metadata = json.load(f)
+        metadata = json.loads((dir_path / "metadata.json").read_text())
         # Load DataFrame
-        df_path = os.path.join(directory, "df.pkl")
-        if not os.path.exists(df_path):
+        df_path = dir_path / "df.pkl"
+        if not df_path.exists():
             raise FileNotFoundError(f"DataFrame file not found at {df_path}")
         df = pd.read_pickle(df_path)
-        # Create instance but skip _prepare_data
+        # Create instance without calling __init__
         instance = cls.__new__(cls)
         instance.df = df
         instance.id_column = metadata["id_column"]
@@ -259,15 +296,16 @@ class Proximity:
         instance.track_columns = metadata["track_columns"]
         instance.n_neighbors = metadata["n_neighbors"]
-        # Load scaler and nn model
-        with open(os.path.join(directory, "scaler.pkl"), "rb") as f:
+        # Load models
+        with open(dir_path / "scaler.pkl", "rb") as f:
             instance.scaler = pickle.load(f)
-        with open(os.path.join(directory, "nn_model.pkl"), "rb") as f:
+        with open(dir_path / "nn_model.pkl", "rb") as f:
             instance.nn = pickle.load(f)
-        # Load X from scaler transform
+        # Restore X
         instance.X = instance.scaler.transform(instance.df[instance.features])
+        instance.proximity_type = ProximityType.DISTANCE
         log.info(f"Proximity model deserialized from {directory}")
         return instance
@@ -294,10 +332,10 @@ if __name__ == "__main__":
     print(prox.all_neighbors())
     # Test the neighbors method
-    print(prox.neighbors(query_df=df.iloc[[0]]))
+    print(prox.neighbors(1))
     # Test the neighbors method with radius
-    print(prox.neighbors(query_df=df.iloc[0:2], radius=2.0))
+    print(prox.neighbors(1, radius=2.0))
     # Test with data that isn't in the 'train' dataframe
     query_data = {
@@ -307,7 +345,7 @@ if __name__ == "__main__":
         "Feature3": [2.31],
     }
     query_df = pd.DataFrame(query_data)
-    print(prox.neighbors(query_df=query_df))
+    print(prox.find_neighbors(query_df=query_df))  # For new data we use find_neighbors()
     # Test with Features list
     prox = Proximity(df, id_column="ID", features=["Feature1"], n_neighbors=2)
@@ -334,13 +372,13 @@ if __name__ == "__main__":
     print(prox.all_neighbors())
     # Test the neighbors method
-    print(prox.neighbors(query_df=df.iloc[0:2]))
+    print(prox.neighbors(["a", "b"]))
     # Time neighbors with all IDs versus calling all_neighbors
     import time
     start_time = time.time()
-    prox_df = prox.neighbors(query_df=df, include_self=False)
+    prox_df = prox.find_neighbors(query_df=df, include_self=False)
     end_time = time.time()
     print(f"Time taken for neighbors: {end_time - start_time:.4f} seconds")
     start_time = time.time()
@@ -361,7 +399,7 @@ if __name__ == "__main__":
     # Test querying without the id_column
     df_no_id = df.drop(columns=["foo_id"])
-    print(prox.neighbors(query_df=df_no_id, include_self=False))
+    print(prox.find_neighbors(query_df=df_no_id, include_self=False))
     # Test duplicate IDs
     data = {
@@ -379,6 +417,9 @@ if __name__ == "__main__":
     fs = FeatureSet("abalone_features")
     model = Model("abalone-regression")
+    features = model.features()
     df = fs.pull_dataframe()
-    prox = Proximity(df, id_column=fs.id_column, features=model.features(), target=model.target())
-    print(prox.neighbors(query_df=df[0:2]))
+    prox = Proximity(
+        df, id_column=fs.id_column, features=model.features(), target=model.target(), track_columns=features
+    )
+    print(prox.find_neighbors(query_df=df[0:2]))

workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template CHANGED Viewed

@@ -14,7 +14,7 @@ import pandas as pd
 TEMPLATE_PARAMS = {
     "features": "{{feature_list}}",
     "target": "{{target_column}}",
-    "train_all_data": "{{train_all_data}}"
+    "train_all_data": "{{train_all_data}}",
 }
@@ -37,7 +37,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     """
     Matches and renames DataFrame columns to match model feature names (case-insensitive).
     Prioritizes exact matches, then case-insensitive matches.
     Raises ValueError if any model features cannot be matched.
     """
     df_columns_lower = {col.lower(): col for col in df.columns}
@@ -81,10 +81,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Load training data from the specified directory
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
     # Check if the DataFrame is empty
@@ -109,8 +106,10 @@ if __name__ == "__main__":
     # Create and train the Regression/Confidence model
     # model = BayesianRidge()
     model = BayesianRidge(
-        alpha_1=1e-6, alpha_2=1e-6,  # Noise precision
-        lambda_1=1e-6, lambda_2=1e-6,  # Weight precision
+        alpha_1=1e-6,
+        alpha_2=1e-6,  # Noise precision
+        lambda_1=1e-6,
+        lambda_2=1e-6,  # Weight precision
         fit_intercept=True,
     )

workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template CHANGED Viewed

@@ -4,11 +4,7 @@ import awswrangler as wr
 import numpy as np
 # Model Performance Scores
-from sklearn.metrics import (
-    mean_absolute_error,
-    r2_score,
-    root_mean_squared_error
-)
+from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
 from sklearn.model_selection import KFold
 from scipy.optimize import minimize
@@ -23,7 +19,7 @@ TEMPLATE_PARAMS = {
     "features": "{{feature_list}}",
     "target": "{{target_column}}",
     "train_all_data": "{{train_all_data}}",
-    "model_metrics_s3_path": "{{model_metrics_s3_path}}"
+    "model_metrics_s3_path": "{{model_metrics_s3_path}}",
 }
@@ -47,7 +43,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     """
     Matches and renames DataFrame columns to match model feature names (case-insensitive).
     Prioritizes exact matches, then case-insensitive matches.
     Raises ValueError if any model features cannot be matched.
     """
     df_columns_lower = {col.lower(): col for col in df.columns}
@@ -90,10 +86,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Load training data from the specified directory
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
     # Check if the DataFrame is empty
@@ -172,16 +165,14 @@ if __name__ == "__main__":
     cv_residuals = np.array(cv_residuals)
     cv_uncertainties = np.array(cv_uncertainties)
     # Optimize calibration parameters: σ_cal = a * σ_uc + b
     def neg_log_likelihood(params):
         a, b = params
         sigma_cal = a * cv_uncertainties + b
         sigma_cal = np.maximum(sigma_cal, 1e-8)  # Prevent division by zero
-        return np.sum(0.5 * np.log(2 * np.pi * sigma_cal ** 2) + 0.5 * (cv_residuals ** 2) / (sigma_cal ** 2))
+        return np.sum(0.5 * np.log(2 * np.pi * sigma_cal**2) + 0.5 * (cv_residuals**2) / (sigma_cal**2))
-    result = minimize(neg_log_likelihood, x0=[1.0, 0.1], method='Nelder-Mead')
+    result = minimize(neg_log_likelihood, x0=[1.0, 0.1], method="Nelder-Mead")
     cal_a, cal_b = result.x
     print(f"Calibration parameters: a={cal_a:.4f}, b={cal_b:.4f}")
@@ -205,7 +196,9 @@ if __name__ == "__main__":
     result_df["prediction"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].mean(axis=1)
     # Compute uncalibrated uncertainty
-    result_df["prediction_std_uc"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].std(axis=1)
+    result_df["prediction_std_uc"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].std(
+        axis=1
+    )
     # Apply calibration to uncertainty
     result_df["prediction_std"] = cal_a * result_df["prediction_std_uc"] + cal_b
@@ -352,4 +345,4 @@ def predict_fn(df, models) -> pd.DataFrame:
     df = df.reindex(sorted(df.columns), axis=1)
     # All done, return the DataFrame
-    return df
+    return df

workbench/model_scripts/custom_models/uq_models/gaussian_process.template CHANGED Viewed

@@ -9,7 +9,7 @@ from sklearn.model_selection import train_test_split
 TEMPLATE_PARAMS = {
     "features": "{{feature_list}}",
     "target": "{{target_column}}",
-    "train_all_data": "{{train_all_data}}"
+    "train_all_data": "{{train_all_data}}",
 }
 from io import StringIO
@@ -33,7 +33,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     """
     Matches and renames DataFrame columns to match model feature names (case-insensitive).
     Prioritizes exact matches, then case-insensitive matches.
     Raises ValueError if any model features cannot be matched.
     """
     df_columns_lower = {col.lower(): col for col in df.columns}
@@ -46,7 +46,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
             rename_dict[df_columns_lower[feature.lower()]] = feature
         else:
             missing.append(feature)
     if missing:
         raise ValueError(f"Features not found: {missing}")
@@ -76,10 +76,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Load training data from the specified directory
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
     # Check if the DataFrame is empty
@@ -112,10 +109,7 @@ if __name__ == "__main__":
     )
     # Create a Pipeline with StandardScaler
-    model = Pipeline([
-        ("scaler", StandardScaler()),
-        ("model", model)
-    ])
+    model = Pipeline([("scaler", StandardScaler()), ("model", model)])
     # Prepare features and targets for training
     X_train = df_train[features]

workbench 0.8.168__py3-none-any.whl → 0.8.193__py3-none-any.whl

workbench 0.8.168py3-none-any.whl → 0.8.193py3-none-any.whl