PyPI - workbench - Versions diffs - 0.8.169__py3-none-any.whl → 0.8.171__py3-none-any.whl - Mend

workbench 0.8.169py3-none-any.whl → 0.8.171py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (20) hide show

workbench/api/model.py CHANGED Viewed

@@ -40,6 +40,7 @@ class Model(ModelCore):
         mem_size: int = 2048,
         max_concurrency: int = 5,
         instance: str = "ml.t2.medium",
+        data_capture: bool = False,
     ) -> Endpoint:
         """Create an Endpoint from the Model.
@@ -50,6 +51,7 @@ class Model(ModelCore):
             mem_size (int): The memory size for the Endpoint in MB (default: 2048)
             max_concurrency (int): The maximum concurrency for the Endpoint (default: 5)
             instance (str): The instance type to use for Realtime(serverless=False) Endpoints (default: "ml.t2.medium")
+            data_capture (bool): Enable data capture for the Endpoint (default: False)
         Returns:
             Endpoint: The Endpoint created from the Model
@@ -73,6 +75,7 @@ class Model(ModelCore):
         model_to_endpoint.transform(
             mem_size=mem_size,
             max_concurrency=max_concurrency,
+            data_capture=data_capture,
         )
         # Set the Endpoint Owner and Return the Endpoint

workbench/core/artifacts/endpoint_core.py CHANGED Viewed

@@ -972,12 +972,23 @@ class EndpointCore(Artifact):
             cls.log.info(f"Deleting Monitoring Schedule {schedule['MonitoringScheduleName']}...")
             cls.sm_client.delete_monitoring_schedule(MonitoringScheduleName=schedule["MonitoringScheduleName"])
-        # Recursively delete all endpoint S3 artifacts (inference, data capture, monitoring, etc)
+        # Recursively delete all endpoint S3 artifacts (inference, etc)
+        # Note: We do not want to delete the data_capture/ files since these
+        #       might be used for collection and data drift analysis
         base_endpoint_path = f"{cls.endpoints_s3_path}/{endpoint_name}"
-        s3_objects = wr.s3.list_objects(base_endpoint_path, boto3_session=cls.boto3_session)
-        cls.log.info(f"Deleting S3 Objects at {base_endpoint_path}...")
-        cls.log.info(f"{s3_objects}")
-        wr.s3.delete_objects(s3_objects, boto3_session=cls.boto3_session)
+        all_s3_objects = wr.s3.list_objects(base_endpoint_path, boto3_session=cls.boto3_session)
+        # Filter out objects that contain 'data_capture/' in their path
+        s3_objects_to_delete = [obj for obj in all_s3_objects if "/data_capture/" not in obj]
+        cls.log.info(f"Found {len(all_s3_objects)} total objects at {base_endpoint_path}")
+        cls.log.info(f"Filtering out data_capture files, will delete {len(s3_objects_to_delete)} objects...")
+        cls.log.info(f"Objects to delete: {s3_objects_to_delete}")
+        if s3_objects_to_delete:
+            wr.s3.delete_objects(s3_objects_to_delete, boto3_session=cls.boto3_session)
+            cls.log.info(f"Successfully deleted {len(s3_objects_to_delete)} objects")
+        else:
+            cls.log.info("No objects to delete (only data_capture files found)")
         # Delete any dataframes that were stored in the Dataframe Cache
         cls.log.info("Deleting Dataframe Cache...")

workbench/core/artifacts/monitor_core.py CHANGED Viewed

@@ -186,11 +186,11 @@ class MonitorCore:
         # Log the data capture operation
         self.log.important(f"Enabling Data Capture for {self.endpoint_name} --> {self.data_capture_path}")
-        self.log.important("This normally redeploys the endpoint...")
+        self.log.important("This will redeploy the endpoint...")
         # Create and apply the data capture configuration
         data_capture_config = DataCaptureConfig(
-            enable_capture=True,  # Required parameter
+            enable_capture=True,
             sampling_percentage=capture_percentage,
             destination_s3_uri=self.data_capture_path,
         )

workbench/core/cloud_platform/aws/aws_meta.py CHANGED Viewed

@@ -196,7 +196,9 @@ class AWSMeta:
         # Return the summary as a DataFrame
         df = pd.DataFrame(data_summary).convert_dtypes()
-        return df.sort_values(by="Created", ascending=False)
+        if not df.empty:
+            df.sort_values(by="Created", ascending=False, inplace=True)
+        return df
     def models(self, details: bool = False) -> pd.DataFrame:
         """Get a summary of the Models in AWS.
@@ -256,7 +258,9 @@ class AWSMeta:
         # Return the summary as a DataFrame
         df = pd.DataFrame(model_summary).convert_dtypes()
-        return df.sort_values(by="Created", ascending=False)
+        if not df.empty:
+            df.sort_values(by="Created", ascending=False, inplace=True)
+        return df
     def endpoints(self, details: bool = False) -> pd.DataFrame:
         """Get a summary of the Endpoints in AWS.
@@ -317,7 +321,9 @@ class AWSMeta:
         # Return the summary as a DataFrame
         df = pd.DataFrame(data_summary).convert_dtypes()
-        return df.sort_values(by="Created", ascending=False)
+        if not df.empty:
+            df.sort_values(by="Created", ascending=False, inplace=True)
+        return df
     def _endpoint_config_info(self, endpoint_config_name: str) -> dict:
         """Internal: Get the Endpoint Configuration information for the given endpoint config name.
@@ -657,7 +663,8 @@ class AWSMeta:
         df = pd.DataFrame(data_summary).convert_dtypes()
         # Sort by the Modified column
-        df = df.sort_values(by="Modified", ascending=False)
+        if not df.empty:
+            df = df.sort_values(by="Modified", ascending=False)
         return df
     def _aws_pipelines(self) -> pd.DataFrame:

workbench/core/transforms/model_to_endpoint/model_to_endpoint.py CHANGED Viewed

@@ -5,6 +5,7 @@ from sagemaker import ModelPackage
 from sagemaker.serializers import CSVSerializer
 from sagemaker.deserializers import CSVDeserializer
 from sagemaker.serverless import ServerlessInferenceConfig
+from sagemaker.model_monitor import DataCaptureConfig
 # Local Imports
 from workbench.core.transforms.transform import Transform, TransformInput, TransformOutput
@@ -51,27 +52,38 @@ class ModelToEndpoint(Transform):
         EndpointCore.managed_delete(self.output_name)
         # Get the Model Package ARN for our input model
-        input_model = ModelCore(self.input_name)
-        model_package_arn = input_model.model_package_arn()
+        workbench_model = ModelCore(self.input_name)
         # Deploy the model
-        self._deploy_model(model_package_arn, **kwargs)
+        self._deploy_model(workbench_model, **kwargs)
         # Add this endpoint to the set of registered endpoints for the model
-        input_model.register_endpoint(self.output_name)
+        workbench_model.register_endpoint(self.output_name)
         # This ensures that the endpoint is ready for use
         time.sleep(5)  # We wait for AWS Lag
         end = EndpointCore(self.output_name)
         self.log.important(f"Endpoint {end.name} is ready for use")
-    def _deploy_model(self, model_package_arn: str, mem_size: int = 2048, max_concurrency: int = 5):
+    def _deploy_model(
+        self,
+        workbench_model: ModelCore,
+        mem_size: int = 2048,
+        max_concurrency: int = 5,
+        data_capture: bool = False,
+        capture_percentage: int = 100,
+    ):
         """Internal Method: Deploy the Model
         Args:
-            model_package_arn(str): The Model Package ARN used to deploy the Endpoint
+            workbench_model(ModelCore): The Workbench ModelCore object to deploy
+            mem_size(int): Memory size for serverless deployment
+            max_concurrency(int): Max concurrency for serverless deployment
+            data_capture(bool): Enable data capture during deployment
+            capture_percentage(int): Percentage of data to capture. Defaults to 100.
         """
         # Grab the specified Model Package
+        model_package_arn = workbench_model.model_package_arn()
         model_package = ModelPackage(
             role=self.workbench_role_arn,
             model_package_arn=model_package_arn,
@@ -95,6 +107,23 @@ class ModelToEndpoint(Transform):
                 max_concurrency=max_concurrency,
             )
+        # Configure data capture if requested (and not serverless)
+        data_capture_config = None
+        if data_capture and not self.serverless:
+            # Set up the S3 path for data capture
+            base_endpoint_path = f"{workbench_model.endpoints_s3_path}/{self.output_name}"
+            data_capture_path = f"{base_endpoint_path}/data_capture"
+            self.log.important(f"Configuring Data Capture --> {data_capture_path}")
+            data_capture_config = DataCaptureConfig(
+                enable_capture=True,
+                sampling_percentage=capture_percentage,
+                destination_s3_uri=data_capture_path,
+            )
+        elif data_capture and self.serverless:
+            self.log.warning(
+                "Data capture is not supported for serverless endpoints. Skipping data capture configuration."
+            )
         # Deploy the Endpoint
         self.log.important(f"Deploying the Endpoint {self.output_name}...")
         model_package.deploy(
@@ -104,6 +133,7 @@ class ModelToEndpoint(Transform):
             endpoint_name=self.output_name,
             serializer=CSVSerializer(),
             deserializer=CSVDeserializer(),
+            data_capture_config=data_capture_config,
             tags=aws_tags,
         )

workbench/model_scripts/custom_models/uq_models/generated_model_script.py CHANGED Viewed

@@ -2,7 +2,6 @@
 from ngboost import NGBRegressor
 from xgboost import XGBRegressor  # Base Estimator
 from sklearn.model_selection import train_test_split
-import numpy as np
 # Model Performance Scores
 from sklearn.metrics import (
@@ -16,7 +15,9 @@ import json
 import argparse
 import joblib
 import os
+import numpy as np
 import pandas as pd
+from typing import List, Tuple
 # Local Imports
 from proximity import Proximity
@@ -25,11 +26,12 @@ from proximity import Proximity
 # Template Placeholders
 TEMPLATE_PARAMS = {
-    "id_column": "id",
-    "features": ['molwt', 'mollogp', 'molmr', 'heavyatomcount', 'numhacceptors', 'numhdonors', 'numheteroatoms', 'numrotatablebonds', 'numvalenceelectrons', 'numaromaticrings', 'numsaturatedrings', 'numaliphaticrings', 'ringcount', 'tpsa', 'labuteasa', 'balabanj', 'bertzct'],
-    "target": "solubility",
-    "train_all_data": True,
-    "track_columns": ['solubility']
+    "id_column": "udm_mol_bat_id",
+    "target": "udm_asy_res_intrinsic_clearance_ul_per_min_per_mg_protein",
+    "features": ['bcut2d_logplow', 'numradicalelectrons', 'smr_vsa5', 'fr_lactam', 'fr_morpholine', 'fr_aldehyde', 'slogp_vsa1', 'fr_amidine', 'bpol', 'fr_ester', 'fr_azo', 'kappa3', 'peoe_vsa5', 'fr_ketone_topliss', 'vsa_estate9', 'estate_vsa9', 'bcut2d_mrhi', 'fr_ndealkylation1', 'numrotatablebonds', 'minestateindex', 'fr_quatn', 'peoe_vsa3', 'fr_epoxide', 'fr_aniline', 'minpartialcharge', 'fr_nitroso', 'fpdensitymorgan2', 'fr_oxime', 'fr_sulfone', 'smr_vsa1', 'kappa1', 'fr_pyridine', 'numaromaticrings', 'vsa_estate6', 'molmr', 'estate_vsa1', 'fr_dihydropyridine', 'vsa_estate10', 'fr_alkyl_halide', 'chi2n', 'fr_thiocyan', 'fpdensitymorgan1', 'fr_unbrch_alkane', 'slogp_vsa9', 'chi4n', 'fr_nitro_arom', 'fr_al_oh', 'fr_furan', 'fr_c_s', 'peoe_vsa8', 'peoe_vsa14', 'numheteroatoms', 'fr_ndealkylation2', 'maxabspartialcharge', 'vsa_estate2', 'peoe_vsa7', 'apol', 'numhacceptors', 'fr_tetrazole', 'vsa_estate1', 'peoe_vsa9', 'naromatom', 'bcut2d_chghi', 'fr_sh', 'fr_halogen', 'slogp_vsa4', 'fr_benzodiazepine', 'molwt', 'fr_isocyan', 'fr_prisulfonamd', 'maxabsestateindex', 'minabsestateindex', 'peoe_vsa11', 'slogp_vsa12', 'estate_vsa5', 'numaliphaticcarbocycles', 'bcut2d_mwlow', 'slogp_vsa7', 'fr_allylic_oxid', 'fr_methoxy', 'fr_nh0', 'fr_coo2', 'fr_phenol', 'nacid', 'nbase', 'chi3v', 'fr_ar_nh', 'fr_nitrile', 'fr_imidazole', 'fr_urea', 'bcut2d_mrlow', 'chi1', 'smr_vsa6', 'fr_aryl_methyl', 'narombond', 'fr_alkyl_carbamate', 'fr_piperzine', 'exactmolwt', 'qed', 'chi0n', 'fr_sulfonamd', 'fr_thiazole', 'numvalenceelectrons', 'fr_phos_acid', 'peoe_vsa12', 'fr_nh1', 'fr_hdrzine', 'fr_c_o_nocoo', 'fr_lactone', 'estate_vsa6', 'bcut2d_logphi', 'vsa_estate7', 'peoe_vsa13', 'numsaturatedcarbocycles', 'fr_nitro', 'fr_phenol_noorthohbond', 'rotratio', 'fr_barbitur', 'fr_isothiocyan', 'balabanj', 'fr_arn', 'fr_imine', 'maxpartialcharge', 'fr_sulfide', 'slogp_vsa11', 'fr_hoccn', 'fr_n_o', 'peoe_vsa1', 'slogp_vsa6', 'heavyatommolwt', 'fractioncsp3', 'estate_vsa8', 'peoe_vsa10', 'numaliphaticrings', 'fr_thiophene', 'maxestateindex', 'smr_vsa10', 'labuteasa', 'smr_vsa2', 'fpdensitymorgan3', 'smr_vsa9', 'slogp_vsa10', 'numaromaticheterocycles', 'fr_nh2', 'fr_diazo', 'chi3n', 'fr_ar_coo', 'slogp_vsa5', 'fr_bicyclic', 'fr_amide', 'estate_vsa10', 'fr_guanido', 'chi1n', 'numsaturatedrings', 'fr_piperdine', 'fr_term_acetylene', 'estate_vsa4', 'slogp_vsa3', 'fr_coo', 'fr_ether', 'estate_vsa7', 'bcut2d_chglo', 'fr_oxazole', 'peoe_vsa6', 'hallkieralpha', 'peoe_vsa2', 'chi2v', 'nocount', 'vsa_estate5', 'fr_nhpyrrole', 'fr_al_coo', 'bertzct', 'estate_vsa11', 'minabspartialcharge', 'slogp_vsa8', 'fr_imide', 'kappa2', 'numaliphaticheterocycles', 'numsaturatedheterocycles', 'fr_hdrzone', 'smr_vsa4', 'fr_ar_n', 'nrot', 'smr_vsa8', 'slogp_vsa2', 'chi4v', 'fr_phos_ester', 'fr_para_hydroxylation', 'smr_vsa3', 'nhohcount', 'estate_vsa2', 'mollogp', 'tpsa', 'fr_azide', 'peoe_vsa4', 'numhdonors', 'fr_al_oh_notert', 'fr_c_o', 'chi0', 'fr_nitro_arom_nonortho', 'vsa_estate3', 'fr_benzene', 'fr_ketone', 'vsa_estate8', 'smr_vsa7', 'fr_ar_oh', 'fr_priamide', 'ringcount', 'estate_vsa3', 'numaromaticcarbocycles', 'bcut2d_mwhi', 'chi1v', 'heavyatomcount', 'vsa_estate4', 'chi0v'],
+    "compressed_features": [],
+    "train_all_data": False,
+    "track_columns": ['udm_asy_res_intrinsic_clearance_ul_per_min_per_mg_protein']
 }
@@ -73,136 +75,97 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     return df.rename(columns=rename_dict)
-def distance_weighted_calibrated_intervals(
-        df_pred: pd.DataFrame,
-        prox_df: pd.DataFrame,
-        calibration_strength: float = 0.7,
-        distance_decay: float = 3.0,
-) -> pd.DataFrame:
+def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
     """
-    Calibrate intervals using distance-weighted neighbor quantiles.
-    Uses all 10 neighbors with distance-based weighting.
+    Converts appropriate columns to categorical type with consistent mappings.
+    Args:
+        df (pd.DataFrame): The DataFrame to process.
+        features (list): List of feature names to consider for conversion.
+        category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
+                                            training mode. If populated, we're in inference mode.
+    Returns:
+        tuple: (processed DataFrame, category mappings dictionary)
     """
-    id_column = TEMPLATE_PARAMS["id_column"]
-    target_column = TEMPLATE_PARAMS["target"]
-    # Distance-weighted neighbor statistics
-    def weighted_quantile(values, weights, q):
-        """Calculate weighted quantile"""
-        if len(values) == 0:
-            return np.nan
-        sorted_indices = np.argsort(values)
-        sorted_values = values[sorted_indices]
-        sorted_weights = weights[sorted_indices]
-        cumsum = np.cumsum(sorted_weights)
-        cutoff = q * cumsum[-1]
-        return np.interp(cutoff, cumsum, sorted_values)
-    # Calculate distance weights (closer neighbors get more weight)
-    prox_df = prox_df.copy()
-    prox_df['weight'] = 1 / (1 + prox_df['distance'] ** distance_decay)
-    # Get weighted quantiles and statistics for each ID
-    neighbor_stats = []
-    for id_val, group in prox_df.groupby(id_column):
-        values = group[target_column].values
-        weights = group['weight'].values
-        # Normalize weights
-        weights = weights / weights.sum()
-        stats = {
-            id_column: id_val,
-            'local_q025': weighted_quantile(values, weights, 0.025),
-            'local_q25': weighted_quantile(values, weights, 0.25),
-            'local_q75': weighted_quantile(values, weights, 0.75),
-            'local_q975': weighted_quantile(values, weights, 0.975),
-            'local_median': weighted_quantile(values, weights, 0.5),
-            'local_std': np.sqrt(np.average((values - np.average(values, weights=weights)) ** 2, weights=weights)),
-            'avg_distance': group['distance'].mean(),
-            'min_distance': group['distance'].min(),
-            'max_distance': group['distance'].max(),
-        }
-        neighbor_stats.append(stats)
-    neighbor_df = pd.DataFrame(neighbor_stats)
-    out = df_pred.merge(neighbor_df, on=id_column, how='left')
-    # Model disagreement score (normalized by prediction std)
-    model_disagreement = (out["prediction"] - out["prediction_uq"]).abs()
-    disagreement_score = (model_disagreement / out["prediction_std"]).clip(0, 2)
-    # Local confidence based on:
-    # 1. How close the neighbors are (closer = more confident)
-    # 2. How much local variance there is (less variance = more confident)
-    max_reasonable_distance = out['max_distance'].quantile(0.8)  # 80th percentile as reference
-    distance_confidence = (1 - (out['avg_distance'] / max_reasonable_distance)).clip(0.1, 1.0)
-    variance_confidence = (out["prediction_std"] / out["local_std"]).clip(0.5, 2.0)
-    local_confidence = distance_confidence * variance_confidence.clip(0.5, 1.5)
-    # Calibration weight: higher when models disagree and we have good local data
-    calibration_weight = (
-            calibration_strength *
-            local_confidence *  # Weight by local data quality
-            disagreement_score.clip(0.3, 1.0)  # More calibration when models disagree
-    )
+    # Training mode
+    if category_mappings == {}:
+        for col in df.select_dtypes(include=["object", "string"]):
+            if col in features and df[col].nunique() < 20:
+                print(f"Training mode: Converting {col} to category")
+                df[col] = df[col].astype("category")
+                category_mappings[col] = df[col].cat.categories.tolist()  # Store category mappings
+    # Inference mode
+    else:
+        for col, categories in category_mappings.items():
+            if col in df.columns:
+                print(f"Inference mode: Applying categorical mapping for {col}")
+                df[col] = pd.Categorical(df[col], categories=categories)  # Apply consistent categorical mapping
-    # Consensus prediction (slight preference for NGBoost since it provides intervals)
-    consensus_pred = 0.65 * out["prediction_uq"] + 0.35 * out["prediction"]
+    return df, category_mappings
-    # Re-center local intervals around consensus prediction
-    local_center_offset = consensus_pred - out["local_median"]
-    # Apply calibration to each quantile
-    quantile_pairs = [
-        ("q_025", "local_q025"),
-        ("q_25", "local_q25"),
-        ("q_75", "local_q75"),
-        ("q_975", "local_q975")
-    ]
+def decompress_features(df: pd.DataFrame, features: List[str], compressed_features: List[str]) -> Tuple[pd.DataFrame, List[str]]:
+    """Prepare features for the XGBoost model
+    Args:
+        df (pd.DataFrame): The features DataFrame
+        features (List[str]): Full list of feature names
+        compressed_features (List[str]): List of feature names to decompress (bitstrings)
+    Returns:
+        pd.DataFrame: DataFrame with the decompressed features
+        List[str]: Updated list of feature names after decompression
-    for model_q, local_q in quantile_pairs:
-        # Adjust local quantiles to be centered around consensus
-        adjusted_local_q = out[local_q] + local_center_offset
+    Raises:
+        ValueError: If any missing values are found in the specified features
+    """
-        # Blend model and local intervals
-        out[model_q] = (
-                (1 - calibration_weight) * out[model_q] +
-                calibration_weight * adjusted_local_q
+    # Check for any missing values in the required features
+    missing_counts = df[features].isna().sum()
+    if missing_counts.any():
+        missing_features = missing_counts[missing_counts > 0]
+        print(
+            f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
+            "WARNING: You might want to remove/replace all NaN values before processing."
         )
-    # Ensure proper interval ordering and bounds using pandas
-    out["q_025"] = pd.concat([out["q_025"], consensus_pred], axis=1).min(axis=1)
-    out["q_975"] = pd.concat([out["q_975"], consensus_pred], axis=1).max(axis=1)
-    out["q_25"] = pd.concat([out["q_25"], out["q_75"]], axis=1).min(axis=1)
+    # Decompress the specified compressed features
+    decompressed_features = features
+    for feature in compressed_features:
+        if (feature not in df.columns) or (feature not in features):
+            print(f"Feature '{feature}' not in the features list, skipping decompression.")
+            continue
-    # Optional: Add some interval expansion when neighbors are very far
-    # (indicates we're in a sparse region of feature space)
-    sparse_region_mask = out['min_distance'] > out['min_distance'].quantile(0.9)
-    expansion_factor = 1 + 0.2 * sparse_region_mask  # 20% expansion in sparse regions
+        # Remove the feature from the list of features to avoid duplication
+        decompressed_features.remove(feature)
-    for q in ["q_025", "q_25", "q_75", "q_975"]:
-        interval_width = out[q] - consensus_pred
-        out[q] = consensus_pred + interval_width * expansion_factor
+        # Handle all compressed features as bitstrings
+        bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
+        prefix = feature[:3]
-    # Clean up temporary columns
-    cleanup_cols = [col for col in out.columns if col.startswith("local_")] + \
-                   ['avg_distance', 'min_distance', 'max_distance']
+        # Create all new columns at once - avoids fragmentation
+        new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
+        new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
-    return out.drop(columns=cleanup_cols)
+        # Add to features list
+        decompressed_features.extend(new_col_names)
+        # Drop original column and concatenate new ones
+        df = df.drop(columns=[feature])
+        df = pd.concat([df, new_df], axis=1)
+    return df, decompressed_features
-# TRAINING SECTION
-#
-# This section (__main__) is where SageMaker will execute the training job
-# and save the model artifacts to the model directory.
-#
 if __name__ == "__main__":
     # Template Parameters
     id_column = TEMPLATE_PARAMS["id_column"]
-    features = TEMPLATE_PARAMS["features"]
     target = TEMPLATE_PARAMS["target"]
+    features = TEMPLATE_PARAMS["features"]
+    orig_features = features.copy()
+    compressed_features = TEMPLATE_PARAMS["compressed_features"]
     train_all_data = TEMPLATE_PARAMS["train_all_data"]
     track_columns = TEMPLATE_PARAMS["track_columns"]  # Can be None
     validation_split = 0.2
@@ -216,34 +179,51 @@ if __name__ == "__main__":
     )
     args = parser.parse_args()
-    # Load training data from the specified directory
+    # Read the training data into DataFrames
     training_files = [
         os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
+        for file in os.listdir(args.train)
+        if file.endswith(".csv")
     ]
     print(f"Training Files: {training_files}")
     # Combine files and read them all into a single pandas dataframe
-    df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
+    all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
-    # Check if the DataFrame is empty
-    check_dataframe(df, "training_df")
+    # Check if the dataframe is empty
+    check_dataframe(all_df, "training_df")
-    # Training data split logic
+    # Features/Target output
+    print(f"Target: {target}")
+    print(f"Features: {str(features)}")
+    # Convert any features that might be categorical to 'category' type
+    all_df, category_mappings = convert_categorical_types(all_df, features)
+    # If we have compressed features, decompress them
+    if compressed_features:
+        print(f"Decompressing features {compressed_features}...")
+        all_df, features = decompress_features(all_df, features, compressed_features)
+    # Do we want to train on all the data?
     if train_all_data:
-        # Use all data for both training and validation
-        print("Training on all data...")
-        df_train = df.copy()
-        df_val = df.copy()
-    elif "training" in df.columns:
-        # Split data based on a 'training' column if it exists
-        print("Splitting data based on 'training' column...")
-        df_train = df[df["training"]].copy()
-        df_val = df[~df["training"]].copy()
+        print("Training on ALL of the data")
+        df_train = all_df.copy()
+        df_val = all_df.copy()
+    # Does the dataframe have a training column?
+    elif "training" in all_df.columns:
+        print("Found training column, splitting data based on training column")
+        df_train = all_df[all_df["training"]]
+        df_val = all_df[~all_df["training"]]
     else:
-        # Perform a random split if no 'training' column is found
-        print("Splitting data randomly...")
-        df_train, df_val = train_test_split(df, test_size=validation_split, random_state=42)
+        # Just do a random training Split
+        print("WARNING: No training column found, splitting data with random state=42")
+        df_train, df_val = train_test_split(
+            all_df, test_size=validation_split, random_state=42
+        )
+    print(f"FIT/TRAIN: {df_train.shape}")
+    print(f"VALIDATION: {df_val.shape}")
     # We're using XGBoost for point predictions and NGBoost for uncertainty quantification
     xgb_model = XGBRegressor()
@@ -251,18 +231,16 @@ if __name__ == "__main__":
     # Prepare features and targets for training
     X_train = df_train[features]
-    X_val = df_val[features]
+    X_validate = df_val[features]
     y_train = df_train[target]
-    y_val = df_val[target]
+    y_validate = df_val[target]
     # Train both models using the training data
     xgb_model.fit(X_train, y_train)
-    ngb_model.fit(X_train, y_train, X_val=X_val, Y_val=y_val)
+    ngb_model.fit(X_train, y_train, X_val=X_validate, Y_val=y_validate)
     # Make Predictions on the Validation Set
     print(f"Making Predictions on Validation Set...")
-    y_validate = df_val[target]
-    X_validate = df_val[features]
     preds = xgb_model.predict(X_validate)
     # Calculate various model performance metrics (regression)
@@ -280,9 +258,9 @@ if __name__ == "__main__":
     # Save the trained NGBoost model
     joblib.dump(ngb_model, os.path.join(args.model_dir, "ngb_model.joblib"))
-    # Save the feature list to validate input during predictions
+    # Save the features (this will validate input during predictions)
     with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
-        json.dump(features, fp)
+        json.dump(orig_features, fp)  # We save the original features, not the decompressed ones
     # Now the Proximity model
     model = Proximity(df_train, id_column, features, target, track_columns=track_columns)
@@ -295,7 +273,7 @@ if __name__ == "__main__":
 # Inference Section
 #
 def model_fn(model_dir) -> dict:
-    """Load and return XGBoost and NGBoost regressors from model directory."""
+    """Load and return XGBoost, NGBoost, and Prox Model from model directory."""
     # Load XGBoost regressor
     xgb_path = os.path.join(model_dir, "xgb_model.json")
@@ -376,18 +354,30 @@ def predict_fn(df, models) -> pd.DataFrame:
     df["prediction_std"] = dist_params['scale']  # standard deviation
     # Add 95% prediction intervals using ppf (percent point function)
-    df["q_025"] = y_dists.ppf(0.025)  # 2.5th percentile
-    df["q_975"] = y_dists.ppf(0.975)  # 97.5th percentile
+    # Note: Our hybrid model uses XGB point prediction and NGBoost UQ
+    #  so we need to adjust the bounds to include the point prediction
+    df["q_025"] = np.minimum(y_dists.ppf(0.025), df["prediction"])
+    df["q_975"] = np.maximum(y_dists.ppf(0.975), df["prediction"])
+    # Add 90% prediction intervals
+    df["q_05"] = y_dists.ppf(0.05)  # 5th percentile
+    df["q_95"] = y_dists.ppf(0.95)  # 95th percentile
+    # Add 80% prediction intervals
+    df["q_10"] = y_dists.ppf(0.10)  # 10th percentile
+    df["q_90"] = y_dists.ppf(0.90)  # 90th percentile
     # Add 50% prediction intervals
-    df["q_25"] = y_dists.ppf(0.25)   # 25th percentile
-    df["q_75"] = y_dists.ppf(0.75)   # 75th percentile
+    df["q_25"] = y_dists.ppf(0.25)  # 25th percentile
+    df["q_75"] = y_dists.ppf(0.75)  # 75th percentile
-    # Compute Nearest neighbors with Proximity model
-    prox_df = models["proximity"].neighbors(df)
+    # Reorder the quantile columns for easier reading
+    quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
+    other_cols = [col for col in df.columns if col not in quantile_cols]
+    df = df[other_cols + quantile_cols]
-    # Shrink prediction intervals based on KNN variance
-    df = distance_weighted_calibrated_intervals(df, prox_df)
+    # Compute Nearest neighbors with Proximity model
+    models["proximity"].neighbors(df)
     # Return the modified DataFrame
     return df

workbench 0.8.169__py3-none-any.whl → 0.8.171__py3-none-any.whl

Potentially problematic release.

workbench 0.8.169py3-none-any.whl → 0.8.171py3-none-any.whl