PyPI - workbench - Versions diffs - 0.8.162__py3-none-any.whl → 0.8.220__py3-none-any.whl - Mend

workbench 0.8.162py3-none-any.whl → 0.8.220py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (147) hide show

workbench/algorithms/dataframe/__init__.py +1 -2
workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
workbench/algorithms/dataframe/fingerprint_proximity.py +422 -86
workbench/algorithms/dataframe/projection_2d.py +44 -21
workbench/algorithms/dataframe/proximity.py +259 -305
workbench/algorithms/graph/light/proximity_graph.py +14 -12
workbench/algorithms/models/cleanlab_model.py +382 -0
workbench/algorithms/models/noise_model.py +388 -0
workbench/algorithms/sql/outliers.py +3 -3
workbench/api/__init__.py +5 -1
workbench/api/compound.py +1 -1
workbench/api/df_store.py +17 -108
workbench/api/endpoint.py +18 -5
workbench/api/feature_set.py +121 -15
workbench/api/meta.py +5 -2
workbench/api/meta_model.py +289 -0
workbench/api/model.py +55 -21
workbench/api/monitor.py +1 -16
workbench/api/parameter_store.py +3 -52
workbench/cached/cached_model.py +4 -4
workbench/core/artifacts/__init__.py +11 -2
workbench/core/artifacts/artifact.py +16 -8
workbench/core/artifacts/data_capture_core.py +355 -0
workbench/core/artifacts/df_store_core.py +114 -0
workbench/core/artifacts/endpoint_core.py +382 -253
workbench/core/artifacts/feature_set_core.py +249 -45
workbench/core/artifacts/model_core.py +135 -80
workbench/core/artifacts/monitor_core.py +33 -248
workbench/core/artifacts/parameter_store_core.py +98 -0
workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
workbench/core/cloud_platform/aws/aws_meta.py +12 -5
workbench/core/cloud_platform/aws/aws_session.py +4 -4
workbench/core/pipelines/pipeline_executor.py +1 -1
workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
workbench/core/transforms/features_to_model/features_to_model.py +62 -40
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +76 -15
workbench/core/transforms/pandas_transforms/pandas_to_features.py +38 -2
workbench/core/views/training_view.py +113 -42
workbench/core/views/view.py +53 -3
workbench/core/views/view_utils.py +4 -4
workbench/model_script_utils/model_script_utils.py +339 -0
workbench/model_script_utils/pytorch_utils.py +405 -0
workbench/model_script_utils/uq_harness.py +278 -0
workbench/model_scripts/chemprop/chemprop.template +649 -0
workbench/model_scripts/chemprop/generated_model_script.py +649 -0
workbench/model_scripts/chemprop/model_script_utils.py +339 -0
workbench/model_scripts/chemprop/requirements.txt +3 -0
workbench/model_scripts/custom_models/chem_info/fingerprints.py +175 -0
workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +8 -10
workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
workbench/model_scripts/custom_models/uq_models/ngboost.template +30 -18
workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
workbench/model_scripts/meta_model/generated_model_script.py +209 -0
workbench/model_scripts/meta_model/meta_model.template +209 -0
workbench/model_scripts/pytorch_model/generated_model_script.py +444 -500
workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
workbench/model_scripts/pytorch_model/pytorch.template +440 -496
workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
workbench/model_scripts/pytorch_model/requirements.txt +1 -1
workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
workbench/model_scripts/script_generation.py +20 -11
workbench/model_scripts/uq_models/generated_model_script.py +248 -0
workbench/model_scripts/xgb_model/generated_model_script.py +372 -404
workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
workbench/model_scripts/xgb_model/uq_harness.py +278 -0
workbench/model_scripts/xgb_model/xgb_model.template +369 -401
workbench/repl/workbench_shell.py +28 -19
workbench/resources/open_source_api.key +1 -1
workbench/scripts/endpoint_test.py +162 -0
workbench/scripts/lambda_test.py +73 -0
workbench/scripts/meta_model_sim.py +35 -0
workbench/scripts/ml_pipeline_batch.py +137 -0
workbench/scripts/ml_pipeline_sqs.py +186 -0
workbench/scripts/monitor_cloud_watch.py +20 -100
workbench/scripts/training_test.py +85 -0
workbench/utils/aws_utils.py +4 -3
workbench/utils/chem_utils/__init__.py +0 -0
workbench/utils/chem_utils/fingerprints.py +175 -0
workbench/utils/chem_utils/misc.py +194 -0
workbench/utils/chem_utils/mol_descriptors.py +483 -0
workbench/utils/chem_utils/mol_standardize.py +450 -0
workbench/utils/chem_utils/mol_tagging.py +348 -0
workbench/utils/chem_utils/projections.py +219 -0
workbench/utils/chem_utils/salts.py +256 -0
workbench/utils/chem_utils/sdf.py +292 -0
workbench/utils/chem_utils/toxicity.py +250 -0
workbench/utils/chem_utils/vis.py +253 -0
workbench/utils/chemprop_utils.py +141 -0
workbench/utils/cloudwatch_handler.py +1 -1
workbench/utils/cloudwatch_utils.py +137 -0
workbench/utils/config_manager.py +3 -7
workbench/utils/endpoint_utils.py +5 -7
workbench/utils/license_manager.py +2 -6
workbench/utils/meta_model_simulator.py +499 -0
workbench/utils/metrics_utils.py +256 -0
workbench/utils/model_utils.py +278 -79
workbench/utils/monitor_utils.py +44 -62
workbench/utils/pandas_utils.py +3 -3
workbench/utils/pytorch_utils.py +87 -0
workbench/utils/shap_utils.py +11 -57
workbench/utils/workbench_logging.py +0 -3
workbench/utils/workbench_sqs.py +1 -1
workbench/utils/xgboost_local_crossfold.py +267 -0
workbench/utils/xgboost_model_utils.py +127 -219
workbench/web_interface/components/model_plot.py +14 -2
workbench/web_interface/components/plugin_unit_test.py +5 -2
workbench/web_interface/components/plugins/dashboard_status.py +3 -1
workbench/web_interface/components/plugins/generated_compounds.py +1 -1
workbench/web_interface/components/plugins/model_details.py +38 -74
workbench/web_interface/components/plugins/scatter_plot.py +6 -10
{workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/METADATA +31 -9
{workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/RECORD +128 -96
workbench-0.8.220.dist-info/entry_points.txt +11 -0
{workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/licenses/LICENSE +1 -1
workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
workbench/model_scripts/custom_models/proximity/proximity.py +0 -384
workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -273
workbench/model_scripts/custom_models/uq_models/proximity.py +0 -384
workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
workbench/model_scripts/quant_regression/quant_regression.template +0 -279
workbench/model_scripts/quant_regression/requirements.txt +0 -1
workbench/utils/chem_utils.py +0 -1556
workbench/utils/execution_environment.py +0 -211
workbench/utils/fast_inference.py +0 -167
workbench/utils/resource_utils.py +0 -39
workbench-0.8.162.dist-info/entry_points.txt +0 -5
{workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/WHEEL +0 -0
{workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/top_level.txt +0 -0

workbench/scripts/ml_pipeline_sqs.py ADDED Viewed

@@ -0,0 +1,186 @@
+import argparse
+import logging
+import json
+from pathlib import Path
+# Workbench Imports
+from workbench.core.cloud_platform.aws.aws_account_clamp import AWSAccountClamp
+from workbench.utils.config_manager import ConfigManager
+from workbench.utils.s3_utils import upload_content_to_s3
+log = logging.getLogger("workbench")
+cm = ConfigManager()
+workbench_bucket = cm.get_config("WORKBENCH_BUCKET")
+def submit_to_sqs(
+    script_path: str,
+    size: str = "small",
+    realtime: bool = False,
+    dt: bool = False,
+    promote: bool = False,
+) -> None:
+    """
+    Upload script to S3 and submit message to SQS queue for processing.
+    Args:
+        script_path: Local path to the ML pipeline script
+        size: Job size tier - "small" (default), "medium", or "large"
+        realtime: If True, sets serverless=False for real-time processing (default: False)
+        dt: If True, sets DT=True in environment (default: False)
+        promote: If True, sets PROMOTE=True in environment (default: False)
+    Raises:
+        ValueError: If size is invalid or script file not found
+    """
+    print(f"\n{'=' * 60}")
+    print("🚀  SUBMITTING ML PIPELINE JOB")
+    print(f"{'=' * 60}")
+    if size not in ["small", "medium", "large"]:
+        raise ValueError(f"Invalid size '{size}'. Must be 'small', 'medium', or 'large'")
+    # Validate script exists
+    script_file = Path(script_path)
+    if not script_file.exists():
+        raise FileNotFoundError(f"Script not found: {script_path}")
+    print(f"📄  Script: {script_file.name}")
+    print(f"📏  Size tier: {size}")
+    print(f"⚡  Mode: {'Real-time' if realtime else 'Serverless'} (serverless={'False' if realtime else 'True'})")
+    print(f"🔄  DynamicTraining: {dt}")
+    print(f"🆕  Promote: {promote}")
+    print(f"🪣  Bucket: {workbench_bucket}")
+    sqs = AWSAccountClamp().boto3_session.client("sqs")
+    script_name = script_file.name
+    # List Workbench queues
+    print("\n📋  Listing Workbench SQS queues...")
+    try:
+        queues = sqs.list_queues(QueueNamePrefix="workbench-")
+        queue_urls = queues.get("QueueUrls", [])
+        if queue_urls:
+            print(f"✅  Found {len(queue_urls)} workbench queue(s):")
+            for url in queue_urls:
+                queue_name = url.split("/")[-1]
+                print(f"   • {queue_name}")
+        else:
+            print("⚠️  No workbench queues found")
+    except Exception as e:
+        print(f"❌  Error listing queues: {e}")
+    # Upload script to S3
+    s3_path = f"s3://{workbench_bucket}/batch-jobs/{script_name}"
+    print("\n📤  Uploading script to S3...")
+    print(f"   Source: {script_path}")
+    print(f"   Destination: {s3_path}")
+    try:
+        upload_content_to_s3(script_file.read_text(), s3_path)
+        print("✅  Script uploaded successfully")
+    except Exception as e:
+        print(f"❌  Upload failed: {e}")
+        raise
+    # Get queue URL and info
+    queue_name = "workbench-ml-pipeline-queue.fifo"
+    print("\n🎯  Getting queue information...")
+    print(f"   Queue name: {queue_name}")
+    try:
+        queue_url = sqs.get_queue_url(QueueName=queue_name)["QueueUrl"]
+        print(f"   Queue URL: {queue_url}")
+        # Get queue attributes for additional info
+        attrs = sqs.get_queue_attributes(
+            QueueUrl=queue_url, AttributeNames=["ApproximateNumberOfMessages", "ApproximateNumberOfMessagesNotVisible"]
+        )
+        messages_available = attrs["Attributes"].get("ApproximateNumberOfMessages", "0")
+        messages_in_flight = attrs["Attributes"].get("ApproximateNumberOfMessagesNotVisible", "0")
+        print(f"   Messages in queue: {messages_available}")
+        print(f"   Messages in flight: {messages_in_flight}")
+    except Exception as e:
+        print(f"❌  Error accessing queue: {e}")
+        raise
+    # Prepare message
+    message = {"script_path": s3_path, "size": size}
+    # Set environment variables
+    message["environment"] = {
+        "SERVERLESS": "False" if realtime else "True",
+        "DT": str(dt),
+        "PROMOTE": str(promote),
+    }
+    # Send the message to SQS
+    try:
+        print("\n📨  Sending message to SQS...")
+        response = sqs.send_message(
+            QueueUrl=queue_url,
+            MessageBody=json.dumps(message, indent=2),
+            MessageGroupId="ml-pipeline-jobs",  # Required for FIFO
+        )
+        message_id = response["MessageId"]
+        print("✅  Message sent successfully!")
+        print(f"   Message ID: {message_id}")
+    except Exception as e:
+        print(f"❌  Failed to send message: {e}")
+        raise
+    # Success summary
+    print(f"\n{'=' * 60}")
+    print("✅  JOB SUBMISSION COMPLETE")
+    print(f"{'=' * 60}")
+    print(f"📄  Script: {script_name}")
+    print(f"📏  Size: {size}")
+    print(f"⚡  Mode: {'Real-time' if realtime else 'Serverless'} (SERVERLESS={'False' if realtime else 'True'})")
+    print(f"🔄  DynamicTraining: {dt}")
+    print(f"🆕  Promote: {promote}")
+    print(f"🆔  Message ID: {message_id}")
+    print("\n🔍  MONITORING LOCATIONS:")
+    print(f"   • SQS Queue: AWS Console → SQS → {queue_name}")
+    print("   • Lambda Logs: AWS Console → Lambda → Functions")
+    print("   • Batch Jobs: AWS Console → Batch → Jobs")
+    print("   • CloudWatch: AWS Console → CloudWatch → Log groups")
+    print("\n⏳  Your job should start processing soon...")
+def main():
+    """CLI entry point for submitting ML pipelines via SQS."""
+    parser = argparse.ArgumentParser(description="Submit ML pipeline to SQS queue for Batch processing")
+    parser.add_argument("script_file", help="Local path to ML pipeline script")
+    parser.add_argument(
+        "--size", default="small", choices=["small", "medium", "large"], help="Job size tier (default: small)"
+    )
+    parser.add_argument(
+        "--realtime",
+        action="store_true",
+        help="Create realtime endpoints (default is serverless)",
+    )
+    parser.add_argument(
+        "--dt",
+        action="store_true",
+        help="Set DT=True (models and endpoints will have '-dt' suffix)",
+    )
+    parser.add_argument(
+        "--promote",
+        action="store_true",
+        help="Set Promote=True (models and endpoints will use promoted naming",
+    )
+    args = parser.parse_args()
+    try:
+        submit_to_sqs(
+            args.script_file,
+            args.size,
+            realtime=args.realtime,
+            dt=args.dt,
+            promote=args.promote,
+        )
+    except Exception as e:
+        print(f"\n❌  ERROR: {e}")
+        log.error(f"Error: {e}")
+        exit(1)
+if __name__ == "__main__":
+    main()

workbench/scripts/monitor_cloud_watch.py CHANGED Viewed

@@ -4,8 +4,10 @@ import sys
 import time
 import argparse
 from datetime import datetime, timedelta, timezone
-from workbench.core.cloud_platform.aws.aws_account_clamp import AWSAccountClamp
+# Workbench Imports
 from workbench.utils.repl_utils import cprint, Spinner
+from workbench.utils.cloudwatch_utils import get_cloudwatch_client, get_active_log_streams, stream_log_events
 # Define the log levels to include all log levels above the specified level
 log_level_map = {
@@ -33,64 +35,6 @@ def date_display(dt):
         return dt.strftime("%Y-%m-%d %I:%M%p") + "(UTC)"
-def get_cloudwatch_client():
-    """Get the CloudWatch Logs client using the Workbench assumed role session."""
-    session = AWSAccountClamp().boto3_session
-    return session.client("logs")
-def get_active_log_streams(client, log_group_name, start_time_ms, stream_filter=None):
-    """Retrieve log streams that have events after the specified start time."""
-    # Get all the streams in the log group
-    active_streams = []
-    stream_params = {
-        "logGroupName": log_group_name,
-        "orderBy": "LastEventTime",
-        "descending": True,
-    }
-    # Loop to retrieve all log streams (maximum 50 per call)
-    while True:
-        response = client.describe_log_streams(**stream_params)
-        log_streams = response.get("logStreams", [])
-        for log_stream in log_streams:
-            log_stream_name = log_stream["logStreamName"]
-            last_event_timestamp = log_stream.get("lastEventTimestamp")
-            # Include streams with events since the specified start time
-            # Note: There's some issue where the last event timestamp is 'off'
-            #       so we're going to add 60 minutes from the last event timestamp
-            last_event_timestamp += 60 * 60 * 1000
-            if last_event_timestamp >= start_time_ms:
-                active_streams.append(log_stream_name)
-            else:
-                break  # Stop if we reach streams older than the start time
-        # Check if there are more streams to retrieve
-        if "nextToken" in response:
-            stream_params["nextToken"] = response["nextToken"]
-        else:
-            break
-    # Sort and report the active log streams
-    active_streams.sort()
-    if active_streams:
-        print("Active log streams:", len(active_streams))
-    # Filter the active streams by a substring if provided
-    if stream_filter and active_streams:
-        print(f"Filtering active log streams by '{stream_filter}'...")
-        active_streams = [stream for stream in active_streams if stream_filter in stream]
-    for stream in active_streams:
-        print(f"\t - {stream}")
-    # Return the active log streams
-    return active_streams
 def get_latest_log_events(client, log_group_name, start_time, end_time=None, stream_filter=None):
     """Retrieve the latest log events from the active/filtered log streams in a CloudWatch Logs group."""
@@ -99,11 +43,15 @@ def get_latest_log_events(client, log_group_name, start_time, end_time=None, str
         get_latest_log_events.first_run = True
     log_events = []
-    start_time_ms = int(start_time.timestamp() * 1000)  # Convert start_time to milliseconds
+    start_time_ms = int(start_time.timestamp() * 1000)
+    # Use the util function to get active streams
+    active_streams = get_active_log_streams(log_group_name, start_time_ms, stream_filter, client)
-    # Get the active log streams with events since start_time
-    active_streams = get_active_log_streams(client, log_group_name, start_time_ms, stream_filter)
     if active_streams:
+        print(f"Active log streams: {len(active_streams)}")
+        for stream in active_streams:
+            print(f"\t - {stream}")
         print(f"Processing log events from {date_display(start_time)} on {len(active_streams)} active log streams...")
         get_latest_log_events.first_run = False
     else:
@@ -114,50 +62,22 @@ def get_latest_log_events(client, log_group_name, start_time, end_time=None, str
             print("Monitoring for new events...")
         return log_events
-    # Iterate over the active streams and fetch log events
+    # Use the util function to stream events from each log stream
     for log_stream_name in active_streams:
-        params = {
-            "logGroupName": log_group_name,
-            "logStreamName": log_stream_name,
-            "startTime": start_time_ms,  # Use start_time in milliseconds
-            "startFromHead": True,  # Start from the nearest event to start_time
-        }
-        next_event_token = None
-        if end_time is not None:
-            params["endTime"] = int(end_time.timestamp() * 1000)
-        # Process the log events from this log stream
         spinner = Spinner("lightpurple", f"Pulling events from {log_stream_name}:")
         spinner.start()
         log_stream_events = 0
-        # Get the log events for the active log stream
-        while True:
-            if next_event_token:
-                params["nextToken"] = next_event_token
-                params.pop("startTime", None)  # Remove startTime when using nextToken
+        # Stream events using the util function
+        for event in stream_log_events(
+            log_group_name, log_stream_name, start_time, end_time, follow=False, client=client
+        ):
+            log_stream_events += 1
+            log_events.append(event)
-            # Fetch the log events (this call takes a while: optimize if we can)
-            events_response = client.get_log_events(**params)
+        spinner.stop()
+        print(f"Processed {log_stream_events} events from {log_stream_name} (Total: {len(log_events)})")
-            events = events_response.get("events", [])
-            for event in events:
-                event["logStreamName"] = log_stream_name
-            # Add the log stream events to our list of all log events
-            log_stream_events += len(events)
-            log_events.extend(events)
-            # Handle pagination for log events
-            next_event_token = events_response.get("nextForwardToken")
-            # Break the loop if there are no more events to fetch
-            if not next_event_token or next_event_token == params.get("nextToken"):
-                spinner.stop()
-                print(f"Processed {log_stream_events} events from {log_stream_name} (Total: {len(log_events)})")
-                break
-    # Return the log events
     return log_events
@@ -206,6 +126,7 @@ def monitor_log_group(
     print(f"Monitoring log group: {log_group_name} from {date_display(start_time)}")
     print(f"Log levels: {log_levels}")
     print(f"Search terms: {search_terms}")
     while True:
         # Get the latest log events with stream filtering if provided
         all_log_events = get_latest_log_events(client, log_group_name, start_time, end_time, stream_filter)
@@ -218,7 +139,6 @@ def monitor_log_group(
                 # Check the search terms
                 if not search_terms or any(term in event["message"].lower() for term in search_terms):
                     # Calculate the start and end index for this match
                     start_index = max(i - before, 0)
                     end_index = min(i + after, len(all_log_events) - 1)

workbench/scripts/training_test.py ADDED Viewed

@@ -0,0 +1,85 @@
+"""
+Local test harness for SageMaker training scripts.
+Usage:
+    python training_test.py <model_script.py> <featureset_name>
+Example:
+    python training_test.py ../model_scripts/pytorch_model/generated_model_script.py caco2-class-features
+"""
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+import pandas as pd
+from workbench.api import FeatureSet
+def get_training_data(featureset_name: str) -> pd.DataFrame:
+    """Get training data from the FeatureSet."""
+    fs = FeatureSet(featureset_name)
+    return fs.pull_dataframe()
+def main():
+    if len(sys.argv) < 3:
+        print("Usage: python training_test.py <model_script.py> <featureset_name>")
+        sys.exit(1)
+    script_path = sys.argv[1]
+    featureset_name = sys.argv[2]
+    if not os.path.exists(script_path):
+        print(f"Error: Script not found: {script_path}")
+        sys.exit(1)
+    # Create temp directories
+    model_dir = tempfile.mkdtemp(prefix="training_model_")
+    train_dir = tempfile.mkdtemp(prefix="training_data_")
+    output_dir = tempfile.mkdtemp(prefix="training_output_")
+    print(f"Model dir: {model_dir}")
+    print(f"Train dir: {train_dir}")
+    try:
+        # Get training data and save to CSV
+        print(f"Loading FeatureSet: {featureset_name}")
+        df = get_training_data(featureset_name)
+        print(f"Data shape: {df.shape}")
+        train_file = os.path.join(train_dir, "training_data.csv")
+        df.to_csv(train_file, index=False)
+        # Set up environment
+        env = os.environ.copy()
+        env["SM_MODEL_DIR"] = model_dir
+        env["SM_CHANNEL_TRAIN"] = train_dir
+        env["SM_OUTPUT_DATA_DIR"] = output_dir
+        print("\n" + "=" * 60)
+        print("Starting training...")
+        print("=" * 60 + "\n")
+        # Run the script
+        cmd = [sys.executable, script_path, "--model-dir", model_dir, "--train", train_dir]
+        result = subprocess.run(cmd, env=env)
+        print("\n" + "=" * 60)
+        if result.returncode == 0:
+            print("Training completed successfully!")
+        else:
+            print(f"Training failed with return code: {result.returncode}")
+        print("=" * 60)
+    finally:
+        shutil.rmtree(model_dir, ignore_errors=True)
+        shutil.rmtree(train_dir, ignore_errors=True)
+        shutil.rmtree(output_dir, ignore_errors=True)
+if __name__ == "__main__":
+    main()

workbench/utils/aws_utils.py CHANGED Viewed

@@ -55,7 +55,8 @@ def aws_throttle(func=None, retry_intervals=None):
     if func is None:
         return lambda f: aws_throttle(f, retry_intervals=retry_intervals)
-    service_hold_time = 2  # Seconds to wait before calling AWS function
+    # This is currently commented out (we might want to use it later)
+    # service_hold_time = 2  # Seconds to wait before calling AWS function
     default_intervals = [2**i for i in range(1, 9)]  # Default exponential backoff: 2, 4, 8... 256 seconds
     intervals = retry_intervals or default_intervals
@@ -64,8 +65,8 @@ def aws_throttle(func=None, retry_intervals=None):
         for attempt, delay in enumerate(intervals, start=1):
             try:
                 # Add sleep before calling AWS func if running as a service
-                if cm.running_as_service:
-                    time.sleep(service_hold_time)
+                # if cm.running_as_service:
+                #    time.sleep(service_hold_time)
                 return func(*args, **kwargs)
             except ClientError as e:
                 if e.response["Error"]["Code"] == "ThrottlingException":

workbench/utils/chem_utils/__init__.py ADDED Viewed

File without changes

workbench/utils/chem_utils/fingerprints.py ADDED Viewed

@@ -0,0 +1,175 @@
+"""Molecular fingerprint computation utilities for ADMET modeling.
+This module provides Morgan count fingerprints, the standard for ADMET prediction.
+Count fingerprints outperform binary fingerprints for molecular property prediction.
+References:
+    - Count vs Binary: https://pubs.acs.org/doi/10.1021/acs.est.3c02198
+    - ECFP/Morgan: https://pubs.acs.org/doi/10.1021/ci100050t
+"""
+import logging
+import numpy as np
+import pandas as pd
+from rdkit import Chem, RDLogger
+from rdkit.Chem import AllChem
+from rdkit.Chem.MolStandardize import rdMolStandardize
+# Suppress RDKit warnings (e.g., "not removing hydrogen atom without neighbors")
+# Keep errors enabled so we see actual problems
+RDLogger.DisableLog("rdApp.warning")
+# Set up the logger
+log = logging.getLogger("workbench")
+def compute_morgan_fingerprints(df: pd.DataFrame, radius: int = 2, n_bits: int = 2048) -> pd.DataFrame:
+    """Compute Morgan count fingerprints for ADMET modeling.
+    Generates true count fingerprints where each bit position contains the
+    number of times that substructure appears in the molecule (clamped to 0-255).
+    This is the recommended approach for ADMET prediction per 2025 research.
+    Args:
+        df: Input DataFrame containing SMILES strings.
+        radius: Radius for the Morgan fingerprint (default 2 = ECFP4 equivalent).
+        n_bits: Number of bits for the fingerprint (default 2048).
+    Returns:
+        pd.DataFrame: Input DataFrame with 'fingerprint' column added.
+                      Values are comma-separated uint8 counts.
+    Note:
+        Count fingerprints outperform binary for ADMET prediction.
+        See: https://pubs.acs.org/doi/10.1021/acs.est.3c02198
+    """
+    delete_mol_column = False
+    # Check for the SMILES column (case-insensitive)
+    smiles_column = next((col for col in df.columns if col.lower() == "smiles"), None)
+    if smiles_column is None:
+        raise ValueError("Input DataFrame must have a 'smiles' column")
+    # Sanity check the molecule column (sometimes it gets serialized, which doesn't work)
+    if "molecule" in df.columns and df["molecule"].dtype == "string":
+        log.warning("Detected serialized molecules in 'molecule' column. Removing...")
+        del df["molecule"]
+    # Convert SMILES to RDKit molecule objects
+    if "molecule" not in df.columns:
+        log.info("Converting SMILES to RDKit Molecules...")
+        delete_mol_column = True
+        df["molecule"] = df[smiles_column].apply(Chem.MolFromSmiles)
+        # Make sure our molecules are not None
+        failed_smiles = df[df["molecule"].isnull()][smiles_column].tolist()
+        if failed_smiles:
+            log.warning(f"Failed to convert {len(failed_smiles)} SMILES to molecules ({failed_smiles})")
+        df = df.dropna(subset=["molecule"]).copy()
+    # If we have fragments in our compounds, get the largest fragment before computing fingerprints
+    largest_frags = df["molecule"].apply(
+        lambda mol: rdMolStandardize.LargestFragmentChooser().choose(mol) if mol else None
+    )
+    def mol_to_count_string(mol):
+        """Convert molecule to comma-separated count fingerprint string."""
+        if mol is None:
+            return pd.NA
+        # Get hashed Morgan fingerprint with counts
+        fp = AllChem.GetHashedMorganFingerprint(mol, radius, nBits=n_bits)
+        # Initialize array and populate with counts (clamped to uint8 range)
+        counts = np.zeros(n_bits, dtype=np.uint8)
+        for idx, count in fp.GetNonzeroElements().items():
+            counts[idx] = min(count, 255)
+        # Return as comma-separated string
+        return ",".join(map(str, counts))
+    # Compute Morgan count fingerprints
+    fingerprints = largest_frags.apply(mol_to_count_string)
+    # Add the fingerprints to the DataFrame
+    df["fingerprint"] = fingerprints
+    # Drop the intermediate 'molecule' column if it was added
+    if delete_mol_column:
+        del df["molecule"]
+    return df
+if __name__ == "__main__":
+    print("Running Morgan count fingerprint tests...")
+    # Test molecules
+    test_molecules = {
+        "aspirin": "CC(=O)OC1=CC=CC=C1C(=O)O",
+        "caffeine": "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",
+        "glucose": "C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O",  # With stereochemistry
+        "sodium_acetate": "CC(=O)[O-].[Na+]",  # Salt (largest fragment used)
+        "benzene": "c1ccccc1",
+        "butene_e": "C/C=C/C",  # E-butene
+        "butene_z": "C/C=C\\C",  # Z-butene
+    }
+    # Test 1: Morgan Count Fingerprints (default parameters)
+    print("\n1. Testing Morgan fingerprint generation (radius=2, n_bits=2048)...")
+    test_df = pd.DataFrame({"SMILES": list(test_molecules.values()), "name": list(test_molecules.keys())})
+    fp_df = compute_morgan_fingerprints(test_df.copy())
+    print("   Fingerprint generation results:")
+    for _, row in fp_df.iterrows():
+        fp = row.get("fingerprint", "N/A")
+        if pd.notna(fp):
+            counts = [int(x) for x in fp.split(",")]
+            non_zero = sum(1 for c in counts if c > 0)
+            max_count = max(counts)
+            print(f"   {row['name']:15} → {len(counts)} features, {non_zero} non-zero, max={max_count}")
+        else:
+            print(f"   {row['name']:15} → N/A")
+    # Test 2: Different parameters
+    print("\n2. Testing with different parameters (radius=3, n_bits=1024)...")
+    fp_df_custom = compute_morgan_fingerprints(test_df.copy(), radius=3, n_bits=1024)
+    for _, row in fp_df_custom.iterrows():
+        fp = row.get("fingerprint", "N/A")
+        if pd.notna(fp):
+            counts = [int(x) for x in fp.split(",")]
+            non_zero = sum(1 for c in counts if c > 0)
+            print(f"   {row['name']:15} → {len(counts)} features, {non_zero} non-zero")
+        else:
+            print(f"   {row['name']:15} → N/A")
+    # Test 3: Edge cases
+    print("\n3. Testing edge cases...")
+    # Invalid SMILES
+    invalid_df = pd.DataFrame({"SMILES": ["INVALID", ""]})
+    fp_invalid = compute_morgan_fingerprints(invalid_df.copy())
+    print(f"   ✓ Invalid SMILES handled: {len(fp_invalid)} rows returned")
+    # Test with pre-existing molecule column
+    mol_df = test_df.copy()
+    mol_df["molecule"] = mol_df["SMILES"].apply(Chem.MolFromSmiles)
+    fp_with_mol = compute_morgan_fingerprints(mol_df)
+    print(f"   ✓ Pre-existing molecule column handled: {len(fp_with_mol)} fingerprints generated")
+    # Test 4: Verify count values are reasonable
+    print("\n4. Verifying count distribution...")
+    all_counts = []
+    for _, row in fp_df.iterrows():
+        fp = row.get("fingerprint", "N/A")
+        if pd.notna(fp):
+            counts = [int(x) for x in fp.split(",")]
+            all_counts.extend([c for c in counts if c > 0])
+    if all_counts:
+        print(f"   Non-zero counts: min={min(all_counts)}, max={max(all_counts)}, mean={np.mean(all_counts):.2f}")
+    print("\n✅ All fingerprint tests completed!")

workbench 0.8.162__py3-none-any.whl → 0.8.220__py3-none-any.whl

Potentially problematic release.

workbench 0.8.162py3-none-any.whl → 0.8.220py3-none-any.whl