PyPI - workbench - Versions diffs - 0.8.168__py3-none-any.whl → 0.8.192__py3-none-any.whl - Mend

workbench 0.8.168py3-none-any.whl → 0.8.192py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

workbench/algorithms/dataframe/proximity.py +143 -102
workbench/algorithms/graph/light/proximity_graph.py +2 -1
workbench/api/compound.py +1 -1
workbench/api/endpoint.py +3 -2
workbench/api/feature_set.py +4 -4
workbench/api/model.py +16 -12
workbench/api/monitor.py +1 -16
workbench/core/artifacts/artifact.py +11 -3
workbench/core/artifacts/data_capture_core.py +355 -0
workbench/core/artifacts/endpoint_core.py +113 -27
workbench/core/artifacts/feature_set_core.py +72 -13
workbench/core/artifacts/model_core.py +50 -15
workbench/core/artifacts/monitor_core.py +33 -249
workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
workbench/core/cloud_platform/aws/aws_meta.py +11 -4
workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
workbench/core/transforms/features_to_model/features_to_model.py +9 -4
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
workbench/core/views/training_view.py +49 -53
workbench/core/views/view.py +51 -1
workbench/core/views/view_utils.py +4 -4
workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
workbench/model_scripts/custom_models/proximity/proximity.py +143 -102
workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
workbench/model_scripts/custom_models/uq_models/meta_uq.template +156 -58
workbench/model_scripts/custom_models/uq_models/ngboost.template +20 -14
workbench/model_scripts/custom_models/uq_models/proximity.py +143 -102
workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
workbench/model_scripts/pytorch_model/pytorch.template +9 -18
workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
workbench/model_scripts/script_generation.py +7 -2
workbench/model_scripts/uq_models/mapie.template +492 -0
workbench/model_scripts/uq_models/requirements.txt +1 -0
workbench/model_scripts/xgb_model/xgb_model.template +31 -40
workbench/repl/workbench_shell.py +4 -4
workbench/scripts/lambda_launcher.py +63 -0
workbench/scripts/{ml_pipeline_launcher.py → ml_pipeline_batch.py} +49 -51
workbench/scripts/ml_pipeline_sqs.py +186 -0
workbench/utils/chem_utils/__init__.py +0 -0
workbench/utils/chem_utils/fingerprints.py +134 -0
workbench/utils/chem_utils/misc.py +194 -0
workbench/utils/chem_utils/mol_descriptors.py +483 -0
workbench/utils/chem_utils/mol_standardize.py +450 -0
workbench/utils/chem_utils/mol_tagging.py +348 -0
workbench/utils/chem_utils/projections.py +209 -0
workbench/utils/chem_utils/salts.py +256 -0
workbench/utils/chem_utils/sdf.py +292 -0
workbench/utils/chem_utils/toxicity.py +250 -0
workbench/utils/chem_utils/vis.py +253 -0
workbench/utils/config_manager.py +2 -6
workbench/utils/endpoint_utils.py +5 -7
workbench/utils/license_manager.py +2 -6
workbench/utils/model_utils.py +76 -30
workbench/utils/monitor_utils.py +44 -62
workbench/utils/pandas_utils.py +3 -3
workbench/utils/shap_utils.py +10 -2
workbench/utils/workbench_sqs.py +1 -1
workbench/utils/xgboost_model_utils.py +283 -145
workbench/web_interface/components/plugins/dashboard_status.py +3 -1
workbench/web_interface/components/plugins/generated_compounds.py +1 -1
workbench/web_interface/components/plugins/scatter_plot.py +3 -3
{workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/METADATA +2 -1
{workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/RECORD +74 -70
{workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/entry_points.txt +3 -1
workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
workbench/model_scripts/pytorch_model/generated_model_script.py +0 -576
workbench/model_scripts/quant_regression/quant_regression.template +0 -279
workbench/model_scripts/quant_regression/requirements.txt +0 -1
workbench/model_scripts/scikit_learn/generated_model_script.py +0 -307
workbench/model_scripts/xgb_model/generated_model_script.py +0 -477
workbench/utils/chem_utils.py +0 -1556
workbench/utils/fast_inference.py +0 -167
workbench/utils/resource_utils.py +0 -39
{workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/WHEEL +0 -0
{workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/licenses/LICENSE +0 -0
{workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/top_level.txt +0 -0

workbench/repl/workbench_shell.py CHANGED Viewed

@@ -41,7 +41,7 @@ from workbench.cached.cached_meta import CachedMeta
 try:
     import rdkit  # noqa
     import mordred  # noqa
-    from workbench.utils import chem_utils
+    from workbench.utils.chem_utils import vis
     HAVE_CHEM_UTILS = True
 except ImportError:
@@ -178,12 +178,12 @@ class WorkbenchShell:
         # Add cheminformatics utils if available
         if HAVE_CHEM_UTILS:
-            self.commands["show"] = chem_utils.show
+            self.commands["show"] = vis.show
     def start(self):
         """Start the Workbench IPython shell"""
         cprint("magenta", "\nWelcome to Workbench!")
-        if self.aws_status is False:
+        if not self.aws_status:
             cprint("red", "AWS Account Connection Failed...Review/Fix the Workbench Config:")
             cprint("red", f"Path: {self.cm.site_config_path}")
             self.show_config()
@@ -560,7 +560,7 @@ class WorkbenchShell:
         from workbench.web_interface.components.plugin_unit_test import PluginUnitTest
         # Get kwargs
-        theme = kwargs.get("theme", "dark")
+        theme = kwargs.get("theme", "midnight_blue")
         plugin_test = PluginUnitTest(plugin_class, theme=theme, input_data=data, **kwargs)

workbench/scripts/lambda_launcher.py ADDED Viewed

@@ -0,0 +1,63 @@
+import sys
+import os
+import json
+import importlib.util
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: lambda_launcher <handler_module_name>")
+        print("\nOptional: testing/event.json with test event")
+        print("Optional: testing/env.json with environment variables")
+        sys.exit(1)
+    handler_file = sys.argv[1]
+    # Add .py if not present
+    if not handler_file.endswith(".py"):
+        handler_file += ".py"
+    # Check if file exists
+    if not os.path.exists(handler_file):
+        print(f"Error: File '{handler_file}' not found")
+        sys.exit(1)
+    # Load environment variables from env.json if it exists
+    if os.path.exists("testing/env.json"):
+        print("Loading environment variables from testing/env.json")
+        with open("testing/env.json") as f:
+            env_vars = json.load(f)
+            for key, value in env_vars.items():
+                os.environ[key] = value
+                print(f"  Set {key} = {value}")
+        print()
+    # Load event configuration
+    if os.path.exists("testing/event.json"):
+        print("Loading event from testing/event.json")
+        with open("testing/event.json") as f:
+            event = json.load(f)
+    else:
+        print("No testing/event.json found, using empty event")
+        event = {}
+    # Load the module dynamically
+    spec = importlib.util.spec_from_file_location("lambda_module", handler_file)
+    lambda_module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(lambda_module)
+    # Call the lambda_handler
+    print(f"Invoking lambda_handler from {handler_file}...")
+    print("-" * 50)
+    print(f"Event: {json.dumps(event, indent=2)}")
+    print("-" * 50)
+    result = lambda_module.lambda_handler(event, {})
+    print("-" * 50)
+    print("Result:")
+    print(json.dumps(result, indent=2))
+if __name__ == "__main__":
+    main()

workbench/scripts/{ml_pipeline_launcher.py → ml_pipeline_batch.py} RENAMED Viewed

@@ -27,60 +27,56 @@ def get_batch_role_arn() -> str:
     return f"arn:aws:iam::{account_id}:role/Workbench-BatchRole"
-def ensure_job_definition():
-    """Register or update the Batch job definition for ML pipeline runner."""
-    batch = AWSAccountClamp().boto3_session.client("batch")
-    name = "workbench-ml-pipeline-runner"
-    response = batch.register_job_definition(
-        jobDefinitionName=name,
-        type="container",
-        platformCapabilities=["FARGATE"],
-        containerProperties={
-            "image": get_ecr_image_uri(),
-            "resourceRequirements": [{"type": "VCPU", "value": "2"}, {"type": "MEMORY", "value": "4096"}],
-            "jobRoleArn": get_batch_role_arn(),
-            "executionRoleArn": get_batch_role_arn(),
-            "environment": [
-                {"name": "WORKBENCH_BUCKET", "value": workbench_bucket},
-                {"name": "PYTHONUNBUFFERED", "value": "1"},
-            ],
-            # "networkConfiguration": {"assignPublicIp": "ENABLED"},  # Required for ECR Image Pull (when not in VPC)
-        },
-        timeout={"attemptDurationSeconds": 10800},  # 3 hours
-    )
-    log.info(f"Job definition ready: {name} (revision {response['revision']})")
-    return name
+def _log_cloudwatch_link(job: dict, message_prefix: str = "View logs") -> None:
+    """
+    Helper method to log CloudWatch logs link with clickable URL and full URL display.
+    Args:
+        job: Batch job description dictionary
+        message_prefix: Prefix for the log message (default: "View logs")
+    """
+    log_stream = job.get("container", {}).get("logStreamName")
+    logs_url = get_cloudwatch_logs_url(log_group="/aws/batch/job", log_stream=log_stream)
+    if logs_url:
+        clickable_url = f"\033]8;;{logs_url}\033\\{logs_url}\033]8;;\033\\"
+        log.info(f"{message_prefix}: {clickable_url}")
+    else:
+        log.info("Check AWS Batch console for logs")
-def run_batch_job(script_path: str) -> int:
+def run_batch_job(script_path: str, size: str = "small") -> int:
     """
     Submit and monitor an AWS Batch job for ML pipeline execution.
-    This function:
-    1. Uploads the ML pipeline script to S3
-    2. Submits a Batch job to run the script in a container
-    3. Monitors job status until completion
-    4. Returns the job's exit code
+    Uploads script to S3, submits Batch job, monitors until completion or 2 minutes of RUNNING.
     Args:
         script_path: Local path to the ML pipeline script
+        size: Job size tier - "small" (default), "medium", or "large"
+          - small: 2 vCPU, 4GB RAM for lightweight processing
+          - medium: 4 vCPU, 8GB RAM for standard ML workloads
+          - large: 8 vCPU, 16GB RAM for heavy training/inference
     Returns:
-        Exit code from the batch job (0 for success, non-zero for failure)
+        Exit code (0 for success/disconnected, non-zero for failure)
     """
+    if size not in ["small", "medium", "large"]:
+        raise ValueError(f"Invalid size '{size}'. Must be 'small', 'medium', or 'large'")
     batch = AWSAccountClamp().boto3_session.client("batch")
     script_name = Path(script_path).stem
-    # Upload script to S3 for the container to download
+    # Upload script to S3
     s3_path = f"s3://{workbench_bucket}/batch-jobs/{Path(script_path).name}"
     log.info(f"Uploading script to {s3_path}")
     upload_content_to_s3(Path(script_path).read_text(), s3_path)
-    # Submit the Batch job
+    # Submit job
     job_name = f"workbench_{script_name}_{datetime.now():%Y%m%d_%H%M%S}"
     response = batch.submit_job(
         jobName=job_name,
         jobQueue="workbench-job-queue",
-        jobDefinition=ensure_job_definition(),
+        jobDefinition=f"workbench-batch-{size}",
         containerOverrides={
             "environment": [
                 {"name": "ML_PIPELINE_S3_PATH", "value": s3_path},
@@ -89,36 +85,38 @@ def run_batch_job(script_path: str) -> int:
         },
     )
     job_id = response["jobId"]
-    log.info(f"Submitted job: {job_name} ({job_id})")
+    log.info(f"Submitted job: {job_name} ({job_id}) using {size} tier")
-    # Monitor job execution
-    last_status = None
+    # Monitor job
+    last_status, running_start = None, None
     while True:
-        # Check job status
         job = batch.describe_jobs(jobs=[job_id])["jobs"][0]
         status = job["status"]
         if status != last_status:
             log.info(f"Job status: {status}")
             last_status = status
+            if status == "RUNNING":
+                running_start = time.time()
+        # Disconnect after 2 minutes of running
+        if status == "RUNNING" and running_start and (time.time() - running_start >= 120):
+            log.info("✅  ML Pipeline is running successfully!")
+            _log_cloudwatch_link(job, "📊  Monitor logs")
+            return 0
-        # Check if job completed
+        # Handle completion
         if status in ["SUCCEEDED", "FAILED"]:
             exit_code = job.get("attempts", [{}])[-1].get("exitCode", 1)
-            if status == "FAILED":
-                log.error(f"Job failed: {job.get('statusReason', 'Unknown reason')}")
-            else:
-                log.info("Job completed successfully")
-            # Get CloudWatch logs URL
-            log_stream_name = job.get("container", {}).get("logStreamName")
-            logs_url = get_cloudwatch_logs_url(log_group="/aws/batch/job", log_stream=log_stream_name)
-            if logs_url:
-                # OSC 8 hyperlink format for modern terminals
-                clickable_url = f"\033]8;;{logs_url}\033\\{logs_url}\033]8;;\033\\"
-                log.info(f"View logs: {clickable_url}")
+            msg = (
+                "Job completed successfully"
+                if status == "SUCCEEDED"
+                else f"Job failed: {job.get('statusReason', 'Unknown')}"
+            )
+            log.info(msg) if status == "SUCCEEDED" else log.error(msg)
+            _log_cloudwatch_link(job)
             return exit_code
-        # Sleep a bit before next status check
         time.sleep(10)

workbench/scripts/ml_pipeline_sqs.py ADDED Viewed

@@ -0,0 +1,186 @@
+import argparse
+import logging
+import json
+from pathlib import Path
+# Workbench Imports
+from workbench.core.cloud_platform.aws.aws_account_clamp import AWSAccountClamp
+from workbench.utils.config_manager import ConfigManager
+from workbench.utils.s3_utils import upload_content_to_s3
+log = logging.getLogger("workbench")
+cm = ConfigManager()
+workbench_bucket = cm.get_config("WORKBENCH_BUCKET")
+def submit_to_sqs(
+    script_path: str,
+    size: str = "small",
+    realtime: bool = False,
+    dt: bool = False,
+    promote: bool = False,
+) -> None:
+    """
+    Upload script to S3 and submit message to SQS queue for processing.
+    Args:
+        script_path: Local path to the ML pipeline script
+        size: Job size tier - "small" (default), "medium", or "large"
+        realtime: If True, sets serverless=False for real-time processing (default: False)
+        dt: If True, sets DT=True in environment (default: False)
+        promote: If True, sets PROMOTE=True in environment (default: False)
+    Raises:
+        ValueError: If size is invalid or script file not found
+    """
+    print(f"\n{'=' * 60}")
+    print("🚀  SUBMITTING ML PIPELINE JOB")
+    print(f"{'=' * 60}")
+    if size not in ["small", "medium", "large"]:
+        raise ValueError(f"Invalid size '{size}'. Must be 'small', 'medium', or 'large'")
+    # Validate script exists
+    script_file = Path(script_path)
+    if not script_file.exists():
+        raise FileNotFoundError(f"Script not found: {script_path}")
+    print(f"📄  Script: {script_file.name}")
+    print(f"📏  Size tier: {size}")
+    print(f"⚡  Mode: {'Real-time' if realtime else 'Serverless'} (serverless={'False' if realtime else 'True'})")
+    print(f"🔄  DynamicTraining: {dt}")
+    print(f"🆕  Promote: {promote}")
+    print(f"🪣  Bucket: {workbench_bucket}")
+    sqs = AWSAccountClamp().boto3_session.client("sqs")
+    script_name = script_file.name
+    # List Workbench queues
+    print("\n📋  Listing Workbench SQS queues...")
+    try:
+        queues = sqs.list_queues(QueueNamePrefix="workbench-")
+        queue_urls = queues.get("QueueUrls", [])
+        if queue_urls:
+            print(f"✅  Found {len(queue_urls)} workbench queue(s):")
+            for url in queue_urls:
+                queue_name = url.split("/")[-1]
+                print(f"   • {queue_name}")
+        else:
+            print("⚠️  No workbench queues found")
+    except Exception as e:
+        print(f"❌  Error listing queues: {e}")
+    # Upload script to S3
+    s3_path = f"s3://{workbench_bucket}/batch-jobs/{script_name}"
+    print("\n📤  Uploading script to S3...")
+    print(f"   Source: {script_path}")
+    print(f"   Destination: {s3_path}")
+    try:
+        upload_content_to_s3(script_file.read_text(), s3_path)
+        print("✅  Script uploaded successfully")
+    except Exception as e:
+        print(f"❌  Upload failed: {e}")
+        raise
+    # Get queue URL and info
+    queue_name = "workbench-ml-pipeline-queue.fifo"
+    print("\n🎯  Getting queue information...")
+    print(f"   Queue name: {queue_name}")
+    try:
+        queue_url = sqs.get_queue_url(QueueName=queue_name)["QueueUrl"]
+        print(f"   Queue URL: {queue_url}")
+        # Get queue attributes for additional info
+        attrs = sqs.get_queue_attributes(
+            QueueUrl=queue_url, AttributeNames=["ApproximateNumberOfMessages", "ApproximateNumberOfMessagesNotVisible"]
+        )
+        messages_available = attrs["Attributes"].get("ApproximateNumberOfMessages", "0")
+        messages_in_flight = attrs["Attributes"].get("ApproximateNumberOfMessagesNotVisible", "0")
+        print(f"   Messages in queue: {messages_available}")
+        print(f"   Messages in flight: {messages_in_flight}")
+    except Exception as e:
+        print(f"❌  Error accessing queue: {e}")
+        raise
+    # Prepare message
+    message = {"script_path": s3_path, "size": size}
+    # Set environment variables
+    message["environment"] = {
+        "SERVERLESS": "False" if realtime else "True",
+        "DT": str(dt),
+        "PROMOTE": str(promote),
+    }
+    # Send the message to SQS
+    try:
+        print("\n📨  Sending message to SQS...")
+        response = sqs.send_message(
+            QueueUrl=queue_url,
+            MessageBody=json.dumps(message, indent=2),
+            MessageGroupId="ml-pipeline-jobs",  # Required for FIFO
+        )
+        message_id = response["MessageId"]
+        print("✅  Message sent successfully!")
+        print(f"   Message ID: {message_id}")
+    except Exception as e:
+        print(f"❌  Failed to send message: {e}")
+        raise
+    # Success summary
+    print(f"\n{'=' * 60}")
+    print("✅  JOB SUBMISSION COMPLETE")
+    print(f"{'=' * 60}")
+    print(f"📄  Script: {script_name}")
+    print(f"📏  Size: {size}")
+    print(f"⚡  Mode: {'Real-time' if realtime else 'Serverless'} (SERVERLESS={'False' if realtime else 'True'})")
+    print(f"🔄  DynamicTraining: {dt}")
+    print(f"🆕  Promote: {promote}")
+    print(f"🆔  Message ID: {message_id}")
+    print("\n🔍  MONITORING LOCATIONS:")
+    print(f"   • SQS Queue: AWS Console → SQS → {queue_name}")
+    print("   • Lambda Logs: AWS Console → Lambda → Functions")
+    print("   • Batch Jobs: AWS Console → Batch → Jobs")
+    print("   • CloudWatch: AWS Console → CloudWatch → Log groups")
+    print("\n⏳  Your job should start processing soon...")
+def main():
+    """CLI entry point for submitting ML pipelines via SQS."""
+    parser = argparse.ArgumentParser(description="Submit ML pipeline to SQS queue for Batch processing")
+    parser.add_argument("script_file", help="Local path to ML pipeline script")
+    parser.add_argument(
+        "--size", default="small", choices=["small", "medium", "large"], help="Job size tier (default: small)"
+    )
+    parser.add_argument(
+        "--realtime",
+        action="store_true",
+        help="Create realtime endpoints (default is serverless)",
+    )
+    parser.add_argument(
+        "--dt",
+        action="store_true",
+        help="Set DT=True (models and endpoints will have '-dt' suffix)",
+    )
+    parser.add_argument(
+        "--promote",
+        action="store_true",
+        help="Set Promote=True (models and endpoints will use promoted naming",
+    )
+    args = parser.parse_args()
+    try:
+        submit_to_sqs(
+            args.script_file,
+            args.size,
+            realtime=args.realtime,
+            dt=args.dt,
+            promote=args.promote,
+        )
+    except Exception as e:
+        print(f"\n❌  ERROR: {e}")
+        log.error(f"Error: {e}")
+        exit(1)
+if __name__ == "__main__":
+    main()

workbench/utils/chem_utils/__init__.py ADDED Viewed

File without changes

workbench/utils/chem_utils/fingerprints.py ADDED Viewed

@@ -0,0 +1,134 @@
+"""Molecular fingerprint computation utilities"""
+import logging
+import pandas as pd
+# Molecular Descriptor Imports
+from rdkit import Chem
+from rdkit.Chem import rdFingerprintGenerator
+from rdkit.Chem.MolStandardize import rdMolStandardize
+# Set up the logger
+log = logging.getLogger("workbench")
+def compute_morgan_fingerprints(df: pd.DataFrame, radius=2, n_bits=2048, counts=True) -> pd.DataFrame:
+    """Compute and add Morgan fingerprints to the DataFrame.
+    Args:
+        df (pd.DataFrame): Input DataFrame containing SMILES strings.
+        radius (int): Radius for the Morgan fingerprint.
+        n_bits (int): Number of bits for the fingerprint.
+        counts (bool): Count simulation for the fingerprint.
+    Returns:
+        pd.DataFrame: The input DataFrame with the Morgan fingerprints added as bit strings.
+    Note:
+        See: https://greglandrum.github.io/rdkit-blog/posts/2021-07-06-simulating-counts.html
+    """
+    delete_mol_column = False
+    # Check for the SMILES column (case-insensitive)
+    smiles_column = next((col for col in df.columns if col.lower() == "smiles"), None)
+    if smiles_column is None:
+        raise ValueError("Input DataFrame must have a 'smiles' column")
+    # Sanity check the molecule column (sometimes it gets serialized, which doesn't work)
+    if "molecule" in df.columns and df["molecule"].dtype == "string":
+        log.warning("Detected serialized molecules in 'molecule' column. Removing...")
+        del df["molecule"]
+    # Convert SMILES to RDKit molecule objects (vectorized)
+    if "molecule" not in df.columns:
+        log.info("Converting SMILES to RDKit Molecules...")
+        delete_mol_column = True
+        df["molecule"] = df[smiles_column].apply(Chem.MolFromSmiles)
+        # Make sure our molecules are not None
+        failed_smiles = df[df["molecule"].isnull()][smiles_column].tolist()
+        if failed_smiles:
+            log.error(f"Failed to convert the following SMILES to molecules: {failed_smiles}")
+        df = df.dropna(subset=["molecule"])
+    # If we have fragments in our compounds, get the largest fragment before computing fingerprints
+    largest_frags = df["molecule"].apply(
+        lambda mol: rdMolStandardize.LargestFragmentChooser().choose(mol) if mol else None
+    )
+    # Create a Morgan fingerprint generator
+    if counts:
+        n_bits *= 4  # Multiply by 4 to simulate counts
+    morgan_generator = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits, countSimulation=counts)
+    # Compute Morgan fingerprints (vectorized)
+    fingerprints = largest_frags.apply(
+        lambda mol: (morgan_generator.GetFingerprint(mol).ToBitString() if mol else pd.NA)
+    )
+    # Add the fingerprints to the DataFrame
+    df["fingerprint"] = fingerprints
+    # Drop the intermediate 'molecule' column if it was added
+    if delete_mol_column:
+        del df["molecule"]
+    return df
+if __name__ == "__main__":
+    print("Running molecular fingerprint tests...")
+    print("Note: This requires molecular_screening module to be available")
+    # Test molecules
+    test_molecules = {
+        "aspirin": "CC(=O)OC1=CC=CC=C1C(=O)O",
+        "caffeine": "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",
+        "glucose": "C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O",  # With stereochemistry
+        "sodium_acetate": "CC(=O)[O-].[Na+]",  # Salt
+        "benzene": "c1ccccc1",
+        "butene_e": "C/C=C/C",  # E-butene
+        "butene_z": "C/C=C\\C",  # Z-butene
+    }
+    # Test 1: Morgan Fingerprints
+    print("\n1. Testing Morgan fingerprint generation...")
+    test_df = pd.DataFrame({"SMILES": list(test_molecules.values()), "name": list(test_molecules.keys())})
+    fp_df = compute_morgan_fingerprints(test_df.copy(), radius=2, n_bits=512, counts=False)
+    print("   Fingerprint generation results:")
+    for _, row in fp_df.iterrows():
+        fp = row.get("fingerprint", "N/A")
+        fp_len = len(fp) if fp != "N/A" else 0
+        print(f"   {row['name']:15} → {fp_len} bits")
+    # Test 2: Different fingerprint parameters
+    print("\n2. Testing different fingerprint parameters...")
+    # Test with counts enabled
+    fp_counts_df = compute_morgan_fingerprints(test_df.copy(), radius=3, n_bits=256, counts=True)
+    print("   With count simulation (256 bits * 4):")
+    for _, row in fp_counts_df.iterrows():
+        fp = row.get("fingerprint", "N/A")
+        fp_len = len(fp) if fp != "N/A" else 0
+        print(f"   {row['name']:15} → {fp_len} bits")
+    # Test 3: Edge cases
+    print("\n3. Testing edge cases...")
+    # Invalid SMILES
+    invalid_df = pd.DataFrame({"SMILES": ["INVALID", ""]})
+    try:
+        fp_invalid = compute_morgan_fingerprints(invalid_df.copy())
+        print(f"   ✓ Invalid SMILES handled: {len(fp_invalid)} valid molecules")
+    except Exception as e:
+        print(f"   ✓ Invalid SMILES properly raised error: {type(e).__name__}")
+    # Test with pre-existing molecule column
+    mol_df = test_df.copy()
+    mol_df["molecule"] = mol_df["SMILES"].apply(Chem.MolFromSmiles)
+    fp_with_mol = compute_morgan_fingerprints(mol_df)
+    print(f"   ✓ Pre-existing molecule column handled: {len(fp_with_mol)} fingerprints generated")
+    print("\n✅ All fingerprint tests completed!")

workbench 0.8.168__py3-none-any.whl → 0.8.192__py3-none-any.whl

workbench 0.8.168py3-none-any.whl → 0.8.192py3-none-any.whl