workbench 0.8.168__py3-none-any.whl → 0.8.192__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. workbench/algorithms/dataframe/proximity.py +143 -102
  2. workbench/algorithms/graph/light/proximity_graph.py +2 -1
  3. workbench/api/compound.py +1 -1
  4. workbench/api/endpoint.py +3 -2
  5. workbench/api/feature_set.py +4 -4
  6. workbench/api/model.py +16 -12
  7. workbench/api/monitor.py +1 -16
  8. workbench/core/artifacts/artifact.py +11 -3
  9. workbench/core/artifacts/data_capture_core.py +355 -0
  10. workbench/core/artifacts/endpoint_core.py +113 -27
  11. workbench/core/artifacts/feature_set_core.py +72 -13
  12. workbench/core/artifacts/model_core.py +50 -15
  13. workbench/core/artifacts/monitor_core.py +33 -249
  14. workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
  15. workbench/core/cloud_platform/aws/aws_meta.py +11 -4
  16. workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
  17. workbench/core/transforms/features_to_model/features_to_model.py +9 -4
  18. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
  19. workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
  20. workbench/core/views/training_view.py +49 -53
  21. workbench/core/views/view.py +51 -1
  22. workbench/core/views/view_utils.py +4 -4
  23. workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
  24. workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
  25. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
  26. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
  27. workbench/model_scripts/custom_models/proximity/proximity.py +143 -102
  28. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  29. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
  30. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  31. workbench/model_scripts/custom_models/uq_models/meta_uq.template +156 -58
  32. workbench/model_scripts/custom_models/uq_models/ngboost.template +20 -14
  33. workbench/model_scripts/custom_models/uq_models/proximity.py +143 -102
  34. workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
  35. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
  36. workbench/model_scripts/pytorch_model/pytorch.template +9 -18
  37. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  38. workbench/model_scripts/script_generation.py +7 -2
  39. workbench/model_scripts/uq_models/mapie.template +492 -0
  40. workbench/model_scripts/uq_models/requirements.txt +1 -0
  41. workbench/model_scripts/xgb_model/xgb_model.template +31 -40
  42. workbench/repl/workbench_shell.py +4 -4
  43. workbench/scripts/lambda_launcher.py +63 -0
  44. workbench/scripts/{ml_pipeline_launcher.py → ml_pipeline_batch.py} +49 -51
  45. workbench/scripts/ml_pipeline_sqs.py +186 -0
  46. workbench/utils/chem_utils/__init__.py +0 -0
  47. workbench/utils/chem_utils/fingerprints.py +134 -0
  48. workbench/utils/chem_utils/misc.py +194 -0
  49. workbench/utils/chem_utils/mol_descriptors.py +483 -0
  50. workbench/utils/chem_utils/mol_standardize.py +450 -0
  51. workbench/utils/chem_utils/mol_tagging.py +348 -0
  52. workbench/utils/chem_utils/projections.py +209 -0
  53. workbench/utils/chem_utils/salts.py +256 -0
  54. workbench/utils/chem_utils/sdf.py +292 -0
  55. workbench/utils/chem_utils/toxicity.py +250 -0
  56. workbench/utils/chem_utils/vis.py +253 -0
  57. workbench/utils/config_manager.py +2 -6
  58. workbench/utils/endpoint_utils.py +5 -7
  59. workbench/utils/license_manager.py +2 -6
  60. workbench/utils/model_utils.py +76 -30
  61. workbench/utils/monitor_utils.py +44 -62
  62. workbench/utils/pandas_utils.py +3 -3
  63. workbench/utils/shap_utils.py +10 -2
  64. workbench/utils/workbench_sqs.py +1 -1
  65. workbench/utils/xgboost_model_utils.py +283 -145
  66. workbench/web_interface/components/plugins/dashboard_status.py +3 -1
  67. workbench/web_interface/components/plugins/generated_compounds.py +1 -1
  68. workbench/web_interface/components/plugins/scatter_plot.py +3 -3
  69. {workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/METADATA +2 -1
  70. {workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/RECORD +74 -70
  71. {workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/entry_points.txt +3 -1
  72. workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
  73. workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
  74. workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
  75. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
  76. workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
  77. workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
  78. workbench/model_scripts/pytorch_model/generated_model_script.py +0 -576
  79. workbench/model_scripts/quant_regression/quant_regression.template +0 -279
  80. workbench/model_scripts/quant_regression/requirements.txt +0 -1
  81. workbench/model_scripts/scikit_learn/generated_model_script.py +0 -307
  82. workbench/model_scripts/xgb_model/generated_model_script.py +0 -477
  83. workbench/utils/chem_utils.py +0 -1556
  84. workbench/utils/fast_inference.py +0 -167
  85. workbench/utils/resource_utils.py +0 -39
  86. {workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/WHEEL +0 -0
  87. {workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/licenses/LICENSE +0 -0
  88. {workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/top_level.txt +0 -0
@@ -41,7 +41,7 @@ from workbench.cached.cached_meta import CachedMeta
41
41
  try:
42
42
  import rdkit # noqa
43
43
  import mordred # noqa
44
- from workbench.utils import chem_utils
44
+ from workbench.utils.chem_utils import vis
45
45
 
46
46
  HAVE_CHEM_UTILS = True
47
47
  except ImportError:
@@ -178,12 +178,12 @@ class WorkbenchShell:
178
178
 
179
179
  # Add cheminformatics utils if available
180
180
  if HAVE_CHEM_UTILS:
181
- self.commands["show"] = chem_utils.show
181
+ self.commands["show"] = vis.show
182
182
 
183
183
  def start(self):
184
184
  """Start the Workbench IPython shell"""
185
185
  cprint("magenta", "\nWelcome to Workbench!")
186
- if self.aws_status is False:
186
+ if not self.aws_status:
187
187
  cprint("red", "AWS Account Connection Failed...Review/Fix the Workbench Config:")
188
188
  cprint("red", f"Path: {self.cm.site_config_path}")
189
189
  self.show_config()
@@ -560,7 +560,7 @@ class WorkbenchShell:
560
560
  from workbench.web_interface.components.plugin_unit_test import PluginUnitTest
561
561
 
562
562
  # Get kwargs
563
- theme = kwargs.get("theme", "dark")
563
+ theme = kwargs.get("theme", "midnight_blue")
564
564
 
565
565
  plugin_test = PluginUnitTest(plugin_class, theme=theme, input_data=data, **kwargs)
566
566
 
@@ -0,0 +1,63 @@
1
+ import sys
2
+ import os
3
+ import json
4
+ import importlib.util
5
+
6
+
7
+ def main():
8
+ if len(sys.argv) != 2:
9
+ print("Usage: lambda_launcher <handler_module_name>")
10
+ print("\nOptional: testing/event.json with test event")
11
+ print("Optional: testing/env.json with environment variables")
12
+ sys.exit(1)
13
+
14
+ handler_file = sys.argv[1]
15
+
16
+ # Add .py if not present
17
+ if not handler_file.endswith(".py"):
18
+ handler_file += ".py"
19
+
20
+ # Check if file exists
21
+ if not os.path.exists(handler_file):
22
+ print(f"Error: File '{handler_file}' not found")
23
+ sys.exit(1)
24
+
25
+ # Load environment variables from env.json if it exists
26
+ if os.path.exists("testing/env.json"):
27
+ print("Loading environment variables from testing/env.json")
28
+ with open("testing/env.json") as f:
29
+ env_vars = json.load(f)
30
+ for key, value in env_vars.items():
31
+ os.environ[key] = value
32
+ print(f" Set {key} = {value}")
33
+ print()
34
+
35
+ # Load event configuration
36
+ if os.path.exists("testing/event.json"):
37
+ print("Loading event from testing/event.json")
38
+ with open("testing/event.json") as f:
39
+ event = json.load(f)
40
+ else:
41
+ print("No testing/event.json found, using empty event")
42
+ event = {}
43
+
44
+ # Load the module dynamically
45
+ spec = importlib.util.spec_from_file_location("lambda_module", handler_file)
46
+ lambda_module = importlib.util.module_from_spec(spec)
47
+ spec.loader.exec_module(lambda_module)
48
+
49
+ # Call the lambda_handler
50
+ print(f"Invoking lambda_handler from {handler_file}...")
51
+ print("-" * 50)
52
+ print(f"Event: {json.dumps(event, indent=2)}")
53
+ print("-" * 50)
54
+
55
+ result = lambda_module.lambda_handler(event, {})
56
+
57
+ print("-" * 50)
58
+ print("Result:")
59
+ print(json.dumps(result, indent=2))
60
+
61
+
62
+ if __name__ == "__main__":
63
+ main()
@@ -27,60 +27,56 @@ def get_batch_role_arn() -> str:
27
27
  return f"arn:aws:iam::{account_id}:role/Workbench-BatchRole"
28
28
 
29
29
 
30
- def ensure_job_definition():
31
- """Register or update the Batch job definition for ML pipeline runner."""
32
- batch = AWSAccountClamp().boto3_session.client("batch")
33
- name = "workbench-ml-pipeline-runner"
34
- response = batch.register_job_definition(
35
- jobDefinitionName=name,
36
- type="container",
37
- platformCapabilities=["FARGATE"],
38
- containerProperties={
39
- "image": get_ecr_image_uri(),
40
- "resourceRequirements": [{"type": "VCPU", "value": "2"}, {"type": "MEMORY", "value": "4096"}],
41
- "jobRoleArn": get_batch_role_arn(),
42
- "executionRoleArn": get_batch_role_arn(),
43
- "environment": [
44
- {"name": "WORKBENCH_BUCKET", "value": workbench_bucket},
45
- {"name": "PYTHONUNBUFFERED", "value": "1"},
46
- ],
47
- # "networkConfiguration": {"assignPublicIp": "ENABLED"}, # Required for ECR Image Pull (when not in VPC)
48
- },
49
- timeout={"attemptDurationSeconds": 10800}, # 3 hours
50
- )
51
- log.info(f"Job definition ready: {name} (revision {response['revision']})")
52
- return name
30
+ def _log_cloudwatch_link(job: dict, message_prefix: str = "View logs") -> None:
31
+ """
32
+ Helper method to log CloudWatch logs link with clickable URL and full URL display.
33
+
34
+ Args:
35
+ job: Batch job description dictionary
36
+ message_prefix: Prefix for the log message (default: "View logs")
37
+ """
38
+ log_stream = job.get("container", {}).get("logStreamName")
39
+ logs_url = get_cloudwatch_logs_url(log_group="/aws/batch/job", log_stream=log_stream)
40
+ if logs_url:
41
+ clickable_url = f"\033]8;;{logs_url}\033\\{logs_url}\033]8;;\033\\"
42
+ log.info(f"{message_prefix}: {clickable_url}")
43
+ else:
44
+ log.info("Check AWS Batch console for logs")
53
45
 
54
46
 
55
- def run_batch_job(script_path: str) -> int:
47
+ def run_batch_job(script_path: str, size: str = "small") -> int:
56
48
  """
57
49
  Submit and monitor an AWS Batch job for ML pipeline execution.
58
- This function:
59
- 1. Uploads the ML pipeline script to S3
60
- 2. Submits a Batch job to run the script in a container
61
- 3. Monitors job status until completion
62
- 4. Returns the job's exit code
50
+
51
+ Uploads script to S3, submits Batch job, monitors until completion or 2 minutes of RUNNING.
63
52
 
64
53
  Args:
65
54
  script_path: Local path to the ML pipeline script
55
+ size: Job size tier - "small" (default), "medium", or "large"
56
+ - small: 2 vCPU, 4GB RAM for lightweight processing
57
+ - medium: 4 vCPU, 8GB RAM for standard ML workloads
58
+ - large: 8 vCPU, 16GB RAM for heavy training/inference
66
59
 
67
60
  Returns:
68
- Exit code from the batch job (0 for success, non-zero for failure)
61
+ Exit code (0 for success/disconnected, non-zero for failure)
69
62
  """
63
+ if size not in ["small", "medium", "large"]:
64
+ raise ValueError(f"Invalid size '{size}'. Must be 'small', 'medium', or 'large'")
65
+
70
66
  batch = AWSAccountClamp().boto3_session.client("batch")
71
67
  script_name = Path(script_path).stem
72
68
 
73
- # Upload script to S3 for the container to download
69
+ # Upload script to S3
74
70
  s3_path = f"s3://{workbench_bucket}/batch-jobs/{Path(script_path).name}"
75
71
  log.info(f"Uploading script to {s3_path}")
76
72
  upload_content_to_s3(Path(script_path).read_text(), s3_path)
77
73
 
78
- # Submit the Batch job
74
+ # Submit job
79
75
  job_name = f"workbench_{script_name}_{datetime.now():%Y%m%d_%H%M%S}"
80
76
  response = batch.submit_job(
81
77
  jobName=job_name,
82
78
  jobQueue="workbench-job-queue",
83
- jobDefinition=ensure_job_definition(),
79
+ jobDefinition=f"workbench-batch-{size}",
84
80
  containerOverrides={
85
81
  "environment": [
86
82
  {"name": "ML_PIPELINE_S3_PATH", "value": s3_path},
@@ -89,36 +85,38 @@ def run_batch_job(script_path: str) -> int:
89
85
  },
90
86
  )
91
87
  job_id = response["jobId"]
92
- log.info(f"Submitted job: {job_name} ({job_id})")
88
+ log.info(f"Submitted job: {job_name} ({job_id}) using {size} tier")
93
89
 
94
- # Monitor job execution
95
- last_status = None
90
+ # Monitor job
91
+ last_status, running_start = None, None
96
92
  while True:
97
- # Check job status
98
93
  job = batch.describe_jobs(jobs=[job_id])["jobs"][0]
99
94
  status = job["status"]
95
+
100
96
  if status != last_status:
101
97
  log.info(f"Job status: {status}")
102
98
  last_status = status
99
+ if status == "RUNNING":
100
+ running_start = time.time()
101
+
102
+ # Disconnect after 2 minutes of running
103
+ if status == "RUNNING" and running_start and (time.time() - running_start >= 120):
104
+ log.info("✅ ML Pipeline is running successfully!")
105
+ _log_cloudwatch_link(job, "📊 Monitor logs")
106
+ return 0
103
107
 
104
- # Check if job completed
108
+ # Handle completion
105
109
  if status in ["SUCCEEDED", "FAILED"]:
106
110
  exit_code = job.get("attempts", [{}])[-1].get("exitCode", 1)
107
- if status == "FAILED":
108
- log.error(f"Job failed: {job.get('statusReason', 'Unknown reason')}")
109
- else:
110
- log.info("Job completed successfully")
111
-
112
- # Get CloudWatch logs URL
113
- log_stream_name = job.get("container", {}).get("logStreamName")
114
- logs_url = get_cloudwatch_logs_url(log_group="/aws/batch/job", log_stream=log_stream_name)
115
- if logs_url:
116
- # OSC 8 hyperlink format for modern terminals
117
- clickable_url = f"\033]8;;{logs_url}\033\\{logs_url}\033]8;;\033\\"
118
- log.info(f"View logs: {clickable_url}")
111
+ msg = (
112
+ "Job completed successfully"
113
+ if status == "SUCCEEDED"
114
+ else f"Job failed: {job.get('statusReason', 'Unknown')}"
115
+ )
116
+ log.info(msg) if status == "SUCCEEDED" else log.error(msg)
117
+ _log_cloudwatch_link(job)
119
118
  return exit_code
120
119
 
121
- # Sleep a bit before next status check
122
120
  time.sleep(10)
123
121
 
124
122
 
@@ -0,0 +1,186 @@
1
+ import argparse
2
+ import logging
3
+ import json
4
+ from pathlib import Path
5
+
6
+ # Workbench Imports
7
+ from workbench.core.cloud_platform.aws.aws_account_clamp import AWSAccountClamp
8
+ from workbench.utils.config_manager import ConfigManager
9
+ from workbench.utils.s3_utils import upload_content_to_s3
10
+
11
+ log = logging.getLogger("workbench")
12
+ cm = ConfigManager()
13
+ workbench_bucket = cm.get_config("WORKBENCH_BUCKET")
14
+
15
+
16
+ def submit_to_sqs(
17
+ script_path: str,
18
+ size: str = "small",
19
+ realtime: bool = False,
20
+ dt: bool = False,
21
+ promote: bool = False,
22
+ ) -> None:
23
+ """
24
+ Upload script to S3 and submit message to SQS queue for processing.
25
+
26
+ Args:
27
+ script_path: Local path to the ML pipeline script
28
+ size: Job size tier - "small" (default), "medium", or "large"
29
+ realtime: If True, sets serverless=False for real-time processing (default: False)
30
+ dt: If True, sets DT=True in environment (default: False)
31
+ promote: If True, sets PROMOTE=True in environment (default: False)
32
+
33
+ Raises:
34
+ ValueError: If size is invalid or script file not found
35
+ """
36
+ print(f"\n{'=' * 60}")
37
+ print("🚀 SUBMITTING ML PIPELINE JOB")
38
+ print(f"{'=' * 60}")
39
+ if size not in ["small", "medium", "large"]:
40
+ raise ValueError(f"Invalid size '{size}'. Must be 'small', 'medium', or 'large'")
41
+
42
+ # Validate script exists
43
+ script_file = Path(script_path)
44
+ if not script_file.exists():
45
+ raise FileNotFoundError(f"Script not found: {script_path}")
46
+
47
+ print(f"📄 Script: {script_file.name}")
48
+ print(f"📏 Size tier: {size}")
49
+ print(f"⚡ Mode: {'Real-time' if realtime else 'Serverless'} (serverless={'False' if realtime else 'True'})")
50
+ print(f"🔄 DynamicTraining: {dt}")
51
+ print(f"🆕 Promote: {promote}")
52
+ print(f"🪣 Bucket: {workbench_bucket}")
53
+ sqs = AWSAccountClamp().boto3_session.client("sqs")
54
+ script_name = script_file.name
55
+
56
+ # List Workbench queues
57
+ print("\n📋 Listing Workbench SQS queues...")
58
+ try:
59
+ queues = sqs.list_queues(QueueNamePrefix="workbench-")
60
+ queue_urls = queues.get("QueueUrls", [])
61
+ if queue_urls:
62
+ print(f"✅ Found {len(queue_urls)} workbench queue(s):")
63
+ for url in queue_urls:
64
+ queue_name = url.split("/")[-1]
65
+ print(f" • {queue_name}")
66
+ else:
67
+ print("⚠️ No workbench queues found")
68
+ except Exception as e:
69
+ print(f"❌ Error listing queues: {e}")
70
+
71
+ # Upload script to S3
72
+ s3_path = f"s3://{workbench_bucket}/batch-jobs/{script_name}"
73
+ print("\n📤 Uploading script to S3...")
74
+ print(f" Source: {script_path}")
75
+ print(f" Destination: {s3_path}")
76
+
77
+ try:
78
+ upload_content_to_s3(script_file.read_text(), s3_path)
79
+ print("✅ Script uploaded successfully")
80
+ except Exception as e:
81
+ print(f"❌ Upload failed: {e}")
82
+ raise
83
+ # Get queue URL and info
84
+ queue_name = "workbench-ml-pipeline-queue.fifo"
85
+ print("\n🎯 Getting queue information...")
86
+ print(f" Queue name: {queue_name}")
87
+
88
+ try:
89
+ queue_url = sqs.get_queue_url(QueueName=queue_name)["QueueUrl"]
90
+ print(f" Queue URL: {queue_url}")
91
+
92
+ # Get queue attributes for additional info
93
+ attrs = sqs.get_queue_attributes(
94
+ QueueUrl=queue_url, AttributeNames=["ApproximateNumberOfMessages", "ApproximateNumberOfMessagesNotVisible"]
95
+ )
96
+ messages_available = attrs["Attributes"].get("ApproximateNumberOfMessages", "0")
97
+ messages_in_flight = attrs["Attributes"].get("ApproximateNumberOfMessagesNotVisible", "0")
98
+ print(f" Messages in queue: {messages_available}")
99
+ print(f" Messages in flight: {messages_in_flight}")
100
+
101
+ except Exception as e:
102
+ print(f"❌ Error accessing queue: {e}")
103
+ raise
104
+
105
+ # Prepare message
106
+ message = {"script_path": s3_path, "size": size}
107
+
108
+ # Set environment variables
109
+ message["environment"] = {
110
+ "SERVERLESS": "False" if realtime else "True",
111
+ "DT": str(dt),
112
+ "PROMOTE": str(promote),
113
+ }
114
+
115
+ # Send the message to SQS
116
+ try:
117
+ print("\n📨 Sending message to SQS...")
118
+ response = sqs.send_message(
119
+ QueueUrl=queue_url,
120
+ MessageBody=json.dumps(message, indent=2),
121
+ MessageGroupId="ml-pipeline-jobs", # Required for FIFO
122
+ )
123
+ message_id = response["MessageId"]
124
+ print("✅ Message sent successfully!")
125
+ print(f" Message ID: {message_id}")
126
+ except Exception as e:
127
+ print(f"❌ Failed to send message: {e}")
128
+ raise
129
+
130
+ # Success summary
131
+ print(f"\n{'=' * 60}")
132
+ print("✅ JOB SUBMISSION COMPLETE")
133
+ print(f"{'=' * 60}")
134
+ print(f"📄 Script: {script_name}")
135
+ print(f"📏 Size: {size}")
136
+ print(f"⚡ Mode: {'Real-time' if realtime else 'Serverless'} (SERVERLESS={'False' if realtime else 'True'})")
137
+ print(f"🔄 DynamicTraining: {dt}")
138
+ print(f"🆕 Promote: {promote}")
139
+ print(f"🆔 Message ID: {message_id}")
140
+ print("\n🔍 MONITORING LOCATIONS:")
141
+ print(f" • SQS Queue: AWS Console → SQS → {queue_name}")
142
+ print(" • Lambda Logs: AWS Console → Lambda → Functions")
143
+ print(" • Batch Jobs: AWS Console → Batch → Jobs")
144
+ print(" • CloudWatch: AWS Console → CloudWatch → Log groups")
145
+ print("\n⏳ Your job should start processing soon...")
146
+
147
+
148
+ def main():
149
+ """CLI entry point for submitting ML pipelines via SQS."""
150
+ parser = argparse.ArgumentParser(description="Submit ML pipeline to SQS queue for Batch processing")
151
+ parser.add_argument("script_file", help="Local path to ML pipeline script")
152
+ parser.add_argument(
153
+ "--size", default="small", choices=["small", "medium", "large"], help="Job size tier (default: small)"
154
+ )
155
+ parser.add_argument(
156
+ "--realtime",
157
+ action="store_true",
158
+ help="Create realtime endpoints (default is serverless)",
159
+ )
160
+ parser.add_argument(
161
+ "--dt",
162
+ action="store_true",
163
+ help="Set DT=True (models and endpoints will have '-dt' suffix)",
164
+ )
165
+ parser.add_argument(
166
+ "--promote",
167
+ action="store_true",
168
+ help="Set Promote=True (models and endpoints will use promoted naming",
169
+ )
170
+ args = parser.parse_args()
171
+ try:
172
+ submit_to_sqs(
173
+ args.script_file,
174
+ args.size,
175
+ realtime=args.realtime,
176
+ dt=args.dt,
177
+ promote=args.promote,
178
+ )
179
+ except Exception as e:
180
+ print(f"\n❌ ERROR: {e}")
181
+ log.error(f"Error: {e}")
182
+ exit(1)
183
+
184
+
185
+ if __name__ == "__main__":
186
+ main()
File without changes
@@ -0,0 +1,134 @@
1
+ """Molecular fingerprint computation utilities"""
2
+
3
+ import logging
4
+ import pandas as pd
5
+
6
+ # Molecular Descriptor Imports
7
+ from rdkit import Chem
8
+ from rdkit.Chem import rdFingerprintGenerator
9
+ from rdkit.Chem.MolStandardize import rdMolStandardize
10
+
11
+ # Set up the logger
12
+ log = logging.getLogger("workbench")
13
+
14
+
15
+ def compute_morgan_fingerprints(df: pd.DataFrame, radius=2, n_bits=2048, counts=True) -> pd.DataFrame:
16
+ """Compute and add Morgan fingerprints to the DataFrame.
17
+
18
+ Args:
19
+ df (pd.DataFrame): Input DataFrame containing SMILES strings.
20
+ radius (int): Radius for the Morgan fingerprint.
21
+ n_bits (int): Number of bits for the fingerprint.
22
+ counts (bool): Count simulation for the fingerprint.
23
+
24
+ Returns:
25
+ pd.DataFrame: The input DataFrame with the Morgan fingerprints added as bit strings.
26
+
27
+ Note:
28
+ See: https://greglandrum.github.io/rdkit-blog/posts/2021-07-06-simulating-counts.html
29
+ """
30
+ delete_mol_column = False
31
+
32
+ # Check for the SMILES column (case-insensitive)
33
+ smiles_column = next((col for col in df.columns if col.lower() == "smiles"), None)
34
+ if smiles_column is None:
35
+ raise ValueError("Input DataFrame must have a 'smiles' column")
36
+
37
+ # Sanity check the molecule column (sometimes it gets serialized, which doesn't work)
38
+ if "molecule" in df.columns and df["molecule"].dtype == "string":
39
+ log.warning("Detected serialized molecules in 'molecule' column. Removing...")
40
+ del df["molecule"]
41
+
42
+ # Convert SMILES to RDKit molecule objects (vectorized)
43
+ if "molecule" not in df.columns:
44
+ log.info("Converting SMILES to RDKit Molecules...")
45
+ delete_mol_column = True
46
+ df["molecule"] = df[smiles_column].apply(Chem.MolFromSmiles)
47
+ # Make sure our molecules are not None
48
+ failed_smiles = df[df["molecule"].isnull()][smiles_column].tolist()
49
+ if failed_smiles:
50
+ log.error(f"Failed to convert the following SMILES to molecules: {failed_smiles}")
51
+ df = df.dropna(subset=["molecule"])
52
+
53
+ # If we have fragments in our compounds, get the largest fragment before computing fingerprints
54
+ largest_frags = df["molecule"].apply(
55
+ lambda mol: rdMolStandardize.LargestFragmentChooser().choose(mol) if mol else None
56
+ )
57
+
58
+ # Create a Morgan fingerprint generator
59
+ if counts:
60
+ n_bits *= 4 # Multiply by 4 to simulate counts
61
+ morgan_generator = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits, countSimulation=counts)
62
+
63
+ # Compute Morgan fingerprints (vectorized)
64
+ fingerprints = largest_frags.apply(
65
+ lambda mol: (morgan_generator.GetFingerprint(mol).ToBitString() if mol else pd.NA)
66
+ )
67
+
68
+ # Add the fingerprints to the DataFrame
69
+ df["fingerprint"] = fingerprints
70
+
71
+ # Drop the intermediate 'molecule' column if it was added
72
+ if delete_mol_column:
73
+ del df["molecule"]
74
+ return df
75
+
76
+
77
+ if __name__ == "__main__":
78
+ print("Running molecular fingerprint tests...")
79
+ print("Note: This requires molecular_screening module to be available")
80
+
81
+ # Test molecules
82
+ test_molecules = {
83
+ "aspirin": "CC(=O)OC1=CC=CC=C1C(=O)O",
84
+ "caffeine": "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",
85
+ "glucose": "C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O", # With stereochemistry
86
+ "sodium_acetate": "CC(=O)[O-].[Na+]", # Salt
87
+ "benzene": "c1ccccc1",
88
+ "butene_e": "C/C=C/C", # E-butene
89
+ "butene_z": "C/C=C\\C", # Z-butene
90
+ }
91
+
92
+ # Test 1: Morgan Fingerprints
93
+ print("\n1. Testing Morgan fingerprint generation...")
94
+
95
+ test_df = pd.DataFrame({"SMILES": list(test_molecules.values()), "name": list(test_molecules.keys())})
96
+
97
+ fp_df = compute_morgan_fingerprints(test_df.copy(), radius=2, n_bits=512, counts=False)
98
+
99
+ print(" Fingerprint generation results:")
100
+ for _, row in fp_df.iterrows():
101
+ fp = row.get("fingerprint", "N/A")
102
+ fp_len = len(fp) if fp != "N/A" else 0
103
+ print(f" {row['name']:15} → {fp_len} bits")
104
+
105
+ # Test 2: Different fingerprint parameters
106
+ print("\n2. Testing different fingerprint parameters...")
107
+
108
+ # Test with counts enabled
109
+ fp_counts_df = compute_morgan_fingerprints(test_df.copy(), radius=3, n_bits=256, counts=True)
110
+
111
+ print(" With count simulation (256 bits * 4):")
112
+ for _, row in fp_counts_df.iterrows():
113
+ fp = row.get("fingerprint", "N/A")
114
+ fp_len = len(fp) if fp != "N/A" else 0
115
+ print(f" {row['name']:15} → {fp_len} bits")
116
+
117
+ # Test 3: Edge cases
118
+ print("\n3. Testing edge cases...")
119
+
120
+ # Invalid SMILES
121
+ invalid_df = pd.DataFrame({"SMILES": ["INVALID", ""]})
122
+ try:
123
+ fp_invalid = compute_morgan_fingerprints(invalid_df.copy())
124
+ print(f" ✓ Invalid SMILES handled: {len(fp_invalid)} valid molecules")
125
+ except Exception as e:
126
+ print(f" ✓ Invalid SMILES properly raised error: {type(e).__name__}")
127
+
128
+ # Test with pre-existing molecule column
129
+ mol_df = test_df.copy()
130
+ mol_df["molecule"] = mol_df["SMILES"].apply(Chem.MolFromSmiles)
131
+ fp_with_mol = compute_morgan_fingerprints(mol_df)
132
+ print(f" ✓ Pre-existing molecule column handled: {len(fp_with_mol)} fingerprints generated")
133
+
134
+ print("\n✅ All fingerprint tests completed!")