workbench 0.8.168__py3-none-any.whl → 0.8.193__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. workbench/algorithms/dataframe/proximity.py +143 -102
  2. workbench/algorithms/graph/light/proximity_graph.py +2 -1
  3. workbench/api/compound.py +1 -1
  4. workbench/api/endpoint.py +3 -2
  5. workbench/api/feature_set.py +4 -4
  6. workbench/api/model.py +16 -12
  7. workbench/api/monitor.py +1 -16
  8. workbench/core/artifacts/artifact.py +11 -3
  9. workbench/core/artifacts/data_capture_core.py +355 -0
  10. workbench/core/artifacts/endpoint_core.py +113 -27
  11. workbench/core/artifacts/feature_set_core.py +72 -13
  12. workbench/core/artifacts/model_core.py +71 -49
  13. workbench/core/artifacts/monitor_core.py +33 -249
  14. workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
  15. workbench/core/cloud_platform/aws/aws_meta.py +11 -4
  16. workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
  17. workbench/core/transforms/features_to_model/features_to_model.py +11 -6
  18. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
  19. workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
  20. workbench/core/views/training_view.py +49 -53
  21. workbench/core/views/view.py +51 -1
  22. workbench/core/views/view_utils.py +4 -4
  23. workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
  24. workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
  25. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
  26. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
  27. workbench/model_scripts/custom_models/proximity/proximity.py +143 -102
  28. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  29. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
  30. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  31. workbench/model_scripts/custom_models/uq_models/meta_uq.template +156 -58
  32. workbench/model_scripts/custom_models/uq_models/ngboost.template +20 -14
  33. workbench/model_scripts/custom_models/uq_models/proximity.py +143 -102
  34. workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
  35. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
  36. workbench/model_scripts/pytorch_model/pytorch.template +9 -18
  37. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  38. workbench/model_scripts/script_generation.py +7 -2
  39. workbench/model_scripts/uq_models/mapie.template +492 -0
  40. workbench/model_scripts/uq_models/requirements.txt +1 -0
  41. workbench/model_scripts/xgb_model/generated_model_script.py +34 -43
  42. workbench/model_scripts/xgb_model/xgb_model.template +31 -40
  43. workbench/repl/workbench_shell.py +4 -4
  44. workbench/scripts/lambda_launcher.py +63 -0
  45. workbench/scripts/{ml_pipeline_launcher.py → ml_pipeline_batch.py} +49 -51
  46. workbench/scripts/ml_pipeline_sqs.py +186 -0
  47. workbench/utils/chem_utils/__init__.py +0 -0
  48. workbench/utils/chem_utils/fingerprints.py +134 -0
  49. workbench/utils/chem_utils/misc.py +194 -0
  50. workbench/utils/chem_utils/mol_descriptors.py +483 -0
  51. workbench/utils/chem_utils/mol_standardize.py +450 -0
  52. workbench/utils/chem_utils/mol_tagging.py +348 -0
  53. workbench/utils/chem_utils/projections.py +209 -0
  54. workbench/utils/chem_utils/salts.py +256 -0
  55. workbench/utils/chem_utils/sdf.py +292 -0
  56. workbench/utils/chem_utils/toxicity.py +250 -0
  57. workbench/utils/chem_utils/vis.py +253 -0
  58. workbench/utils/config_manager.py +2 -6
  59. workbench/utils/endpoint_utils.py +5 -7
  60. workbench/utils/license_manager.py +2 -6
  61. workbench/utils/model_utils.py +89 -31
  62. workbench/utils/monitor_utils.py +44 -62
  63. workbench/utils/pandas_utils.py +3 -3
  64. workbench/utils/shap_utils.py +10 -2
  65. workbench/utils/workbench_sqs.py +1 -1
  66. workbench/utils/xgboost_model_utils.py +300 -151
  67. workbench/web_interface/components/model_plot.py +7 -1
  68. workbench/web_interface/components/plugins/dashboard_status.py +3 -1
  69. workbench/web_interface/components/plugins/generated_compounds.py +1 -1
  70. workbench/web_interface/components/plugins/model_details.py +7 -2
  71. workbench/web_interface/components/plugins/scatter_plot.py +3 -3
  72. {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/METADATA +24 -2
  73. {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/RECORD +77 -72
  74. {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/entry_points.txt +3 -1
  75. {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/licenses/LICENSE +1 -1
  76. workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
  77. workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
  78. workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
  79. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
  80. workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
  81. workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
  82. workbench/model_scripts/pytorch_model/generated_model_script.py +0 -576
  83. workbench/model_scripts/quant_regression/quant_regression.template +0 -279
  84. workbench/model_scripts/quant_regression/requirements.txt +0 -1
  85. workbench/model_scripts/scikit_learn/generated_model_script.py +0 -307
  86. workbench/utils/chem_utils.py +0 -1556
  87. workbench/utils/fast_inference.py +0 -167
  88. workbench/utils/resource_utils.py +0 -39
  89. {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/WHEEL +0 -0
  90. {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,186 @@
1
+ import argparse
2
+ import logging
3
+ import json
4
+ from pathlib import Path
5
+
6
+ # Workbench Imports
7
+ from workbench.core.cloud_platform.aws.aws_account_clamp import AWSAccountClamp
8
+ from workbench.utils.config_manager import ConfigManager
9
+ from workbench.utils.s3_utils import upload_content_to_s3
10
+
11
+ log = logging.getLogger("workbench")
12
+ cm = ConfigManager()
13
+ workbench_bucket = cm.get_config("WORKBENCH_BUCKET")
14
+
15
+
16
+ def submit_to_sqs(
17
+ script_path: str,
18
+ size: str = "small",
19
+ realtime: bool = False,
20
+ dt: bool = False,
21
+ promote: bool = False,
22
+ ) -> None:
23
+ """
24
+ Upload script to S3 and submit message to SQS queue for processing.
25
+
26
+ Args:
27
+ script_path: Local path to the ML pipeline script
28
+ size: Job size tier - "small" (default), "medium", or "large"
29
+ realtime: If True, sets serverless=False for real-time processing (default: False)
30
+ dt: If True, sets DT=True in environment (default: False)
31
+ promote: If True, sets PROMOTE=True in environment (default: False)
32
+
33
+ Raises:
34
+ ValueError: If size is invalid or script file not found
35
+ """
36
+ print(f"\n{'=' * 60}")
37
+ print("🚀 SUBMITTING ML PIPELINE JOB")
38
+ print(f"{'=' * 60}")
39
+ if size not in ["small", "medium", "large"]:
40
+ raise ValueError(f"Invalid size '{size}'. Must be 'small', 'medium', or 'large'")
41
+
42
+ # Validate script exists
43
+ script_file = Path(script_path)
44
+ if not script_file.exists():
45
+ raise FileNotFoundError(f"Script not found: {script_path}")
46
+
47
+ print(f"📄 Script: {script_file.name}")
48
+ print(f"📏 Size tier: {size}")
49
+ print(f"⚡ Mode: {'Real-time' if realtime else 'Serverless'} (serverless={'False' if realtime else 'True'})")
50
+ print(f"🔄 DynamicTraining: {dt}")
51
+ print(f"🆕 Promote: {promote}")
52
+ print(f"🪣 Bucket: {workbench_bucket}")
53
+ sqs = AWSAccountClamp().boto3_session.client("sqs")
54
+ script_name = script_file.name
55
+
56
+ # List Workbench queues
57
+ print("\n📋 Listing Workbench SQS queues...")
58
+ try:
59
+ queues = sqs.list_queues(QueueNamePrefix="workbench-")
60
+ queue_urls = queues.get("QueueUrls", [])
61
+ if queue_urls:
62
+ print(f"✅ Found {len(queue_urls)} workbench queue(s):")
63
+ for url in queue_urls:
64
+ queue_name = url.split("/")[-1]
65
+ print(f" • {queue_name}")
66
+ else:
67
+ print("⚠️ No workbench queues found")
68
+ except Exception as e:
69
+ print(f"❌ Error listing queues: {e}")
70
+
71
+ # Upload script to S3
72
+ s3_path = f"s3://{workbench_bucket}/batch-jobs/{script_name}"
73
+ print("\n📤 Uploading script to S3...")
74
+ print(f" Source: {script_path}")
75
+ print(f" Destination: {s3_path}")
76
+
77
+ try:
78
+ upload_content_to_s3(script_file.read_text(), s3_path)
79
+ print("✅ Script uploaded successfully")
80
+ except Exception as e:
81
+ print(f"❌ Upload failed: {e}")
82
+ raise
83
+ # Get queue URL and info
84
+ queue_name = "workbench-ml-pipeline-queue.fifo"
85
+ print("\n🎯 Getting queue information...")
86
+ print(f" Queue name: {queue_name}")
87
+
88
+ try:
89
+ queue_url = sqs.get_queue_url(QueueName=queue_name)["QueueUrl"]
90
+ print(f" Queue URL: {queue_url}")
91
+
92
+ # Get queue attributes for additional info
93
+ attrs = sqs.get_queue_attributes(
94
+ QueueUrl=queue_url, AttributeNames=["ApproximateNumberOfMessages", "ApproximateNumberOfMessagesNotVisible"]
95
+ )
96
+ messages_available = attrs["Attributes"].get("ApproximateNumberOfMessages", "0")
97
+ messages_in_flight = attrs["Attributes"].get("ApproximateNumberOfMessagesNotVisible", "0")
98
+ print(f" Messages in queue: {messages_available}")
99
+ print(f" Messages in flight: {messages_in_flight}")
100
+
101
+ except Exception as e:
102
+ print(f"❌ Error accessing queue: {e}")
103
+ raise
104
+
105
+ # Prepare message
106
+ message = {"script_path": s3_path, "size": size}
107
+
108
+ # Set environment variables
109
+ message["environment"] = {
110
+ "SERVERLESS": "False" if realtime else "True",
111
+ "DT": str(dt),
112
+ "PROMOTE": str(promote),
113
+ }
114
+
115
+ # Send the message to SQS
116
+ try:
117
+ print("\n📨 Sending message to SQS...")
118
+ response = sqs.send_message(
119
+ QueueUrl=queue_url,
120
+ MessageBody=json.dumps(message, indent=2),
121
+ MessageGroupId="ml-pipeline-jobs", # Required for FIFO
122
+ )
123
+ message_id = response["MessageId"]
124
+ print("✅ Message sent successfully!")
125
+ print(f" Message ID: {message_id}")
126
+ except Exception as e:
127
+ print(f"❌ Failed to send message: {e}")
128
+ raise
129
+
130
+ # Success summary
131
+ print(f"\n{'=' * 60}")
132
+ print("✅ JOB SUBMISSION COMPLETE")
133
+ print(f"{'=' * 60}")
134
+ print(f"📄 Script: {script_name}")
135
+ print(f"📏 Size: {size}")
136
+ print(f"⚡ Mode: {'Real-time' if realtime else 'Serverless'} (SERVERLESS={'False' if realtime else 'True'})")
137
+ print(f"🔄 DynamicTraining: {dt}")
138
+ print(f"🆕 Promote: {promote}")
139
+ print(f"🆔 Message ID: {message_id}")
140
+ print("\n🔍 MONITORING LOCATIONS:")
141
+ print(f" • SQS Queue: AWS Console → SQS → {queue_name}")
142
+ print(" • Lambda Logs: AWS Console → Lambda → Functions")
143
+ print(" • Batch Jobs: AWS Console → Batch → Jobs")
144
+ print(" • CloudWatch: AWS Console → CloudWatch → Log groups")
145
+ print("\n⏳ Your job should start processing soon...")
146
+
147
+
148
+ def main():
149
+ """CLI entry point for submitting ML pipelines via SQS."""
150
+ parser = argparse.ArgumentParser(description="Submit ML pipeline to SQS queue for Batch processing")
151
+ parser.add_argument("script_file", help="Local path to ML pipeline script")
152
+ parser.add_argument(
153
+ "--size", default="small", choices=["small", "medium", "large"], help="Job size tier (default: small)"
154
+ )
155
+ parser.add_argument(
156
+ "--realtime",
157
+ action="store_true",
158
+ help="Create realtime endpoints (default is serverless)",
159
+ )
160
+ parser.add_argument(
161
+ "--dt",
162
+ action="store_true",
163
+ help="Set DT=True (models and endpoints will have '-dt' suffix)",
164
+ )
165
+ parser.add_argument(
166
+ "--promote",
167
+ action="store_true",
168
+ help="Set Promote=True (models and endpoints will use promoted naming",
169
+ )
170
+ args = parser.parse_args()
171
+ try:
172
+ submit_to_sqs(
173
+ args.script_file,
174
+ args.size,
175
+ realtime=args.realtime,
176
+ dt=args.dt,
177
+ promote=args.promote,
178
+ )
179
+ except Exception as e:
180
+ print(f"\n❌ ERROR: {e}")
181
+ log.error(f"Error: {e}")
182
+ exit(1)
183
+
184
+
185
+ if __name__ == "__main__":
186
+ main()
File without changes
@@ -0,0 +1,134 @@
1
+ """Molecular fingerprint computation utilities"""
2
+
3
+ import logging
4
+ import pandas as pd
5
+
6
+ # Molecular Descriptor Imports
7
+ from rdkit import Chem
8
+ from rdkit.Chem import rdFingerprintGenerator
9
+ from rdkit.Chem.MolStandardize import rdMolStandardize
10
+
11
+ # Set up the logger
12
+ log = logging.getLogger("workbench")
13
+
14
+
15
+ def compute_morgan_fingerprints(df: pd.DataFrame, radius=2, n_bits=2048, counts=True) -> pd.DataFrame:
16
+ """Compute and add Morgan fingerprints to the DataFrame.
17
+
18
+ Args:
19
+ df (pd.DataFrame): Input DataFrame containing SMILES strings.
20
+ radius (int): Radius for the Morgan fingerprint.
21
+ n_bits (int): Number of bits for the fingerprint.
22
+ counts (bool): Count simulation for the fingerprint.
23
+
24
+ Returns:
25
+ pd.DataFrame: The input DataFrame with the Morgan fingerprints added as bit strings.
26
+
27
+ Note:
28
+ See: https://greglandrum.github.io/rdkit-blog/posts/2021-07-06-simulating-counts.html
29
+ """
30
+ delete_mol_column = False
31
+
32
+ # Check for the SMILES column (case-insensitive)
33
+ smiles_column = next((col for col in df.columns if col.lower() == "smiles"), None)
34
+ if smiles_column is None:
35
+ raise ValueError("Input DataFrame must have a 'smiles' column")
36
+
37
+ # Sanity check the molecule column (sometimes it gets serialized, which doesn't work)
38
+ if "molecule" in df.columns and df["molecule"].dtype == "string":
39
+ log.warning("Detected serialized molecules in 'molecule' column. Removing...")
40
+ del df["molecule"]
41
+
42
+ # Convert SMILES to RDKit molecule objects (vectorized)
43
+ if "molecule" not in df.columns:
44
+ log.info("Converting SMILES to RDKit Molecules...")
45
+ delete_mol_column = True
46
+ df["molecule"] = df[smiles_column].apply(Chem.MolFromSmiles)
47
+ # Make sure our molecules are not None
48
+ failed_smiles = df[df["molecule"].isnull()][smiles_column].tolist()
49
+ if failed_smiles:
50
+ log.error(f"Failed to convert the following SMILES to molecules: {failed_smiles}")
51
+ df = df.dropna(subset=["molecule"])
52
+
53
+ # If we have fragments in our compounds, get the largest fragment before computing fingerprints
54
+ largest_frags = df["molecule"].apply(
55
+ lambda mol: rdMolStandardize.LargestFragmentChooser().choose(mol) if mol else None
56
+ )
57
+
58
+ # Create a Morgan fingerprint generator
59
+ if counts:
60
+ n_bits *= 4 # Multiply by 4 to simulate counts
61
+ morgan_generator = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits, countSimulation=counts)
62
+
63
+ # Compute Morgan fingerprints (vectorized)
64
+ fingerprints = largest_frags.apply(
65
+ lambda mol: (morgan_generator.GetFingerprint(mol).ToBitString() if mol else pd.NA)
66
+ )
67
+
68
+ # Add the fingerprints to the DataFrame
69
+ df["fingerprint"] = fingerprints
70
+
71
+ # Drop the intermediate 'molecule' column if it was added
72
+ if delete_mol_column:
73
+ del df["molecule"]
74
+ return df
75
+
76
+
77
+ if __name__ == "__main__":
78
+ print("Running molecular fingerprint tests...")
79
+ print("Note: This requires molecular_screening module to be available")
80
+
81
+ # Test molecules
82
+ test_molecules = {
83
+ "aspirin": "CC(=O)OC1=CC=CC=C1C(=O)O",
84
+ "caffeine": "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",
85
+ "glucose": "C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O", # With stereochemistry
86
+ "sodium_acetate": "CC(=O)[O-].[Na+]", # Salt
87
+ "benzene": "c1ccccc1",
88
+ "butene_e": "C/C=C/C", # E-butene
89
+ "butene_z": "C/C=C\\C", # Z-butene
90
+ }
91
+
92
+ # Test 1: Morgan Fingerprints
93
+ print("\n1. Testing Morgan fingerprint generation...")
94
+
95
+ test_df = pd.DataFrame({"SMILES": list(test_molecules.values()), "name": list(test_molecules.keys())})
96
+
97
+ fp_df = compute_morgan_fingerprints(test_df.copy(), radius=2, n_bits=512, counts=False)
98
+
99
+ print(" Fingerprint generation results:")
100
+ for _, row in fp_df.iterrows():
101
+ fp = row.get("fingerprint", "N/A")
102
+ fp_len = len(fp) if fp != "N/A" else 0
103
+ print(f" {row['name']:15} → {fp_len} bits")
104
+
105
+ # Test 2: Different fingerprint parameters
106
+ print("\n2. Testing different fingerprint parameters...")
107
+
108
+ # Test with counts enabled
109
+ fp_counts_df = compute_morgan_fingerprints(test_df.copy(), radius=3, n_bits=256, counts=True)
110
+
111
+ print(" With count simulation (256 bits * 4):")
112
+ for _, row in fp_counts_df.iterrows():
113
+ fp = row.get("fingerprint", "N/A")
114
+ fp_len = len(fp) if fp != "N/A" else 0
115
+ print(f" {row['name']:15} → {fp_len} bits")
116
+
117
+ # Test 3: Edge cases
118
+ print("\n3. Testing edge cases...")
119
+
120
+ # Invalid SMILES
121
+ invalid_df = pd.DataFrame({"SMILES": ["INVALID", ""]})
122
+ try:
123
+ fp_invalid = compute_morgan_fingerprints(invalid_df.copy())
124
+ print(f" ✓ Invalid SMILES handled: {len(fp_invalid)} valid molecules")
125
+ except Exception as e:
126
+ print(f" ✓ Invalid SMILES properly raised error: {type(e).__name__}")
127
+
128
+ # Test with pre-existing molecule column
129
+ mol_df = test_df.copy()
130
+ mol_df["molecule"] = mol_df["SMILES"].apply(Chem.MolFromSmiles)
131
+ fp_with_mol = compute_morgan_fingerprints(mol_df)
132
+ print(f" ✓ Pre-existing molecule column handled: {len(fp_with_mol)} fingerprints generated")
133
+
134
+ print("\n✅ All fingerprint tests completed!")
@@ -0,0 +1,194 @@
1
+ """Miscellaneous processing functions for molecular data."""
2
+
3
+ import logging
4
+ import numpy as np
5
+ import pandas as pd
6
+ from typing import List, Optional
7
+
8
+ # Set up the logger
9
+ log = logging.getLogger("workbench")
10
+
11
+
12
+ def geometric_mean(series: pd.Series) -> float:
13
+ """Computes the geometric mean manually to avoid using scipy."""
14
+ return np.exp(np.log(series).mean())
15
+
16
+
17
+ def rollup_experimental_data(
18
+ df: pd.DataFrame, id: str, time: str, target: str, use_gmean: bool = False
19
+ ) -> pd.DataFrame:
20
+ """
21
+ Rolls up a dataset by selecting the largest time per unique ID and averaging the target value
22
+ if multiple records exist at that time. Supports both arithmetic and geometric mean.
23
+
24
+ Parameters:
25
+ df (pd.DataFrame): Input dataframe.
26
+ id (str): Column representing the unique molecule ID.
27
+ time (str): Column representing the time.
28
+ target (str): Column representing the target value.
29
+ use_gmean (bool): Whether to use the geometric mean instead of the arithmetic mean.
30
+
31
+ Returns:
32
+ pd.DataFrame: Rolled-up dataframe with all original columns retained.
33
+ """
34
+ # Find the max time per unique ID
35
+ max_time_df = df.groupby(id)[time].transform("max")
36
+ filtered_df = df[df[time] == max_time_df]
37
+
38
+ # Define aggregation function
39
+ agg_func = geometric_mean if use_gmean else np.mean
40
+
41
+ # Perform aggregation on all columns
42
+ agg_dict = {col: "first" for col in df.columns if col not in [target, id, time]}
43
+ agg_dict[target] = lambda x: agg_func(x) if len(x) > 1 else x.iloc[0] # Apply mean or gmean
44
+
45
+ rolled_up_df = filtered_df.groupby([id, time]).agg(agg_dict).reset_index()
46
+ return rolled_up_df
47
+
48
+
49
+ def micromolar_to_log(series_µM: pd.Series) -> pd.Series:
50
+ """
51
+ Convert a pandas Series of concentrations in µM (micromolar) to their logarithmic values (log10).
52
+
53
+ Parameters:
54
+ series_uM (pd.Series): Series of concentrations in micromolar.
55
+
56
+ Returns:
57
+ pd.Series: Series of logarithmic values (log10).
58
+ """
59
+ # Replace 0 or negative values with a small number to avoid log errors
60
+ adjusted_series = series_µM.clip(lower=1e-9) # Alignment with another project
61
+
62
+ series_mol_per_l = adjusted_series * 1e-6 # Convert µM/L to mol/L
63
+ log_series = np.log10(series_mol_per_l)
64
+ return log_series
65
+
66
+
67
+ def log_to_micromolar(log_series: pd.Series) -> pd.Series:
68
+ """
69
+ Convert a pandas Series of logarithmic values (log10) back to concentrations in µM (micromolar).
70
+
71
+ Parameters:
72
+ log_series (pd.Series): Series of logarithmic values (log10).
73
+
74
+ Returns:
75
+ pd.Series: Series of concentrations in micromolar.
76
+ """
77
+ series_mol_per_l = 10**log_series # Convert log10 back to mol/L
78
+ series_µM = series_mol_per_l * 1e6 # Convert mol/L to µM
79
+ return series_µM
80
+
81
+
82
+ def feature_resolution_issues(df: pd.DataFrame, features: List[str], show_cols: Optional[List[str]] = None) -> None:
83
+ """
84
+ Identify and print groups in a DataFrame where the given features have more than one unique SMILES,
85
+ sorted by group size (largest number of unique SMILES first).
86
+
87
+ Args:
88
+ df (pd.DataFrame): Input DataFrame containing SMILES strings.
89
+ features (List[str]): List of features to check.
90
+ show_cols (Optional[List[str]]): Columns to display; defaults to all columns.
91
+ """
92
+ # Check for the 'smiles' column (case-insensitive)
93
+ smiles_column = next((col for col in df.columns if col.lower() == "smiles"), None)
94
+ if smiles_column is None:
95
+ raise ValueError("Input DataFrame must have a 'smiles' column")
96
+
97
+ show_cols = show_cols if show_cols is not None else df.columns.tolist()
98
+
99
+ # Drop duplicates to keep only unique SMILES for each feature combination
100
+ unique_df = df.drop_duplicates(subset=[smiles_column] + features)
101
+
102
+ # Find groups with more than one unique SMILES
103
+ group_counts = unique_df.groupby(features).size()
104
+ collision_groups = group_counts[group_counts > 1].sort_values(ascending=False)
105
+
106
+ # Print each group in order of size (largest first)
107
+ for group, count in collision_groups.items():
108
+ # Get the rows for this group
109
+ if isinstance(group, tuple):
110
+ group_mask = (unique_df[features] == group).all(axis=1)
111
+ else:
112
+ group_mask = unique_df[features[0]] == group
113
+
114
+ group_df = unique_df[group_mask]
115
+
116
+ print(f"Feature Group (unique SMILES: {count}):")
117
+ print(group_df[show_cols])
118
+ print("\n")
119
+
120
+
121
+ if __name__ == "__main__":
122
+ print("Running molecular processing and transformation tests...")
123
+ print("Note: This requires the molecular_filters module to be available")
124
+
125
+ # Test 1: Concentration conversions
126
+ print("\n1. Testing concentration conversions...")
127
+
128
+ # Test micromolar to log
129
+ test_conc = pd.Series([1.0, 10.0, 100.0, 1000.0, 0.001])
130
+ log_values = micromolar_to_log(test_conc)
131
+ back_to_uM = log_to_micromolar(log_values)
132
+
133
+ print(" µM → log10 → µM:")
134
+ for orig, log_val, back in zip(test_conc, log_values, back_to_uM):
135
+ print(f" {orig:8.3f} µM → {log_val:6.2f} → {back:8.3f} µM")
136
+
137
+ # Test 2: Geometric mean
138
+ print("\n2. Testing geometric mean...")
139
+ test_series = pd.Series([2, 4, 8, 16])
140
+ geo_mean = geometric_mean(test_series)
141
+ arith_mean = np.mean(test_series)
142
+ print(f" Series: {list(test_series)}")
143
+ print(f" Arithmetic mean: {arith_mean:.2f}")
144
+ print(f" Geometric mean: {geo_mean:.2f}")
145
+
146
+ # Test 3: Experimental data rollup
147
+ print("\n3. Testing experimental data rollup...")
148
+
149
+ # Create test data with multiple timepoints and replicates
150
+ test_data = pd.DataFrame(
151
+ {
152
+ "compound_id": ["A", "A", "A", "B", "B", "C", "C", "C"],
153
+ "time": [1, 2, 2, 1, 2, 1, 1, 2],
154
+ "activity": [10, 20, 22, 5, 8, 100, 110, 200],
155
+ "assay": ["kinase", "kinase", "kinase", "kinase", "kinase", "cell", "cell", "cell"],
156
+ }
157
+ )
158
+
159
+ # Rollup with arithmetic mean
160
+ rolled_arith = rollup_experimental_data(test_data, "compound_id", "time", "activity", use_gmean=False)
161
+ print(" Arithmetic mean rollup:")
162
+ print(rolled_arith[["compound_id", "time", "activity"]])
163
+
164
+ # Rollup with geometric mean
165
+ rolled_geo = rollup_experimental_data(test_data, "compound_id", "time", "activity", use_gmean=True)
166
+ print("\n Geometric mean rollup:")
167
+ print(rolled_geo[["compound_id", "time", "activity"]])
168
+
169
+ # Test 4: Feature resolution issues
170
+ print("\n4. Testing feature resolution identification...")
171
+
172
+ # Create data with some duplicate features but different SMILES
173
+ resolution_df = pd.DataFrame(
174
+ {
175
+ "smiles": ["CCO", "C(C)O", "CC(C)O", "CCC(C)O", "CCCO"],
176
+ "assay_id": ["A1", "A1", "A2", "A2", "A3"],
177
+ "value": [1.0, 1.5, 2.0, 2.2, 3.0],
178
+ }
179
+ )
180
+
181
+ print(" Checking for feature collisions in 'assay_id':")
182
+ feature_resolution_issues(resolution_df, ["assay_id"], show_cols=["smiles", "assay_id", "value"])
183
+
184
+ # Test 7: Edge cases
185
+ print("\n7. Testing edge cases...")
186
+
187
+ # Zero and negative concentrations
188
+ edge_conc = pd.Series([0, -1, 1e-10])
189
+ edge_log = micromolar_to_log(edge_conc)
190
+ print(" Edge concentration handling:")
191
+ for c, l in zip(edge_conc, edge_log):
192
+ print(f" {c:6.2e} µM → {l:6.2f}")
193
+
194
+ print("\n✅ All molecular processing tests completed!")