rdkit-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rdkit_cli/__init__.py +4 -0
- rdkit_cli/__main__.py +6 -0
- rdkit_cli/cli.py +162 -0
- rdkit_cli/commands/__init__.py +1 -0
- rdkit_cli/commands/conformers.py +220 -0
- rdkit_cli/commands/convert.py +162 -0
- rdkit_cli/commands/depict.py +311 -0
- rdkit_cli/commands/descriptors.py +251 -0
- rdkit_cli/commands/diversity.py +232 -0
- rdkit_cli/commands/enumerate.py +229 -0
- rdkit_cli/commands/filter.py +384 -0
- rdkit_cli/commands/fingerprints.py +179 -0
- rdkit_cli/commands/fragment.py +284 -0
- rdkit_cli/commands/mcs.py +162 -0
- rdkit_cli/commands/reactions.py +191 -0
- rdkit_cli/commands/scaffold.py +243 -0
- rdkit_cli/commands/similarity.py +359 -0
- rdkit_cli/commands/standardize.py +138 -0
- rdkit_cli/core/__init__.py +1 -0
- rdkit_cli/core/conformers.py +197 -0
- rdkit_cli/core/depict.py +241 -0
- rdkit_cli/core/descriptors.py +248 -0
- rdkit_cli/core/diversity.py +174 -0
- rdkit_cli/core/enumerate.py +190 -0
- rdkit_cli/core/filters.py +443 -0
- rdkit_cli/core/fingerprints.py +265 -0
- rdkit_cli/core/fragment.py +237 -0
- rdkit_cli/core/mcs.py +128 -0
- rdkit_cli/core/reactions.py +159 -0
- rdkit_cli/core/scaffold.py +174 -0
- rdkit_cli/core/similarity.py +206 -0
- rdkit_cli/core/standardizer.py +141 -0
- rdkit_cli/io/__init__.py +7 -0
- rdkit_cli/io/formats.py +109 -0
- rdkit_cli/io/readers.py +352 -0
- rdkit_cli/io/writers.py +275 -0
- rdkit_cli/parallel/__init__.py +5 -0
- rdkit_cli/parallel/batch.py +181 -0
- rdkit_cli/parallel/executor.py +180 -0
- rdkit_cli/progress/__init__.py +5 -0
- rdkit_cli/progress/ninja.py +195 -0
- rdkit_cli/utils/__init__.py +1 -0
- rdkit_cli-0.1.0.dist-info/METADATA +380 -0
- rdkit_cli-0.1.0.dist-info/RECORD +47 -0
- rdkit_cli-0.1.0.dist-info/WHEEL +4 -0
- rdkit_cli-0.1.0.dist-info/entry_points.txt +2 -0
- rdkit_cli-0.1.0.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
"""Scaffold command implementation."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from rdkit_cli.cli import RdkitHelpFormatter, add_common_io_options, add_common_processing_options
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def register_parser(subparsers):
|
|
10
|
+
"""Register the scaffold command and subcommands."""
|
|
11
|
+
parser = subparsers.add_parser(
|
|
12
|
+
"scaffold",
|
|
13
|
+
help="Analyze molecular scaffolds",
|
|
14
|
+
description="Extract and analyze Murcko scaffolds.",
|
|
15
|
+
formatter_class=RdkitHelpFormatter,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
scaf_subparsers = parser.add_subparsers(
|
|
19
|
+
title="Subcommands",
|
|
20
|
+
dest="subcommand",
|
|
21
|
+
metavar="<subcommand>",
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
# scaffold murcko
|
|
25
|
+
murcko_parser = scaf_subparsers.add_parser(
|
|
26
|
+
"murcko",
|
|
27
|
+
help="Extract Murcko scaffolds",
|
|
28
|
+
formatter_class=RdkitHelpFormatter,
|
|
29
|
+
)
|
|
30
|
+
add_common_io_options(murcko_parser)
|
|
31
|
+
add_common_processing_options(murcko_parser)
|
|
32
|
+
murcko_parser.add_argument(
|
|
33
|
+
"--generic",
|
|
34
|
+
action="store_true",
|
|
35
|
+
help="Generate generic (element-agnostic) scaffolds",
|
|
36
|
+
)
|
|
37
|
+
murcko_parser.add_argument(
|
|
38
|
+
"--include-sidechains",
|
|
39
|
+
action="store_true",
|
|
40
|
+
help="Include side chains in output",
|
|
41
|
+
)
|
|
42
|
+
murcko_parser.add_argument(
|
|
43
|
+
"--rings-only",
|
|
44
|
+
action="store_true",
|
|
45
|
+
help="Extract only ring systems (no linkers)",
|
|
46
|
+
)
|
|
47
|
+
murcko_parser.add_argument(
|
|
48
|
+
"--include-original",
|
|
49
|
+
action="store_true",
|
|
50
|
+
help="Include original SMILES in output",
|
|
51
|
+
)
|
|
52
|
+
murcko_parser.set_defaults(func=run_murcko)
|
|
53
|
+
|
|
54
|
+
# scaffold decompose
|
|
55
|
+
decompose_parser = scaf_subparsers.add_parser(
|
|
56
|
+
"decompose",
|
|
57
|
+
help="Decompose molecules into scaffold components",
|
|
58
|
+
formatter_class=RdkitHelpFormatter,
|
|
59
|
+
)
|
|
60
|
+
add_common_io_options(decompose_parser)
|
|
61
|
+
add_common_processing_options(decompose_parser)
|
|
62
|
+
decompose_parser.set_defaults(func=run_decompose)
|
|
63
|
+
|
|
64
|
+
# scaffold analyze
|
|
65
|
+
analyze_parser = scaf_subparsers.add_parser(
|
|
66
|
+
"analyze",
|
|
67
|
+
help="Analyze scaffold frequency distribution",
|
|
68
|
+
formatter_class=RdkitHelpFormatter,
|
|
69
|
+
)
|
|
70
|
+
analyze_parser.add_argument(
|
|
71
|
+
"-i", "--input",
|
|
72
|
+
required=True,
|
|
73
|
+
metavar="FILE",
|
|
74
|
+
help="Input file with scaffold column",
|
|
75
|
+
)
|
|
76
|
+
analyze_parser.add_argument(
|
|
77
|
+
"-o", "--output",
|
|
78
|
+
metavar="FILE",
|
|
79
|
+
help="Output file (optional, prints to stdout if not specified)",
|
|
80
|
+
)
|
|
81
|
+
analyze_parser.add_argument(
|
|
82
|
+
"--scaffold-column",
|
|
83
|
+
default="scaffold",
|
|
84
|
+
help="Name of scaffold column (default: scaffold)",
|
|
85
|
+
)
|
|
86
|
+
analyze_parser.add_argument(
|
|
87
|
+
"--top",
|
|
88
|
+
type=int,
|
|
89
|
+
default=20,
|
|
90
|
+
help="Number of top scaffolds to show (default: 20)",
|
|
91
|
+
)
|
|
92
|
+
analyze_parser.add_argument(
|
|
93
|
+
"--no-header",
|
|
94
|
+
action="store_true",
|
|
95
|
+
help="Input file has no header row",
|
|
96
|
+
)
|
|
97
|
+
analyze_parser.set_defaults(func=run_analyze)
|
|
98
|
+
|
|
99
|
+
# Set default for main parser
|
|
100
|
+
parser.set_defaults(func=lambda args: parser.print_help() or 1)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def run_murcko(args) -> int:
|
|
104
|
+
"""Run Murcko scaffold extraction."""
|
|
105
|
+
# Lazy imports
|
|
106
|
+
from rdkit_cli.core.scaffold import ScaffoldExtractor
|
|
107
|
+
from rdkit_cli.io import create_reader, create_writer
|
|
108
|
+
from rdkit_cli.parallel.batch import process_molecules
|
|
109
|
+
|
|
110
|
+
extractor = ScaffoldExtractor(
|
|
111
|
+
generic=args.generic,
|
|
112
|
+
include_smiles=True,
|
|
113
|
+
include_name=True,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
input_path = Path(args.input)
|
|
117
|
+
if not input_path.exists():
|
|
118
|
+
print(f"Error: Input file not found: {input_path}", file=sys.stderr)
|
|
119
|
+
return 1
|
|
120
|
+
|
|
121
|
+
reader = create_reader(
|
|
122
|
+
input_path,
|
|
123
|
+
smiles_column=args.smiles_column,
|
|
124
|
+
name_column=args.name_column,
|
|
125
|
+
has_header=not args.no_header,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
output_path = Path(args.output)
|
|
129
|
+
writer = create_writer(output_path)
|
|
130
|
+
|
|
131
|
+
with reader, writer:
|
|
132
|
+
result = process_molecules(
|
|
133
|
+
reader=reader,
|
|
134
|
+
writer=writer,
|
|
135
|
+
processor=extractor.extract,
|
|
136
|
+
n_workers=args.ncpu,
|
|
137
|
+
quiet=args.quiet,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
if not args.quiet:
|
|
141
|
+
print(
|
|
142
|
+
f"Extracted scaffolds for {result.successful}/{result.total_processed} molecules "
|
|
143
|
+
f"({result.failed} failed) in {result.elapsed_time:.1f}s",
|
|
144
|
+
file=sys.stderr,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
return 0 if result.failed == 0 else 1
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def run_decompose(args) -> int:
|
|
151
|
+
"""Run scaffold decomposition."""
|
|
152
|
+
# Lazy imports
|
|
153
|
+
from rdkit_cli.core.scaffold import ScaffoldDecomposer
|
|
154
|
+
from rdkit_cli.io import create_reader, create_writer
|
|
155
|
+
from rdkit_cli.parallel.batch import process_molecules
|
|
156
|
+
|
|
157
|
+
decomposer = ScaffoldDecomposer(
|
|
158
|
+
include_smiles=True,
|
|
159
|
+
include_name=True,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
input_path = Path(args.input)
|
|
163
|
+
if not input_path.exists():
|
|
164
|
+
print(f"Error: Input file not found: {input_path}", file=sys.stderr)
|
|
165
|
+
return 1
|
|
166
|
+
|
|
167
|
+
reader = create_reader(
|
|
168
|
+
input_path,
|
|
169
|
+
smiles_column=args.smiles_column,
|
|
170
|
+
name_column=args.name_column,
|
|
171
|
+
has_header=not args.no_header,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
output_path = Path(args.output)
|
|
175
|
+
writer = create_writer(output_path)
|
|
176
|
+
|
|
177
|
+
with reader, writer:
|
|
178
|
+
result = process_molecules(
|
|
179
|
+
reader=reader,
|
|
180
|
+
writer=writer,
|
|
181
|
+
processor=decomposer.decompose,
|
|
182
|
+
n_workers=args.ncpu,
|
|
183
|
+
quiet=args.quiet,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
if not args.quiet:
|
|
187
|
+
print(
|
|
188
|
+
f"Decomposed {result.successful}/{result.total_processed} molecules "
|
|
189
|
+
f"({result.failed} failed) in {result.elapsed_time:.1f}s",
|
|
190
|
+
file=sys.stderr,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
return 0 if result.failed == 0 else 1
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def run_analyze(args) -> int:
|
|
197
|
+
"""Run scaffold frequency analysis."""
|
|
198
|
+
# Lazy imports
|
|
199
|
+
import pandas as pd
|
|
200
|
+
from rdkit_cli.core.scaffold import analyze_scaffolds
|
|
201
|
+
|
|
202
|
+
input_path = Path(args.input)
|
|
203
|
+
if not input_path.exists():
|
|
204
|
+
print(f"Error: Input file not found: {input_path}", file=sys.stderr)
|
|
205
|
+
return 1
|
|
206
|
+
|
|
207
|
+
# Read scaffold data
|
|
208
|
+
header = 0 if not args.no_header else None
|
|
209
|
+
df = pd.read_csv(input_path, header=header)
|
|
210
|
+
|
|
211
|
+
if args.no_header:
|
|
212
|
+
# Assume first column is scaffold
|
|
213
|
+
scaffold_col = df.columns[0]
|
|
214
|
+
else:
|
|
215
|
+
scaffold_col = args.scaffold_column
|
|
216
|
+
|
|
217
|
+
if scaffold_col not in df.columns:
|
|
218
|
+
print(f"Error: Scaffold column '{scaffold_col}' not found", file=sys.stderr)
|
|
219
|
+
return 1
|
|
220
|
+
|
|
221
|
+
scaffolds = df[scaffold_col].dropna().tolist()
|
|
222
|
+
results = analyze_scaffolds(scaffolds, top_n=args.top)
|
|
223
|
+
|
|
224
|
+
# Output
|
|
225
|
+
output_lines = ["scaffold,count,percentage"]
|
|
226
|
+
for scaffold, count, pct in results:
|
|
227
|
+
# Escape quotes in scaffold SMILES
|
|
228
|
+
scaffold_escaped = scaffold.replace('"', '""')
|
|
229
|
+
output_lines.append(f'"{scaffold_escaped}",{count},{pct}')
|
|
230
|
+
|
|
231
|
+
output_text = "\n".join(output_lines)
|
|
232
|
+
|
|
233
|
+
if args.output:
|
|
234
|
+
output_path = Path(args.output)
|
|
235
|
+
with open(output_path, "w") as f:
|
|
236
|
+
f.write(output_text + "\n")
|
|
237
|
+
print(f"Wrote scaffold analysis to {output_path}", file=sys.stderr)
|
|
238
|
+
else:
|
|
239
|
+
print(output_text)
|
|
240
|
+
|
|
241
|
+
print(f"\nTotal scaffolds: {len(scaffolds)}, Unique: {len(set(scaffolds))}", file=sys.stderr)
|
|
242
|
+
|
|
243
|
+
return 0
|
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
"""Similarity command implementation."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from rdkit_cli.cli import RdkitHelpFormatter, add_common_io_options, add_common_processing_options
|
|
7
|
+
|
|
8
|
+
# Define here to avoid loading core at startup
|
|
9
|
+
SIMILARITY_METRICS = ["tanimoto", "dice", "cosine", "sokal", "russel"]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def register_parser(subparsers):
|
|
13
|
+
"""Register the similarity command and subcommands."""
|
|
14
|
+
parser = subparsers.add_parser(
|
|
15
|
+
"similarity",
|
|
16
|
+
help="Compute molecular similarity",
|
|
17
|
+
description="Search, compare, and cluster molecules by similarity.",
|
|
18
|
+
formatter_class=RdkitHelpFormatter,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
sim_subparsers = parser.add_subparsers(
|
|
22
|
+
title="Subcommands",
|
|
23
|
+
dest="subcommand",
|
|
24
|
+
metavar="<subcommand>",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
# similarity search
|
|
28
|
+
search_parser = sim_subparsers.add_parser(
|
|
29
|
+
"search",
|
|
30
|
+
help="Search for molecules similar to a query",
|
|
31
|
+
formatter_class=RdkitHelpFormatter,
|
|
32
|
+
)
|
|
33
|
+
add_common_io_options(search_parser)
|
|
34
|
+
add_common_processing_options(search_parser)
|
|
35
|
+
search_parser.add_argument(
|
|
36
|
+
"--query",
|
|
37
|
+
required=True,
|
|
38
|
+
metavar="SMILES",
|
|
39
|
+
help="Query molecule SMILES",
|
|
40
|
+
)
|
|
41
|
+
search_parser.add_argument(
|
|
42
|
+
"-t", "--threshold",
|
|
43
|
+
type=float,
|
|
44
|
+
default=0.7,
|
|
45
|
+
metavar="T",
|
|
46
|
+
help="Minimum similarity threshold (default: 0.7)",
|
|
47
|
+
)
|
|
48
|
+
search_parser.add_argument(
|
|
49
|
+
"-m", "--metric",
|
|
50
|
+
choices=SIMILARITY_METRICS,
|
|
51
|
+
default="tanimoto",
|
|
52
|
+
help="Similarity metric (default: tanimoto)",
|
|
53
|
+
)
|
|
54
|
+
search_parser.add_argument(
|
|
55
|
+
"-r", "--radius",
|
|
56
|
+
type=int,
|
|
57
|
+
default=2,
|
|
58
|
+
help="Morgan fingerprint radius (default: 2)",
|
|
59
|
+
)
|
|
60
|
+
search_parser.add_argument(
|
|
61
|
+
"-b", "--bits",
|
|
62
|
+
type=int,
|
|
63
|
+
default=2048,
|
|
64
|
+
help="Fingerprint bit size (default: 2048)",
|
|
65
|
+
)
|
|
66
|
+
search_parser.add_argument(
|
|
67
|
+
"--top-n",
|
|
68
|
+
type=int,
|
|
69
|
+
default=None,
|
|
70
|
+
metavar="N",
|
|
71
|
+
help="Return only top N most similar molecules",
|
|
72
|
+
)
|
|
73
|
+
search_parser.add_argument(
|
|
74
|
+
"--sort",
|
|
75
|
+
action="store_true",
|
|
76
|
+
help="Sort output by similarity (descending)",
|
|
77
|
+
)
|
|
78
|
+
search_parser.add_argument(
|
|
79
|
+
"--fp-type",
|
|
80
|
+
choices=["morgan", "maccs", "rdkit", "atompair", "torsion"],
|
|
81
|
+
default="morgan",
|
|
82
|
+
help="Fingerprint type (default: morgan)",
|
|
83
|
+
)
|
|
84
|
+
search_parser.add_argument(
|
|
85
|
+
"--include-query",
|
|
86
|
+
action="store_true",
|
|
87
|
+
help="Include query molecule in output",
|
|
88
|
+
)
|
|
89
|
+
search_parser.add_argument(
|
|
90
|
+
"--add-rank",
|
|
91
|
+
action="store_true",
|
|
92
|
+
help="Add similarity rank column",
|
|
93
|
+
)
|
|
94
|
+
search_parser.set_defaults(func=run_search)
|
|
95
|
+
|
|
96
|
+
# similarity matrix
|
|
97
|
+
matrix_parser = sim_subparsers.add_parser(
|
|
98
|
+
"matrix",
|
|
99
|
+
help="Compute pairwise similarity matrix",
|
|
100
|
+
formatter_class=RdkitHelpFormatter,
|
|
101
|
+
)
|
|
102
|
+
add_common_io_options(matrix_parser)
|
|
103
|
+
add_common_processing_options(matrix_parser)
|
|
104
|
+
matrix_parser.add_argument(
|
|
105
|
+
"-m", "--metric",
|
|
106
|
+
choices=SIMILARITY_METRICS,
|
|
107
|
+
default="tanimoto",
|
|
108
|
+
help="Similarity metric (default: tanimoto)",
|
|
109
|
+
)
|
|
110
|
+
matrix_parser.add_argument(
|
|
111
|
+
"--fp-type",
|
|
112
|
+
choices=["morgan", "maccs", "rdkit", "atompair", "torsion"],
|
|
113
|
+
default="morgan",
|
|
114
|
+
help="Fingerprint type (default: morgan)",
|
|
115
|
+
)
|
|
116
|
+
matrix_parser.add_argument(
|
|
117
|
+
"-r", "--radius",
|
|
118
|
+
type=int,
|
|
119
|
+
default=2,
|
|
120
|
+
help="Morgan fingerprint radius (default: 2)",
|
|
121
|
+
)
|
|
122
|
+
matrix_parser.add_argument(
|
|
123
|
+
"-b", "--bits",
|
|
124
|
+
type=int,
|
|
125
|
+
default=2048,
|
|
126
|
+
help="Fingerprint bit size (default: 2048)",
|
|
127
|
+
)
|
|
128
|
+
matrix_parser.add_argument(
|
|
129
|
+
"--distance",
|
|
130
|
+
action="store_true",
|
|
131
|
+
help="Output distance matrix (1 - similarity) instead of similarity",
|
|
132
|
+
)
|
|
133
|
+
matrix_parser.add_argument(
|
|
134
|
+
"--precision",
|
|
135
|
+
type=int,
|
|
136
|
+
default=4,
|
|
137
|
+
help="Decimal precision (default: 4)",
|
|
138
|
+
)
|
|
139
|
+
matrix_parser.set_defaults(func=run_matrix)
|
|
140
|
+
|
|
141
|
+
# similarity cluster
|
|
142
|
+
cluster_parser = sim_subparsers.add_parser(
|
|
143
|
+
"cluster",
|
|
144
|
+
help="Cluster molecules by similarity",
|
|
145
|
+
formatter_class=RdkitHelpFormatter,
|
|
146
|
+
)
|
|
147
|
+
add_common_io_options(cluster_parser)
|
|
148
|
+
add_common_processing_options(cluster_parser)
|
|
149
|
+
cluster_parser.add_argument(
|
|
150
|
+
"-c", "--cutoff",
|
|
151
|
+
type=float,
|
|
152
|
+
default=0.3,
|
|
153
|
+
metavar="C",
|
|
154
|
+
help="Distance cutoff (1-similarity, default: 0.3)",
|
|
155
|
+
)
|
|
156
|
+
cluster_parser.add_argument(
|
|
157
|
+
"-r", "--radius",
|
|
158
|
+
type=int,
|
|
159
|
+
default=2,
|
|
160
|
+
help="Morgan fingerprint radius (default: 2)",
|
|
161
|
+
)
|
|
162
|
+
cluster_parser.add_argument(
|
|
163
|
+
"-b", "--bits",
|
|
164
|
+
type=int,
|
|
165
|
+
default=2048,
|
|
166
|
+
help="Fingerprint bit size (default: 2048)",
|
|
167
|
+
)
|
|
168
|
+
cluster_parser.add_argument(
|
|
169
|
+
"--min-cluster-size",
|
|
170
|
+
type=int,
|
|
171
|
+
default=1,
|
|
172
|
+
help="Minimum cluster size to include (default: 1)",
|
|
173
|
+
)
|
|
174
|
+
cluster_parser.add_argument(
|
|
175
|
+
"--fp-type",
|
|
176
|
+
choices=["morgan", "maccs", "rdkit", "atompair", "torsion"],
|
|
177
|
+
default="morgan",
|
|
178
|
+
help="Fingerprint type (default: morgan)",
|
|
179
|
+
)
|
|
180
|
+
cluster_parser.add_argument(
|
|
181
|
+
"--method",
|
|
182
|
+
choices=["butina", "hierarchical"],
|
|
183
|
+
default="butina",
|
|
184
|
+
help="Clustering method (default: butina)",
|
|
185
|
+
)
|
|
186
|
+
cluster_parser.add_argument(
|
|
187
|
+
"--add-centroid",
|
|
188
|
+
action="store_true",
|
|
189
|
+
help="Mark cluster centroids",
|
|
190
|
+
)
|
|
191
|
+
cluster_parser.set_defaults(func=run_cluster)
|
|
192
|
+
|
|
193
|
+
# Set default for main parser
|
|
194
|
+
parser.set_defaults(func=lambda args: parser.print_help() or 1)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def run_search(args) -> int:
|
|
198
|
+
"""Run similarity search."""
|
|
199
|
+
# Lazy imports
|
|
200
|
+
from rdkit_cli.core.similarity import SimilaritySearcher, SimilarityMetric
|
|
201
|
+
from rdkit_cli.io import create_reader, create_writer
|
|
202
|
+
from rdkit_cli.parallel.batch import process_molecules
|
|
203
|
+
|
|
204
|
+
try:
|
|
205
|
+
searcher = SimilaritySearcher(
|
|
206
|
+
query_smiles=args.query,
|
|
207
|
+
threshold=args.threshold,
|
|
208
|
+
metric=SimilarityMetric(args.metric),
|
|
209
|
+
radius=args.radius,
|
|
210
|
+
n_bits=args.bits,
|
|
211
|
+
)
|
|
212
|
+
except ValueError as e:
|
|
213
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
214
|
+
return 1
|
|
215
|
+
|
|
216
|
+
input_path = Path(args.input)
|
|
217
|
+
if not input_path.exists():
|
|
218
|
+
print(f"Error: Input file not found: {input_path}", file=sys.stderr)
|
|
219
|
+
return 1
|
|
220
|
+
|
|
221
|
+
reader = create_reader(
|
|
222
|
+
input_path,
|
|
223
|
+
smiles_column=args.smiles_column,
|
|
224
|
+
name_column=args.name_column,
|
|
225
|
+
has_header=not args.no_header,
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
output_path = Path(args.output)
|
|
229
|
+
writer = create_writer(output_path)
|
|
230
|
+
|
|
231
|
+
with reader, writer:
|
|
232
|
+
result = process_molecules(
|
|
233
|
+
reader=reader,
|
|
234
|
+
writer=writer,
|
|
235
|
+
processor=searcher.search,
|
|
236
|
+
n_workers=args.ncpu,
|
|
237
|
+
quiet=args.quiet,
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
if not args.quiet:
|
|
241
|
+
found = result.successful
|
|
242
|
+
total = result.total_processed
|
|
243
|
+
print(
|
|
244
|
+
f"Found {found}/{total} molecules above threshold "
|
|
245
|
+
f"({result.failed} failed) in {result.elapsed_time:.1f}s",
|
|
246
|
+
file=sys.stderr,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
return 0
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def run_matrix(args) -> int:
|
|
253
|
+
"""Compute similarity matrix."""
|
|
254
|
+
# Lazy imports
|
|
255
|
+
from rdkit_cli.core.similarity import compute_similarity_matrix, SimilarityMetric
|
|
256
|
+
from rdkit_cli.io import create_reader
|
|
257
|
+
|
|
258
|
+
input_path = Path(args.input)
|
|
259
|
+
if not input_path.exists():
|
|
260
|
+
print(f"Error: Input file not found: {input_path}", file=sys.stderr)
|
|
261
|
+
return 1
|
|
262
|
+
|
|
263
|
+
reader = create_reader(
|
|
264
|
+
input_path,
|
|
265
|
+
smiles_column=args.smiles_column,
|
|
266
|
+
name_column=args.name_column,
|
|
267
|
+
has_header=not args.no_header,
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
# Read all molecules
|
|
271
|
+
if not args.quiet:
|
|
272
|
+
print("Reading molecules...", file=sys.stderr)
|
|
273
|
+
|
|
274
|
+
records = list(reader)
|
|
275
|
+
mols = [r.mol for r in records]
|
|
276
|
+
names = [r.name or r.smiles[:20] for r in records]
|
|
277
|
+
|
|
278
|
+
if not args.quiet:
|
|
279
|
+
print(f"Computing {len(mols)}x{len(mols)} similarity matrix...", file=sys.stderr)
|
|
280
|
+
|
|
281
|
+
matrix = compute_similarity_matrix(
|
|
282
|
+
mols,
|
|
283
|
+
metric=SimilarityMetric(args.metric),
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
# Write output
|
|
287
|
+
output_path = Path(args.output)
|
|
288
|
+
with open(output_path, "w") as f:
|
|
289
|
+
# Header
|
|
290
|
+
f.write("," + ",".join(names) + "\n")
|
|
291
|
+
# Data
|
|
292
|
+
for i, row in enumerate(matrix):
|
|
293
|
+
f.write(names[i] + "," + ",".join(f"{v:.4f}" for v in row) + "\n")
|
|
294
|
+
|
|
295
|
+
if not args.quiet:
|
|
296
|
+
print(f"Wrote similarity matrix to {output_path}", file=sys.stderr)
|
|
297
|
+
|
|
298
|
+
return 0
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def run_cluster(args) -> int:
|
|
302
|
+
"""Cluster molecules."""
|
|
303
|
+
# Lazy imports
|
|
304
|
+
from rdkit_cli.core.similarity import cluster_molecules
|
|
305
|
+
from rdkit_cli.io import create_reader
|
|
306
|
+
|
|
307
|
+
input_path = Path(args.input)
|
|
308
|
+
if not input_path.exists():
|
|
309
|
+
print(f"Error: Input file not found: {input_path}", file=sys.stderr)
|
|
310
|
+
return 1
|
|
311
|
+
|
|
312
|
+
reader = create_reader(
|
|
313
|
+
input_path,
|
|
314
|
+
smiles_column=args.smiles_column,
|
|
315
|
+
name_column=args.name_column,
|
|
316
|
+
has_header=not args.no_header,
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
# Read all molecules
|
|
320
|
+
if not args.quiet:
|
|
321
|
+
print("Reading molecules...", file=sys.stderr)
|
|
322
|
+
|
|
323
|
+
records = list(reader)
|
|
324
|
+
mols = [r.mol for r in records]
|
|
325
|
+
|
|
326
|
+
if not args.quiet:
|
|
327
|
+
print(f"Clustering {len(mols)} molecules...", file=sys.stderr)
|
|
328
|
+
|
|
329
|
+
clusters = cluster_molecules(
|
|
330
|
+
mols,
|
|
331
|
+
cutoff=args.cutoff,
|
|
332
|
+
radius=args.radius,
|
|
333
|
+
n_bits=args.bits,
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
# Filter by minimum cluster size
|
|
337
|
+
min_size = getattr(args, "min_cluster_size", 1)
|
|
338
|
+
clusters = [c for c in clusters if len(c) >= min_size]
|
|
339
|
+
|
|
340
|
+
# Write output
|
|
341
|
+
output_path = Path(args.output)
|
|
342
|
+
with open(output_path, "w") as f:
|
|
343
|
+
f.write("smiles,name,cluster,cluster_size\n")
|
|
344
|
+
for cluster_id, cluster in enumerate(clusters):
|
|
345
|
+
cluster_size = len(cluster)
|
|
346
|
+
for idx in cluster:
|
|
347
|
+
r = records[idx]
|
|
348
|
+
smiles = r.smiles.replace('"', '""')
|
|
349
|
+
name = (r.name or "").replace('"', '""')
|
|
350
|
+
f.write(f'"{smiles}","{name}",{cluster_id},{cluster_size}\n')
|
|
351
|
+
|
|
352
|
+
if not args.quiet:
|
|
353
|
+
print(
|
|
354
|
+
f"Found {len(clusters)} clusters from {len(mols)} molecules. "
|
|
355
|
+
f"Wrote to {output_path}",
|
|
356
|
+
file=sys.stderr,
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
return 0
|