rdkit-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. rdkit_cli/__init__.py +4 -0
  2. rdkit_cli/__main__.py +6 -0
  3. rdkit_cli/cli.py +162 -0
  4. rdkit_cli/commands/__init__.py +1 -0
  5. rdkit_cli/commands/conformers.py +220 -0
  6. rdkit_cli/commands/convert.py +162 -0
  7. rdkit_cli/commands/depict.py +311 -0
  8. rdkit_cli/commands/descriptors.py +251 -0
  9. rdkit_cli/commands/diversity.py +232 -0
  10. rdkit_cli/commands/enumerate.py +229 -0
  11. rdkit_cli/commands/filter.py +384 -0
  12. rdkit_cli/commands/fingerprints.py +179 -0
  13. rdkit_cli/commands/fragment.py +284 -0
  14. rdkit_cli/commands/mcs.py +162 -0
  15. rdkit_cli/commands/reactions.py +191 -0
  16. rdkit_cli/commands/scaffold.py +243 -0
  17. rdkit_cli/commands/similarity.py +359 -0
  18. rdkit_cli/commands/standardize.py +138 -0
  19. rdkit_cli/core/__init__.py +1 -0
  20. rdkit_cli/core/conformers.py +197 -0
  21. rdkit_cli/core/depict.py +241 -0
  22. rdkit_cli/core/descriptors.py +248 -0
  23. rdkit_cli/core/diversity.py +174 -0
  24. rdkit_cli/core/enumerate.py +190 -0
  25. rdkit_cli/core/filters.py +443 -0
  26. rdkit_cli/core/fingerprints.py +265 -0
  27. rdkit_cli/core/fragment.py +237 -0
  28. rdkit_cli/core/mcs.py +128 -0
  29. rdkit_cli/core/reactions.py +159 -0
  30. rdkit_cli/core/scaffold.py +174 -0
  31. rdkit_cli/core/similarity.py +206 -0
  32. rdkit_cli/core/standardizer.py +141 -0
  33. rdkit_cli/io/__init__.py +7 -0
  34. rdkit_cli/io/formats.py +109 -0
  35. rdkit_cli/io/readers.py +352 -0
  36. rdkit_cli/io/writers.py +275 -0
  37. rdkit_cli/parallel/__init__.py +5 -0
  38. rdkit_cli/parallel/batch.py +181 -0
  39. rdkit_cli/parallel/executor.py +180 -0
  40. rdkit_cli/progress/__init__.py +5 -0
  41. rdkit_cli/progress/ninja.py +195 -0
  42. rdkit_cli/utils/__init__.py +1 -0
  43. rdkit_cli-0.1.0.dist-info/METADATA +380 -0
  44. rdkit_cli-0.1.0.dist-info/RECORD +47 -0
  45. rdkit_cli-0.1.0.dist-info/WHEEL +4 -0
  46. rdkit_cli-0.1.0.dist-info/entry_points.txt +2 -0
  47. rdkit_cli-0.1.0.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,243 @@
1
+ """Scaffold command implementation."""
2
+
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ from rdkit_cli.cli import RdkitHelpFormatter, add_common_io_options, add_common_processing_options
7
+
8
+
9
+ def register_parser(subparsers):
10
+ """Register the scaffold command and subcommands."""
11
+ parser = subparsers.add_parser(
12
+ "scaffold",
13
+ help="Analyze molecular scaffolds",
14
+ description="Extract and analyze Murcko scaffolds.",
15
+ formatter_class=RdkitHelpFormatter,
16
+ )
17
+
18
+ scaf_subparsers = parser.add_subparsers(
19
+ title="Subcommands",
20
+ dest="subcommand",
21
+ metavar="<subcommand>",
22
+ )
23
+
24
+ # scaffold murcko
25
+ murcko_parser = scaf_subparsers.add_parser(
26
+ "murcko",
27
+ help="Extract Murcko scaffolds",
28
+ formatter_class=RdkitHelpFormatter,
29
+ )
30
+ add_common_io_options(murcko_parser)
31
+ add_common_processing_options(murcko_parser)
32
+ murcko_parser.add_argument(
33
+ "--generic",
34
+ action="store_true",
35
+ help="Generate generic (element-agnostic) scaffolds",
36
+ )
37
+ murcko_parser.add_argument(
38
+ "--include-sidechains",
39
+ action="store_true",
40
+ help="Include side chains in output",
41
+ )
42
+ murcko_parser.add_argument(
43
+ "--rings-only",
44
+ action="store_true",
45
+ help="Extract only ring systems (no linkers)",
46
+ )
47
+ murcko_parser.add_argument(
48
+ "--include-original",
49
+ action="store_true",
50
+ help="Include original SMILES in output",
51
+ )
52
+ murcko_parser.set_defaults(func=run_murcko)
53
+
54
+ # scaffold decompose
55
+ decompose_parser = scaf_subparsers.add_parser(
56
+ "decompose",
57
+ help="Decompose molecules into scaffold components",
58
+ formatter_class=RdkitHelpFormatter,
59
+ )
60
+ add_common_io_options(decompose_parser)
61
+ add_common_processing_options(decompose_parser)
62
+ decompose_parser.set_defaults(func=run_decompose)
63
+
64
+ # scaffold analyze
65
+ analyze_parser = scaf_subparsers.add_parser(
66
+ "analyze",
67
+ help="Analyze scaffold frequency distribution",
68
+ formatter_class=RdkitHelpFormatter,
69
+ )
70
+ analyze_parser.add_argument(
71
+ "-i", "--input",
72
+ required=True,
73
+ metavar="FILE",
74
+ help="Input file with scaffold column",
75
+ )
76
+ analyze_parser.add_argument(
77
+ "-o", "--output",
78
+ metavar="FILE",
79
+ help="Output file (optional, prints to stdout if not specified)",
80
+ )
81
+ analyze_parser.add_argument(
82
+ "--scaffold-column",
83
+ default="scaffold",
84
+ help="Name of scaffold column (default: scaffold)",
85
+ )
86
+ analyze_parser.add_argument(
87
+ "--top",
88
+ type=int,
89
+ default=20,
90
+ help="Number of top scaffolds to show (default: 20)",
91
+ )
92
+ analyze_parser.add_argument(
93
+ "--no-header",
94
+ action="store_true",
95
+ help="Input file has no header row",
96
+ )
97
+ analyze_parser.set_defaults(func=run_analyze)
98
+
99
+ # Set default for main parser
100
+ parser.set_defaults(func=lambda args: parser.print_help() or 1)
101
+
102
+
103
+ def run_murcko(args) -> int:
104
+ """Run Murcko scaffold extraction."""
105
+ # Lazy imports
106
+ from rdkit_cli.core.scaffold import ScaffoldExtractor
107
+ from rdkit_cli.io import create_reader, create_writer
108
+ from rdkit_cli.parallel.batch import process_molecules
109
+
110
+ extractor = ScaffoldExtractor(
111
+ generic=args.generic,
112
+ include_smiles=True,
113
+ include_name=True,
114
+ )
115
+
116
+ input_path = Path(args.input)
117
+ if not input_path.exists():
118
+ print(f"Error: Input file not found: {input_path}", file=sys.stderr)
119
+ return 1
120
+
121
+ reader = create_reader(
122
+ input_path,
123
+ smiles_column=args.smiles_column,
124
+ name_column=args.name_column,
125
+ has_header=not args.no_header,
126
+ )
127
+
128
+ output_path = Path(args.output)
129
+ writer = create_writer(output_path)
130
+
131
+ with reader, writer:
132
+ result = process_molecules(
133
+ reader=reader,
134
+ writer=writer,
135
+ processor=extractor.extract,
136
+ n_workers=args.ncpu,
137
+ quiet=args.quiet,
138
+ )
139
+
140
+ if not args.quiet:
141
+ print(
142
+ f"Extracted scaffolds for {result.successful}/{result.total_processed} molecules "
143
+ f"({result.failed} failed) in {result.elapsed_time:.1f}s",
144
+ file=sys.stderr,
145
+ )
146
+
147
+ return 0 if result.failed == 0 else 1
148
+
149
+
150
+ def run_decompose(args) -> int:
151
+ """Run scaffold decomposition."""
152
+ # Lazy imports
153
+ from rdkit_cli.core.scaffold import ScaffoldDecomposer
154
+ from rdkit_cli.io import create_reader, create_writer
155
+ from rdkit_cli.parallel.batch import process_molecules
156
+
157
+ decomposer = ScaffoldDecomposer(
158
+ include_smiles=True,
159
+ include_name=True,
160
+ )
161
+
162
+ input_path = Path(args.input)
163
+ if not input_path.exists():
164
+ print(f"Error: Input file not found: {input_path}", file=sys.stderr)
165
+ return 1
166
+
167
+ reader = create_reader(
168
+ input_path,
169
+ smiles_column=args.smiles_column,
170
+ name_column=args.name_column,
171
+ has_header=not args.no_header,
172
+ )
173
+
174
+ output_path = Path(args.output)
175
+ writer = create_writer(output_path)
176
+
177
+ with reader, writer:
178
+ result = process_molecules(
179
+ reader=reader,
180
+ writer=writer,
181
+ processor=decomposer.decompose,
182
+ n_workers=args.ncpu,
183
+ quiet=args.quiet,
184
+ )
185
+
186
+ if not args.quiet:
187
+ print(
188
+ f"Decomposed {result.successful}/{result.total_processed} molecules "
189
+ f"({result.failed} failed) in {result.elapsed_time:.1f}s",
190
+ file=sys.stderr,
191
+ )
192
+
193
+ return 0 if result.failed == 0 else 1
194
+
195
+
196
+ def run_analyze(args) -> int:
197
+ """Run scaffold frequency analysis."""
198
+ # Lazy imports
199
+ import pandas as pd
200
+ from rdkit_cli.core.scaffold import analyze_scaffolds
201
+
202
+ input_path = Path(args.input)
203
+ if not input_path.exists():
204
+ print(f"Error: Input file not found: {input_path}", file=sys.stderr)
205
+ return 1
206
+
207
+ # Read scaffold data
208
+ header = 0 if not args.no_header else None
209
+ df = pd.read_csv(input_path, header=header)
210
+
211
+ if args.no_header:
212
+ # Assume first column is scaffold
213
+ scaffold_col = df.columns[0]
214
+ else:
215
+ scaffold_col = args.scaffold_column
216
+
217
+ if scaffold_col not in df.columns:
218
+ print(f"Error: Scaffold column '{scaffold_col}' not found", file=sys.stderr)
219
+ return 1
220
+
221
+ scaffolds = df[scaffold_col].dropna().tolist()
222
+ results = analyze_scaffolds(scaffolds, top_n=args.top)
223
+
224
+ # Output
225
+ output_lines = ["scaffold,count,percentage"]
226
+ for scaffold, count, pct in results:
227
+ # Escape quotes in scaffold SMILES
228
+ scaffold_escaped = scaffold.replace('"', '""')
229
+ output_lines.append(f'"{scaffold_escaped}",{count},{pct}')
230
+
231
+ output_text = "\n".join(output_lines)
232
+
233
+ if args.output:
234
+ output_path = Path(args.output)
235
+ with open(output_path, "w") as f:
236
+ f.write(output_text + "\n")
237
+ print(f"Wrote scaffold analysis to {output_path}", file=sys.stderr)
238
+ else:
239
+ print(output_text)
240
+
241
+ print(f"\nTotal scaffolds: {len(scaffolds)}, Unique: {len(set(scaffolds))}", file=sys.stderr)
242
+
243
+ return 0
@@ -0,0 +1,359 @@
1
+ """Similarity command implementation."""
2
+
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ from rdkit_cli.cli import RdkitHelpFormatter, add_common_io_options, add_common_processing_options
7
+
8
+ # Define here to avoid loading core at startup
9
+ SIMILARITY_METRICS = ["tanimoto", "dice", "cosine", "sokal", "russel"]
10
+
11
+
12
+ def register_parser(subparsers):
13
+ """Register the similarity command and subcommands."""
14
+ parser = subparsers.add_parser(
15
+ "similarity",
16
+ help="Compute molecular similarity",
17
+ description="Search, compare, and cluster molecules by similarity.",
18
+ formatter_class=RdkitHelpFormatter,
19
+ )
20
+
21
+ sim_subparsers = parser.add_subparsers(
22
+ title="Subcommands",
23
+ dest="subcommand",
24
+ metavar="<subcommand>",
25
+ )
26
+
27
+ # similarity search
28
+ search_parser = sim_subparsers.add_parser(
29
+ "search",
30
+ help="Search for molecules similar to a query",
31
+ formatter_class=RdkitHelpFormatter,
32
+ )
33
+ add_common_io_options(search_parser)
34
+ add_common_processing_options(search_parser)
35
+ search_parser.add_argument(
36
+ "--query",
37
+ required=True,
38
+ metavar="SMILES",
39
+ help="Query molecule SMILES",
40
+ )
41
+ search_parser.add_argument(
42
+ "-t", "--threshold",
43
+ type=float,
44
+ default=0.7,
45
+ metavar="T",
46
+ help="Minimum similarity threshold (default: 0.7)",
47
+ )
48
+ search_parser.add_argument(
49
+ "-m", "--metric",
50
+ choices=SIMILARITY_METRICS,
51
+ default="tanimoto",
52
+ help="Similarity metric (default: tanimoto)",
53
+ )
54
+ search_parser.add_argument(
55
+ "-r", "--radius",
56
+ type=int,
57
+ default=2,
58
+ help="Morgan fingerprint radius (default: 2)",
59
+ )
60
+ search_parser.add_argument(
61
+ "-b", "--bits",
62
+ type=int,
63
+ default=2048,
64
+ help="Fingerprint bit size (default: 2048)",
65
+ )
66
+ search_parser.add_argument(
67
+ "--top-n",
68
+ type=int,
69
+ default=None,
70
+ metavar="N",
71
+ help="Return only top N most similar molecules",
72
+ )
73
+ search_parser.add_argument(
74
+ "--sort",
75
+ action="store_true",
76
+ help="Sort output by similarity (descending)",
77
+ )
78
+ search_parser.add_argument(
79
+ "--fp-type",
80
+ choices=["morgan", "maccs", "rdkit", "atompair", "torsion"],
81
+ default="morgan",
82
+ help="Fingerprint type (default: morgan)",
83
+ )
84
+ search_parser.add_argument(
85
+ "--include-query",
86
+ action="store_true",
87
+ help="Include query molecule in output",
88
+ )
89
+ search_parser.add_argument(
90
+ "--add-rank",
91
+ action="store_true",
92
+ help="Add similarity rank column",
93
+ )
94
+ search_parser.set_defaults(func=run_search)
95
+
96
+ # similarity matrix
97
+ matrix_parser = sim_subparsers.add_parser(
98
+ "matrix",
99
+ help="Compute pairwise similarity matrix",
100
+ formatter_class=RdkitHelpFormatter,
101
+ )
102
+ add_common_io_options(matrix_parser)
103
+ add_common_processing_options(matrix_parser)
104
+ matrix_parser.add_argument(
105
+ "-m", "--metric",
106
+ choices=SIMILARITY_METRICS,
107
+ default="tanimoto",
108
+ help="Similarity metric (default: tanimoto)",
109
+ )
110
+ matrix_parser.add_argument(
111
+ "--fp-type",
112
+ choices=["morgan", "maccs", "rdkit", "atompair", "torsion"],
113
+ default="morgan",
114
+ help="Fingerprint type (default: morgan)",
115
+ )
116
+ matrix_parser.add_argument(
117
+ "-r", "--radius",
118
+ type=int,
119
+ default=2,
120
+ help="Morgan fingerprint radius (default: 2)",
121
+ )
122
+ matrix_parser.add_argument(
123
+ "-b", "--bits",
124
+ type=int,
125
+ default=2048,
126
+ help="Fingerprint bit size (default: 2048)",
127
+ )
128
+ matrix_parser.add_argument(
129
+ "--distance",
130
+ action="store_true",
131
+ help="Output distance matrix (1 - similarity) instead of similarity",
132
+ )
133
+ matrix_parser.add_argument(
134
+ "--precision",
135
+ type=int,
136
+ default=4,
137
+ help="Decimal precision (default: 4)",
138
+ )
139
+ matrix_parser.set_defaults(func=run_matrix)
140
+
141
+ # similarity cluster
142
+ cluster_parser = sim_subparsers.add_parser(
143
+ "cluster",
144
+ help="Cluster molecules by similarity",
145
+ formatter_class=RdkitHelpFormatter,
146
+ )
147
+ add_common_io_options(cluster_parser)
148
+ add_common_processing_options(cluster_parser)
149
+ cluster_parser.add_argument(
150
+ "-c", "--cutoff",
151
+ type=float,
152
+ default=0.3,
153
+ metavar="C",
154
+ help="Distance cutoff (1-similarity, default: 0.3)",
155
+ )
156
+ cluster_parser.add_argument(
157
+ "-r", "--radius",
158
+ type=int,
159
+ default=2,
160
+ help="Morgan fingerprint radius (default: 2)",
161
+ )
162
+ cluster_parser.add_argument(
163
+ "-b", "--bits",
164
+ type=int,
165
+ default=2048,
166
+ help="Fingerprint bit size (default: 2048)",
167
+ )
168
+ cluster_parser.add_argument(
169
+ "--min-cluster-size",
170
+ type=int,
171
+ default=1,
172
+ help="Minimum cluster size to include (default: 1)",
173
+ )
174
+ cluster_parser.add_argument(
175
+ "--fp-type",
176
+ choices=["morgan", "maccs", "rdkit", "atompair", "torsion"],
177
+ default="morgan",
178
+ help="Fingerprint type (default: morgan)",
179
+ )
180
+ cluster_parser.add_argument(
181
+ "--method",
182
+ choices=["butina", "hierarchical"],
183
+ default="butina",
184
+ help="Clustering method (default: butina)",
185
+ )
186
+ cluster_parser.add_argument(
187
+ "--add-centroid",
188
+ action="store_true",
189
+ help="Mark cluster centroids",
190
+ )
191
+ cluster_parser.set_defaults(func=run_cluster)
192
+
193
+ # Set default for main parser
194
+ parser.set_defaults(func=lambda args: parser.print_help() or 1)
195
+
196
+
197
+ def run_search(args) -> int:
198
+ """Run similarity search."""
199
+ # Lazy imports
200
+ from rdkit_cli.core.similarity import SimilaritySearcher, SimilarityMetric
201
+ from rdkit_cli.io import create_reader, create_writer
202
+ from rdkit_cli.parallel.batch import process_molecules
203
+
204
+ try:
205
+ searcher = SimilaritySearcher(
206
+ query_smiles=args.query,
207
+ threshold=args.threshold,
208
+ metric=SimilarityMetric(args.metric),
209
+ radius=args.radius,
210
+ n_bits=args.bits,
211
+ )
212
+ except ValueError as e:
213
+ print(f"Error: {e}", file=sys.stderr)
214
+ return 1
215
+
216
+ input_path = Path(args.input)
217
+ if not input_path.exists():
218
+ print(f"Error: Input file not found: {input_path}", file=sys.stderr)
219
+ return 1
220
+
221
+ reader = create_reader(
222
+ input_path,
223
+ smiles_column=args.smiles_column,
224
+ name_column=args.name_column,
225
+ has_header=not args.no_header,
226
+ )
227
+
228
+ output_path = Path(args.output)
229
+ writer = create_writer(output_path)
230
+
231
+ with reader, writer:
232
+ result = process_molecules(
233
+ reader=reader,
234
+ writer=writer,
235
+ processor=searcher.search,
236
+ n_workers=args.ncpu,
237
+ quiet=args.quiet,
238
+ )
239
+
240
+ if not args.quiet:
241
+ found = result.successful
242
+ total = result.total_processed
243
+ print(
244
+ f"Found {found}/{total} molecules above threshold "
245
+ f"({result.failed} failed) in {result.elapsed_time:.1f}s",
246
+ file=sys.stderr,
247
+ )
248
+
249
+ return 0
250
+
251
+
252
+ def run_matrix(args) -> int:
253
+ """Compute similarity matrix."""
254
+ # Lazy imports
255
+ from rdkit_cli.core.similarity import compute_similarity_matrix, SimilarityMetric
256
+ from rdkit_cli.io import create_reader
257
+
258
+ input_path = Path(args.input)
259
+ if not input_path.exists():
260
+ print(f"Error: Input file not found: {input_path}", file=sys.stderr)
261
+ return 1
262
+
263
+ reader = create_reader(
264
+ input_path,
265
+ smiles_column=args.smiles_column,
266
+ name_column=args.name_column,
267
+ has_header=not args.no_header,
268
+ )
269
+
270
+ # Read all molecules
271
+ if not args.quiet:
272
+ print("Reading molecules...", file=sys.stderr)
273
+
274
+ records = list(reader)
275
+ mols = [r.mol for r in records]
276
+ names = [r.name or r.smiles[:20] for r in records]
277
+
278
+ if not args.quiet:
279
+ print(f"Computing {len(mols)}x{len(mols)} similarity matrix...", file=sys.stderr)
280
+
281
+ matrix = compute_similarity_matrix(
282
+ mols,
283
+ metric=SimilarityMetric(args.metric),
284
+ )
285
+
286
+ # Write output
287
+ output_path = Path(args.output)
288
+ with open(output_path, "w") as f:
289
+ # Header
290
+ f.write("," + ",".join(names) + "\n")
291
+ # Data
292
+ for i, row in enumerate(matrix):
293
+ f.write(names[i] + "," + ",".join(f"{v:.4f}" for v in row) + "\n")
294
+
295
+ if not args.quiet:
296
+ print(f"Wrote similarity matrix to {output_path}", file=sys.stderr)
297
+
298
+ return 0
299
+
300
+
301
+ def run_cluster(args) -> int:
302
+ """Cluster molecules."""
303
+ # Lazy imports
304
+ from rdkit_cli.core.similarity import cluster_molecules
305
+ from rdkit_cli.io import create_reader
306
+
307
+ input_path = Path(args.input)
308
+ if not input_path.exists():
309
+ print(f"Error: Input file not found: {input_path}", file=sys.stderr)
310
+ return 1
311
+
312
+ reader = create_reader(
313
+ input_path,
314
+ smiles_column=args.smiles_column,
315
+ name_column=args.name_column,
316
+ has_header=not args.no_header,
317
+ )
318
+
319
+ # Read all molecules
320
+ if not args.quiet:
321
+ print("Reading molecules...", file=sys.stderr)
322
+
323
+ records = list(reader)
324
+ mols = [r.mol for r in records]
325
+
326
+ if not args.quiet:
327
+ print(f"Clustering {len(mols)} molecules...", file=sys.stderr)
328
+
329
+ clusters = cluster_molecules(
330
+ mols,
331
+ cutoff=args.cutoff,
332
+ radius=args.radius,
333
+ n_bits=args.bits,
334
+ )
335
+
336
+ # Filter by minimum cluster size
337
+ min_size = getattr(args, "min_cluster_size", 1)
338
+ clusters = [c for c in clusters if len(c) >= min_size]
339
+
340
+ # Write output
341
+ output_path = Path(args.output)
342
+ with open(output_path, "w") as f:
343
+ f.write("smiles,name,cluster,cluster_size\n")
344
+ for cluster_id, cluster in enumerate(clusters):
345
+ cluster_size = len(cluster)
346
+ for idx in cluster:
347
+ r = records[idx]
348
+ smiles = r.smiles.replace('"', '""')
349
+ name = (r.name or "").replace('"', '""')
350
+ f.write(f'"{smiles}","{name}",{cluster_id},{cluster_size}\n')
351
+
352
+ if not args.quiet:
353
+ print(
354
+ f"Found {len(clusters)} clusters from {len(mols)} molecules. "
355
+ f"Wrote to {output_path}",
356
+ file=sys.stderr,
357
+ )
358
+
359
+ return 0