rdkit-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. rdkit_cli/__init__.py +4 -0
  2. rdkit_cli/__main__.py +6 -0
  3. rdkit_cli/cli.py +162 -0
  4. rdkit_cli/commands/__init__.py +1 -0
  5. rdkit_cli/commands/conformers.py +220 -0
  6. rdkit_cli/commands/convert.py +162 -0
  7. rdkit_cli/commands/depict.py +311 -0
  8. rdkit_cli/commands/descriptors.py +251 -0
  9. rdkit_cli/commands/diversity.py +232 -0
  10. rdkit_cli/commands/enumerate.py +229 -0
  11. rdkit_cli/commands/filter.py +384 -0
  12. rdkit_cli/commands/fingerprints.py +179 -0
  13. rdkit_cli/commands/fragment.py +284 -0
  14. rdkit_cli/commands/mcs.py +162 -0
  15. rdkit_cli/commands/reactions.py +191 -0
  16. rdkit_cli/commands/scaffold.py +243 -0
  17. rdkit_cli/commands/similarity.py +359 -0
  18. rdkit_cli/commands/standardize.py +138 -0
  19. rdkit_cli/core/__init__.py +1 -0
  20. rdkit_cli/core/conformers.py +197 -0
  21. rdkit_cli/core/depict.py +241 -0
  22. rdkit_cli/core/descriptors.py +248 -0
  23. rdkit_cli/core/diversity.py +174 -0
  24. rdkit_cli/core/enumerate.py +190 -0
  25. rdkit_cli/core/filters.py +443 -0
  26. rdkit_cli/core/fingerprints.py +265 -0
  27. rdkit_cli/core/fragment.py +237 -0
  28. rdkit_cli/core/mcs.py +128 -0
  29. rdkit_cli/core/reactions.py +159 -0
  30. rdkit_cli/core/scaffold.py +174 -0
  31. rdkit_cli/core/similarity.py +206 -0
  32. rdkit_cli/core/standardizer.py +141 -0
  33. rdkit_cli/io/__init__.py +7 -0
  34. rdkit_cli/io/formats.py +109 -0
  35. rdkit_cli/io/readers.py +352 -0
  36. rdkit_cli/io/writers.py +275 -0
  37. rdkit_cli/parallel/__init__.py +5 -0
  38. rdkit_cli/parallel/batch.py +181 -0
  39. rdkit_cli/parallel/executor.py +180 -0
  40. rdkit_cli/progress/__init__.py +5 -0
  41. rdkit_cli/progress/ninja.py +195 -0
  42. rdkit_cli/utils/__init__.py +1 -0
  43. rdkit_cli-0.1.0.dist-info/METADATA +380 -0
  44. rdkit_cli-0.1.0.dist-info/RECORD +47 -0
  45. rdkit_cli-0.1.0.dist-info/WHEEL +4 -0
  46. rdkit_cli-0.1.0.dist-info/entry_points.txt +2 -0
  47. rdkit_cli-0.1.0.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,232 @@
1
+ """Diversity command implementation."""
2
+
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ from rdkit_cli.cli import RdkitHelpFormatter, add_common_processing_options
7
+
8
+
9
+ def register_parser(subparsers):
10
+ """Register the diversity command and subcommands."""
11
+ parser = subparsers.add_parser(
12
+ "diversity",
13
+ help="Analyze and select diverse molecules",
14
+ description="Analyze molecular diversity and select diverse subsets.",
15
+ formatter_class=RdkitHelpFormatter,
16
+ )
17
+
18
+ div_subparsers = parser.add_subparsers(
19
+ title="Subcommands",
20
+ dest="subcommand",
21
+ metavar="<subcommand>",
22
+ )
23
+
24
+ # diversity pick
25
+ pick_parser = div_subparsers.add_parser(
26
+ "pick",
27
+ help="Select diverse subset using MaxMin algorithm",
28
+ formatter_class=RdkitHelpFormatter,
29
+ )
30
+ pick_parser.add_argument(
31
+ "-i", "--input",
32
+ required=True,
33
+ metavar="FILE",
34
+ help="Input file",
35
+ )
36
+ pick_parser.add_argument(
37
+ "-o", "--output",
38
+ required=True,
39
+ metavar="FILE",
40
+ help="Output file",
41
+ )
42
+ add_common_processing_options(pick_parser)
43
+ pick_parser.add_argument(
44
+ "-k", "--num-picks",
45
+ type=int,
46
+ default=100,
47
+ metavar="N",
48
+ help="Number of molecules to pick (default: 100)",
49
+ )
50
+ pick_parser.add_argument(
51
+ "-m", "--method",
52
+ choices=["maxmin", "leader"],
53
+ default="maxmin",
54
+ help="Picking method (default: maxmin)",
55
+ )
56
+ pick_parser.add_argument(
57
+ "-r", "--radius",
58
+ type=int,
59
+ default=2,
60
+ help="Morgan fingerprint radius (default: 2)",
61
+ )
62
+ pick_parser.add_argument(
63
+ "-b", "--bits",
64
+ type=int,
65
+ default=2048,
66
+ help="Fingerprint bit size (default: 2048)",
67
+ )
68
+ pick_parser.add_argument(
69
+ "--seed",
70
+ type=int,
71
+ default=42,
72
+ help="Random seed (default: 42)",
73
+ )
74
+ pick_parser.set_defaults(func=run_pick)
75
+
76
+ # diversity analyze
77
+ analyze_parser = div_subparsers.add_parser(
78
+ "analyze",
79
+ help="Analyze diversity of a molecule set",
80
+ formatter_class=RdkitHelpFormatter,
81
+ )
82
+ analyze_parser.add_argument(
83
+ "-i", "--input",
84
+ required=True,
85
+ metavar="FILE",
86
+ help="Input file",
87
+ )
88
+ analyze_parser.add_argument(
89
+ "-o", "--output",
90
+ metavar="FILE",
91
+ help="Output file (optional, prints to stdout if not specified)",
92
+ )
93
+ add_common_processing_options(analyze_parser)
94
+ analyze_parser.add_argument(
95
+ "-r", "--radius",
96
+ type=int,
97
+ default=2,
98
+ help="Morgan fingerprint radius (default: 2)",
99
+ )
100
+ analyze_parser.add_argument(
101
+ "-b", "--bits",
102
+ type=int,
103
+ default=2048,
104
+ help="Fingerprint bit size (default: 2048)",
105
+ )
106
+ analyze_parser.add_argument(
107
+ "--sample-size",
108
+ type=int,
109
+ default=1000,
110
+ help="Max molecules to sample for analysis (default: 1000)",
111
+ )
112
+ analyze_parser.set_defaults(func=run_analyze)
113
+
114
+ # Set default for main parser
115
+ parser.set_defaults(func=lambda args: parser.print_help() or 1)
116
+
117
+
118
+ def run_pick(args) -> int:
119
+ """Run diversity picking."""
120
+ from rdkit_cli.core.diversity import DiversityPicker
121
+ from rdkit_cli.io import create_reader, create_writer
122
+
123
+ input_path = Path(args.input)
124
+ if not input_path.exists():
125
+ print(f"Error: Input file not found: {input_path}", file=sys.stderr)
126
+ return 1
127
+
128
+ reader = create_reader(
129
+ input_path,
130
+ smiles_column=args.smiles_column,
131
+ name_column=args.name_column,
132
+ has_header=not args.no_header,
133
+ )
134
+
135
+ if not args.quiet:
136
+ print("Reading molecules...", file=sys.stderr)
137
+
138
+ # Read all records
139
+ records = list(reader)
140
+ mols = [r.mol for r in records]
141
+
142
+ if not args.quiet:
143
+ print(f"Picking {args.num_picks} diverse molecules from {len(mols)}...", file=sys.stderr)
144
+
145
+ # Create picker
146
+ picker = DiversityPicker(
147
+ n_picks=args.num_picks,
148
+ seed=args.seed,
149
+ radius=args.radius,
150
+ n_bits=args.bits,
151
+ method=args.method,
152
+ )
153
+
154
+ # Pick diverse subset
155
+ selected_indices = picker.pick(mols)
156
+
157
+ # Write output
158
+ output_path = Path(args.output)
159
+ writer = create_writer(output_path)
160
+
161
+ with writer:
162
+ for idx in selected_indices:
163
+ record = records[idx]
164
+ result = {
165
+ "smiles": record.smiles,
166
+ "diversity_rank": selected_indices.index(idx),
167
+ }
168
+ if record.name:
169
+ result["name"] = record.name
170
+ writer.write_row(result)
171
+
172
+ if not args.quiet:
173
+ print(
174
+ f"Selected {len(selected_indices)} diverse molecules. Wrote to {output_path}",
175
+ file=sys.stderr,
176
+ )
177
+
178
+ return 0
179
+
180
+
181
+ def run_analyze(args) -> int:
182
+ """Run diversity analysis."""
183
+ from rdkit_cli.core.diversity import DiversityAnalyzer
184
+ from rdkit_cli.io import create_reader
185
+
186
+ input_path = Path(args.input)
187
+ if not input_path.exists():
188
+ print(f"Error: Input file not found: {input_path}", file=sys.stderr)
189
+ return 1
190
+
191
+ reader = create_reader(
192
+ input_path,
193
+ smiles_column=args.smiles_column,
194
+ name_column=args.name_column,
195
+ has_header=not args.no_header,
196
+ )
197
+
198
+ if not args.quiet:
199
+ print("Reading molecules...", file=sys.stderr)
200
+
201
+ # Read all molecules
202
+ mols = [r.mol for r in reader]
203
+
204
+ if not args.quiet:
205
+ print(f"Analyzing diversity of {len(mols)} molecules...", file=sys.stderr)
206
+
207
+ # Analyze
208
+ analyzer = DiversityAnalyzer(
209
+ radius=args.radius,
210
+ n_bits=args.bits,
211
+ sample_size=args.sample_size,
212
+ )
213
+
214
+ stats = analyzer.analyze(mols)
215
+
216
+ # Output results
217
+ if args.output:
218
+ output_path = Path(args.output)
219
+ from rdkit_cli.io import create_writer
220
+ writer = create_writer(output_path)
221
+ with writer:
222
+ writer.write_row(stats)
223
+ if not args.quiet:
224
+ print(f"Wrote diversity analysis to {output_path}", file=sys.stderr)
225
+ else:
226
+ print("\nDiversity Analysis Results")
227
+ print("=" * 40)
228
+ for key, value in stats.items():
229
+ print(f"{key}: {value}")
230
+ print("=" * 40)
231
+
232
+ return 0
@@ -0,0 +1,229 @@
1
+ """Enumerate command implementation."""
2
+
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ from rdkit_cli.cli import RdkitHelpFormatter, add_common_io_options, add_common_processing_options
7
+
8
+
9
+ def register_parser(subparsers):
10
+ """Register the enumerate command and subcommands."""
11
+ parser = subparsers.add_parser(
12
+ "enumerate",
13
+ help="Enumerate molecular variants",
14
+ description="Enumerate stereoisomers, tautomers, and other molecular variants.",
15
+ formatter_class=RdkitHelpFormatter,
16
+ )
17
+
18
+ enum_subparsers = parser.add_subparsers(
19
+ title="Subcommands",
20
+ dest="subcommand",
21
+ metavar="<subcommand>",
22
+ )
23
+
24
+ # enumerate stereoisomers
25
+ stereo_parser = enum_subparsers.add_parser(
26
+ "stereoisomers",
27
+ help="Enumerate stereoisomers",
28
+ formatter_class=RdkitHelpFormatter,
29
+ )
30
+ add_common_io_options(stereo_parser)
31
+ add_common_processing_options(stereo_parser)
32
+ stereo_parser.add_argument(
33
+ "--max-isomers",
34
+ type=int,
35
+ default=32,
36
+ metavar="N",
37
+ help="Maximum stereoisomers per molecule (default: 32)",
38
+ )
39
+ stereo_parser.add_argument(
40
+ "--only-unassigned",
41
+ action="store_true",
42
+ default=True,
43
+ help="Only enumerate unassigned stereocenters (default: True)",
44
+ )
45
+ stereo_parser.add_argument(
46
+ "--all-centers",
47
+ action="store_true",
48
+ help="Enumerate all stereocenters, not just unassigned",
49
+ )
50
+ stereo_parser.set_defaults(func=run_stereoisomers)
51
+
52
+ # enumerate tautomers
53
+ taut_parser = enum_subparsers.add_parser(
54
+ "tautomers",
55
+ help="Enumerate tautomers",
56
+ formatter_class=RdkitHelpFormatter,
57
+ )
58
+ add_common_io_options(taut_parser)
59
+ add_common_processing_options(taut_parser)
60
+ taut_parser.add_argument(
61
+ "--max-tautomers",
62
+ type=int,
63
+ default=50,
64
+ metavar="N",
65
+ help="Maximum tautomers per molecule (default: 50)",
66
+ )
67
+ taut_parser.add_argument(
68
+ "--max-transforms",
69
+ type=int,
70
+ default=1000,
71
+ metavar="N",
72
+ help="Maximum transforms to apply (default: 1000)",
73
+ )
74
+ taut_parser.set_defaults(func=run_tautomers)
75
+
76
+ # enumerate canonical-tautomer
77
+ canon_parser = enum_subparsers.add_parser(
78
+ "canonical-tautomer",
79
+ help="Get canonical tautomer",
80
+ formatter_class=RdkitHelpFormatter,
81
+ )
82
+ add_common_io_options(canon_parser)
83
+ add_common_processing_options(canon_parser)
84
+ canon_parser.add_argument(
85
+ "--include-original",
86
+ action="store_true",
87
+ help="Include original SMILES in output",
88
+ )
89
+ canon_parser.set_defaults(func=run_canonical_tautomer)
90
+
91
+ # Set default for main parser
92
+ parser.set_defaults(func=lambda args: parser.print_help() or 1)
93
+
94
+
95
+ def run_stereoisomers(args) -> int:
96
+ """Run stereoisomer enumeration."""
97
+ from rdkit_cli.core.enumerate import StereoisomerEnumerator
98
+ from rdkit_cli.io import create_reader, create_writer
99
+
100
+ enumerator = StereoisomerEnumerator(
101
+ max_isomers=args.max_isomers,
102
+ only_unassigned=not args.all_centers,
103
+ )
104
+
105
+ input_path = Path(args.input)
106
+ if not input_path.exists():
107
+ print(f"Error: Input file not found: {input_path}", file=sys.stderr)
108
+ return 1
109
+
110
+ reader = create_reader(
111
+ input_path,
112
+ smiles_column=args.smiles_column,
113
+ name_column=args.name_column,
114
+ has_header=not args.no_header,
115
+ )
116
+
117
+ output_path = Path(args.output)
118
+ writer = create_writer(output_path)
119
+
120
+ total_input = 0
121
+ total_output = 0
122
+
123
+ with reader, writer:
124
+ for record in reader:
125
+ total_input += 1
126
+ results = enumerator.enumerate(record)
127
+ for result in results:
128
+ writer.write_row(result)
129
+ total_output += 1
130
+
131
+ if not args.quiet:
132
+ print(
133
+ f"Enumerated {total_output} stereoisomers from {total_input} molecules",
134
+ file=sys.stderr,
135
+ )
136
+
137
+ return 0
138
+
139
+
140
+ def run_tautomers(args) -> int:
141
+ """Run tautomer enumeration."""
142
+ from rdkit_cli.core.enumerate import TautomerEnumerator
143
+ from rdkit_cli.io import create_reader, create_writer
144
+
145
+ enumerator = TautomerEnumerator(
146
+ max_tautomers=args.max_tautomers,
147
+ max_transforms=args.max_transforms,
148
+ )
149
+
150
+ input_path = Path(args.input)
151
+ if not input_path.exists():
152
+ print(f"Error: Input file not found: {input_path}", file=sys.stderr)
153
+ return 1
154
+
155
+ reader = create_reader(
156
+ input_path,
157
+ smiles_column=args.smiles_column,
158
+ name_column=args.name_column,
159
+ has_header=not args.no_header,
160
+ )
161
+
162
+ output_path = Path(args.output)
163
+ writer = create_writer(output_path)
164
+
165
+ total_input = 0
166
+ total_output = 0
167
+
168
+ with reader, writer:
169
+ for record in reader:
170
+ total_input += 1
171
+ results = enumerator.enumerate(record)
172
+ for result in results:
173
+ writer.write_row(result)
174
+ total_output += 1
175
+
176
+ if not args.quiet:
177
+ print(
178
+ f"Enumerated {total_output} tautomers from {total_input} molecules",
179
+ file=sys.stderr,
180
+ )
181
+
182
+ return 0
183
+
184
+
185
+ def run_canonical_tautomer(args) -> int:
186
+ """Run canonical tautomer extraction."""
187
+ from rdkit_cli.core.enumerate import CanonicalTautomerizer
188
+ from rdkit_cli.io import create_reader, create_writer
189
+
190
+ canonicalizer = CanonicalTautomerizer(
191
+ include_original=args.include_original,
192
+ )
193
+
194
+ input_path = Path(args.input)
195
+ if not input_path.exists():
196
+ print(f"Error: Input file not found: {input_path}", file=sys.stderr)
197
+ return 1
198
+
199
+ reader = create_reader(
200
+ input_path,
201
+ smiles_column=args.smiles_column,
202
+ name_column=args.name_column,
203
+ has_header=not args.no_header,
204
+ )
205
+
206
+ output_path = Path(args.output)
207
+ writer = create_writer(output_path)
208
+
209
+ # Note: Running single-threaded because RDKit TautomerEnumerator
210
+ # objects can't be pickled for multiprocessing
211
+ total = 0
212
+ successful = 0
213
+
214
+ with reader, writer:
215
+ for record in reader:
216
+ total += 1
217
+ result = canonicalizer.canonicalize(record)
218
+ if result is not None:
219
+ writer.write_row(result)
220
+ successful += 1
221
+
222
+ if not args.quiet:
223
+ print(
224
+ f"Canonicalized {successful}/{total} molecules "
225
+ f"({total - successful} failed)",
226
+ file=sys.stderr,
227
+ )
228
+
229
+ return 0