rdkit-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. rdkit_cli/__init__.py +4 -0
  2. rdkit_cli/__main__.py +6 -0
  3. rdkit_cli/cli.py +162 -0
  4. rdkit_cli/commands/__init__.py +1 -0
  5. rdkit_cli/commands/conformers.py +220 -0
  6. rdkit_cli/commands/convert.py +162 -0
  7. rdkit_cli/commands/depict.py +311 -0
  8. rdkit_cli/commands/descriptors.py +251 -0
  9. rdkit_cli/commands/diversity.py +232 -0
  10. rdkit_cli/commands/enumerate.py +229 -0
  11. rdkit_cli/commands/filter.py +384 -0
  12. rdkit_cli/commands/fingerprints.py +179 -0
  13. rdkit_cli/commands/fragment.py +284 -0
  14. rdkit_cli/commands/mcs.py +162 -0
  15. rdkit_cli/commands/reactions.py +191 -0
  16. rdkit_cli/commands/scaffold.py +243 -0
  17. rdkit_cli/commands/similarity.py +359 -0
  18. rdkit_cli/commands/standardize.py +138 -0
  19. rdkit_cli/core/__init__.py +1 -0
  20. rdkit_cli/core/conformers.py +197 -0
  21. rdkit_cli/core/depict.py +241 -0
  22. rdkit_cli/core/descriptors.py +248 -0
  23. rdkit_cli/core/diversity.py +174 -0
  24. rdkit_cli/core/enumerate.py +190 -0
  25. rdkit_cli/core/filters.py +443 -0
  26. rdkit_cli/core/fingerprints.py +265 -0
  27. rdkit_cli/core/fragment.py +237 -0
  28. rdkit_cli/core/mcs.py +128 -0
  29. rdkit_cli/core/reactions.py +159 -0
  30. rdkit_cli/core/scaffold.py +174 -0
  31. rdkit_cli/core/similarity.py +206 -0
  32. rdkit_cli/core/standardizer.py +141 -0
  33. rdkit_cli/io/__init__.py +7 -0
  34. rdkit_cli/io/formats.py +109 -0
  35. rdkit_cli/io/readers.py +352 -0
  36. rdkit_cli/io/writers.py +275 -0
  37. rdkit_cli/parallel/__init__.py +5 -0
  38. rdkit_cli/parallel/batch.py +181 -0
  39. rdkit_cli/parallel/executor.py +180 -0
  40. rdkit_cli/progress/__init__.py +5 -0
  41. rdkit_cli/progress/ninja.py +195 -0
  42. rdkit_cli/utils/__init__.py +1 -0
  43. rdkit_cli-0.1.0.dist-info/METADATA +380 -0
  44. rdkit_cli-0.1.0.dist-info/RECORD +47 -0
  45. rdkit_cli-0.1.0.dist-info/WHEEL +4 -0
  46. rdkit_cli-0.1.0.dist-info/entry_points.txt +2 -0
  47. rdkit_cli-0.1.0.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,248 @@
1
+ """Molecular descriptor computation engine."""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Optional, Any
5
+
6
+ from rdkit import Chem
7
+ from rdkit.Chem import Descriptors, rdMolDescriptors, QED
8
+
9
+ from rdkit_cli.io.readers import MoleculeRecord
10
+
11
+
12
+ # Descriptor categories
13
+ DESCRIPTOR_CATEGORIES = [
14
+ "constitutional",
15
+ "topological",
16
+ "electronic",
17
+ "geometric",
18
+ "molecular",
19
+ ]
20
+
21
+
22
+ @dataclass
23
+ class DescriptorInfo:
24
+ """Information about a descriptor."""
25
+
26
+ name: str
27
+ description: str
28
+ category: str
29
+
30
+
31
+ # Build descriptor registry from RDKit
32
+ def _build_descriptor_registry() -> dict[str, tuple[callable, str, str]]:
33
+ """Build registry of all available descriptors."""
34
+ registry = {}
35
+
36
+ # Get all descriptors from Descriptors module
37
+ for name, func in Descriptors.descList:
38
+ # Categorize based on name patterns
39
+ category = "molecular"
40
+ lower_name = name.lower()
41
+
42
+ if any(x in lower_name for x in ["chi", "kappa", "hall", "balaban", "bertz"]):
43
+ category = "topological"
44
+ elif any(x in lower_name for x in ["tpsa", "labute", "peoe", "gasteiger"]):
45
+ category = "electronic"
46
+ elif any(x in lower_name for x in ["num", "count", "heavy", "ring", "rotatable"]):
47
+ category = "constitutional"
48
+ elif any(x in lower_name for x in ["mol", "exact", "weight", "logp", "mr"]):
49
+ category = "molecular"
50
+
51
+ registry[name] = (func, f"RDKit descriptor: {name}", category)
52
+
53
+ return registry
54
+
55
+
56
+ DESCRIPTOR_REGISTRY = _build_descriptor_registry()
57
+
58
+ # Add QED (not in Descriptors.descList)
59
+ DESCRIPTOR_REGISTRY["QED"] = (QED.qed, "Quantitative Estimate of Drug-likeness", "molecular")
60
+
61
+
62
+ def compute_lipinski_violations(mol: Chem.Mol) -> int:
63
+ """
64
+ Count Lipinski Rule of 5 violations.
65
+
66
+ Args:
67
+ mol: RDKit molecule
68
+
69
+ Returns:
70
+ Number of violations (0-4)
71
+ """
72
+ violations = 0
73
+
74
+ if Descriptors.MolWt(mol) > 500:
75
+ violations += 1
76
+ if Descriptors.MolLogP(mol) > 5:
77
+ violations += 1
78
+ if Descriptors.NumHDonors(mol) > 5:
79
+ violations += 1
80
+ if Descriptors.NumHAcceptors(mol) > 10:
81
+ violations += 1
82
+
83
+ return violations
84
+
85
+
86
+ def list_descriptors(
87
+ category: Optional[str] = None,
88
+ verbose: bool = False,
89
+ ) -> list[DescriptorInfo]:
90
+ """
91
+ List available descriptors.
92
+
93
+ Args:
94
+ category: Filter by category
95
+ verbose: Include descriptions
96
+
97
+ Returns:
98
+ List of DescriptorInfo objects
99
+ """
100
+ result = []
101
+
102
+ for name, (func, desc, cat) in sorted(DESCRIPTOR_REGISTRY.items()):
103
+ if category is None or cat == category:
104
+ result.append(DescriptorInfo(name=name, description=desc, category=cat))
105
+
106
+ return result
107
+
108
+
109
+ def compute_descriptor(mol: Chem.Mol, name: str) -> Optional[float]:
110
+ """
111
+ Compute a single descriptor for a molecule.
112
+
113
+ Args:
114
+ mol: RDKit molecule
115
+ name: Descriptor name
116
+
117
+ Returns:
118
+ Descriptor value or None if computation failed
119
+ """
120
+ if name not in DESCRIPTOR_REGISTRY:
121
+ raise ValueError(f"Unknown descriptor: {name}")
122
+
123
+ func = DESCRIPTOR_REGISTRY[name][0]
124
+
125
+ try:
126
+ value = func(mol)
127
+ # Handle NaN and inf
128
+ if value is None or (isinstance(value, float) and (value != value or abs(value) == float("inf"))):
129
+ return None
130
+ return float(value)
131
+ except Exception:
132
+ return None
133
+
134
+
135
+ class DescriptorCalculator:
136
+ """Calculator for molecular descriptors."""
137
+
138
+ def __init__(
139
+ self,
140
+ descriptors: Optional[list[str]] = None,
141
+ include_smiles: bool = True,
142
+ include_name: bool = True,
143
+ precision: int = 4,
144
+ error_value: str = "NaN",
145
+ ):
146
+ """
147
+ Initialize descriptor calculator.
148
+
149
+ Args:
150
+ descriptors: List of descriptor names (None for all)
151
+ include_smiles: Include SMILES in output
152
+ include_name: Include molecule name in output
153
+ precision: Decimal precision for float values
154
+ error_value: Value to use for failed calculations
155
+ """
156
+ if descriptors is None:
157
+ self.descriptors = list(DESCRIPTOR_REGISTRY.keys())
158
+ else:
159
+ # Validate descriptor names
160
+ unknown = set(descriptors) - set(DESCRIPTOR_REGISTRY.keys())
161
+ if unknown:
162
+ raise ValueError(f"Unknown descriptors: {', '.join(unknown)}")
163
+ self.descriptors = descriptors
164
+
165
+ self.include_smiles = include_smiles
166
+ self.include_name = include_name
167
+ self.precision = precision
168
+ self.error_value = error_value
169
+
170
+ def _format_value(self, value: Optional[float]) -> Any:
171
+ """Format a descriptor value with precision and error handling."""
172
+ if value is None:
173
+ return self.error_value
174
+ if isinstance(value, float):
175
+ return round(value, self.precision)
176
+ return value
177
+
178
+ def compute(self, record: MoleculeRecord) -> Optional[dict[str, Any]]:
179
+ """
180
+ Compute descriptors for a molecule record.
181
+
182
+ Args:
183
+ record: MoleculeRecord to process
184
+
185
+ Returns:
186
+ Dictionary with descriptor values or None if molecule is invalid
187
+ """
188
+ if record.mol is None:
189
+ return None
190
+
191
+ result: dict[str, Any] = {}
192
+
193
+ if self.include_smiles:
194
+ result["smiles"] = record.smiles
195
+ if self.include_name and record.name:
196
+ result["name"] = record.name
197
+
198
+ for desc_name in self.descriptors:
199
+ value = compute_descriptor(record.mol, desc_name)
200
+ result[desc_name] = self._format_value(value)
201
+
202
+ return result
203
+
204
+ def get_column_names(self) -> list[str]:
205
+ """Get output column names in order."""
206
+ cols = []
207
+ if self.include_smiles:
208
+ cols.append("smiles")
209
+ if self.include_name:
210
+ cols.append("name")
211
+ cols.extend(self.descriptors)
212
+ return cols
213
+
214
+
215
+ # Common descriptor sets
216
+ COMMON_DESCRIPTORS = [
217
+ "MolWt",
218
+ "ExactMolWt",
219
+ "HeavyAtomCount",
220
+ "NumHAcceptors",
221
+ "NumHDonors",
222
+ "NumRotatableBonds",
223
+ "NumHeteroatoms",
224
+ "NumAromaticRings",
225
+ "RingCount",
226
+ "TPSA",
227
+ "MolLogP",
228
+ "MolMR",
229
+ "FractionCSP3",
230
+ ]
231
+
232
+ LIPINSKI_DESCRIPTORS = [
233
+ "MolWt",
234
+ "MolLogP",
235
+ "NumHDonors",
236
+ "NumHAcceptors",
237
+ ]
238
+
239
+ DRUGLIKE_DESCRIPTORS = [
240
+ "MolWt",
241
+ "MolLogP",
242
+ "NumHDonors",
243
+ "NumHAcceptors",
244
+ "TPSA",
245
+ "NumRotatableBonds",
246
+ "RingCount",
247
+ "HeavyAtomCount",
248
+ ]
@@ -0,0 +1,174 @@
1
+ """Molecular diversity analysis engine."""
2
+
3
+ from typing import Optional, Any
4
+
5
+ from rdkit import Chem, DataStructs
6
+ from rdkit.Chem import rdMolDescriptors
7
+ from rdkit.SimDivFilters import rdSimDivPickers
8
+
9
+
10
+ def get_morgan_fingerprint(mol: Chem.Mol, radius: int = 2, n_bits: int = 2048):
11
+ """Get Morgan fingerprint for a molecule."""
12
+ return rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
13
+
14
+
15
+ class DiversityPicker:
16
+ """Select diverse subset of molecules using MaxMin algorithm."""
17
+
18
+ def __init__(
19
+ self,
20
+ n_picks: int = 100,
21
+ seed: Optional[int] = None,
22
+ radius: int = 2,
23
+ n_bits: int = 2048,
24
+ method: str = "maxmin",
25
+ ):
26
+ """
27
+ Initialize diversity picker.
28
+
29
+ Args:
30
+ n_picks: Number of molecules to pick
31
+ seed: Random seed for reproducibility
32
+ radius: Morgan fingerprint radius
33
+ n_bits: Fingerprint bit size
34
+ method: Picking method ('maxmin' or 'leader')
35
+ """
36
+ self.n_picks = n_picks
37
+ self.seed = seed
38
+ self.radius = radius
39
+ self.n_bits = n_bits
40
+ self.method = method
41
+
42
+ def pick(
43
+ self,
44
+ mols: list[Chem.Mol],
45
+ first_picks: Optional[list[int]] = None,
46
+ ) -> list[int]:
47
+ """
48
+ Pick diverse subset of molecules.
49
+
50
+ Args:
51
+ mols: List of molecules
52
+ first_picks: Indices of molecules that must be included
53
+
54
+ Returns:
55
+ List of selected indices
56
+ """
57
+ # Filter None molecules and track indices
58
+ valid_mols = []
59
+ valid_indices = []
60
+ for i, mol in enumerate(mols):
61
+ if mol is not None:
62
+ valid_mols.append(mol)
63
+ valid_indices.append(i)
64
+
65
+ if len(valid_mols) == 0:
66
+ return []
67
+
68
+ # Generate fingerprints
69
+ fps = [get_morgan_fingerprint(mol, self.radius, self.n_bits) for mol in valid_mols]
70
+
71
+ # Adjust n_picks if larger than available
72
+ n_to_pick = min(self.n_picks, len(fps))
73
+
74
+ # Create picker
75
+ if self.method == "maxmin":
76
+ picker = rdSimDivPickers.MaxMinPicker()
77
+ else:
78
+ picker = rdSimDivPickers.LeaderPicker()
79
+
80
+ # Define distance function
81
+ def dist_func(i, j):
82
+ return 1 - DataStructs.TanimotoSimilarity(fps[i], fps[j])
83
+
84
+ # Pick diverse molecules
85
+ if first_picks:
86
+ # Map first_picks to valid indices
87
+ mapped_first = [valid_indices.index(i) for i in first_picks if i in valid_indices]
88
+ picks = list(picker.LazyBitVectorPick(fps, len(fps), n_to_pick, firstPicks=mapped_first))
89
+ else:
90
+ if self.seed is not None:
91
+ picks = list(picker.LazyBitVectorPick(fps, len(fps), n_to_pick, seed=self.seed))
92
+ else:
93
+ picks = list(picker.LazyBitVectorPick(fps, len(fps), n_to_pick))
94
+
95
+ # Map back to original indices
96
+ return [valid_indices[i] for i in picks]
97
+
98
+
99
+ class DiversityAnalyzer:
100
+ """Analyze diversity of a molecule set."""
101
+
102
+ def __init__(
103
+ self,
104
+ radius: int = 2,
105
+ n_bits: int = 2048,
106
+ sample_size: int = 1000,
107
+ ):
108
+ """
109
+ Initialize diversity analyzer.
110
+
111
+ Args:
112
+ radius: Morgan fingerprint radius
113
+ n_bits: Fingerprint bit size
114
+ sample_size: Max molecules to sample for analysis
115
+ """
116
+ self.radius = radius
117
+ self.n_bits = n_bits
118
+ self.sample_size = sample_size
119
+
120
+ def analyze(self, mols: list[Chem.Mol]) -> dict[str, Any]:
121
+ """
122
+ Analyze diversity of molecule set.
123
+
124
+ Args:
125
+ mols: List of molecules
126
+
127
+ Returns:
128
+ Dictionary with diversity statistics
129
+ """
130
+ import random
131
+
132
+ # Filter None molecules
133
+ valid_mols = [mol for mol in mols if mol is not None]
134
+
135
+ if len(valid_mols) < 2:
136
+ return {"error": "Need at least 2 valid molecules"}
137
+
138
+ # Sample if too large
139
+ if len(valid_mols) > self.sample_size:
140
+ valid_mols = random.sample(valid_mols, self.sample_size)
141
+
142
+ # Generate fingerprints
143
+ fps = [get_morgan_fingerprint(mol, self.radius, self.n_bits) for mol in valid_mols]
144
+
145
+ # Compute pairwise similarities
146
+ similarities = []
147
+ n = len(fps)
148
+ for i in range(n):
149
+ for j in range(i + 1, n):
150
+ sim = DataStructs.TanimotoSimilarity(fps[i], fps[j])
151
+ similarities.append(sim)
152
+
153
+ if not similarities:
154
+ return {"error": "Could not compute similarities"}
155
+
156
+ # Calculate statistics
157
+ import statistics
158
+
159
+ mean_sim = statistics.mean(similarities)
160
+ median_sim = statistics.median(similarities)
161
+ min_sim = min(similarities)
162
+ max_sim = max(similarities)
163
+ stdev_sim = statistics.stdev(similarities) if len(similarities) > 1 else 0
164
+
165
+ return {
166
+ "n_molecules": len(valid_mols),
167
+ "n_pairs": len(similarities),
168
+ "mean_similarity": round(mean_sim, 4),
169
+ "median_similarity": round(median_sim, 4),
170
+ "min_similarity": round(min_sim, 4),
171
+ "max_similarity": round(max_sim, 4),
172
+ "stdev_similarity": round(stdev_sim, 4),
173
+ "diversity_score": round(1 - mean_sim, 4),
174
+ }
@@ -0,0 +1,190 @@
1
+ """Molecular enumeration engine."""
2
+
3
+ from typing import Optional, Any
4
+
5
+ from rdkit import Chem
6
+ from rdkit.Chem import AllChem, EnumerateStereoisomers
7
+ from rdkit.Chem.MolStandardize import rdMolStandardize
8
+
9
+ from rdkit_cli.io.readers import MoleculeRecord
10
+
11
+
12
+ class StereoisomerEnumerator:
13
+ """Enumerate stereoisomers of molecules."""
14
+
15
+ def __init__(
16
+ self,
17
+ max_isomers: int = 32,
18
+ include_given: bool = True,
19
+ only_unassigned: bool = True,
20
+ include_smiles: bool = True,
21
+ include_name: bool = True,
22
+ ):
23
+ """
24
+ Initialize stereoisomer enumerator.
25
+
26
+ Args:
27
+ max_isomers: Maximum stereoisomers to generate
28
+ include_given: Include the input stereoisomer
29
+ only_unassigned: Only enumerate unassigned stereocenters
30
+ include_smiles: Include original SMILES in output
31
+ include_name: Include molecule name in output
32
+ """
33
+ self.max_isomers = max_isomers
34
+ self.include_given = include_given
35
+ self.only_unassigned = only_unassigned
36
+ self.include_smiles = include_smiles
37
+ self.include_name = include_name
38
+
39
+ def enumerate(self, record: MoleculeRecord) -> list[dict[str, Any]]:
40
+ """
41
+ Enumerate stereoisomers.
42
+
43
+ Args:
44
+ record: MoleculeRecord to process
45
+
46
+ Returns:
47
+ List of dictionaries with stereoisomer SMILES
48
+ """
49
+ if record.mol is None:
50
+ return []
51
+
52
+ try:
53
+ opts = EnumerateStereoisomers.StereoEnumerationOptions()
54
+ opts.maxIsomers = self.max_isomers
55
+ opts.onlyUnassigned = self.only_unassigned
56
+
57
+ isomers = list(EnumerateStereoisomers.EnumerateStereoisomers(record.mol, opts))
58
+
59
+ results = []
60
+ for i, iso in enumerate(isomers[:self.max_isomers]):
61
+ smi = Chem.MolToSmiles(iso, isomericSmiles=True)
62
+ result: dict[str, Any] = {"smiles": smi}
63
+
64
+ if self.include_name and record.name:
65
+ result["name"] = f"{record.name}_iso{i}"
66
+ elif record.name:
67
+ result["name"] = record.name
68
+
69
+ result["stereoisomer_idx"] = i
70
+ result["original_smiles"] = record.smiles
71
+
72
+ results.append(result)
73
+
74
+ return results
75
+
76
+ except Exception:
77
+ return []
78
+
79
+
80
+ class TautomerEnumerator:
81
+ """Enumerate tautomers of molecules."""
82
+
83
+ def __init__(
84
+ self,
85
+ max_tautomers: int = 50,
86
+ max_transforms: int = 1000,
87
+ include_smiles: bool = True,
88
+ include_name: bool = True,
89
+ ):
90
+ """
91
+ Initialize tautomer enumerator.
92
+
93
+ Args:
94
+ max_tautomers: Maximum tautomers to generate
95
+ max_transforms: Maximum transforms to apply
96
+ include_smiles: Include original SMILES in output
97
+ include_name: Include molecule name in output
98
+ """
99
+ self.max_tautomers = max_tautomers
100
+ self.max_transforms = max_transforms
101
+ self.include_smiles = include_smiles
102
+ self.include_name = include_name
103
+
104
+ # Create enumerator
105
+ self._enumerator = rdMolStandardize.TautomerEnumerator()
106
+ self._enumerator.SetMaxTautomers(max_tautomers)
107
+ self._enumerator.SetMaxTransforms(max_transforms)
108
+
109
+ def enumerate(self, record: MoleculeRecord) -> list[dict[str, Any]]:
110
+ """
111
+ Enumerate tautomers.
112
+
113
+ Args:
114
+ record: MoleculeRecord to process
115
+
116
+ Returns:
117
+ List of dictionaries with tautomer SMILES
118
+ """
119
+ if record.mol is None:
120
+ return []
121
+
122
+ try:
123
+ tautomers = list(self._enumerator.Enumerate(record.mol))
124
+
125
+ results = []
126
+ for i, taut in enumerate(tautomers[:self.max_tautomers]):
127
+ smi = Chem.MolToSmiles(taut, isomericSmiles=True)
128
+ result: dict[str, Any] = {"smiles": smi}
129
+
130
+ if self.include_name and record.name:
131
+ result["name"] = f"{record.name}_taut{i}"
132
+ elif record.name:
133
+ result["name"] = record.name
134
+
135
+ result["tautomer_idx"] = i
136
+ result["original_smiles"] = record.smiles
137
+
138
+ results.append(result)
139
+
140
+ return results
141
+
142
+ except Exception:
143
+ return []
144
+
145
+
146
+ class CanonicalTautomerizer:
147
+ """Get canonical tautomer of molecules."""
148
+
149
+ def __init__(
150
+ self,
151
+ include_original: bool = False,
152
+ ):
153
+ """
154
+ Initialize canonical tautomerizer.
155
+
156
+ Args:
157
+ include_original: Include original SMILES in output
158
+ """
159
+ self.include_original = include_original
160
+ self._canonicalizer = rdMolStandardize.TautomerEnumerator()
161
+
162
+ def canonicalize(self, record: MoleculeRecord) -> Optional[dict[str, Any]]:
163
+ """
164
+ Get canonical tautomer.
165
+
166
+ Args:
167
+ record: MoleculeRecord to process
168
+
169
+ Returns:
170
+ Dictionary with canonical tautomer or None
171
+ """
172
+ if record.mol is None:
173
+ return None
174
+
175
+ try:
176
+ canonical = self._canonicalizer.Canonicalize(record.mol)
177
+ smi = Chem.MolToSmiles(canonical, isomericSmiles=True)
178
+
179
+ result: dict[str, Any] = {"smiles": smi}
180
+
181
+ if record.name:
182
+ result["name"] = record.name
183
+
184
+ if self.include_original:
185
+ result["original_smiles"] = record.smiles
186
+
187
+ return result
188
+
189
+ except Exception:
190
+ return None