rdkit-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. rdkit_cli/__init__.py +4 -0
  2. rdkit_cli/__main__.py +6 -0
  3. rdkit_cli/cli.py +162 -0
  4. rdkit_cli/commands/__init__.py +1 -0
  5. rdkit_cli/commands/conformers.py +220 -0
  6. rdkit_cli/commands/convert.py +162 -0
  7. rdkit_cli/commands/depict.py +311 -0
  8. rdkit_cli/commands/descriptors.py +251 -0
  9. rdkit_cli/commands/diversity.py +232 -0
  10. rdkit_cli/commands/enumerate.py +229 -0
  11. rdkit_cli/commands/filter.py +384 -0
  12. rdkit_cli/commands/fingerprints.py +179 -0
  13. rdkit_cli/commands/fragment.py +284 -0
  14. rdkit_cli/commands/mcs.py +162 -0
  15. rdkit_cli/commands/reactions.py +191 -0
  16. rdkit_cli/commands/scaffold.py +243 -0
  17. rdkit_cli/commands/similarity.py +359 -0
  18. rdkit_cli/commands/standardize.py +138 -0
  19. rdkit_cli/core/__init__.py +1 -0
  20. rdkit_cli/core/conformers.py +197 -0
  21. rdkit_cli/core/depict.py +241 -0
  22. rdkit_cli/core/descriptors.py +248 -0
  23. rdkit_cli/core/diversity.py +174 -0
  24. rdkit_cli/core/enumerate.py +190 -0
  25. rdkit_cli/core/filters.py +443 -0
  26. rdkit_cli/core/fingerprints.py +265 -0
  27. rdkit_cli/core/fragment.py +237 -0
  28. rdkit_cli/core/mcs.py +128 -0
  29. rdkit_cli/core/reactions.py +159 -0
  30. rdkit_cli/core/scaffold.py +174 -0
  31. rdkit_cli/core/similarity.py +206 -0
  32. rdkit_cli/core/standardizer.py +141 -0
  33. rdkit_cli/io/__init__.py +7 -0
  34. rdkit_cli/io/formats.py +109 -0
  35. rdkit_cli/io/readers.py +352 -0
  36. rdkit_cli/io/writers.py +275 -0
  37. rdkit_cli/parallel/__init__.py +5 -0
  38. rdkit_cli/parallel/batch.py +181 -0
  39. rdkit_cli/parallel/executor.py +180 -0
  40. rdkit_cli/progress/__init__.py +5 -0
  41. rdkit_cli/progress/ninja.py +195 -0
  42. rdkit_cli/utils/__init__.py +1 -0
  43. rdkit_cli-0.1.0.dist-info/METADATA +380 -0
  44. rdkit_cli-0.1.0.dist-info/RECORD +47 -0
  45. rdkit_cli-0.1.0.dist-info/WHEEL +4 -0
  46. rdkit_cli-0.1.0.dist-info/entry_points.txt +2 -0
  47. rdkit_cli-0.1.0.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,206 @@
1
+ """Molecular similarity computation engine."""
2
+
3
+ from dataclasses import dataclass
4
+ from enum import Enum
5
+ from typing import Optional, Any
6
+
7
+ from rdkit import Chem, DataStructs
8
+ from rdkit.Chem import AllChem, rdMolDescriptors
9
+ from rdkit.ML.Cluster import Butina
10
+
11
+ from rdkit_cli.io.readers import MoleculeRecord
12
+
13
+
14
+ class SimilarityMetric(Enum):
15
+ """Supported similarity metrics."""
16
+
17
+ TANIMOTO = "tanimoto"
18
+ DICE = "dice"
19
+ COSINE = "cosine"
20
+ SOKAL = "sokal"
21
+ RUSSEL = "russel"
22
+
23
+
24
+ def get_morgan_fingerprint(mol: Chem.Mol, radius: int = 2, n_bits: int = 2048):
25
+ """Get Morgan fingerprint for a molecule."""
26
+ return rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
27
+
28
+
29
+ def compute_similarity(
30
+ fp1,
31
+ fp2,
32
+ metric: SimilarityMetric = SimilarityMetric.TANIMOTO,
33
+ ) -> float:
34
+ """
35
+ Compute similarity between two fingerprints.
36
+
37
+ Args:
38
+ fp1: First fingerprint
39
+ fp2: Second fingerprint
40
+ metric: Similarity metric to use
41
+
42
+ Returns:
43
+ Similarity score (0-1)
44
+ """
45
+ if metric == SimilarityMetric.TANIMOTO:
46
+ return DataStructs.TanimotoSimilarity(fp1, fp2)
47
+ elif metric == SimilarityMetric.DICE:
48
+ return DataStructs.DiceSimilarity(fp1, fp2)
49
+ elif metric == SimilarityMetric.COSINE:
50
+ return DataStructs.CosineSimilarity(fp1, fp2)
51
+ elif metric == SimilarityMetric.SOKAL:
52
+ return DataStructs.SokalSimilarity(fp1, fp2)
53
+ elif metric == SimilarityMetric.RUSSEL:
54
+ return DataStructs.RusselSimilarity(fp1, fp2)
55
+ else:
56
+ raise ValueError(f"Unknown metric: {metric}")
57
+
58
+
59
+ def bulk_tanimoto_similarity(query_fp, fps: list) -> list[float]:
60
+ """Compute Tanimoto similarity of query against multiple fingerprints."""
61
+ return list(DataStructs.BulkTanimotoSimilarity(query_fp, fps))
62
+
63
+
64
+ class SimilaritySearcher:
65
+ """Search for similar molecules."""
66
+
67
+ def __init__(
68
+ self,
69
+ query_smiles: str,
70
+ threshold: float = 0.7,
71
+ metric: SimilarityMetric = SimilarityMetric.TANIMOTO,
72
+ radius: int = 2,
73
+ n_bits: int = 2048,
74
+ ):
75
+ """
76
+ Initialize similarity searcher.
77
+
78
+ Args:
79
+ query_smiles: Query molecule SMILES
80
+ threshold: Minimum similarity threshold
81
+ metric: Similarity metric
82
+ radius: Morgan fingerprint radius
83
+ n_bits: Fingerprint bit size
84
+ """
85
+ self.threshold = threshold
86
+ self.metric = metric
87
+ self.radius = radius
88
+ self.n_bits = n_bits
89
+
90
+ # Generate query fingerprint
91
+ query_mol = Chem.MolFromSmiles(query_smiles)
92
+ if query_mol is None:
93
+ raise ValueError(f"Invalid query SMILES: {query_smiles}")
94
+
95
+ self.query_fp = get_morgan_fingerprint(query_mol, radius, n_bits)
96
+
97
+ def search(self, record: MoleculeRecord) -> Optional[dict[str, Any]]:
98
+ """
99
+ Check if molecule is similar to query.
100
+
101
+ Args:
102
+ record: MoleculeRecord to check
103
+
104
+ Returns:
105
+ Dictionary with similarity score if above threshold, None otherwise
106
+ """
107
+ if record.mol is None:
108
+ return None
109
+
110
+ fp = get_morgan_fingerprint(record.mol, self.radius, self.n_bits)
111
+ similarity = compute_similarity(self.query_fp, fp, self.metric)
112
+
113
+ if similarity < self.threshold:
114
+ return None
115
+
116
+ result: dict[str, Any] = {
117
+ "smiles": record.smiles,
118
+ "similarity": round(similarity, 4),
119
+ }
120
+
121
+ if record.name:
122
+ result["name"] = record.name
123
+
124
+ return result
125
+
126
+
127
+ def compute_similarity_matrix(
128
+ mols: list[Chem.Mol],
129
+ metric: SimilarityMetric = SimilarityMetric.TANIMOTO,
130
+ radius: int = 2,
131
+ n_bits: int = 2048,
132
+ ) -> list[list[float]]:
133
+ """
134
+ Compute pairwise similarity matrix.
135
+
136
+ Args:
137
+ mols: List of molecules
138
+ metric: Similarity metric
139
+ radius: Morgan fingerprint radius
140
+ n_bits: Fingerprint bit size
141
+
142
+ Returns:
143
+ Symmetric similarity matrix
144
+ """
145
+ # Generate fingerprints
146
+ fps = [get_morgan_fingerprint(mol, radius, n_bits) for mol in mols if mol is not None]
147
+ n = len(fps)
148
+
149
+ # Compute pairwise similarities
150
+ matrix = [[0.0] * n for _ in range(n)]
151
+
152
+ for i in range(n):
153
+ matrix[i][i] = 1.0
154
+ for j in range(i + 1, n):
155
+ sim = compute_similarity(fps[i], fps[j], metric)
156
+ matrix[i][j] = sim
157
+ matrix[j][i] = sim
158
+
159
+ return matrix
160
+
161
+
162
+ def cluster_molecules(
163
+ mols: list[Chem.Mol],
164
+ cutoff: float = 0.3,
165
+ radius: int = 2,
166
+ n_bits: int = 2048,
167
+ ) -> list[list[int]]:
168
+ """
169
+ Cluster molecules using Butina algorithm.
170
+
171
+ Args:
172
+ mols: List of molecules
173
+ cutoff: Distance cutoff (1 - similarity)
174
+ radius: Morgan fingerprint radius
175
+ n_bits: Fingerprint bit size
176
+
177
+ Returns:
178
+ List of clusters (each cluster is a list of molecule indices)
179
+ """
180
+ # Generate fingerprints
181
+ fps = []
182
+ valid_indices = []
183
+ for i, mol in enumerate(mols):
184
+ if mol is not None:
185
+ fps.append(get_morgan_fingerprint(mol, radius, n_bits))
186
+ valid_indices.append(i)
187
+
188
+ n = len(fps)
189
+ if n == 0:
190
+ return []
191
+
192
+ # Compute distance matrix (lower triangle)
193
+ dists = []
194
+ for i in range(1, n):
195
+ sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
196
+ dists.extend([1 - s for s in sims])
197
+
198
+ # Cluster using Butina
199
+ clusters = Butina.ClusterData(dists, n, cutoff, isDistData=True)
200
+
201
+ # Map back to original indices
202
+ result = []
203
+ for cluster in clusters:
204
+ result.append([valid_indices[i] for i in cluster])
205
+
206
+ return result
@@ -0,0 +1,141 @@
1
+ """Molecule standardization engine."""
2
+
3
+ from typing import Optional, Any
4
+
5
+ from rdkit import Chem
6
+ from rdkit.Chem import AllChem
7
+ from rdkit.Chem.MolStandardize import rdMolStandardize
8
+
9
+ from rdkit_cli.io.readers import MoleculeRecord
10
+
11
+
12
+ class MoleculeStandardizer:
13
+ """Standardizer for molecular structures."""
14
+
15
+ def __init__(
16
+ self,
17
+ canonicalize: bool = True,
18
+ remove_stereo: bool = False,
19
+ disconnect_metals: bool = False,
20
+ normalize: bool = False,
21
+ reionize: bool = False,
22
+ uncharge: bool = False,
23
+ fragment_parent: bool = False,
24
+ tautomer_parent: bool = False,
25
+ include_original: bool = False,
26
+ ):
27
+ """
28
+ Initialize standardizer.
29
+
30
+ Args:
31
+ canonicalize: Canonicalize SMILES
32
+ remove_stereo: Remove stereochemistry information
33
+ disconnect_metals: Disconnect metal atoms
34
+ normalize: Apply normalization transforms
35
+ reionize: Standardize ionization state
36
+ uncharge: Neutralize charges
37
+ fragment_parent: Keep only largest fragment
38
+ tautomer_parent: Canonicalize tautomer
39
+ include_original: Include original SMILES in output
40
+ """
41
+ self.canonicalize = canonicalize
42
+ self.remove_stereo = remove_stereo
43
+ self.disconnect_metals = disconnect_metals
44
+ self.normalize = normalize
45
+ self.reionize = reionize
46
+ self.uncharge = uncharge
47
+ self.fragment_parent = fragment_parent
48
+ self.tautomer_parent = tautomer_parent
49
+ self.include_original = include_original
50
+
51
+ # Initialize standardizers
52
+ self._metal_disconnector = rdMolStandardize.MetalDisconnector() if disconnect_metals else None
53
+ self._normalizer = rdMolStandardize.Normalizer() if normalize else None
54
+ self._reionizer = rdMolStandardize.Reionizer() if reionize else None
55
+ self._uncharger = rdMolStandardize.Uncharger() if uncharge else None
56
+ self._fragment_chooser = rdMolStandardize.LargestFragmentChooser() if fragment_parent else None
57
+ self._tautomer_canon = rdMolStandardize.TautomerCanonicalizer() if tautomer_parent else None
58
+
59
+ def standardize(self, record: MoleculeRecord) -> Optional[dict[str, Any]]:
60
+ """
61
+ Standardize a molecule record.
62
+
63
+ Args:
64
+ record: MoleculeRecord to process
65
+
66
+ Returns:
67
+ Dictionary with standardized SMILES or None if failed
68
+ """
69
+ if record.mol is None:
70
+ return None
71
+
72
+ try:
73
+ mol = record.mol
74
+
75
+ # Apply transformations in order
76
+ if self._metal_disconnector:
77
+ mol = self._metal_disconnector.Disconnect(mol)
78
+
79
+ if self._normalizer:
80
+ mol = self._normalizer.normalize(mol)
81
+
82
+ if self._reionizer:
83
+ mol = self._reionizer.reionize(mol)
84
+
85
+ if self._uncharger:
86
+ mol = self._uncharger.uncharge(mol)
87
+
88
+ if self._fragment_chooser:
89
+ mol = self._fragment_chooser.choose(mol)
90
+
91
+ if self._tautomer_canon:
92
+ mol = self._tautomer_canon.canonicalize(mol)
93
+
94
+ if self.remove_stereo:
95
+ Chem.RemoveStereochemistry(mol)
96
+
97
+ # Generate output SMILES
98
+ if self.canonicalize:
99
+ output_smiles = Chem.MolToSmiles(mol, canonical=True)
100
+ else:
101
+ output_smiles = Chem.MolToSmiles(mol)
102
+
103
+ result: dict[str, Any] = {}
104
+
105
+ if self.include_original:
106
+ result["original_smiles"] = record.smiles
107
+
108
+ result["smiles"] = output_smiles
109
+
110
+ if record.name:
111
+ result["name"] = record.name
112
+
113
+ return result
114
+
115
+ except Exception:
116
+ return None
117
+
118
+ def get_column_names(self) -> list[str]:
119
+ """Get output column names in order."""
120
+ cols = []
121
+ if self.include_original:
122
+ cols.append("original_smiles")
123
+ cols.append("smiles")
124
+ cols.append("name")
125
+ return cols
126
+
127
+
128
+ def canonicalize_smiles(smiles: str) -> Optional[str]:
129
+ """
130
+ Canonicalize a SMILES string.
131
+
132
+ Args:
133
+ smiles: Input SMILES
134
+
135
+ Returns:
136
+ Canonical SMILES or None if parsing failed
137
+ """
138
+ mol = Chem.MolFromSmiles(smiles)
139
+ if mol is None:
140
+ return None
141
+ return Chem.MolToSmiles(mol, canonical=True)
@@ -0,0 +1,7 @@
1
+ """I/O handling for multiple file formats."""
2
+
3
+ from rdkit_cli.io.formats import FileFormat, FormatConfig, detect_format
4
+ from rdkit_cli.io.readers import create_reader
5
+ from rdkit_cli.io.writers import create_writer
6
+
7
+ __all__ = ["FileFormat", "FormatConfig", "detect_format", "create_reader", "create_writer"]
@@ -0,0 +1,109 @@
1
+ """File format detection and configuration."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from enum import Enum
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+
9
+ class FileFormat(Enum):
10
+ """Supported file formats."""
11
+
12
+ CSV = "csv"
13
+ TSV = "tsv"
14
+ SMI = "smi"
15
+ SDF = "sdf"
16
+ PARQUET = "parquet"
17
+
18
+
19
+ @dataclass
20
+ class FormatConfig:
21
+ """Configuration for file format handling."""
22
+
23
+ format: FileFormat
24
+ has_header: bool = True
25
+ smiles_column: str = "smiles"
26
+ name_column: Optional[str] = None
27
+ delimiter: str = ","
28
+ extra_columns: list[str] = field(default_factory=list)
29
+
30
+ def __post_init__(self):
31
+ """Set format-specific defaults."""
32
+ if self.format == FileFormat.TSV:
33
+ self.delimiter = "\t"
34
+ elif self.format == FileFormat.SMI:
35
+ self.has_header = False
36
+ self.delimiter = " "
37
+
38
+
39
+ # File extension to format mapping
40
+ EXTENSION_MAP: dict[str, FileFormat] = {
41
+ ".csv": FileFormat.CSV,
42
+ ".tsv": FileFormat.TSV,
43
+ ".smi": FileFormat.SMI,
44
+ ".smiles": FileFormat.SMI,
45
+ ".sdf": FileFormat.SDF,
46
+ ".mol": FileFormat.SDF,
47
+ ".parquet": FileFormat.PARQUET,
48
+ ".pq": FileFormat.PARQUET,
49
+ }
50
+
51
+
52
+ def detect_format(path: str | Path) -> FileFormat:
53
+ """
54
+ Detect file format from file extension.
55
+
56
+ Args:
57
+ path: Path to the file
58
+
59
+ Returns:
60
+ Detected FileFormat
61
+
62
+ Raises:
63
+ ValueError: If format cannot be detected
64
+ """
65
+ path = Path(path)
66
+ suffix = path.suffix.lower()
67
+
68
+ if suffix in EXTENSION_MAP:
69
+ return EXTENSION_MAP[suffix]
70
+
71
+ raise ValueError(
72
+ f"Cannot detect format for '{path}'. "
73
+ f"Supported extensions: {', '.join(EXTENSION_MAP.keys())}"
74
+ )
75
+
76
+
77
+ def create_format_config(
78
+ path: str | Path,
79
+ format_override: Optional[FileFormat] = None,
80
+ has_header: Optional[bool] = None,
81
+ smiles_column: str = "smiles",
82
+ name_column: Optional[str] = None,
83
+ ) -> FormatConfig:
84
+ """
85
+ Create a FormatConfig for a file.
86
+
87
+ Args:
88
+ path: Path to the file
89
+ format_override: Override auto-detected format
90
+ has_header: Override default header setting
91
+ smiles_column: Name of the SMILES column
92
+ name_column: Name of the molecule name column
93
+
94
+ Returns:
95
+ Configured FormatConfig
96
+ """
97
+ file_format = format_override or detect_format(path)
98
+
99
+ config = FormatConfig(
100
+ format=file_format,
101
+ smiles_column=smiles_column,
102
+ name_column=name_column,
103
+ )
104
+
105
+ # Override header if explicitly specified
106
+ if has_header is not None:
107
+ config.has_header = has_header
108
+
109
+ return config