emap2lig 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- emap2lig/__init__.py +0 -0
- emap2lig/data/__init__.py +0 -0
- emap2lig/data/ccd.py +407 -0
- emap2lig/data/const.py +25 -0
- emap2lig/data/dataset.py +579 -0
- emap2lig/data/download.py +29 -0
- emap2lig/data/io/__init__.py +5 -0
- emap2lig/data/io/map.py +173 -0
- emap2lig/data/io/mmcif.py +255 -0
- emap2lig/data/io/writer.py +338 -0
- emap2lig/data/map.py +524 -0
- emap2lig/data/simulate.py +169 -0
- emap2lig/data/transforms.py +98 -0
- emap2lig/data/types.py +265 -0
- emap2lig/emap2lig.yaml +140 -0
- emap2lig/frag.py +146 -0
- emap2lig/main.py +1417 -0
- emap2lig/model/__init__.py +19 -0
- emap2lig/model/layers/__init__.py +55 -0
- emap2lig/model/layers/attention.py +179 -0
- emap2lig/model/layers/decoder.py +52 -0
- emap2lig/model/layers/dropout.py +43 -0
- emap2lig/model/layers/fourier.py +58 -0
- emap2lig/model/layers/instance.py +310 -0
- emap2lig/model/layers/outer_product_mean.py +90 -0
- emap2lig/model/layers/positional_encoding.py +149 -0
- emap2lig/model/layers/primitives.py +149 -0
- emap2lig/model/layers/selected_attention.py +149 -0
- emap2lig/model/layers/transition.py +80 -0
- emap2lig/model/layers/triangle_mult.py +283 -0
- emap2lig/model/layers/triangular_attention.py +397 -0
- emap2lig/model/model.py +466 -0
- emap2lig/model/modules/__init__.py +18 -0
- emap2lig/model/modules/conditioning.py +372 -0
- emap2lig/model/modules/conf_embedder.py +90 -0
- emap2lig/model/modules/diffusion.py +509 -0
- emap2lig/model/modules/instance_seg.py +355 -0
- emap2lig/model/modules/pairformer.py +440 -0
- emap2lig/model/seg/__init__.py +3 -0
- emap2lig/model/seg/model.py +544 -0
- emap2lig/model/seg/munet/__init__.py +11 -0
- emap2lig/model/seg/munet/backbone.py +158 -0
- emap2lig/model/seg/munet/conv.py +109 -0
- emap2lig/model/seg/munet/transformer.py +263 -0
- emap2lig/model/seg/threshold.py +78 -0
- emap2lig/web/__init__.py +0 -0
- emap2lig/web/__main__.py +6 -0
- emap2lig/web/app.py +125 -0
- emap2lig/web/cli.py +206 -0
- emap2lig/web/frontend/.gitignore +26 -0
- emap2lig/web/frontend/dist/assets/BuildTab.js +1 -0
- emap2lig/web/frontend/dist/assets/DirPicker.js +1 -0
- emap2lig/web/frontend/dist/assets/FindTab.js +1 -0
- emap2lig/web/frontend/dist/assets/ResultsTable.js +1 -0
- emap2lig/web/frontend/dist/assets/VisualizationTab.js +1 -0
- emap2lig/web/frontend/dist/assets/emap2lig-logo-sm.ico +0 -0
- emap2lig/web/frontend/dist/assets/index.css +1 -0
- emap2lig/web/frontend/dist/assets/index.js +2 -0
- emap2lig/web/frontend/dist/assets/molstar.js +7403 -0
- emap2lig/web/frontend/dist/assets/react-vendor.js +34 -0
- emap2lig/web/frontend/dist/assets/useBlobs.js +1 -0
- emap2lig/web/frontend/dist/assets/useJob.js +1 -0
- emap2lig/web/frontend/dist/assets/vendor.js +34 -0
- emap2lig/web/frontend/dist/index.html +17 -0
- emap2lig/web/frontend/dist/kihara-logo.ico +0 -0
- emap2lig/web/results_scan.py +123 -0
- emap2lig/web/routers/__init__.py +0 -0
- emap2lig/web/routers/detect.py +167 -0
- emap2lig/web/routers/download.py +56 -0
- emap2lig/web/routers/files.py +365 -0
- emap2lig/web/routers/jobs.py +156 -0
- emap2lig/web/routers/model.py +134 -0
- emap2lig/web/schemas.py +116 -0
- emap2lig/web/services.py +522 -0
- emap2lig/web/state.py +114 -0
- emap2lig-0.4.1.dist-info/METADATA +893 -0
- emap2lig-0.4.1.dist-info/RECORD +79 -0
- emap2lig-0.4.1.dist-info/WHEEL +4 -0
- emap2lig-0.4.1.dist-info/entry_points.txt +5 -0
emap2lig/__init__.py
ADDED
|
File without changes
|
|
File without changes
|
emap2lig/data/ccd.py
ADDED
|
@@ -0,0 +1,407 @@
|
|
|
1
|
+
"""CCD conformer utilities and on-demand local caching."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import pickle
|
|
5
|
+
import shutil
|
|
6
|
+
import tempfile
|
|
7
|
+
from collections.abc import Mapping
|
|
8
|
+
from functools import cache
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
import requests
|
|
12
|
+
from huggingface_hub import hf_hub_download
|
|
13
|
+
from pdbeccdutils.core import ccd_reader
|
|
14
|
+
from pdbeccdutils.core.component import ConformerType
|
|
15
|
+
from rdkit import Chem
|
|
16
|
+
from rdkit.Chem import rdDistGeom, rdForceFieldHelpers
|
|
17
|
+
from rdkit.Chem.rdchem import Conformer, Mol
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
_HF_REPO_ID = "KiharaLab/Emap2lig"
|
|
22
|
+
_DEFAULT_CCD_DATE = "250523"
|
|
23
|
+
_CCD_DIR = Path.home() / ".emap2lig" / "ccd"
|
|
24
|
+
_LEGACY_CCD_DIR = Path.home() / ".emap2lig" / "models" / "ccd"
|
|
25
|
+
_RCSB_CIF_URL = "https://files.rcsb.org/ligands/download/{code}.cif"
|
|
26
|
+
|
|
27
|
+
Chem.SetDefaultPickleProperties(Chem.PropertyPickleOptions.AllProps)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class CCDFetchError(RuntimeError):
|
|
31
|
+
"""Raised when a CCD molecule cannot be fetched or parsed from RCSB."""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _ensure_ccd_dir() -> None:
|
|
35
|
+
"""Create the CCD cache directory if it does not exist."""
|
|
36
|
+
_CCD_DIR.mkdir(parents=True, exist_ok=True)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _migrate_legacy_bulk_dict(date: str) -> None:
|
|
40
|
+
"""Move legacy CCD bulk dictionary file into the new CCD root."""
|
|
41
|
+
legacy_path = _LEGACY_CCD_DIR / f"ccd_dict_{date}.pkl"
|
|
42
|
+
new_path = _CCD_DIR / f"ccd_dict_{date}.pkl"
|
|
43
|
+
if legacy_path.exists() and not new_path.exists():
|
|
44
|
+
_ensure_ccd_dir()
|
|
45
|
+
shutil.move(str(legacy_path), str(new_path))
|
|
46
|
+
logger.info("Migrated CCD dictionary from %s to %s", legacy_path, new_path)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _download_bulk_dict(date: str) -> Path:
|
|
50
|
+
"""Download the bulk CCD dictionary and place it in the CCD cache directory.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
date: CCD release date string used in the HuggingFace filename.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
Local path to the downloaded dictionary file.
|
|
57
|
+
"""
|
|
58
|
+
target_path = _CCD_DIR / f"ccd_dict_{date}.pkl"
|
|
59
|
+
if target_path.exists():
|
|
60
|
+
return target_path
|
|
61
|
+
|
|
62
|
+
downloaded_path = Path(
|
|
63
|
+
hf_hub_download(
|
|
64
|
+
repo_id=_HF_REPO_ID,
|
|
65
|
+
filename=f"ccd/ccd_dict_{date}.pkl",
|
|
66
|
+
)
|
|
67
|
+
)
|
|
68
|
+
shutil.copy2(downloaded_path, target_path)
|
|
69
|
+
return target_path
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@cache
|
|
73
|
+
def _load_bulk_dict(date: str = _DEFAULT_CCD_DATE) -> Mapping[str, Mol]:
|
|
74
|
+
"""Load the bulk CCD dictionary from local cache or HuggingFace.
|
|
75
|
+
|
|
76
|
+
Checks for a locally cached dictionary, migrates from the legacy path
|
|
77
|
+
if needed, downloads from HuggingFace as a last resort, then loads
|
|
78
|
+
the pickle.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
date: CCD release date string (default ``"250523"``).
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
Mapping from CCD three-letter code to RDKit ``Mol``.
|
|
85
|
+
"""
|
|
86
|
+
_ensure_ccd_dir()
|
|
87
|
+
_migrate_legacy_bulk_dict(date)
|
|
88
|
+
local_path = _CCD_DIR / f"ccd_dict_{date}.pkl"
|
|
89
|
+
if not local_path.exists():
|
|
90
|
+
local_path = _download_bulk_dict(date)
|
|
91
|
+
with local_path.open("rb") as handle:
|
|
92
|
+
ccd_dict = pickle.load(handle)
|
|
93
|
+
return ccd_dict
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _fetch_from_rcsb(code: str) -> Mol:
|
|
97
|
+
"""Fetch a CCD component from the RCSB CIF endpoint.
|
|
98
|
+
|
|
99
|
+
Downloads the per-ligand CCD CIF file and parses it with
|
|
100
|
+
``pdbeccdutils``, which sets proper atom names, leaving-atom
|
|
101
|
+
flags, and Ideal/Model conformers from the CCD definition.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
code: Normalized CCD three-letter code.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Molecule with CCD atom names, leaving-atom flags, and
|
|
108
|
+
Ideal/Model conformers.
|
|
109
|
+
|
|
110
|
+
Raises:
|
|
111
|
+
CCDFetchError: If the CIF download or parsing fails.
|
|
112
|
+
"""
|
|
113
|
+
url = _RCSB_CIF_URL.format(code=code)
|
|
114
|
+
try:
|
|
115
|
+
response = requests.get(url, timeout=20)
|
|
116
|
+
except requests.RequestException as exc:
|
|
117
|
+
raise CCDFetchError(f"Network error fetching CCD {code}: {exc}") from exc
|
|
118
|
+
if response.status_code != 200:
|
|
119
|
+
raise CCDFetchError(
|
|
120
|
+
f"RCSB returned status {response.status_code} for CCD {code}"
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
tmp_path: str | None = None
|
|
124
|
+
try:
|
|
125
|
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".cif", delete=False) as tmp:
|
|
126
|
+
tmp.write(response.text)
|
|
127
|
+
tmp_path = tmp.name
|
|
128
|
+
|
|
129
|
+
result = ccd_reader.read_pdb_cif_file(tmp_path, sanitize=False)
|
|
130
|
+
mol = result.component.mol
|
|
131
|
+
except Exception as exc:
|
|
132
|
+
raise CCDFetchError(f"Failed to parse RCSB CIF for CCD {code}: {exc}") from exc
|
|
133
|
+
finally:
|
|
134
|
+
if tmp_path is not None:
|
|
135
|
+
Path(tmp_path).unlink(missing_ok=True)
|
|
136
|
+
|
|
137
|
+
if mol.GetNumAtoms() == 0:
|
|
138
|
+
raise CCDFetchError(f"RCSB CIF for CCD {code} contains no atoms")
|
|
139
|
+
|
|
140
|
+
mol.SetProp("PDB_NAME", code)
|
|
141
|
+
return mol
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def get_ccd_mol(code: str, date: str = _DEFAULT_CCD_DATE) -> Mol:
|
|
145
|
+
"""Resolve a CCD molecule from per-CCD cache, bulk dict, then RCSB.
|
|
146
|
+
|
|
147
|
+
Lookup order:
|
|
148
|
+
1. Bulk CCD dictionary (downloaded from HuggingFace).
|
|
149
|
+
2. Per-CCD pickle in ``~/.emap2lig/ccd/<CODE>.pkl`` for fallback entries.
|
|
150
|
+
3. RCSB CIF endpoint (parsed by ``pdbeccdutils``).
|
|
151
|
+
|
|
152
|
+
RCSB fallback hits are persisted to the per-CCD pickle so subsequent
|
|
153
|
+
lookups for CCD entries missing from the bulk dictionary are instant.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
code: CCD three-letter code (case-insensitive, whitespace trimmed).
|
|
157
|
+
date: CCD release date string used to locate the bulk dictionary.
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
RDKit ``Mol`` with 3D coordinates and atom names.
|
|
161
|
+
|
|
162
|
+
Raises:
|
|
163
|
+
CCDFetchError: If the code cannot be resolved from any source.
|
|
164
|
+
"""
|
|
165
|
+
normalized_code = code.strip().upper()
|
|
166
|
+
if not normalized_code:
|
|
167
|
+
raise CCDFetchError(f"Empty CCD code: {code!r}")
|
|
168
|
+
|
|
169
|
+
_ensure_ccd_dir()
|
|
170
|
+
bulk_dict = _load_bulk_dict(date)
|
|
171
|
+
if normalized_code in bulk_dict:
|
|
172
|
+
return bulk_dict[normalized_code]
|
|
173
|
+
|
|
174
|
+
ccd_pickle = _CCD_DIR / f"{normalized_code}.pkl"
|
|
175
|
+
if ccd_pickle.exists():
|
|
176
|
+
with ccd_pickle.open("rb") as handle:
|
|
177
|
+
return pickle.load(handle)
|
|
178
|
+
|
|
179
|
+
mol = _fetch_from_rcsb(normalized_code)
|
|
180
|
+
with ccd_pickle.open("wb") as handle:
|
|
181
|
+
pickle.dump(mol, handle)
|
|
182
|
+
return mol
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _etkdg_embed(mol: Mol, version: str, *, use_random_coords: bool) -> int:
|
|
186
|
+
"""Run ETKDG embedding followed by UFF relaxation.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
mol: RDKit molecule to process (modified in place).
|
|
190
|
+
version: ETKDG version — ``"v3"`` or ``"v2"``.
|
|
191
|
+
use_random_coords: When ``True``, seed the embedder with random
|
|
192
|
+
coordinates. This helps large or charged molecules that fail
|
|
193
|
+
distance-geometry initialization.
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
Conformer id on success, or ``-1`` when embedding fails.
|
|
197
|
+
"""
|
|
198
|
+
if version == "v3":
|
|
199
|
+
options = rdDistGeom.ETKDGv3()
|
|
200
|
+
elif version == "v2":
|
|
201
|
+
options = rdDistGeom.ETKDGv2()
|
|
202
|
+
else:
|
|
203
|
+
raise ValueError(f"Unsupported ETKDG version: {version}")
|
|
204
|
+
|
|
205
|
+
options.clearConfs = False
|
|
206
|
+
options.useRandomCoords = use_random_coords
|
|
207
|
+
|
|
208
|
+
try:
|
|
209
|
+
conf_id = rdDistGeom.EmbedMolecule(mol, options)
|
|
210
|
+
if conf_id == -1:
|
|
211
|
+
return -1
|
|
212
|
+
rdForceFieldHelpers.UFFOptimizeMolecule(mol, confId=conf_id, maxIters=1000)
|
|
213
|
+
except (RuntimeError, ValueError):
|
|
214
|
+
logger.debug(
|
|
215
|
+
"ETKDG embedding failed: version=%s random_coords=%s",
|
|
216
|
+
version,
|
|
217
|
+
use_random_coords,
|
|
218
|
+
)
|
|
219
|
+
return -1
|
|
220
|
+
|
|
221
|
+
return conf_id
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def compute_3d(mol: Mol, version: str = "v3") -> bool:
|
|
225
|
+
"""Generate 3D coordinates using the ETKDG method.
|
|
226
|
+
|
|
227
|
+
Adapted from ``pdbeccdutils.core.component.Component``.
|
|
228
|
+
|
|
229
|
+
Tries the requested ETKDG version first, then retries with random
|
|
230
|
+
starting coordinates, and finally falls back to ETKDGv2.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
mol: RDKit molecule to process (modified in place).
|
|
234
|
+
version: ETKDG version — ``"v3"`` or ``"v2"`` (defaults to ``"v3"``).
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
``True`` if a 3D conformer was successfully embedded.
|
|
238
|
+
"""
|
|
239
|
+
versions = [version]
|
|
240
|
+
if version == "v3":
|
|
241
|
+
versions.append("v2")
|
|
242
|
+
|
|
243
|
+
for etkdg_version in versions:
|
|
244
|
+
for use_random_coords in (False, True):
|
|
245
|
+
conf_id = _etkdg_embed(
|
|
246
|
+
mol,
|
|
247
|
+
etkdg_version,
|
|
248
|
+
use_random_coords=use_random_coords,
|
|
249
|
+
)
|
|
250
|
+
if conf_id == -1:
|
|
251
|
+
continue
|
|
252
|
+
|
|
253
|
+
conformer = mol.GetConformer(conf_id)
|
|
254
|
+
conformer.SetProp("name", ConformerType.Computed.name)
|
|
255
|
+
conformer.SetProp("coord_generation", f"ETKDG{etkdg_version}")
|
|
256
|
+
return True
|
|
257
|
+
|
|
258
|
+
return False
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def get_conformer(mol: Mol, c_type: ConformerType) -> Conformer:
|
|
262
|
+
"""Retrieve a conformer of the requested type.
|
|
263
|
+
|
|
264
|
+
Adapted from ``pdbeccdutils.core.component.Component``.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
mol: Molecule to search.
|
|
268
|
+
c_type: Desired conformer type.
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
The first conformer whose ``name`` property matches *c_type*.
|
|
272
|
+
|
|
273
|
+
Raises:
|
|
274
|
+
ValueError: If no conformer of the requested type exists.
|
|
275
|
+
"""
|
|
276
|
+
for c in mol.GetConformers():
|
|
277
|
+
try:
|
|
278
|
+
if c.GetProp("name") == c_type.name:
|
|
279
|
+
return c
|
|
280
|
+
except KeyError:
|
|
281
|
+
pass
|
|
282
|
+
|
|
283
|
+
raise ValueError(f"Conformer {c_type.name} does not exist.")
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def compute_symmetries(mol: Mol) -> list[list[int]]:
|
|
287
|
+
"""Compute the automorphism permutations of a molecule.
|
|
288
|
+
|
|
289
|
+
Each permutation maps non-leaving atom indices to their symmetric
|
|
290
|
+
counterparts. The result is also serialized into a hex-encoded
|
|
291
|
+
pickle stored as the ``symmetries`` property on *mol*.
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
mol: Molecule to process (modified in place).
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
List of index permutations (one per automorphism).
|
|
298
|
+
"""
|
|
299
|
+
mol = Chem.RemoveHs(mol)
|
|
300
|
+
idx_map: dict[int, int] = {}
|
|
301
|
+
atom_idx = 0
|
|
302
|
+
for i, atom in enumerate(mol.GetAtoms()):
|
|
303
|
+
if int(atom.GetProp("leaving_atom")):
|
|
304
|
+
continue
|
|
305
|
+
idx_map[i] = atom_idx
|
|
306
|
+
atom_idx += 1
|
|
307
|
+
|
|
308
|
+
permutations: list[list[int]] = []
|
|
309
|
+
raw_permutations = mol.GetSubstructMatches(mol, uniquify=False)
|
|
310
|
+
for raw_permutation in raw_permutations:
|
|
311
|
+
try:
|
|
312
|
+
if {raw_permutation[idx] for idx in idx_map} == set(idx_map.keys()):
|
|
313
|
+
permutation = [
|
|
314
|
+
idx_map[idx] for idx in raw_permutation if idx in idx_map
|
|
315
|
+
]
|
|
316
|
+
permutations.append(permutation)
|
|
317
|
+
except IndexError:
|
|
318
|
+
logger.debug("Skipping malformed symmetry permutation")
|
|
319
|
+
serialized_permutations = pickle.dumps(permutations)
|
|
320
|
+
mol.SetProp("symmetries", serialized_permutations.hex())
|
|
321
|
+
return permutations
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def add_conformer(mol: Mol) -> tuple[str, Mol]:
|
|
325
|
+
"""Attempt to add a 3D conformer to a molecule.
|
|
326
|
+
|
|
327
|
+
For single-atom molecules the result is ``"single"``. Otherwise
|
|
328
|
+
an ETKDGv3 conformer is computed; if that fails the existing ideal
|
|
329
|
+
coordinates are used. If neither is available the result is
|
|
330
|
+
``"failed"``.
|
|
331
|
+
|
|
332
|
+
Args:
|
|
333
|
+
mol: Molecule to process (modified in place).
|
|
334
|
+
|
|
335
|
+
Returns:
|
|
336
|
+
Tuple of (result_tag, molecule). *result_tag* is one of
|
|
337
|
+
``"single"``, ``"computed"``, ``"ideal"``, or ``"failed"``.
|
|
338
|
+
"""
|
|
339
|
+
# Check if single atom
|
|
340
|
+
if mol.GetNumAtoms() == 1:
|
|
341
|
+
result = "single"
|
|
342
|
+
else:
|
|
343
|
+
# Get the 3D conformer
|
|
344
|
+
try:
|
|
345
|
+
# Try to generate a 3D conformer with RDKit
|
|
346
|
+
success = compute_3d(mol, version="v3")
|
|
347
|
+
if success:
|
|
348
|
+
_ = get_conformer(mol, ConformerType.Computed)
|
|
349
|
+
result = "computed"
|
|
350
|
+
|
|
351
|
+
# Otherwise, default to the ideal coordinates
|
|
352
|
+
else:
|
|
353
|
+
_ = get_conformer(mol, ConformerType.Ideal)
|
|
354
|
+
result = "ideal"
|
|
355
|
+
except ValueError:
|
|
356
|
+
result = "failed"
|
|
357
|
+
|
|
358
|
+
# Output the results
|
|
359
|
+
return result, mol
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def _assign_canonical_atom_names(mol: Mol, smiles: str) -> None:
|
|
363
|
+
"""Assign canonical ``<SYMBOL><RANK>`` atom names to a heavy-atom molecule.
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
mol: Heavy-atom RDKit molecule (modified in place).
|
|
367
|
+
smiles: Source SMILES string, included in error messages.
|
|
368
|
+
|
|
369
|
+
Raises:
|
|
370
|
+
ValueError: If an atom name exceeds 4 characters.
|
|
371
|
+
"""
|
|
372
|
+
canonical_order = Chem.CanonicalRankAtoms(mol)
|
|
373
|
+
for atom, can_idx in zip(mol.GetAtoms(), canonical_order):
|
|
374
|
+
atom_name = atom.GetSymbol().upper() + str(can_idx + 1)
|
|
375
|
+
if len(atom_name) > 4:
|
|
376
|
+
raise ValueError(
|
|
377
|
+
f"{smiles} has an atom with a name longer than 4 characters: {atom_name}"
|
|
378
|
+
)
|
|
379
|
+
atom.SetProp("name", atom_name)
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def get_conformer_from_smiles(smiles: str) -> tuple[str, Mol]:
|
|
383
|
+
"""Build a molecule from a SMILES string and generate a 3D conformer.
|
|
384
|
+
|
|
385
|
+
Hydrogens are added for ETKDG embedding, then removed before
|
|
386
|
+
canonical atom names are assigned on the heavy-atom molecule.
|
|
387
|
+
Atom names longer than 4 characters raise ``ValueError``.
|
|
388
|
+
|
|
389
|
+
Args:
|
|
390
|
+
smiles: SMILES string to parse.
|
|
391
|
+
|
|
392
|
+
Returns:
|
|
393
|
+
Tuple of (result_tag, molecule) from :func:`add_conformer`.
|
|
394
|
+
|
|
395
|
+
Raises:
|
|
396
|
+
ValueError: If an atom name exceeds 4 characters.
|
|
397
|
+
"""
|
|
398
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
399
|
+
mol = Chem.AddHs(mol)
|
|
400
|
+
result, mol = add_conformer(mol)
|
|
401
|
+
|
|
402
|
+
if result == "failed":
|
|
403
|
+
return result, mol
|
|
404
|
+
|
|
405
|
+
mol = Chem.RemoveHs(mol)
|
|
406
|
+
_assign_canonical_atom_names(mol, smiles)
|
|
407
|
+
return result, mol
|
emap2lig/data/const.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
num_elements = 128
|
|
2
|
+
|
|
3
|
+
####################################################################################################
|
|
4
|
+
# ATOMS
|
|
5
|
+
####################################################################################################
|
|
6
|
+
|
|
7
|
+
chirality_types = [
|
|
8
|
+
"CHI_OTHER",
|
|
9
|
+
"CHI_OCTAHEDRAL",
|
|
10
|
+
"CHI_TETRAHEDRAL_CW",
|
|
11
|
+
"CHI_TRIGONALBIPYRAMIDAL",
|
|
12
|
+
"CHI_UNSPECIFIED",
|
|
13
|
+
"CHI_TETRAHEDRAL_CCW",
|
|
14
|
+
"CHI_SQUAREPLANAR",
|
|
15
|
+
]
|
|
16
|
+
chirality_type_ids = {chirality: i for i, chirality in enumerate(chirality_types)}
|
|
17
|
+
|
|
18
|
+
bond_types = [
|
|
19
|
+
"SINGLE",
|
|
20
|
+
"DOUBLE",
|
|
21
|
+
"TRIPLE",
|
|
22
|
+
"DATIVE",
|
|
23
|
+
"AROMATIC",
|
|
24
|
+
]
|
|
25
|
+
bond_type_ids = {bond: i for i, bond in enumerate(bond_types)}
|