emap2lig 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. emap2lig/__init__.py +0 -0
  2. emap2lig/data/__init__.py +0 -0
  3. emap2lig/data/ccd.py +407 -0
  4. emap2lig/data/const.py +25 -0
  5. emap2lig/data/dataset.py +579 -0
  6. emap2lig/data/download.py +29 -0
  7. emap2lig/data/io/__init__.py +5 -0
  8. emap2lig/data/io/map.py +173 -0
  9. emap2lig/data/io/mmcif.py +255 -0
  10. emap2lig/data/io/writer.py +338 -0
  11. emap2lig/data/map.py +524 -0
  12. emap2lig/data/simulate.py +169 -0
  13. emap2lig/data/transforms.py +98 -0
  14. emap2lig/data/types.py +265 -0
  15. emap2lig/emap2lig.yaml +140 -0
  16. emap2lig/frag.py +146 -0
  17. emap2lig/main.py +1417 -0
  18. emap2lig/model/__init__.py +19 -0
  19. emap2lig/model/layers/__init__.py +55 -0
  20. emap2lig/model/layers/attention.py +179 -0
  21. emap2lig/model/layers/decoder.py +52 -0
  22. emap2lig/model/layers/dropout.py +43 -0
  23. emap2lig/model/layers/fourier.py +58 -0
  24. emap2lig/model/layers/instance.py +310 -0
  25. emap2lig/model/layers/outer_product_mean.py +90 -0
  26. emap2lig/model/layers/positional_encoding.py +149 -0
  27. emap2lig/model/layers/primitives.py +149 -0
  28. emap2lig/model/layers/selected_attention.py +149 -0
  29. emap2lig/model/layers/transition.py +80 -0
  30. emap2lig/model/layers/triangle_mult.py +283 -0
  31. emap2lig/model/layers/triangular_attention.py +397 -0
  32. emap2lig/model/model.py +466 -0
  33. emap2lig/model/modules/__init__.py +18 -0
  34. emap2lig/model/modules/conditioning.py +372 -0
  35. emap2lig/model/modules/conf_embedder.py +90 -0
  36. emap2lig/model/modules/diffusion.py +509 -0
  37. emap2lig/model/modules/instance_seg.py +355 -0
  38. emap2lig/model/modules/pairformer.py +440 -0
  39. emap2lig/model/seg/__init__.py +3 -0
  40. emap2lig/model/seg/model.py +544 -0
  41. emap2lig/model/seg/munet/__init__.py +11 -0
  42. emap2lig/model/seg/munet/backbone.py +158 -0
  43. emap2lig/model/seg/munet/conv.py +109 -0
  44. emap2lig/model/seg/munet/transformer.py +263 -0
  45. emap2lig/model/seg/threshold.py +78 -0
  46. emap2lig/web/__init__.py +0 -0
  47. emap2lig/web/__main__.py +6 -0
  48. emap2lig/web/app.py +125 -0
  49. emap2lig/web/cli.py +206 -0
  50. emap2lig/web/frontend/.gitignore +26 -0
  51. emap2lig/web/frontend/dist/assets/BuildTab.js +1 -0
  52. emap2lig/web/frontend/dist/assets/DirPicker.js +1 -0
  53. emap2lig/web/frontend/dist/assets/FindTab.js +1 -0
  54. emap2lig/web/frontend/dist/assets/ResultsTable.js +1 -0
  55. emap2lig/web/frontend/dist/assets/VisualizationTab.js +1 -0
  56. emap2lig/web/frontend/dist/assets/emap2lig-logo-sm.ico +0 -0
  57. emap2lig/web/frontend/dist/assets/index.css +1 -0
  58. emap2lig/web/frontend/dist/assets/index.js +2 -0
  59. emap2lig/web/frontend/dist/assets/molstar.js +7403 -0
  60. emap2lig/web/frontend/dist/assets/react-vendor.js +34 -0
  61. emap2lig/web/frontend/dist/assets/useBlobs.js +1 -0
  62. emap2lig/web/frontend/dist/assets/useJob.js +1 -0
  63. emap2lig/web/frontend/dist/assets/vendor.js +34 -0
  64. emap2lig/web/frontend/dist/index.html +17 -0
  65. emap2lig/web/frontend/dist/kihara-logo.ico +0 -0
  66. emap2lig/web/results_scan.py +123 -0
  67. emap2lig/web/routers/__init__.py +0 -0
  68. emap2lig/web/routers/detect.py +167 -0
  69. emap2lig/web/routers/download.py +56 -0
  70. emap2lig/web/routers/files.py +365 -0
  71. emap2lig/web/routers/jobs.py +156 -0
  72. emap2lig/web/routers/model.py +134 -0
  73. emap2lig/web/schemas.py +116 -0
  74. emap2lig/web/services.py +522 -0
  75. emap2lig/web/state.py +114 -0
  76. emap2lig-0.4.1.dist-info/METADATA +893 -0
  77. emap2lig-0.4.1.dist-info/RECORD +79 -0
  78. emap2lig-0.4.1.dist-info/WHEEL +4 -0
  79. emap2lig-0.4.1.dist-info/entry_points.txt +5 -0
emap2lig/__init__.py ADDED
File without changes
File without changes
emap2lig/data/ccd.py ADDED
@@ -0,0 +1,407 @@
1
+ """CCD conformer utilities and on-demand local caching."""
2
+
3
+ import logging
4
+ import pickle
5
+ import shutil
6
+ import tempfile
7
+ from collections.abc import Mapping
8
+ from functools import cache
9
+ from pathlib import Path
10
+
11
+ import requests
12
+ from huggingface_hub import hf_hub_download
13
+ from pdbeccdutils.core import ccd_reader
14
+ from pdbeccdutils.core.component import ConformerType
15
+ from rdkit import Chem
16
+ from rdkit.Chem import rdDistGeom, rdForceFieldHelpers
17
+ from rdkit.Chem.rdchem import Conformer, Mol
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ _HF_REPO_ID = "KiharaLab/Emap2lig"
22
+ _DEFAULT_CCD_DATE = "250523"
23
+ _CCD_DIR = Path.home() / ".emap2lig" / "ccd"
24
+ _LEGACY_CCD_DIR = Path.home() / ".emap2lig" / "models" / "ccd"
25
+ _RCSB_CIF_URL = "https://files.rcsb.org/ligands/download/{code}.cif"
26
+
27
+ Chem.SetDefaultPickleProperties(Chem.PropertyPickleOptions.AllProps)
28
+
29
+
30
+ class CCDFetchError(RuntimeError):
31
+ """Raised when a CCD molecule cannot be fetched or parsed from RCSB."""
32
+
33
+
34
+ def _ensure_ccd_dir() -> None:
35
+ """Create the CCD cache directory if it does not exist."""
36
+ _CCD_DIR.mkdir(parents=True, exist_ok=True)
37
+
38
+
39
+ def _migrate_legacy_bulk_dict(date: str) -> None:
40
+ """Move legacy CCD bulk dictionary file into the new CCD root."""
41
+ legacy_path = _LEGACY_CCD_DIR / f"ccd_dict_{date}.pkl"
42
+ new_path = _CCD_DIR / f"ccd_dict_{date}.pkl"
43
+ if legacy_path.exists() and not new_path.exists():
44
+ _ensure_ccd_dir()
45
+ shutil.move(str(legacy_path), str(new_path))
46
+ logger.info("Migrated CCD dictionary from %s to %s", legacy_path, new_path)
47
+
48
+
49
+ def _download_bulk_dict(date: str) -> Path:
50
+ """Download the bulk CCD dictionary and place it in the CCD cache directory.
51
+
52
+ Args:
53
+ date: CCD release date string used in the HuggingFace filename.
54
+
55
+ Returns:
56
+ Local path to the downloaded dictionary file.
57
+ """
58
+ target_path = _CCD_DIR / f"ccd_dict_{date}.pkl"
59
+ if target_path.exists():
60
+ return target_path
61
+
62
+ downloaded_path = Path(
63
+ hf_hub_download(
64
+ repo_id=_HF_REPO_ID,
65
+ filename=f"ccd/ccd_dict_{date}.pkl",
66
+ )
67
+ )
68
+ shutil.copy2(downloaded_path, target_path)
69
+ return target_path
70
+
71
+
72
+ @cache
73
+ def _load_bulk_dict(date: str = _DEFAULT_CCD_DATE) -> Mapping[str, Mol]:
74
+ """Load the bulk CCD dictionary from local cache or HuggingFace.
75
+
76
+ Checks for a locally cached dictionary, migrates from the legacy path
77
+ if needed, downloads from HuggingFace as a last resort, then loads
78
+ the pickle.
79
+
80
+ Args:
81
+ date: CCD release date string (default ``"250523"``).
82
+
83
+ Returns:
84
+ Mapping from CCD three-letter code to RDKit ``Mol``.
85
+ """
86
+ _ensure_ccd_dir()
87
+ _migrate_legacy_bulk_dict(date)
88
+ local_path = _CCD_DIR / f"ccd_dict_{date}.pkl"
89
+ if not local_path.exists():
90
+ local_path = _download_bulk_dict(date)
91
+ with local_path.open("rb") as handle:
92
+ ccd_dict = pickle.load(handle)
93
+ return ccd_dict
94
+
95
+
96
+ def _fetch_from_rcsb(code: str) -> Mol:
97
+ """Fetch a CCD component from the RCSB CIF endpoint.
98
+
99
+ Downloads the per-ligand CCD CIF file and parses it with
100
+ ``pdbeccdutils``, which sets proper atom names, leaving-atom
101
+ flags, and Ideal/Model conformers from the CCD definition.
102
+
103
+ Args:
104
+ code: Normalized CCD three-letter code.
105
+
106
+ Returns:
107
+ Molecule with CCD atom names, leaving-atom flags, and
108
+ Ideal/Model conformers.
109
+
110
+ Raises:
111
+ CCDFetchError: If the CIF download or parsing fails.
112
+ """
113
+ url = _RCSB_CIF_URL.format(code=code)
114
+ try:
115
+ response = requests.get(url, timeout=20)
116
+ except requests.RequestException as exc:
117
+ raise CCDFetchError(f"Network error fetching CCD {code}: {exc}") from exc
118
+ if response.status_code != 200:
119
+ raise CCDFetchError(
120
+ f"RCSB returned status {response.status_code} for CCD {code}"
121
+ )
122
+
123
+ tmp_path: str | None = None
124
+ try:
125
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".cif", delete=False) as tmp:
126
+ tmp.write(response.text)
127
+ tmp_path = tmp.name
128
+
129
+ result = ccd_reader.read_pdb_cif_file(tmp_path, sanitize=False)
130
+ mol = result.component.mol
131
+ except Exception as exc:
132
+ raise CCDFetchError(f"Failed to parse RCSB CIF for CCD {code}: {exc}") from exc
133
+ finally:
134
+ if tmp_path is not None:
135
+ Path(tmp_path).unlink(missing_ok=True)
136
+
137
+ if mol.GetNumAtoms() == 0:
138
+ raise CCDFetchError(f"RCSB CIF for CCD {code} contains no atoms")
139
+
140
+ mol.SetProp("PDB_NAME", code)
141
+ return mol
142
+
143
+
144
+ def get_ccd_mol(code: str, date: str = _DEFAULT_CCD_DATE) -> Mol:
145
+ """Resolve a CCD molecule from per-CCD cache, bulk dict, then RCSB.
146
+
147
+ Lookup order:
148
+ 1. Bulk CCD dictionary (downloaded from HuggingFace).
149
+ 2. Per-CCD pickle in ``~/.emap2lig/ccd/<CODE>.pkl`` for fallback entries.
150
+ 3. RCSB CIF endpoint (parsed by ``pdbeccdutils``).
151
+
152
+ RCSB fallback hits are persisted to the per-CCD pickle so subsequent
153
+ lookups for CCD entries missing from the bulk dictionary are instant.
154
+
155
+ Args:
156
+ code: CCD three-letter code (case-insensitive, whitespace trimmed).
157
+ date: CCD release date string used to locate the bulk dictionary.
158
+
159
+ Returns:
160
+ RDKit ``Mol`` with 3D coordinates and atom names.
161
+
162
+ Raises:
163
+ CCDFetchError: If the code cannot be resolved from any source.
164
+ """
165
+ normalized_code = code.strip().upper()
166
+ if not normalized_code:
167
+ raise CCDFetchError(f"Empty CCD code: {code!r}")
168
+
169
+ _ensure_ccd_dir()
170
+ bulk_dict = _load_bulk_dict(date)
171
+ if normalized_code in bulk_dict:
172
+ return bulk_dict[normalized_code]
173
+
174
+ ccd_pickle = _CCD_DIR / f"{normalized_code}.pkl"
175
+ if ccd_pickle.exists():
176
+ with ccd_pickle.open("rb") as handle:
177
+ return pickle.load(handle)
178
+
179
+ mol = _fetch_from_rcsb(normalized_code)
180
+ with ccd_pickle.open("wb") as handle:
181
+ pickle.dump(mol, handle)
182
+ return mol
183
+
184
+
185
+ def _etkdg_embed(mol: Mol, version: str, *, use_random_coords: bool) -> int:
186
+ """Run ETKDG embedding followed by UFF relaxation.
187
+
188
+ Args:
189
+ mol: RDKit molecule to process (modified in place).
190
+ version: ETKDG version — ``"v3"`` or ``"v2"``.
191
+ use_random_coords: When ``True``, seed the embedder with random
192
+ coordinates. This helps large or charged molecules that fail
193
+ distance-geometry initialization.
194
+
195
+ Returns:
196
+ Conformer id on success, or ``-1`` when embedding fails.
197
+ """
198
+ if version == "v3":
199
+ options = rdDistGeom.ETKDGv3()
200
+ elif version == "v2":
201
+ options = rdDistGeom.ETKDGv2()
202
+ else:
203
+ raise ValueError(f"Unsupported ETKDG version: {version}")
204
+
205
+ options.clearConfs = False
206
+ options.useRandomCoords = use_random_coords
207
+
208
+ try:
209
+ conf_id = rdDistGeom.EmbedMolecule(mol, options)
210
+ if conf_id == -1:
211
+ return -1
212
+ rdForceFieldHelpers.UFFOptimizeMolecule(mol, confId=conf_id, maxIters=1000)
213
+ except (RuntimeError, ValueError):
214
+ logger.debug(
215
+ "ETKDG embedding failed: version=%s random_coords=%s",
216
+ version,
217
+ use_random_coords,
218
+ )
219
+ return -1
220
+
221
+ return conf_id
222
+
223
+
224
+ def compute_3d(mol: Mol, version: str = "v3") -> bool:
225
+ """Generate 3D coordinates using the ETKDG method.
226
+
227
+ Adapted from ``pdbeccdutils.core.component.Component``.
228
+
229
+ Tries the requested ETKDG version first, then retries with random
230
+ starting coordinates, and finally falls back to ETKDGv2.
231
+
232
+ Args:
233
+ mol: RDKit molecule to process (modified in place).
234
+ version: ETKDG version — ``"v3"`` or ``"v2"`` (defaults to ``"v3"``).
235
+
236
+ Returns:
237
+ ``True`` if a 3D conformer was successfully embedded.
238
+ """
239
+ versions = [version]
240
+ if version == "v3":
241
+ versions.append("v2")
242
+
243
+ for etkdg_version in versions:
244
+ for use_random_coords in (False, True):
245
+ conf_id = _etkdg_embed(
246
+ mol,
247
+ etkdg_version,
248
+ use_random_coords=use_random_coords,
249
+ )
250
+ if conf_id == -1:
251
+ continue
252
+
253
+ conformer = mol.GetConformer(conf_id)
254
+ conformer.SetProp("name", ConformerType.Computed.name)
255
+ conformer.SetProp("coord_generation", f"ETKDG{etkdg_version}")
256
+ return True
257
+
258
+ return False
259
+
260
+
261
+ def get_conformer(mol: Mol, c_type: ConformerType) -> Conformer:
262
+ """Retrieve a conformer of the requested type.
263
+
264
+ Adapted from ``pdbeccdutils.core.component.Component``.
265
+
266
+ Args:
267
+ mol: Molecule to search.
268
+ c_type: Desired conformer type.
269
+
270
+ Returns:
271
+ The first conformer whose ``name`` property matches *c_type*.
272
+
273
+ Raises:
274
+ ValueError: If no conformer of the requested type exists.
275
+ """
276
+ for c in mol.GetConformers():
277
+ try:
278
+ if c.GetProp("name") == c_type.name:
279
+ return c
280
+ except KeyError:
281
+ pass
282
+
283
+ raise ValueError(f"Conformer {c_type.name} does not exist.")
284
+
285
+
286
+ def compute_symmetries(mol: Mol) -> list[list[int]]:
287
+ """Compute the automorphism permutations of a molecule.
288
+
289
+ Each permutation maps non-leaving atom indices to their symmetric
290
+ counterparts. The result is also serialized into a hex-encoded
291
+ pickle stored as the ``symmetries`` property on *mol*.
292
+
293
+ Args:
294
+ mol: Molecule to process (modified in place).
295
+
296
+ Returns:
297
+ List of index permutations (one per automorphism).
298
+ """
299
+ mol = Chem.RemoveHs(mol)
300
+ idx_map: dict[int, int] = {}
301
+ atom_idx = 0
302
+ for i, atom in enumerate(mol.GetAtoms()):
303
+ if int(atom.GetProp("leaving_atom")):
304
+ continue
305
+ idx_map[i] = atom_idx
306
+ atom_idx += 1
307
+
308
+ permutations: list[list[int]] = []
309
+ raw_permutations = mol.GetSubstructMatches(mol, uniquify=False)
310
+ for raw_permutation in raw_permutations:
311
+ try:
312
+ if {raw_permutation[idx] for idx in idx_map} == set(idx_map.keys()):
313
+ permutation = [
314
+ idx_map[idx] for idx in raw_permutation if idx in idx_map
315
+ ]
316
+ permutations.append(permutation)
317
+ except IndexError:
318
+ logger.debug("Skipping malformed symmetry permutation")
319
+ serialized_permutations = pickle.dumps(permutations)
320
+ mol.SetProp("symmetries", serialized_permutations.hex())
321
+ return permutations
322
+
323
+
324
+ def add_conformer(mol: Mol) -> tuple[str, Mol]:
325
+ """Attempt to add a 3D conformer to a molecule.
326
+
327
+ For single-atom molecules the result is ``"single"``. Otherwise
328
+ an ETKDGv3 conformer is computed; if that fails the existing ideal
329
+ coordinates are used. If neither is available the result is
330
+ ``"failed"``.
331
+
332
+ Args:
333
+ mol: Molecule to process (modified in place).
334
+
335
+ Returns:
336
+ Tuple of (result_tag, molecule). *result_tag* is one of
337
+ ``"single"``, ``"computed"``, ``"ideal"``, or ``"failed"``.
338
+ """
339
+ # Check if single atom
340
+ if mol.GetNumAtoms() == 1:
341
+ result = "single"
342
+ else:
343
+ # Get the 3D conformer
344
+ try:
345
+ # Try to generate a 3D conformer with RDKit
346
+ success = compute_3d(mol, version="v3")
347
+ if success:
348
+ _ = get_conformer(mol, ConformerType.Computed)
349
+ result = "computed"
350
+
351
+ # Otherwise, default to the ideal coordinates
352
+ else:
353
+ _ = get_conformer(mol, ConformerType.Ideal)
354
+ result = "ideal"
355
+ except ValueError:
356
+ result = "failed"
357
+
358
+ # Output the results
359
+ return result, mol
360
+
361
+
362
+ def _assign_canonical_atom_names(mol: Mol, smiles: str) -> None:
363
+ """Assign canonical ``<SYMBOL><RANK>`` atom names to a heavy-atom molecule.
364
+
365
+ Args:
366
+ mol: Heavy-atom RDKit molecule (modified in place).
367
+ smiles: Source SMILES string, included in error messages.
368
+
369
+ Raises:
370
+ ValueError: If an atom name exceeds 4 characters.
371
+ """
372
+ canonical_order = Chem.CanonicalRankAtoms(mol)
373
+ for atom, can_idx in zip(mol.GetAtoms(), canonical_order):
374
+ atom_name = atom.GetSymbol().upper() + str(can_idx + 1)
375
+ if len(atom_name) > 4:
376
+ raise ValueError(
377
+ f"{smiles} has an atom with a name longer than 4 characters: {atom_name}"
378
+ )
379
+ atom.SetProp("name", atom_name)
380
+
381
+
382
+ def get_conformer_from_smiles(smiles: str) -> tuple[str, Mol]:
383
+ """Build a molecule from a SMILES string and generate a 3D conformer.
384
+
385
+ Hydrogens are added for ETKDG embedding, then removed before
386
+ canonical atom names are assigned on the heavy-atom molecule.
387
+ Atom names longer than 4 characters raise ``ValueError``.
388
+
389
+ Args:
390
+ smiles: SMILES string to parse.
391
+
392
+ Returns:
393
+ Tuple of (result_tag, molecule) from :func:`add_conformer`.
394
+
395
+ Raises:
396
+ ValueError: If an atom name exceeds 4 characters.
397
+ """
398
+ mol = Chem.MolFromSmiles(smiles)
399
+ mol = Chem.AddHs(mol)
400
+ result, mol = add_conformer(mol)
401
+
402
+ if result == "failed":
403
+ return result, mol
404
+
405
+ mol = Chem.RemoveHs(mol)
406
+ _assign_canonical_atom_names(mol, smiles)
407
+ return result, mol
emap2lig/data/const.py ADDED
@@ -0,0 +1,25 @@
1
+ num_elements = 128
2
+
3
+ ####################################################################################################
4
+ # ATOMS
5
+ ####################################################################################################
6
+
7
+ chirality_types = [
8
+ "CHI_OTHER",
9
+ "CHI_OCTAHEDRAL",
10
+ "CHI_TETRAHEDRAL_CW",
11
+ "CHI_TRIGONALBIPYRAMIDAL",
12
+ "CHI_UNSPECIFIED",
13
+ "CHI_TETRAHEDRAL_CCW",
14
+ "CHI_SQUAREPLANAR",
15
+ ]
16
+ chirality_type_ids = {chirality: i for i, chirality in enumerate(chirality_types)}
17
+
18
+ bond_types = [
19
+ "SINGLE",
20
+ "DOUBLE",
21
+ "TRIPLE",
22
+ "DATIVE",
23
+ "AROMATIC",
24
+ ]
25
+ bond_type_ids = {bond: i for i, bond in enumerate(bond_types)}