@datagrok/bio 2.25.1 → 2.25.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +7 -0
- package/dist/package-test.js +5 -5
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +3 -3
- package/dist/package.js.map +1 -1
- package/package.json +2 -2
- package/scripts/mol-to-helm.py +1279 -0
- package/src/package-api.ts +14 -0
- package/src/package.g.ts +9 -0
- package/src/package.ts +27 -1
- package/src/utils/monomer-lib/library-file-manager/ui.ts +2 -0
- package/src/utils/monomer-lib/monomer-manager/monomer-manager.ts +34 -13
- package/src/widgets/sequence-scrolling-widget.ts +195 -183
- package/test-console-output-1.log +338 -342
- package/test-record-1.mp4 +0 -0
|
@@ -0,0 +1,1279 @@
|
|
|
1
|
+
#language: python
|
|
2
|
+
#name: molToHelmConverterPy
|
|
3
|
+
#description: Converts molecules to HELM notation based on monomer library
|
|
4
|
+
#input: dataframe moleculesDataframe
|
|
5
|
+
#input: column moleculesColumn {semType: Molecule}
|
|
6
|
+
#input: string libraryJSON
|
|
7
|
+
#output: dataframe result_helm {action:join(moleculesDataframe)} [Sequences, in HELM format]
|
|
8
|
+
molListToProcess = moleculesDataframe[moleculesColumn].tolist()
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import numpy as np
|
|
11
|
+
"""
|
|
12
|
+
Aggregated file combining all modules from the logics folder.
|
|
13
|
+
Generated automatically - do not edit manually.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
# External library imports
|
|
17
|
+
from collections import defaultdict
|
|
18
|
+
from enum import Enum
|
|
19
|
+
from itertools import combinations
|
|
20
|
+
from rdkit import Chem
|
|
21
|
+
from rdkit import RDLogger
|
|
22
|
+
from typing import Dict
|
|
23
|
+
from typing import List
|
|
24
|
+
from typing import Optional
|
|
25
|
+
from typing import Tuple
|
|
26
|
+
import json
|
|
27
|
+
import os
|
|
28
|
+
|
|
29
|
+
# ============================================================================
|
|
30
|
+
# Content from: fragment_graph.py
|
|
31
|
+
# ============================================================================
|
|
32
|
+
|
|
33
|
+
from rdkit import Chem
|
|
34
|
+
from typing import Optional, List, Dict, Tuple
|
|
35
|
+
from enum import Enum
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class LinkageType(Enum):
|
|
39
|
+
"""Types of linkages between fragments"""
|
|
40
|
+
PEPTIDE = "peptide"
|
|
41
|
+
DISULFIDE = "disulfide"
|
|
42
|
+
ESTER = "ester"
|
|
43
|
+
ETHER = "ether"
|
|
44
|
+
THIOETHER = "thioether"
|
|
45
|
+
UNKNOWN = "unknown"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class FragmentNode:
|
|
49
|
+
"""Represents a single molecular fragment (amino acid/monomer)"""
|
|
50
|
+
|
|
51
|
+
def __init__(self, fragment_id: int, mol: Chem.Mol):
|
|
52
|
+
self.id = fragment_id
|
|
53
|
+
self.mol = mol # RDKit molecule object
|
|
54
|
+
self.smiles = Chem.MolToSmiles(mol, canonical=True) if mol else ""
|
|
55
|
+
self.monomer = None # Will be filled by matcher - MonomerData object
|
|
56
|
+
self.is_c_terminal = False
|
|
57
|
+
self.is_n_terminal = False
|
|
58
|
+
|
|
59
|
+
def __repr__(self):
|
|
60
|
+
monomer_name = self.monomer.symbol if self.monomer else "Unknown"
|
|
61
|
+
return f"FragmentNode(id={self.id}, monomer={monomer_name}, smiles={self.smiles[:20]}...)"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class FragmentLink:
|
|
65
|
+
"""Represents a connection between two fragments"""
|
|
66
|
+
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
from_node_id: int,
|
|
70
|
+
to_node_id: int,
|
|
71
|
+
linkage_type: LinkageType,
|
|
72
|
+
from_atom_idx: Optional[int] = None,
|
|
73
|
+
to_atom_idx: Optional[int] = None
|
|
74
|
+
):
|
|
75
|
+
self.from_node_id = from_node_id
|
|
76
|
+
self.to_node_id = to_node_id
|
|
77
|
+
self.linkage_type = linkage_type
|
|
78
|
+
self.from_atom_idx = from_atom_idx # Atom index in from_node's molecule
|
|
79
|
+
self.to_atom_idx = to_atom_idx # Atom index in to_node's molecule
|
|
80
|
+
|
|
81
|
+
def __repr__(self):
|
|
82
|
+
return f"FragmentLink({self.from_node_id} --{self.linkage_type.value}--> {self.to_node_id})"
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class FragmentGraph:
|
|
86
|
+
"""
|
|
87
|
+
Graph structure representing a molecule as fragments and their connections.
|
|
88
|
+
|
|
89
|
+
Supports:
|
|
90
|
+
- Linear peptides (chain of peptide bonds)
|
|
91
|
+
- Cyclic peptides (peptide bond from last to first)
|
|
92
|
+
- Disulfide bridges (additional S-S links)
|
|
93
|
+
- Branched structures (multiple links per fragment)
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
def __init__(self):
|
|
97
|
+
self.nodes: Dict[int, FragmentNode] = {} # node_id -> FragmentNode
|
|
98
|
+
self.links: List[FragmentLink] = []
|
|
99
|
+
|
|
100
|
+
def add_node(self, node: FragmentNode) -> int:
|
|
101
|
+
"""Add a fragment node to the graph"""
|
|
102
|
+
self.nodes[node.id] = node
|
|
103
|
+
return node.id
|
|
104
|
+
|
|
105
|
+
def add_link(self, link: FragmentLink):
|
|
106
|
+
"""Add a linkage between two nodes"""
|
|
107
|
+
if link.from_node_id not in self.nodes or link.to_node_id not in self.nodes:
|
|
108
|
+
raise ValueError(f"Cannot add link: nodes {link.from_node_id} or {link.to_node_id} not in graph")
|
|
109
|
+
self.links.append(link)
|
|
110
|
+
|
|
111
|
+
def get_node(self, node_id: int) -> Optional[FragmentNode]:
|
|
112
|
+
"""Get a node by ID"""
|
|
113
|
+
return self.nodes.get(node_id)
|
|
114
|
+
|
|
115
|
+
def get_neighbors(self, node_id: int) -> List[Tuple[int, LinkageType]]:
|
|
116
|
+
"""Get all neighbors of a node with their linkage types"""
|
|
117
|
+
neighbors = []
|
|
118
|
+
for link in self.links:
|
|
119
|
+
if link.from_node_id == node_id:
|
|
120
|
+
neighbors.append((link.to_node_id, link.linkage_type))
|
|
121
|
+
elif link.to_node_id == node_id:
|
|
122
|
+
neighbors.append((link.from_node_id, link.linkage_type))
|
|
123
|
+
return neighbors
|
|
124
|
+
|
|
125
|
+
def get_ordered_nodes(self) -> List[FragmentNode]:
|
|
126
|
+
"""
|
|
127
|
+
Get nodes in sequential order (for linear/cyclic peptides).
|
|
128
|
+
For branched structures, returns a depth-first traversal.
|
|
129
|
+
"""
|
|
130
|
+
if not self.nodes:
|
|
131
|
+
return []
|
|
132
|
+
|
|
133
|
+
# Find starting node (N-terminal for peptides)
|
|
134
|
+
start_node_id = None
|
|
135
|
+
for node_id, node in self.nodes.items():
|
|
136
|
+
if node.is_n_terminal:
|
|
137
|
+
start_node_id = node_id
|
|
138
|
+
break
|
|
139
|
+
|
|
140
|
+
# If no N-terminal found, use first node
|
|
141
|
+
if start_node_id is None:
|
|
142
|
+
start_node_id = min(self.nodes.keys())
|
|
143
|
+
|
|
144
|
+
# Traverse the graph
|
|
145
|
+
ordered = []
|
|
146
|
+
visited = set()
|
|
147
|
+
self._traverse_from_node(start_node_id, visited, ordered)
|
|
148
|
+
|
|
149
|
+
return ordered
|
|
150
|
+
|
|
151
|
+
def _traverse_from_node(self, node_id: int, visited: set, ordered: list):
|
|
152
|
+
"""Helper for depth-first traversal"""
|
|
153
|
+
if node_id in visited:
|
|
154
|
+
return
|
|
155
|
+
|
|
156
|
+
visited.add(node_id)
|
|
157
|
+
ordered.append(self.nodes[node_id])
|
|
158
|
+
|
|
159
|
+
# Get peptide bond neighbors first (to maintain chain order)
|
|
160
|
+
peptide_neighbors = []
|
|
161
|
+
other_neighbors = []
|
|
162
|
+
|
|
163
|
+
for link in self.links:
|
|
164
|
+
if link.from_node_id == node_id and link.to_node_id not in visited:
|
|
165
|
+
if link.linkage_type == LinkageType.PEPTIDE:
|
|
166
|
+
peptide_neighbors.append(link.to_node_id)
|
|
167
|
+
else:
|
|
168
|
+
other_neighbors.append(link.to_node_id)
|
|
169
|
+
|
|
170
|
+
# Visit peptide bonds first, then others
|
|
171
|
+
for neighbor_id in peptide_neighbors + other_neighbors:
|
|
172
|
+
self._traverse_from_node(neighbor_id, visited, ordered)
|
|
173
|
+
|
|
174
|
+
def get_fragment_sequence(self) -> List[str]:
|
|
175
|
+
"""Get sequence of monomer symbols (for matched fragments)"""
|
|
176
|
+
ordered_nodes = self.get_ordered_nodes()
|
|
177
|
+
return [
|
|
178
|
+
node.monomer.symbol if node.monomer else f"X{node.id}"
|
|
179
|
+
for node in ordered_nodes
|
|
180
|
+
]
|
|
181
|
+
|
|
182
|
+
def __len__(self):
|
|
183
|
+
return len(self.nodes)
|
|
184
|
+
|
|
185
|
+
def __repr__(self):
|
|
186
|
+
return f"FragmentGraph(nodes={len(self.nodes)}, links={len(self.links)})"
|
|
187
|
+
|
|
188
|
+
def to_dict(self) -> dict:
|
|
189
|
+
"""Convert graph to dictionary for serialization"""
|
|
190
|
+
return {
|
|
191
|
+
"nodes": [
|
|
192
|
+
{
|
|
193
|
+
"id": node.id,
|
|
194
|
+
"smiles": node.smiles,
|
|
195
|
+
"monomer": node.monomer.symbol if node.monomer else None,
|
|
196
|
+
"is_n_terminal": node.is_n_terminal,
|
|
197
|
+
"is_c_terminal": node.is_c_terminal
|
|
198
|
+
}
|
|
199
|
+
for node in self.nodes.values()
|
|
200
|
+
],
|
|
201
|
+
"links": [
|
|
202
|
+
{
|
|
203
|
+
"from": link.from_node_id,
|
|
204
|
+
"to": link.to_node_id,
|
|
205
|
+
"type": link.linkage_type.value,
|
|
206
|
+
"from_atom": link.from_atom_idx,
|
|
207
|
+
"to_atom": link.to_atom_idx
|
|
208
|
+
}
|
|
209
|
+
for link in self.links
|
|
210
|
+
]
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
# ============================================================================
|
|
214
|
+
# Content from: fragment_processor.py
|
|
215
|
+
# ============================================================================
|
|
216
|
+
|
|
217
|
+
from rdkit import Chem
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
class BondDetector:
|
|
221
|
+
#GENERALIZATION ITEM: BOND PATTERNS SHOULD BE DERIVED FROM LIBRARY
|
|
222
|
+
def __init__(self):
|
|
223
|
+
# True peptide bond: C and N both in backbone (each bonded to carbons)
|
|
224
|
+
# Alpha carbons can be sp3 (X4) or sp2 (X3) for dehydroamino acids
|
|
225
|
+
self.peptide_bond = Chem.MolFromSmarts('[C;X3,X4]-[C;X3](=[O;X1])-[N;X3]-[C;X3,X4]')
|
|
226
|
+
# True disulfide bond: S-S where each S is bonded to carbon (cysteine residues)
|
|
227
|
+
self.disulfide_bond = Chem.MolFromSmarts('[C;X4]-[S;X2]-[S;X2]-[C;X4]')
|
|
228
|
+
# Primary amine at N-terminus (can be NH2 or NH3+), alpha-C can be sp3 or sp2
|
|
229
|
+
self.primary_amine = Chem.MolFromSmarts('[N;H2,H3;X3,X4]-[C;X3,X4]')
|
|
230
|
+
|
|
231
|
+
def find_cleavable_bonds(self, mol: Chem.Mol):
|
|
232
|
+
"""
|
|
233
|
+
Find all cleavable bonds in the molecule.
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
List of tuples: (atom1_idx, atom2_idx, LinkageType)
|
|
237
|
+
"""
|
|
238
|
+
try:
|
|
239
|
+
all_bonds = []
|
|
240
|
+
|
|
241
|
+
# Find peptide bonds
|
|
242
|
+
peptide_bonds = self._find_peptide_bonds(mol)
|
|
243
|
+
all_bonds.extend([(bond[0], bond[1], LinkageType.PEPTIDE) for bond in peptide_bonds])
|
|
244
|
+
|
|
245
|
+
# Find disulfide bonds
|
|
246
|
+
disulfide_bonds = self._find_disulfide_bonds(mol)
|
|
247
|
+
all_bonds.extend([(bond[0], bond[1], LinkageType.DISULFIDE) for bond in disulfide_bonds])
|
|
248
|
+
|
|
249
|
+
# Order peptide bonds from N to C (keep disulfide bonds unordered)
|
|
250
|
+
peptide_only = [(b[0], b[1]) for b in all_bonds if b[2] == LinkageType.PEPTIDE]
|
|
251
|
+
ordered_peptide = self._order_bonds_from_n_to_c(mol, peptide_only)
|
|
252
|
+
|
|
253
|
+
# Rebuild with types
|
|
254
|
+
ordered_bonds = [(b[0], b[1], LinkageType.PEPTIDE) for b in ordered_peptide]
|
|
255
|
+
ordered_bonds.extend([b for b in all_bonds if b[2] != LinkageType.PEPTIDE])
|
|
256
|
+
|
|
257
|
+
return ordered_bonds
|
|
258
|
+
|
|
259
|
+
except Exception:
|
|
260
|
+
return []
|
|
261
|
+
|
|
262
|
+
def _find_peptide_bonds(self, mol: Chem.Mol):
|
|
263
|
+
bonds = []
|
|
264
|
+
try:
|
|
265
|
+
matches = mol.GetSubstructMatches(self.peptide_bond)
|
|
266
|
+
for match in matches:
|
|
267
|
+
if len(match) >= 5:
|
|
268
|
+
# Pattern: [C;X3,X4]-[C;X3](=[O;X1])-[N;X3]-[C;X3,X4]
|
|
269
|
+
# match[0]=alpha-C (sp2 or sp3), match[1]=carbonyl-C, match[2]=O, match[3]=N, match[4]=next-alpha-C (sp2 or sp3)
|
|
270
|
+
c_atom = match[1] # Carbonyl carbon
|
|
271
|
+
n_atom = match[3] # Nitrogen
|
|
272
|
+
bonds.append((c_atom, n_atom))
|
|
273
|
+
except Exception:
|
|
274
|
+
pass
|
|
275
|
+
return bonds
|
|
276
|
+
|
|
277
|
+
def _find_disulfide_bonds(self, mol: Chem.Mol):
|
|
278
|
+
"""Find disulfide bonds (S-S linkages)"""
|
|
279
|
+
bonds = []
|
|
280
|
+
try:
|
|
281
|
+
matches = mol.GetSubstructMatches(self.disulfide_bond)
|
|
282
|
+
for match in matches:
|
|
283
|
+
if len(match) >= 4:
|
|
284
|
+
# Pattern: [C;X4]-[S;X2]-[S;X2]-[C;X4]
|
|
285
|
+
# match[0]=C, match[1]=S, match[2]=S, match[3]=C
|
|
286
|
+
s1_atom = match[1] # First sulfur
|
|
287
|
+
s2_atom = match[2] # Second sulfur
|
|
288
|
+
bonds.append((s1_atom, s2_atom))
|
|
289
|
+
except Exception:
|
|
290
|
+
pass
|
|
291
|
+
return bonds
|
|
292
|
+
|
|
293
|
+
def _order_bonds_from_n_to_c(self, mol: Chem.Mol, bonds):
|
|
294
|
+
if not bonds:
|
|
295
|
+
return bonds
|
|
296
|
+
|
|
297
|
+
n_terminal = self._find_n_terminal(mol)
|
|
298
|
+
if n_terminal is None:
|
|
299
|
+
return bonds
|
|
300
|
+
|
|
301
|
+
ordered = []
|
|
302
|
+
visited = set()
|
|
303
|
+
current = n_terminal
|
|
304
|
+
|
|
305
|
+
while current is not None and len(ordered) < len(bonds):
|
|
306
|
+
next_bond = None
|
|
307
|
+
for bond in bonds:
|
|
308
|
+
if bond not in visited and bond[1] == current:
|
|
309
|
+
next_bond = bond
|
|
310
|
+
break
|
|
311
|
+
|
|
312
|
+
if next_bond is None:
|
|
313
|
+
break
|
|
314
|
+
|
|
315
|
+
ordered.append(next_bond)
|
|
316
|
+
visited.add(next_bond)
|
|
317
|
+
current = next_bond[0]
|
|
318
|
+
|
|
319
|
+
for bond in bonds:
|
|
320
|
+
if bond not in visited:
|
|
321
|
+
ordered.append(bond)
|
|
322
|
+
|
|
323
|
+
return ordered
|
|
324
|
+
|
|
325
|
+
def _find_n_terminal(self, mol: Chem.Mol):
|
|
326
|
+
try:
|
|
327
|
+
matches = mol.GetSubstructMatches(self.primary_amine)
|
|
328
|
+
if matches:
|
|
329
|
+
return matches[0][0]
|
|
330
|
+
|
|
331
|
+
max_h = -1
|
|
332
|
+
n_term = None
|
|
333
|
+
for atom in mol.GetAtoms():
|
|
334
|
+
if atom.GetAtomicNum() == 7:
|
|
335
|
+
h_count = atom.GetTotalNumHs()
|
|
336
|
+
if h_count > max_h:
|
|
337
|
+
max_h = h_count
|
|
338
|
+
n_term = atom.GetIdx()
|
|
339
|
+
return n_term
|
|
340
|
+
|
|
341
|
+
except Exception:
|
|
342
|
+
return None
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
class FragmentProcessor:
|
|
346
|
+
def __init__(self, monomer_library):
|
|
347
|
+
self.monomer_library = monomer_library
|
|
348
|
+
self.bond_detector = BondDetector()
|
|
349
|
+
|
|
350
|
+
def process_molecule(self, mol: Chem.Mol) -> FragmentGraph:
|
|
351
|
+
"""
|
|
352
|
+
Process a molecule into a fragment graph.
|
|
353
|
+
|
|
354
|
+
Args:
|
|
355
|
+
mol: RDKit molecule object
|
|
356
|
+
|
|
357
|
+
Returns:
|
|
358
|
+
FragmentGraph object containing fragments and their connections
|
|
359
|
+
"""
|
|
360
|
+
graph = FragmentGraph()
|
|
361
|
+
# Store original molecule for fragment recovery
|
|
362
|
+
graph.original_mol = mol
|
|
363
|
+
|
|
364
|
+
try:
|
|
365
|
+
bonds_to_cleave = self.bond_detector.find_cleavable_bonds(mol)
|
|
366
|
+
|
|
367
|
+
if not bonds_to_cleave:
|
|
368
|
+
# Single fragment (no cleavable bonds)
|
|
369
|
+
node = FragmentNode(0, mol)
|
|
370
|
+
node.is_n_terminal = True
|
|
371
|
+
node.is_c_terminal = True
|
|
372
|
+
graph.add_node(node)
|
|
373
|
+
return graph
|
|
374
|
+
|
|
375
|
+
# Extract bond info for fragmentation
|
|
376
|
+
bond_indices = []
|
|
377
|
+
bond_info = [] # (bond_idx, atom1, atom2, linkage_type)
|
|
378
|
+
seen_bonds = set() # Track which bonds we've already added
|
|
379
|
+
|
|
380
|
+
for atom1, atom2, linkage_type in bonds_to_cleave:
|
|
381
|
+
bond = mol.GetBondBetweenAtoms(atom1, atom2)
|
|
382
|
+
if bond:
|
|
383
|
+
bond_idx = bond.GetIdx()
|
|
384
|
+
if bond_idx not in seen_bonds:
|
|
385
|
+
bond_indices.append(bond_idx)
|
|
386
|
+
bond_info.append((bond_idx, atom1, atom2, linkage_type))
|
|
387
|
+
seen_bonds.add(bond_idx)
|
|
388
|
+
# Skip duplicate bonds silently
|
|
389
|
+
# Skip invalid bonds silently
|
|
390
|
+
|
|
391
|
+
if not bond_indices:
|
|
392
|
+
# No valid bonds found
|
|
393
|
+
node = FragmentNode(0, mol)
|
|
394
|
+
node.is_n_terminal = True
|
|
395
|
+
node.is_c_terminal = True
|
|
396
|
+
graph.add_node(node)
|
|
397
|
+
return graph
|
|
398
|
+
|
|
399
|
+
# Fragment the molecule
|
|
400
|
+
fragmented_mol = Chem.FragmentOnBonds(mol, bond_indices, addDummies=True)
|
|
401
|
+
|
|
402
|
+
# Get fragments AND their atom mappings separately
|
|
403
|
+
fragments_tuple = Chem.GetMolFrags(
|
|
404
|
+
fragmented_mol,
|
|
405
|
+
asMols=True,
|
|
406
|
+
sanitizeFrags=True
|
|
407
|
+
)
|
|
408
|
+
fragments = list(fragments_tuple)
|
|
409
|
+
|
|
410
|
+
# Store bond cleavage info for recovery - we'll use this to selectively re-fragment
|
|
411
|
+
graph.cleaved_bond_indices = bond_indices
|
|
412
|
+
graph.bond_info = bond_info
|
|
413
|
+
print(f"DEBUG: Created {len(fragments)} fragments, cleaved {len(bond_indices)} bonds")
|
|
414
|
+
|
|
415
|
+
# Create nodes for each fragment
|
|
416
|
+
fragment_nodes = []
|
|
417
|
+
for i, frag in enumerate(fragments):
|
|
418
|
+
clean_frag = self._clean_fragment(frag)
|
|
419
|
+
if clean_frag and clean_frag.GetNumAtoms() >= 3:
|
|
420
|
+
is_c_terminal = (i == len(fragments) - 1)
|
|
421
|
+
is_n_terminal = (i == 0)
|
|
422
|
+
# No normalization! Use fragment as-is
|
|
423
|
+
node = FragmentNode(i, clean_frag)
|
|
424
|
+
node.is_c_terminal = is_c_terminal
|
|
425
|
+
node.is_n_terminal = is_n_terminal
|
|
426
|
+
graph.add_node(node)
|
|
427
|
+
fragment_nodes.append((i, node))
|
|
428
|
+
|
|
429
|
+
# Create links between fragments based on cleaved bonds
|
|
430
|
+
# For sequential peptide bonds
|
|
431
|
+
peptide_links = [b for b in bond_info if b[3] == LinkageType.PEPTIDE]
|
|
432
|
+
for i in range(len(fragment_nodes) - 1):
|
|
433
|
+
from_id, _ = fragment_nodes[i]
|
|
434
|
+
to_id, _ = fragment_nodes[i + 1]
|
|
435
|
+
link = FragmentLink(from_id, to_id, LinkageType.PEPTIDE)
|
|
436
|
+
graph.add_link(link)
|
|
437
|
+
|
|
438
|
+
# Add disulfide bridges (if any)
|
|
439
|
+
# TODO: Track which fragments contain the S atoms for proper linking
|
|
440
|
+
disulfide_links = [b for b in bond_info if b[3] == LinkageType.DISULFIDE]
|
|
441
|
+
# For now, disulfide bonds require more complex atom tracking
|
|
442
|
+
# This is a placeholder for future enhancement
|
|
443
|
+
|
|
444
|
+
return graph
|
|
445
|
+
|
|
446
|
+
except Exception as e:
|
|
447
|
+
# Fallback: single node with original molecule
|
|
448
|
+
node = FragmentNode(0, mol)
|
|
449
|
+
node.is_n_terminal = True
|
|
450
|
+
node.is_c_terminal = True
|
|
451
|
+
graph.add_node(node)
|
|
452
|
+
return graph
|
|
453
|
+
|
|
454
|
+
def _clean_fragment(self, mol: Chem.Mol):
|
|
455
|
+
try:
|
|
456
|
+
mol_copy = Chem.Mol(mol)
|
|
457
|
+
atoms_to_remove = []
|
|
458
|
+
|
|
459
|
+
for atom in mol_copy.GetAtoms():
|
|
460
|
+
if atom.GetAtomicNum() == 0:
|
|
461
|
+
atoms_to_remove.append(atom.GetIdx())
|
|
462
|
+
|
|
463
|
+
atoms_to_remove.sort(reverse=True)
|
|
464
|
+
if atoms_to_remove:
|
|
465
|
+
emol = Chem.EditableMol(mol_copy)
|
|
466
|
+
for atom_idx in atoms_to_remove:
|
|
467
|
+
emol.RemoveAtom(atom_idx)
|
|
468
|
+
return emol.GetMol()
|
|
469
|
+
|
|
470
|
+
return mol_copy
|
|
471
|
+
|
|
472
|
+
except Exception:
|
|
473
|
+
return None
|
|
474
|
+
|
|
475
|
+
def _reconstruct_fragment(self, node_ids: list, graph: FragmentGraph) -> Chem.Mol:
|
|
476
|
+
"""
|
|
477
|
+
Reconstruct a molecule by combining multiple fragment nodes.
|
|
478
|
+
Re-fragments the original molecule, excluding bonds between the nodes to merge.
|
|
479
|
+
"""
|
|
480
|
+
if not node_ids or not hasattr(graph, 'original_mol') or not hasattr(graph, 'cleaved_bond_indices'):
|
|
481
|
+
return None
|
|
482
|
+
|
|
483
|
+
try:
|
|
484
|
+
# Sort node IDs to ensure consistent ordering
|
|
485
|
+
sorted_nodes = sorted(node_ids)
|
|
486
|
+
|
|
487
|
+
# Identify which bonds to exclude (bonds between consecutive merged nodes)
|
|
488
|
+
bonds_to_exclude = set()
|
|
489
|
+
for i in range(len(sorted_nodes) - 1):
|
|
490
|
+
# We want to keep the bond between node i and node i+1
|
|
491
|
+
# This bond would be at position sorted_nodes[i] in the cleaved_bond_indices
|
|
492
|
+
if sorted_nodes[i] + 1 == sorted_nodes[i + 1]:
|
|
493
|
+
# Consecutive nodes - exclude the bond between them
|
|
494
|
+
if sorted_nodes[i] < len(graph.cleaved_bond_indices):
|
|
495
|
+
bonds_to_exclude.add(sorted_nodes[i])
|
|
496
|
+
|
|
497
|
+
# Create new bond list excluding the bonds we want to keep
|
|
498
|
+
new_bond_indices = [
|
|
499
|
+
bond_idx for i, bond_idx in enumerate(graph.cleaved_bond_indices)
|
|
500
|
+
if i not in bonds_to_exclude
|
|
501
|
+
]
|
|
502
|
+
|
|
503
|
+
print(f"DEBUG reconstruct: Original had {len(graph.cleaved_bond_indices)} cleaved bonds, "
|
|
504
|
+
f"excluding {len(bonds_to_exclude)} bonds, new list has {len(new_bond_indices)} bonds")
|
|
505
|
+
|
|
506
|
+
# Re-fragment with the modified bond list
|
|
507
|
+
if not new_bond_indices:
|
|
508
|
+
# No bonds to cleave - return whole molecule
|
|
509
|
+
return graph.original_mol
|
|
510
|
+
|
|
511
|
+
fragmented_mol = Chem.FragmentOnBonds(graph.original_mol, new_bond_indices, addDummies=True)
|
|
512
|
+
fragments_tuple = Chem.GetMolFrags(fragmented_mol, asMols=True, sanitizeFrags=True)
|
|
513
|
+
fragments = list(fragments_tuple)
|
|
514
|
+
|
|
515
|
+
# Find which fragment corresponds to our merged nodes
|
|
516
|
+
# The merged nodes should be at the position of the first node ID in sorted order
|
|
517
|
+
target_idx = sorted_nodes[0]
|
|
518
|
+
|
|
519
|
+
# Account for excluded bonds shifting fragment indices
|
|
520
|
+
adjusted_idx = target_idx - sum(1 for excluded_idx in bonds_to_exclude if excluded_idx < target_idx)
|
|
521
|
+
|
|
522
|
+
print(f"DEBUG reconstruct: Got {len(fragments)} fragments after re-fragmentation, "
|
|
523
|
+
f"target_idx={target_idx}, adjusted_idx={adjusted_idx}")
|
|
524
|
+
|
|
525
|
+
if adjusted_idx < len(fragments):
|
|
526
|
+
clean_frag = self._clean_fragment(fragments[adjusted_idx])
|
|
527
|
+
return clean_frag if clean_frag else fragments[adjusted_idx]
|
|
528
|
+
|
|
529
|
+
return None
|
|
530
|
+
|
|
531
|
+
except Exception as e:
|
|
532
|
+
print(f"DEBUG reconstruct: Exception: {e}")
|
|
533
|
+
return None
|
|
534
|
+
|
|
535
|
+
def _merge_nodes_in_graph(self, graph: FragmentGraph, nodes_to_merge: list,
|
|
536
|
+
new_node: FragmentNode) -> None:
|
|
537
|
+
"""
|
|
538
|
+
Remove old nodes, add new merged node, update all links.
|
|
539
|
+
Preserves terminal flags from edge nodes.
|
|
540
|
+
"""
|
|
541
|
+
if not nodes_to_merge:
|
|
542
|
+
return
|
|
543
|
+
|
|
544
|
+
# Sort node IDs to identify edge nodes
|
|
545
|
+
sorted_nodes = sorted(nodes_to_merge)
|
|
546
|
+
leftmost = sorted_nodes[0]
|
|
547
|
+
rightmost = sorted_nodes[-1]
|
|
548
|
+
|
|
549
|
+
# Preserve terminal flags
|
|
550
|
+
if leftmost in graph.nodes:
|
|
551
|
+
new_node.is_n_terminal = graph.nodes[leftmost].is_n_terminal
|
|
552
|
+
if rightmost in graph.nodes:
|
|
553
|
+
new_node.is_c_terminal = graph.nodes[rightmost].is_c_terminal
|
|
554
|
+
|
|
555
|
+
# Update links: replace references to merged nodes
|
|
556
|
+
updated_links = []
|
|
557
|
+
nodes_to_merge_set = set(nodes_to_merge)
|
|
558
|
+
|
|
559
|
+
for link in graph.links:
|
|
560
|
+
from_in = link.from_node_id in nodes_to_merge_set
|
|
561
|
+
to_in = link.to_node_id in nodes_to_merge_set
|
|
562
|
+
|
|
563
|
+
# Skip internal links between merged nodes
|
|
564
|
+
if from_in and to_in:
|
|
565
|
+
continue
|
|
566
|
+
|
|
567
|
+
# Update link if one end is being merged
|
|
568
|
+
new_from = new_node.id if from_in else link.from_node_id
|
|
569
|
+
new_to = new_node.id if to_in else link.to_node_id
|
|
570
|
+
|
|
571
|
+
updated_links.append(FragmentLink(new_from, new_to, link.linkage_type))
|
|
572
|
+
|
|
573
|
+
# Remove old nodes
|
|
574
|
+
for node_id in nodes_to_merge:
|
|
575
|
+
if node_id in graph.nodes:
|
|
576
|
+
del graph.nodes[node_id]
|
|
577
|
+
|
|
578
|
+
# Add new node and update links
|
|
579
|
+
graph.add_node(new_node)
|
|
580
|
+
graph.links = updated_links
|
|
581
|
+
|
|
582
|
+
def recover_unmatched_fragments(self, graph: FragmentGraph, matcher) -> bool:
|
|
583
|
+
"""
|
|
584
|
+
Try to recover unmatched fragments by merging with neighbors.
|
|
585
|
+
Returns True if any merges were successful.
|
|
586
|
+
"""
|
|
587
|
+
# Identify unmatched nodes
|
|
588
|
+
unmatched_nodes = []
|
|
589
|
+
for node_id, node in graph.nodes.items():
|
|
590
|
+
if node.monomer and node.monomer.symbol.startswith("X"):
|
|
591
|
+
unmatched_nodes.append(node_id)
|
|
592
|
+
|
|
593
|
+
if not unmatched_nodes:
|
|
594
|
+
return False
|
|
595
|
+
|
|
596
|
+
print(f"DEBUG: Found {len(unmatched_nodes)} unmatched nodes: {unmatched_nodes}")
|
|
597
|
+
|
|
598
|
+
had_changes = False
|
|
599
|
+
|
|
600
|
+
# Try to recover each unmatched node
|
|
601
|
+
for node_id in unmatched_nodes:
|
|
602
|
+
# Check if node still exists (might have been merged already)
|
|
603
|
+
if node_id not in graph.nodes:
|
|
604
|
+
continue
|
|
605
|
+
|
|
606
|
+
# Get neighbors
|
|
607
|
+
neighbors = graph.get_neighbors(node_id)
|
|
608
|
+
neighbor_ids = [n[0] for n in neighbors]
|
|
609
|
+
|
|
610
|
+
if not neighbor_ids:
|
|
611
|
+
continue
|
|
612
|
+
|
|
613
|
+
# Separate left and right neighbors (assuming sequential order)
|
|
614
|
+
left_neighbors = [n for n in neighbor_ids if n < node_id]
|
|
615
|
+
right_neighbors = [n for n in neighbor_ids if n > node_id]
|
|
616
|
+
|
|
617
|
+
# Try merge combinations: left only, right only, both
|
|
618
|
+
merge_attempts = []
|
|
619
|
+
|
|
620
|
+
if left_neighbors:
|
|
621
|
+
merge_attempts.append([left_neighbors[0], node_id])
|
|
622
|
+
if right_neighbors:
|
|
623
|
+
merge_attempts.append([node_id, right_neighbors[0]])
|
|
624
|
+
if left_neighbors and right_neighbors:
|
|
625
|
+
merge_attempts.append([left_neighbors[0], node_id, right_neighbors[0]])
|
|
626
|
+
|
|
627
|
+
# Try each merge combination
|
|
628
|
+
for nodes_to_merge in merge_attempts:
|
|
629
|
+
print(f"DEBUG: Trying to merge nodes {nodes_to_merge}")
|
|
630
|
+
|
|
631
|
+
# Reconstruct combined molecule
|
|
632
|
+
combined_mol = self._reconstruct_fragment(nodes_to_merge, graph)
|
|
633
|
+
if not combined_mol:
|
|
634
|
+
print(f"DEBUG: Failed to reconstruct molecule for {nodes_to_merge}")
|
|
635
|
+
continue
|
|
636
|
+
|
|
637
|
+
print(f"DEBUG: Reconstructed mol with {combined_mol.GetNumAtoms()} atoms")
|
|
638
|
+
|
|
639
|
+
# Count expected connections for this merged fragment
|
|
640
|
+
# Get all unique neighbors of the merged set
|
|
641
|
+
all_neighbors = set()
|
|
642
|
+
for nid in nodes_to_merge:
|
|
643
|
+
if nid in graph.nodes:
|
|
644
|
+
node_neighbors = graph.get_neighbors(nid)
|
|
645
|
+
for neighbor_id, _ in node_neighbors:
|
|
646
|
+
if neighbor_id not in nodes_to_merge:
|
|
647
|
+
all_neighbors.add(neighbor_id)
|
|
648
|
+
|
|
649
|
+
num_connections = len(all_neighbors)
|
|
650
|
+
print(f"DEBUG: Expecting {num_connections} connections")
|
|
651
|
+
|
|
652
|
+
# Try to match the combined fragment
|
|
653
|
+
monomer = matcher.find_exact_match(combined_mol, num_connections)
|
|
654
|
+
|
|
655
|
+
if monomer:
|
|
656
|
+
print(f"DEBUG: SUCCESS! Matched to {monomer.symbol}")
|
|
657
|
+
# Success! Create new merged node
|
|
658
|
+
new_node_id = min(nodes_to_merge) # Use lowest ID
|
|
659
|
+
new_node = FragmentNode(new_node_id, combined_mol)
|
|
660
|
+
new_node.monomer = monomer
|
|
661
|
+
|
|
662
|
+
# Merge nodes in graph
|
|
663
|
+
self._merge_nodes_in_graph(graph, nodes_to_merge, new_node)
|
|
664
|
+
|
|
665
|
+
had_changes = True
|
|
666
|
+
break # Stop trying other combinations for this node
|
|
667
|
+
else:
|
|
668
|
+
print(f"DEBUG: No match found for merge {nodes_to_merge}")
|
|
669
|
+
|
|
670
|
+
return had_changes
|
|
671
|
+
|
|
672
|
+
# ============================================================================
|
|
673
|
+
# Content from: helm_generator.py
|
|
674
|
+
# ============================================================================
|
|
675
|
+
|
|
676
|
+
class HELMGenerator:
|
|
677
|
+
"""
|
|
678
|
+
Generates HELM notation from fragment graphs or monomer lists.
|
|
679
|
+
|
|
680
|
+
Supports:
|
|
681
|
+
- Linear peptides
|
|
682
|
+
- Cyclic peptides
|
|
683
|
+
- Disulfide bridges
|
|
684
|
+
- Custom linkages
|
|
685
|
+
"""
|
|
686
|
+
|
|
687
|
+
def __init__(self):
|
|
688
|
+
#GENERALIZATION ITEM: POLYMER TYPES SHOULD BE DERIVED FROM LIBRARY
|
|
689
|
+
self.polymer_types = {
|
|
690
|
+
"peptide": "PEPTIDE",
|
|
691
|
+
"rna": "RNA",
|
|
692
|
+
"dna": "DNA",
|
|
693
|
+
"chemical": "CHEM"
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
def generate_helm_from_graph(self, graph: FragmentGraph) -> str:
|
|
697
|
+
"""
|
|
698
|
+
Generate HELM notation from a FragmentGraph.
|
|
699
|
+
|
|
700
|
+
Args:
|
|
701
|
+
graph: FragmentGraph containing matched monomers and their connections
|
|
702
|
+
|
|
703
|
+
Returns:
|
|
704
|
+
HELM notation string
|
|
705
|
+
"""
|
|
706
|
+
if len(graph) == 0:
|
|
707
|
+
return ""
|
|
708
|
+
|
|
709
|
+
# Get ordered sequence of monomers
|
|
710
|
+
ordered_nodes = graph.get_ordered_nodes()
|
|
711
|
+
sequence_symbols = [node.monomer.symbol if node.monomer else "X" for node in ordered_nodes]
|
|
712
|
+
|
|
713
|
+
# Generate linear peptide notation
|
|
714
|
+
sequence = ".".join(sequence_symbols)
|
|
715
|
+
|
|
716
|
+
# Check for disulfide bridges or other non-peptide bonds
|
|
717
|
+
has_special_bonds = any(
|
|
718
|
+
link.linkage_type != LinkageType.PEPTIDE
|
|
719
|
+
for link in graph.links
|
|
720
|
+
)
|
|
721
|
+
|
|
722
|
+
if has_special_bonds:
|
|
723
|
+
# Add connection notation for disulfide bridges
|
|
724
|
+
connections = []
|
|
725
|
+
for link in graph.links:
|
|
726
|
+
if link.linkage_type == LinkageType.DISULFIDE:
|
|
727
|
+
# Format: PEPTIDE1,PEPTIDE1,from_idx:R3-to_idx:R3
|
|
728
|
+
connections.append(
|
|
729
|
+
f"PEPTIDE1,PEPTIDE1,{link.from_node_id + 1}:R3-{link.to_node_id + 1}:R3"
|
|
730
|
+
)
|
|
731
|
+
|
|
732
|
+
if connections:
|
|
733
|
+
connection_str = "|".join(connections)
|
|
734
|
+
helm = f"PEPTIDE1{{{sequence}}}${connection_str}$$$V2.0"
|
|
735
|
+
else:
|
|
736
|
+
helm = f"PEPTIDE1{{{sequence}}}$$$$"
|
|
737
|
+
else:
|
|
738
|
+
helm = f"PEPTIDE1{{{sequence}}}$$$$"
|
|
739
|
+
|
|
740
|
+
return helm
|
|
741
|
+
|
|
742
|
+
def generate_helm_notation(self, monomers) -> str:
|
|
743
|
+
"""
|
|
744
|
+
Legacy method: Generate HELM notation from a list of monomers.
|
|
745
|
+
Kept for backward compatibility.
|
|
746
|
+
|
|
747
|
+
Args:
|
|
748
|
+
monomers: List of MonomerData objects
|
|
749
|
+
|
|
750
|
+
Returns:
|
|
751
|
+
HELM notation string
|
|
752
|
+
"""
|
|
753
|
+
if not monomers:
|
|
754
|
+
return ""
|
|
755
|
+
|
|
756
|
+
sequence = ".".join([monomer.symbol for monomer in monomers])
|
|
757
|
+
helm = f"PEPTIDE1{{{sequence}}}$$$$"
|
|
758
|
+
|
|
759
|
+
return helm
|
|
760
|
+
|
|
761
|
+
# ============================================================================
|
|
762
|
+
# Content from: monomer_library.py
|
|
763
|
+
# ============================================================================
|
|
764
|
+
|
|
765
|
+
from rdkit import Chem
|
|
766
|
+
from rdkit import RDLogger
|
|
767
|
+
from collections import defaultdict
|
|
768
|
+
from itertools import combinations
|
|
769
|
+
import json
|
|
770
|
+
import os
|
|
771
|
+
|
|
772
|
+
# Suppress RDKit warnings
|
|
773
|
+
RDLogger.DisableLog('rdApp.warning')
|
|
774
|
+
|
|
775
|
+
class MonomerData:
|
|
776
|
+
def __init__(self):
|
|
777
|
+
self.symbol = ""
|
|
778
|
+
self.name = ""
|
|
779
|
+
self.mol = None
|
|
780
|
+
self.smiles = "" # Original SMILES with R-groups
|
|
781
|
+
self.r_groups = {} # R-group label -> cap SMILES
|
|
782
|
+
self.r_group_count = 0
|
|
783
|
+
self.capped_smiles_cache = {} # Cache: frozenset of removed R-groups -> canonical SMILES
|
|
784
|
+
|
|
785
|
+
def __repr__(self):
|
|
786
|
+
return f"Monomer({self.symbol}: {self.name}, R-groups: {self.r_group_count})"
|
|
787
|
+
|
|
788
|
+
def get_capped_smiles_for_removed_rgroups(self, removed_rgroups: frozenset) -> str:
|
|
789
|
+
"""
|
|
790
|
+
Get canonical SMILES with specific R-groups removed (lazy generation with caching).
|
|
791
|
+
|
|
792
|
+
Args:
|
|
793
|
+
removed_rgroups: frozenset of R-group labels that were removed (e.g., {'R1', 'R2'})
|
|
794
|
+
|
|
795
|
+
Returns:
|
|
796
|
+
Canonical SMILES with those R-groups removed, or empty string on error
|
|
797
|
+
|
|
798
|
+
Example:
|
|
799
|
+
For monomer with R1, R2:
|
|
800
|
+
- get_capped_smiles_for_removed_rgroups({'R1'}) → SMILES with R1 removed, R2 kept
|
|
801
|
+
- get_capped_smiles_for_removed_rgroups({'R2'}) → SMILES with R2 removed, R1 kept
|
|
802
|
+
- get_capped_smiles_for_removed_rgroups({'R1', 'R2'}) → SMILES with both removed
|
|
803
|
+
"""
|
|
804
|
+
# Check cache first
|
|
805
|
+
if removed_rgroups in self.capped_smiles_cache:
|
|
806
|
+
return self.capped_smiles_cache[removed_rgroups]
|
|
807
|
+
|
|
808
|
+
# Generate on demand
|
|
809
|
+
smiles = self._get_smiles_with_rgroups_removed(removed_rgroups)
|
|
810
|
+
|
|
811
|
+
# Cache for future use
|
|
812
|
+
self.capped_smiles_cache[removed_rgroups] = smiles
|
|
813
|
+
|
|
814
|
+
return smiles
|
|
815
|
+
|
|
816
|
+
def _get_smiles_with_rgroups_removed(self, removed_rgroups: frozenset) -> str:
|
|
817
|
+
"""
|
|
818
|
+
Generate canonical SMILES with specific R-groups removed and others capped.
|
|
819
|
+
|
|
820
|
+
Args:
|
|
821
|
+
removed_rgroups: Set of R-group labels where bonds were broken (e.g., {'R1', 'R2'})
|
|
822
|
+
|
|
823
|
+
Returns:
|
|
824
|
+
Canonical SMILES string matching fragment structure
|
|
825
|
+
|
|
826
|
+
Logic:
|
|
827
|
+
- R-groups in removed_rgroups: Remove dummy atom (bond was broken)
|
|
828
|
+
- R-groups NOT in removed_rgroups: Cap according to library (e.g., OH, H)
|
|
829
|
+
- Final SMILES has NO [*:X] markers to match fragment SMILES
|
|
830
|
+
"""
|
|
831
|
+
try:
|
|
832
|
+
mol_copy = Chem.Mol(self.mol)
|
|
833
|
+
|
|
834
|
+
# Identify which R-groups to cap vs remove
|
|
835
|
+
kept_rgroups = set(self.r_groups.keys()) - removed_rgroups
|
|
836
|
+
|
|
837
|
+
# Process each R-group
|
|
838
|
+
# IMPORTANT: SMILES [*:1] uses atom map numbers, not isotopes!
|
|
839
|
+
dummy_atoms_to_process = []
|
|
840
|
+
for atom in mol_copy.GetAtoms():
|
|
841
|
+
if atom.GetAtomicNum() == 0: # Dummy atom (R-group)
|
|
842
|
+
map_num = atom.GetAtomMapNum()
|
|
843
|
+
if map_num > 0:
|
|
844
|
+
r_label = f"R{map_num}"
|
|
845
|
+
if r_label in removed_rgroups:
|
|
846
|
+
# Just remove this dummy atom
|
|
847
|
+
dummy_atoms_to_process.append((atom.GetIdx(), 'remove', r_label))
|
|
848
|
+
elif r_label in kept_rgroups:
|
|
849
|
+
# Need to cap this R-group
|
|
850
|
+
cap_smiles = self.r_groups.get(r_label, '')
|
|
851
|
+
dummy_atoms_to_process.append((atom.GetIdx(), 'cap', cap_smiles))
|
|
852
|
+
|
|
853
|
+
# Apply caps to kept R-groups, remove others
|
|
854
|
+
# Process in two passes: first cap, then remove
|
|
855
|
+
# Cap R-groups: Replace [*:X] with the cap group (e.g., H or OH)
|
|
856
|
+
for atom_idx, action, data in sorted(dummy_atoms_to_process, reverse=True):
|
|
857
|
+
if action == 'cap':
|
|
858
|
+
cap_smiles = data
|
|
859
|
+
# For R1 cap '[*:1][H]', we just remove [*:1] (implicit H added)
|
|
860
|
+
# For R2 cap 'O[*:2]', we need to add O when removing [*:2]
|
|
861
|
+
# Simplified: check if cap has O
|
|
862
|
+
if 'O' in cap_smiles and '[*:' in cap_smiles:
|
|
863
|
+
# R2-like cap: need to add OH group
|
|
864
|
+
# Get the neighbor atom of the dummy
|
|
865
|
+
atom = mol_copy.GetAtomWithIdx(atom_idx)
|
|
866
|
+
neighbors = atom.GetNeighbors()
|
|
867
|
+
if neighbors:
|
|
868
|
+
neighbor = neighbors[0]
|
|
869
|
+
# Add OH to the neighbor before removing dummy
|
|
870
|
+
emol = Chem.EditableMol(mol_copy)
|
|
871
|
+
new_o_idx = emol.AddAtom(Chem.Atom(8)) # Oxygen
|
|
872
|
+
emol.AddBond(neighbor.GetIdx(), new_o_idx, Chem.BondType.SINGLE)
|
|
873
|
+
emol.RemoveAtom(atom_idx)
|
|
874
|
+
mol_copy = emol.GetMol()
|
|
875
|
+
else:
|
|
876
|
+
# R1-like cap: just remove dummy (implicit H)
|
|
877
|
+
emol = Chem.EditableMol(mol_copy)
|
|
878
|
+
emol.RemoveAtom(atom_idx)
|
|
879
|
+
mol_copy = emol.GetMol()
|
|
880
|
+
elif action == 'remove':
|
|
881
|
+
# Just remove the dummy atom
|
|
882
|
+
emol = Chem.EditableMol(mol_copy)
|
|
883
|
+
emol.RemoveAtom(atom_idx)
|
|
884
|
+
mol_copy = emol.GetMol()
|
|
885
|
+
|
|
886
|
+
if mol_copy:
|
|
887
|
+
# Sanitize to add implicit hydrogens where needed
|
|
888
|
+
Chem.SanitizeMol(mol_copy)
|
|
889
|
+
# Generate canonical SMILES without any R-group markers
|
|
890
|
+
return Chem.MolToSmiles(mol_copy, canonical=True)
|
|
891
|
+
return ""
|
|
892
|
+
except Exception as e:
|
|
893
|
+
return ""
|
|
894
|
+
|
|
895
|
+
|
|
896
|
+
class MonomerLibrary:
|
|
897
|
+
def __init__(self):
|
|
898
|
+
self.monomers = {}
|
|
899
|
+
self.smiles_to_monomer = {}
|
|
900
|
+
self.name_to_monomer = {}
|
|
901
|
+
self.symbol_to_monomer = {}
|
|
902
|
+
|
|
903
|
+
def load_from_helm_json(self, json_path: str) -> None:
|
|
904
|
+
if not os.path.exists(json_path):
|
|
905
|
+
return
|
|
906
|
+
|
|
907
|
+
try:
|
|
908
|
+
with open(json_path, 'r', encoding='utf-8') as f:
|
|
909
|
+
data = json.load(f)
|
|
910
|
+
except Exception:
|
|
911
|
+
return
|
|
912
|
+
|
|
913
|
+
successful = 0
|
|
914
|
+
for monomer_dict in data:
|
|
915
|
+
try:
|
|
916
|
+
monomer = self._parse_monomer(monomer_dict)
|
|
917
|
+
if monomer and monomer.mol is not None:
|
|
918
|
+
self.monomers[monomer.symbol] = monomer
|
|
919
|
+
self.symbol_to_monomer[monomer.symbol] = monomer
|
|
920
|
+
|
|
921
|
+
clean_name = monomer.name.lower().replace(" ", "").replace("-", "").replace("_", "")
|
|
922
|
+
self.name_to_monomer[clean_name] = monomer
|
|
923
|
+
|
|
924
|
+
successful += 1
|
|
925
|
+
except Exception:
|
|
926
|
+
continue
|
|
927
|
+
|
|
928
|
+
def _parse_monomer(self, monomer_dict: dict):
|
|
929
|
+
# IMPORTANT: Only load PEPTIDE monomers (amino acids)
|
|
930
|
+
# The library contains RNA, CHEM, etc. with overlapping symbols (A, C, G, T, U)
|
|
931
|
+
polymer_type = monomer_dict.get('polymerType', '')
|
|
932
|
+
if polymer_type != 'PEPTIDE':
|
|
933
|
+
return None
|
|
934
|
+
|
|
935
|
+
monomer = MonomerData()
|
|
936
|
+
monomer.symbol = monomer_dict.get('symbol', '')
|
|
937
|
+
monomer.name = monomer_dict.get('name', '')
|
|
938
|
+
|
|
939
|
+
if not monomer.symbol:
|
|
940
|
+
return None
|
|
941
|
+
|
|
942
|
+
smiles = monomer_dict.get('smiles', '')
|
|
943
|
+
molfile = monomer_dict.get('molfile', '')
|
|
944
|
+
|
|
945
|
+
if smiles:
|
|
946
|
+
try:
|
|
947
|
+
monomer.mol = Chem.MolFromSmiles(smiles)
|
|
948
|
+
monomer.smiles = smiles
|
|
949
|
+
except Exception:
|
|
950
|
+
monomer.mol = None
|
|
951
|
+
|
|
952
|
+
if monomer.mol is None and molfile:
|
|
953
|
+
try:
|
|
954
|
+
monomer.mol = Chem.MolFromMolBlock(molfile)
|
|
955
|
+
if monomer.mol:
|
|
956
|
+
monomer.smiles = Chem.MolToSmiles(monomer.mol)
|
|
957
|
+
except Exception:
|
|
958
|
+
monomer.mol = None
|
|
959
|
+
|
|
960
|
+
if monomer.mol is None:
|
|
961
|
+
return None
|
|
962
|
+
|
|
963
|
+
# Parse R-groups
|
|
964
|
+
rgroups_list = monomer_dict.get('rgroups', [])
|
|
965
|
+
for rgroup in rgroups_list:
|
|
966
|
+
label = rgroup.get('label', '')
|
|
967
|
+
cap_smiles = rgroup.get('capGroupSMILES', '')
|
|
968
|
+
if label and cap_smiles:
|
|
969
|
+
monomer.r_groups[label] = cap_smiles
|
|
970
|
+
|
|
971
|
+
monomer.r_group_count = len(monomer.r_groups)
|
|
972
|
+
|
|
973
|
+
return monomer
|
|
974
|
+
|
|
975
|
+
def find_monomer_by_fragment_smiles(self, fragment_smiles: str, num_connections: int):
|
|
976
|
+
"""
|
|
977
|
+
Find monomer by matching fragment SMILES with on-demand R-group removal.
|
|
978
|
+
|
|
979
|
+
Args:
|
|
980
|
+
fragment_smiles: Canonical SMILES of the fragment
|
|
981
|
+
num_connections: Number of connections this fragment has in the graph
|
|
982
|
+
|
|
983
|
+
Returns:
|
|
984
|
+
MonomerData if match found, None otherwise
|
|
985
|
+
|
|
986
|
+
Logic:
|
|
987
|
+
- Fragment with N connections → N R-groups were removed during fragmentation
|
|
988
|
+
- For monomer with M R-groups, try all C(M,N) combinations of which N R-groups were removed
|
|
989
|
+
- Generate SMILES for each combination on-demand (with caching)
|
|
990
|
+
|
|
991
|
+
Example:
|
|
992
|
+
Fragment has 1 connection, monomer has R1, R2:
|
|
993
|
+
- Try removing R1 → check if SMILES matches
|
|
994
|
+
- Try removing R2 → check if SMILES matches
|
|
995
|
+
"""
|
|
996
|
+
# Search through all monomers
|
|
997
|
+
for symbol, monomer in self.monomers.items():
|
|
998
|
+
# Skip if monomer doesn't have enough R-groups
|
|
999
|
+
if monomer.r_group_count < num_connections:
|
|
1000
|
+
continue
|
|
1001
|
+
|
|
1002
|
+
# Generate all combinations of num_connections R-groups that could have been removed
|
|
1003
|
+
r_group_labels = list(monomer.r_groups.keys())
|
|
1004
|
+
|
|
1005
|
+
# For each combination of R-groups that could have been removed
|
|
1006
|
+
for removed_combo in combinations(r_group_labels, num_connections):
|
|
1007
|
+
removed_set = frozenset(removed_combo)
|
|
1008
|
+
|
|
1009
|
+
# Generate SMILES with these R-groups removed (lazy, cached)
|
|
1010
|
+
candidate_smiles = monomer.get_capped_smiles_for_removed_rgroups(removed_set)
|
|
1011
|
+
|
|
1012
|
+
# Check if it matches the fragment
|
|
1013
|
+
if candidate_smiles == fragment_smiles:
|
|
1014
|
+
return monomer
|
|
1015
|
+
|
|
1016
|
+
return None
|
|
1017
|
+
|
|
1018
|
+
def find_monomer_by_symbol(self, symbol: str):
|
|
1019
|
+
return self.symbol_to_monomer.get(symbol)
|
|
1020
|
+
|
|
1021
|
+
# ============================================================================
|
|
1022
|
+
# Content from: monomer_matcher.py
|
|
1023
|
+
# ============================================================================
|
|
1024
|
+
|
|
1025
|
+
from rdkit import Chem
|
|
1026
|
+
|
|
1027
|
+
|
|
1028
|
+
class MonomerMatcher:
|
|
1029
|
+
"""
|
|
1030
|
+
Matches molecular fragments to monomers using graph-aware R-group analysis.
|
|
1031
|
+
|
|
1032
|
+
Revolutionary approach:
|
|
1033
|
+
- No hardcoded mappings
|
|
1034
|
+
- No complex normalization
|
|
1035
|
+
- Direct string comparison of canonical SMILES
|
|
1036
|
+
- Graph topology determines which R-groups are capped
|
|
1037
|
+
"""
|
|
1038
|
+
|
|
1039
|
+
def __init__(self, monomer_library: MonomerLibrary):
|
|
1040
|
+
self.monomer_library = monomer_library
|
|
1041
|
+
|
|
1042
|
+
def find_exact_match(self, fragment: Chem.Mol, num_connections: int = 0):
|
|
1043
|
+
"""
|
|
1044
|
+
Find exact match for a fragment based on graph topology.
|
|
1045
|
+
|
|
1046
|
+
Args:
|
|
1047
|
+
fragment: RDKit molecule object representing a fragment
|
|
1048
|
+
num_connections: Number of connections this fragment has in the graph
|
|
1049
|
+
|
|
1050
|
+
Returns:
|
|
1051
|
+
MonomerData object if match found, None otherwise
|
|
1052
|
+
"""
|
|
1053
|
+
try:
|
|
1054
|
+
# Get canonical SMILES of the fragment
|
|
1055
|
+
frag_smiles = Chem.MolToSmiles(fragment, canonical=True)
|
|
1056
|
+
if not frag_smiles:
|
|
1057
|
+
return None
|
|
1058
|
+
|
|
1059
|
+
# Use the library's new graph-aware matching
|
|
1060
|
+
match = self.monomer_library.find_monomer_by_fragment_smiles(
|
|
1061
|
+
frag_smiles, num_connections
|
|
1062
|
+
)
|
|
1063
|
+
|
|
1064
|
+
return match
|
|
1065
|
+
|
|
1066
|
+
except Exception:
|
|
1067
|
+
return None
|
|
1068
|
+
|
|
1069
|
+
def match_graph(self, graph: FragmentGraph):
|
|
1070
|
+
"""
|
|
1071
|
+
Match all fragments in a graph to monomers.
|
|
1072
|
+
|
|
1073
|
+
Args:
|
|
1074
|
+
graph: FragmentGraph with unmatched nodes
|
|
1075
|
+
|
|
1076
|
+
Returns:
|
|
1077
|
+
Number of successfully matched nodes
|
|
1078
|
+
"""
|
|
1079
|
+
matched_count = 0
|
|
1080
|
+
|
|
1081
|
+
for node_id, node in graph.nodes.items():
|
|
1082
|
+
# Count connections for this node
|
|
1083
|
+
neighbors = graph.get_neighbors(node_id)
|
|
1084
|
+
num_connections = len(neighbors)
|
|
1085
|
+
|
|
1086
|
+
# Find matching monomer
|
|
1087
|
+
monomer = self.find_exact_match(node.mol, num_connections)
|
|
1088
|
+
|
|
1089
|
+
if monomer:
|
|
1090
|
+
node.monomer = monomer
|
|
1091
|
+
matched_count += 1
|
|
1092
|
+
|
|
1093
|
+
return matched_count
|
|
1094
|
+
|
|
1095
|
+
# ============================================================================
|
|
1096
|
+
# Content from: pipeline.py
|
|
1097
|
+
# ============================================================================
|
|
1098
|
+
|
|
1099
|
+
from rdkit import Chem
|
|
1100
|
+
import os
|
|
1101
|
+
import json
|
|
1102
|
+
|
|
1103
|
+
# Global variables for caching
|
|
1104
|
+
_MONOMER_LIBRARY = None
|
|
1105
|
+
_PROCESSOR = None
|
|
1106
|
+
_MATCHER = None
|
|
1107
|
+
_HELM_GENERATOR = None
|
|
1108
|
+
|
|
1109
|
+
|
|
1110
|
+
def _load_monomer_library():
|
|
1111
|
+
global _MONOMER_LIBRARY
|
|
1112
|
+
if _MONOMER_LIBRARY is None:
|
|
1113
|
+
# Define path to library relative to current directory
|
|
1114
|
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
1115
|
+
project_root = os.path.dirname(current_dir)
|
|
1116
|
+
library_path = os.path.join(project_root, "libraries", "HELMCoreLibrary.json")
|
|
1117
|
+
|
|
1118
|
+
if not os.path.exists(library_path):
|
|
1119
|
+
return None
|
|
1120
|
+
|
|
1121
|
+
print("Loading monomer library...")
|
|
1122
|
+
_MONOMER_LIBRARY = MonomerLibrary()
|
|
1123
|
+
_MONOMER_LIBRARY.load_from_helm_json(library_path)
|
|
1124
|
+
|
|
1125
|
+
if not _MONOMER_LIBRARY.monomers:
|
|
1126
|
+
return None
|
|
1127
|
+
|
|
1128
|
+
print(f"Monomer library loaded: {len(_MONOMER_LIBRARY.monomers)} monomers")
|
|
1129
|
+
|
|
1130
|
+
return _MONOMER_LIBRARY
|
|
1131
|
+
|
|
1132
|
+
|
|
1133
|
+
def _get_processors():
|
|
1134
|
+
"""
|
|
1135
|
+
Get or create singleton instances of processors.
|
|
1136
|
+
Returns tuple: (processor, matcher, helm_generator)
|
|
1137
|
+
"""
|
|
1138
|
+
global _PROCESSOR, _MATCHER, _HELM_GENERATOR
|
|
1139
|
+
|
|
1140
|
+
if _PROCESSOR is None or _MATCHER is None or _HELM_GENERATOR is None:
|
|
1141
|
+
library = _load_monomer_library()
|
|
1142
|
+
if not library:
|
|
1143
|
+
return None, None, None
|
|
1144
|
+
|
|
1145
|
+
_PROCESSOR = FragmentProcessor(library)
|
|
1146
|
+
_MATCHER = MonomerMatcher(library)
|
|
1147
|
+
_HELM_GENERATOR = HELMGenerator()
|
|
1148
|
+
|
|
1149
|
+
return _PROCESSOR, _MATCHER, _HELM_GENERATOR
|
|
1150
|
+
|
|
1151
|
+
|
|
1152
|
+
def preload_library():
|
|
1153
|
+
"""
|
|
1154
|
+
Preload the monomer library and initialize processors once at the start.
|
|
1155
|
+
Returns True if successful, False otherwise.
|
|
1156
|
+
"""
|
|
1157
|
+
library = _load_monomer_library()
|
|
1158
|
+
if library is None:
|
|
1159
|
+
return False
|
|
1160
|
+
|
|
1161
|
+
# Initialize processors
|
|
1162
|
+
processor, matcher, generator = _get_processors()
|
|
1163
|
+
return processor is not None
|
|
1164
|
+
|
|
1165
|
+
|
|
1166
|
+
def convert_molecules_batch(molfiles: list, library_json: str = None) -> list:
|
|
1167
|
+
"""
|
|
1168
|
+
Convert a batch of molecules from molfile format to HELM notation.
|
|
1169
|
+
|
|
1170
|
+
Args:
|
|
1171
|
+
molfiles: List of molfile strings
|
|
1172
|
+
library_json: Optional monomer library as JSON string.
|
|
1173
|
+
If None, uses default cached library from HELMCoreLibrary.json
|
|
1174
|
+
|
|
1175
|
+
Returns:
|
|
1176
|
+
List of tuples: (success: bool, helm_notation: str)
|
|
1177
|
+
success is True if molecule was successfully converted, False otherwise
|
|
1178
|
+
"""
|
|
1179
|
+
# Determine which library to use
|
|
1180
|
+
if library_json is None:
|
|
1181
|
+
# Use cached global library
|
|
1182
|
+
global _PROCESSOR
|
|
1183
|
+
if _PROCESSOR is None:
|
|
1184
|
+
print("Initializing monomer library and processors...")
|
|
1185
|
+
if not preload_library():
|
|
1186
|
+
print("ERROR: Failed to load monomer library")
|
|
1187
|
+
return [(False, "Library initialization failed") for _ in molfiles]
|
|
1188
|
+
print()
|
|
1189
|
+
|
|
1190
|
+
# Use shared processor instances
|
|
1191
|
+
processor, matcher, helm_generator = _get_processors()
|
|
1192
|
+
if not processor:
|
|
1193
|
+
return [(False, "") for _ in molfiles]
|
|
1194
|
+
else:
|
|
1195
|
+
# Load custom library from provided JSON string (no caching)
|
|
1196
|
+
try:
|
|
1197
|
+
library_data = json.loads(library_json)
|
|
1198
|
+
except Exception as e:
|
|
1199
|
+
print(f"ERROR: Failed to parse library JSON: {str(e)}")
|
|
1200
|
+
return [(False, "Invalid JSON") for _ in molfiles]
|
|
1201
|
+
|
|
1202
|
+
print(f"Loading custom library from JSON string...")
|
|
1203
|
+
library = MonomerLibrary()
|
|
1204
|
+
|
|
1205
|
+
# Parse the library data
|
|
1206
|
+
successful = 0
|
|
1207
|
+
for monomer_dict in library_data:
|
|
1208
|
+
try:
|
|
1209
|
+
monomer = library._parse_monomer(monomer_dict)
|
|
1210
|
+
if monomer and monomer.mol is not None:
|
|
1211
|
+
library.monomers[monomer.symbol] = monomer
|
|
1212
|
+
library.symbol_to_monomer[monomer.symbol] = monomer
|
|
1213
|
+
clean_name = monomer.name.lower().replace(" ", "").replace("-", "").replace("_", "")
|
|
1214
|
+
library.name_to_monomer[clean_name] = monomer
|
|
1215
|
+
successful += 1
|
|
1216
|
+
except Exception:
|
|
1217
|
+
continue
|
|
1218
|
+
|
|
1219
|
+
if not library.monomers:
|
|
1220
|
+
print("ERROR: No monomers loaded from custom library")
|
|
1221
|
+
return [(False, "Library loading failed") for _ in molfiles]
|
|
1222
|
+
|
|
1223
|
+
print(f"Custom library loaded: {len(library.monomers)} monomers")
|
|
1224
|
+
|
|
1225
|
+
# Create processor instances for this library
|
|
1226
|
+
processor = FragmentProcessor(library)
|
|
1227
|
+
matcher = MonomerMatcher(library)
|
|
1228
|
+
helm_generator = HELMGenerator()
|
|
1229
|
+
|
|
1230
|
+
results = []
|
|
1231
|
+
|
|
1232
|
+
for i in range(len(molfiles)):
|
|
1233
|
+
molfile = molfiles[i]
|
|
1234
|
+
mol = Chem.MolFromMolBlock(molfile)
|
|
1235
|
+
if not mol:
|
|
1236
|
+
results.append((False, ""))
|
|
1237
|
+
continue
|
|
1238
|
+
|
|
1239
|
+
try:
|
|
1240
|
+
# Process molecule into fragment graph
|
|
1241
|
+
graph = processor.process_molecule(mol)
|
|
1242
|
+
|
|
1243
|
+
# Match each fragment to a monomer using graph topology
|
|
1244
|
+
unknown_count = 0
|
|
1245
|
+
for node_id, node in graph.nodes.items():
|
|
1246
|
+
# Count connections for this node
|
|
1247
|
+
neighbors = graph.get_neighbors(node_id)
|
|
1248
|
+
num_connections = len(neighbors)
|
|
1249
|
+
|
|
1250
|
+
# Find matching monomer
|
|
1251
|
+
monomer = matcher.find_exact_match(node.mol, num_connections)
|
|
1252
|
+
if monomer:
|
|
1253
|
+
node.monomer = monomer
|
|
1254
|
+
else:
|
|
1255
|
+
unknown_count += 1
|
|
1256
|
+
mock_monomer = MonomerData()
|
|
1257
|
+
mock_monomer.symbol = f"X{unknown_count}"
|
|
1258
|
+
mock_monomer.name = f"Unknown_{unknown_count}"
|
|
1259
|
+
node.monomer = mock_monomer
|
|
1260
|
+
|
|
1261
|
+
# Try to recover unmatched fragments by merging with neighbors
|
|
1262
|
+
max_recovery_attempts = 3 # Prevent infinite loops
|
|
1263
|
+
for attempt in range(max_recovery_attempts):
|
|
1264
|
+
had_changes = processor.recover_unmatched_fragments(graph, matcher)
|
|
1265
|
+
if not had_changes:
|
|
1266
|
+
break
|
|
1267
|
+
|
|
1268
|
+
if len(graph.nodes) > 0:
|
|
1269
|
+
helm_notation = helm_generator.generate_helm_from_graph(graph)
|
|
1270
|
+
results.append((True, helm_notation))
|
|
1271
|
+
else:
|
|
1272
|
+
results.append((False, ""))
|
|
1273
|
+
except Exception as e:
|
|
1274
|
+
results.append((False, f"Error: {str(e)}"))
|
|
1275
|
+
|
|
1276
|
+
return results
|
|
1277
|
+
|
|
1278
|
+
res_helm_list = convert_molecules_batch(molListToProcess, library_json=libraryJSON)
|
|
1279
|
+
result_helm = pd.DataFrame(map(lambda x: x[1], res_helm_list), columns=["regenerated sequences"])
|