@datagrok/bio 2.27.0 → 2.27.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/287.js +1 -1
- package/dist/287.js.map +1 -1
- package/dist/422.js +1 -1
- package/dist/package-test.js +2 -2
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +2 -2
- package/dist/package.js.map +1 -1
- package/package.json +3 -3
- package/scripts/mol-to-helm.py +642 -170
- package/src/analysis/sequence-activity-cliffs.ts +8 -6
- package/src/package-api.ts +2 -2
- package/src/package.g.ts +7 -0
- package/src/package.ts +12 -1
- package/src/utils/annotations/annotation-manager-ui.ts +1 -1
- package/src/utils/compare-sequences.ts +104 -0
- package/src/utils/monomer-lib/library-file-manager/ui.ts +1 -1
- package/src/utils/monomer-lib/monomer-manager/monomer-manager.ts +1 -1
- package/src/utils/multiple-sequence-alignment-ui.ts +2 -2
- package/test-console-output-1.log +518 -532
- package/test-record-1.mp4 +0 -0
package/scripts/mol-to-helm.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
#description: Converts molecules to HELM notation based on monomer library
|
|
4
4
|
#input: dataframe moleculesDataframe
|
|
5
5
|
#input: column moleculesColumn {semType: Molecule}
|
|
6
|
-
#input:
|
|
6
|
+
#input: file libraryFile
|
|
7
7
|
#output: dataframe result_helm {action:join(moleculesDataframe)} [Sequences, in HELM format]
|
|
8
8
|
molListToProcess = moleculesDataframe[moleculesColumn].tolist()
|
|
9
9
|
import pandas as pd
|
|
@@ -150,26 +150,35 @@ class FragmentGraph:
|
|
|
150
150
|
return ordered
|
|
151
151
|
|
|
152
152
|
def _traverse_from_node(self, node_id: int, visited: set, ordered: list):
|
|
153
|
-
"""Helper for depth-first traversal"""
|
|
153
|
+
"""Helper for depth-first traversal with bidirectional link support"""
|
|
154
154
|
if node_id in visited:
|
|
155
155
|
return
|
|
156
|
-
|
|
156
|
+
|
|
157
157
|
visited.add(node_id)
|
|
158
158
|
ordered.append(self.nodes[node_id])
|
|
159
|
-
|
|
160
|
-
#
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
159
|
+
|
|
160
|
+
# Follow links in BOTH directions but prefer the canonical (from→to)
|
|
161
|
+
# direction. Link direction depends on bond detection order and is not
|
|
162
|
+
# guaranteed to match backbone direction (e.g. FC01 stapled peptides).
|
|
163
|
+
peptide_fwd = []
|
|
164
|
+
peptide_bwd = []
|
|
165
|
+
other_fwd = []
|
|
166
|
+
other_bwd = []
|
|
167
|
+
|
|
164
168
|
for link in self.links:
|
|
165
169
|
if link.from_node_id == node_id and link.to_node_id not in visited:
|
|
166
170
|
if link.linkage_type == LinkageType.PEPTIDE:
|
|
167
|
-
|
|
171
|
+
peptide_fwd.append(link.to_node_id)
|
|
168
172
|
else:
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
+
other_fwd.append(link.to_node_id)
|
|
174
|
+
elif link.to_node_id == node_id and link.from_node_id not in visited:
|
|
175
|
+
if link.linkage_type == LinkageType.PEPTIDE:
|
|
176
|
+
peptide_bwd.append(link.from_node_id)
|
|
177
|
+
else:
|
|
178
|
+
other_bwd.append(link.from_node_id)
|
|
179
|
+
|
|
180
|
+
# Forward first, backward as fallback
|
|
181
|
+
for neighbor_id in peptide_fwd + peptide_bwd + other_fwd + other_bwd:
|
|
173
182
|
self._traverse_from_node(neighbor_id, visited, ordered)
|
|
174
183
|
|
|
175
184
|
def get_fragment_sequence(self) -> List[str]:
|
|
@@ -194,23 +203,87 @@ class FragmentGraph:
|
|
|
194
203
|
if len(ordered) < 3:
|
|
195
204
|
return False
|
|
196
205
|
|
|
197
|
-
#
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
first_few_ids = [ordered[i].id for i in range(min(3, len(ordered)))]
|
|
204
|
-
|
|
206
|
+
# Check if any of the last few residues connect back to any of the first few.
|
|
207
|
+
# Checking multiple positions on each end handles branch nodes (like 'ac')
|
|
208
|
+
# that the bidirectional traversal may place at the edges.
|
|
209
|
+
first_few_ids = set(ordered[i].id for i in range(min(3, len(ordered))))
|
|
210
|
+
last_few_ids = set(ordered[-i - 1].id for i in range(min(3, len(ordered))))
|
|
211
|
+
|
|
205
212
|
for link in self.links:
|
|
206
213
|
if link.linkage_type == LinkageType.PEPTIDE:
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
(link.to_node_id == last_id and link.from_node_id in first_few_ids):
|
|
214
|
+
if (link.from_node_id in last_few_ids and link.to_node_id in first_few_ids) or \
|
|
215
|
+
(link.to_node_id in last_few_ids and link.from_node_id in first_few_ids):
|
|
210
216
|
return True
|
|
211
|
-
|
|
217
|
+
|
|
212
218
|
return False
|
|
213
219
|
|
|
220
|
+
def find_all_cycles(self) -> List[List[int]]:
|
|
221
|
+
"""
|
|
222
|
+
Find all cycles in the graph using DFS.
|
|
223
|
+
Returns list of cycles, where each cycle is a list of node IDs.
|
|
224
|
+
"""
|
|
225
|
+
cycles = []
|
|
226
|
+
visited = set()
|
|
227
|
+
rec_stack = set()
|
|
228
|
+
parent = {}
|
|
229
|
+
|
|
230
|
+
def dfs(node_id: int, path: List[int]):
|
|
231
|
+
visited.add(node_id)
|
|
232
|
+
rec_stack.add(node_id)
|
|
233
|
+
path.append(node_id)
|
|
234
|
+
|
|
235
|
+
# Get peptide bond neighbors
|
|
236
|
+
neighbors = [n_id for n_id, link_type in self.get_neighbors(node_id)
|
|
237
|
+
if link_type == LinkageType.PEPTIDE]
|
|
238
|
+
|
|
239
|
+
for neighbor_id in neighbors:
|
|
240
|
+
if neighbor_id not in visited:
|
|
241
|
+
parent[neighbor_id] = node_id
|
|
242
|
+
dfs(neighbor_id, path[:])
|
|
243
|
+
elif neighbor_id in rec_stack and neighbor_id != parent.get(node_id):
|
|
244
|
+
# Found a cycle - extract it from path
|
|
245
|
+
cycle_start_idx = path.index(neighbor_id)
|
|
246
|
+
cycle = path[cycle_start_idx:] + [neighbor_id]
|
|
247
|
+
# Normalize cycle (start from smallest ID)
|
|
248
|
+
min_idx = cycle.index(min(cycle[:-1])) # Don't include duplicate last element
|
|
249
|
+
normalized = cycle[min_idx:-1] + cycle[:min_idx]
|
|
250
|
+
if normalized not in cycles:
|
|
251
|
+
cycles.append(normalized)
|
|
252
|
+
|
|
253
|
+
rec_stack.remove(node_id)
|
|
254
|
+
|
|
255
|
+
# Try starting DFS from each unvisited node
|
|
256
|
+
for node_id in self.nodes.keys():
|
|
257
|
+
if node_id not in visited:
|
|
258
|
+
parent[node_id] = None
|
|
259
|
+
dfs(node_id, [])
|
|
260
|
+
|
|
261
|
+
return cycles
|
|
262
|
+
|
|
263
|
+
def get_connected_components(self) -> List[List[int]]:
|
|
264
|
+
"""
|
|
265
|
+
Find all connected components in the graph.
|
|
266
|
+
Returns list of components, where each component is a list of node IDs.
|
|
267
|
+
"""
|
|
268
|
+
visited = set()
|
|
269
|
+
components = []
|
|
270
|
+
|
|
271
|
+
def dfs_component(node_id: int, component: List[int]):
|
|
272
|
+
visited.add(node_id)
|
|
273
|
+
component.append(node_id)
|
|
274
|
+
neighbors = self.get_neighbors(node_id)
|
|
275
|
+
for neighbor_id, _ in neighbors:
|
|
276
|
+
if neighbor_id not in visited:
|
|
277
|
+
dfs_component(neighbor_id, component)
|
|
278
|
+
|
|
279
|
+
for node_id in self.nodes.keys():
|
|
280
|
+
if node_id not in visited:
|
|
281
|
+
component = []
|
|
282
|
+
dfs_component(node_id, component)
|
|
283
|
+
components.append(sorted(component))
|
|
284
|
+
|
|
285
|
+
return components
|
|
286
|
+
|
|
214
287
|
def __len__(self):
|
|
215
288
|
return len(self.nodes)
|
|
216
289
|
|
|
@@ -302,10 +375,38 @@ class BondDetector:
|
|
|
302
375
|
bonds = []
|
|
303
376
|
try:
|
|
304
377
|
matches = mol.GetSubstructMatches(self.peptide_bond)
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
378
|
+
|
|
379
|
+
# Filter out internal amide bonds in CHEM linkers like FC01.
|
|
380
|
+
# FC01 pattern: C(=O)-N-ArRing-N-C(=O) — two amide bonds connect to the
|
|
381
|
+
# same aromatic ring via the alpha-C position (match[4]).
|
|
382
|
+
# Real aromatic amino acids (3Abz) only have ONE such bond per ring.
|
|
383
|
+
skip_indices = set()
|
|
384
|
+
ring_info = mol.GetRingInfo()
|
|
385
|
+
rings = ring_info.AtomRings()
|
|
386
|
+
|
|
387
|
+
# Map: ring_frozenset -> list of match indices where match[4] is aromatic on that ring
|
|
388
|
+
# Only consider small rings (5-6 atoms) — large macrocycles should not be filtered
|
|
389
|
+
ring_to_matches = {}
|
|
390
|
+
for i, match in enumerate(matches):
|
|
391
|
+
if len(match) < 5:
|
|
392
|
+
continue
|
|
393
|
+
alpha_c_atom = mol.GetAtomWithIdx(match[4])
|
|
394
|
+
if alpha_c_atom.GetIsAromatic():
|
|
395
|
+
for ring in rings:
|
|
396
|
+
if match[4] in ring and len(ring) <= 6:
|
|
397
|
+
ring_key = frozenset(ring)
|
|
398
|
+
if ring_key not in ring_to_matches:
|
|
399
|
+
ring_to_matches[ring_key] = []
|
|
400
|
+
ring_to_matches[ring_key].append(i)
|
|
401
|
+
break
|
|
402
|
+
|
|
403
|
+
# If 2+ matches share an aromatic ring at their alpha-C position, skip them
|
|
404
|
+
for ring_key, match_indices in ring_to_matches.items():
|
|
405
|
+
if len(match_indices) >= 2:
|
|
406
|
+
skip_indices.update(match_indices)
|
|
407
|
+
|
|
408
|
+
for i, match in enumerate(matches):
|
|
409
|
+
if len(match) >= 5 and i not in skip_indices:
|
|
309
410
|
c_atom = match[1] # Carbonyl carbon
|
|
310
411
|
n_atom = match[3] # Nitrogen
|
|
311
412
|
bonds.append((c_atom, n_atom))
|
|
@@ -386,6 +487,117 @@ class FragmentProcessor:
|
|
|
386
487
|
self.monomer_library = monomer_library
|
|
387
488
|
self.bond_detector = BondDetector()
|
|
388
489
|
|
|
490
|
+
|
|
491
|
+
def _find_staple_sidechain_bonds(self, mol, existing_bonds):
|
|
492
|
+
"""
|
|
493
|
+
Find non-backbone bonds to cleave in macrocycles.
|
|
494
|
+
|
|
495
|
+
Handles three types of macrocyclic cross-links:
|
|
496
|
+
1. RCMtrans/RCMcis (stapled peptides): C=C double bond in the linker.
|
|
497
|
+
Cleaves one hop away on each side to keep the correct R3 chain length.
|
|
498
|
+
2. FC01-type (thioether staples): C-S bonds in the linker.
|
|
499
|
+
3. Alkyl cross-links (bi-cyclic peptides): pure C-C chains connecting
|
|
500
|
+
two amino acid side chains (R3-R3). Detected by finding non-backbone
|
|
501
|
+
segments in large macrocycles and cleaving at their midpoint.
|
|
502
|
+
"""
|
|
503
|
+
ring_info = mol.GetRingInfo()
|
|
504
|
+
large_rings = [set(ring) for ring in ring_info.AtomRings() if len(ring) > 10]
|
|
505
|
+
if not large_rings:
|
|
506
|
+
return []
|
|
507
|
+
|
|
508
|
+
existing_atom_pairs = set()
|
|
509
|
+
for a1, a2, _ in existing_bonds:
|
|
510
|
+
existing_atom_pairs.add((min(a1, a2), max(a1, a2)))
|
|
511
|
+
# Also track existing bond atoms for backbone detection
|
|
512
|
+
existing_bond_atoms = set()
|
|
513
|
+
for a1, a2, _ in existing_bonds:
|
|
514
|
+
existing_bond_atoms.add(a1)
|
|
515
|
+
existing_bond_atoms.add(a2)
|
|
516
|
+
|
|
517
|
+
additional_bonds = []
|
|
518
|
+
seen = set()
|
|
519
|
+
|
|
520
|
+
for ring in large_rings:
|
|
521
|
+
ring_list = list(ring)
|
|
522
|
+
|
|
523
|
+
# --- Type 1: C=C double bonds (RCMtrans/RCMcis) ---
|
|
524
|
+
# Only if molecule has quaternary alpha-methyl C (staple monomer signature)
|
|
525
|
+
quat_alpha = Chem.MolFromSmarts('[N;X2,X3]-[C;X4;H0](-[C;X3](=[O;X1]))-[CH3]')
|
|
526
|
+
has_quat = quat_alpha and mol.HasSubstructMatch(quat_alpha)
|
|
527
|
+
if has_quat:
|
|
528
|
+
for bond in mol.GetBonds():
|
|
529
|
+
a1, a2 = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
|
|
530
|
+
if a1 not in ring or a2 not in ring:
|
|
531
|
+
continue
|
|
532
|
+
if (bond.GetBondTypeAsDouble() >= 2 and
|
|
533
|
+
mol.GetAtomWithIdx(a1).GetAtomicNum() == 6 and
|
|
534
|
+
mol.GetAtomWithIdx(a2).GetAtomicNum() == 6):
|
|
535
|
+
for cc_atom_idx in (a1, a2):
|
|
536
|
+
other_cc = a2 if cc_atom_idx == a1 else a1
|
|
537
|
+
cc_atom = mol.GetAtomWithIdx(cc_atom_idx)
|
|
538
|
+
for nbr in cc_atom.GetNeighbors():
|
|
539
|
+
nbr_idx = nbr.GetIdx()
|
|
540
|
+
if nbr_idx == other_cc or nbr_idx not in ring:
|
|
541
|
+
continue
|
|
542
|
+
for nbr2 in nbr.GetNeighbors():
|
|
543
|
+
nbr2_idx = nbr2.GetIdx()
|
|
544
|
+
if nbr2_idx == cc_atom_idx or nbr2_idx not in ring:
|
|
545
|
+
continue
|
|
546
|
+
pair = (min(nbr_idx, nbr2_idx), max(nbr_idx, nbr2_idx))
|
|
547
|
+
if pair not in existing_atom_pairs and pair not in seen:
|
|
548
|
+
seen.add(pair)
|
|
549
|
+
additional_bonds.append((nbr_idx, nbr2_idx, LinkageType.UNKNOWN))
|
|
550
|
+
|
|
551
|
+
# --- Type 2: C-S thioether bonds (FC01) ---
|
|
552
|
+
# Only true thioethers (S bonded to C on both sides), NOT disulfide-adjacent
|
|
553
|
+
for bond in mol.GetBonds():
|
|
554
|
+
a1, a2 = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
|
|
555
|
+
if a1 not in ring or a2 not in ring:
|
|
556
|
+
continue
|
|
557
|
+
at1, at2 = mol.GetAtomWithIdx(a1), mol.GetAtomWithIdx(a2)
|
|
558
|
+
if ((at1.GetAtomicNum() == 6 and at2.GetAtomicNum() == 16) or
|
|
559
|
+
(at1.GetAtomicNum() == 16 and at2.GetAtomicNum() == 6)):
|
|
560
|
+
s_atom = at2 if at2.GetAtomicNum() == 16 else at1
|
|
561
|
+
# Skip if S is bonded to another S (disulfide bridge path)
|
|
562
|
+
if any(n.GetAtomicNum() == 16 for n in s_atom.GetNeighbors()):
|
|
563
|
+
continue
|
|
564
|
+
pair = (min(a1, a2), max(a1, a2))
|
|
565
|
+
if pair not in existing_atom_pairs and pair not in seen:
|
|
566
|
+
seen.add(pair)
|
|
567
|
+
additional_bonds.append((a1, a2, LinkageType.UNKNOWN))
|
|
568
|
+
|
|
569
|
+
# --- Type 3: Alkyl cross-link paths (bi-cyclic R3-R3) ---
|
|
570
|
+
# Find pairs of alpha-C atoms connected by pure carbon chains (no N/O/S
|
|
571
|
+
# in the path). These are R3-R3 cross-links between different cycles.
|
|
572
|
+
# Cleave at the midpoint of each such chain.
|
|
573
|
+
alpha_c_pat = Chem.MolFromSmarts('[N]-[C;X4]-[C;X3](=[O])')
|
|
574
|
+
if alpha_c_pat:
|
|
575
|
+
ac_matches = mol.GetSubstructMatches(alpha_c_pat)
|
|
576
|
+
alpha_c_set = list(set(m[1] for m in ac_matches))
|
|
577
|
+
for i, ac1 in enumerate(alpha_c_set):
|
|
578
|
+
for ac2 in alpha_c_set[i + 1:]:
|
|
579
|
+
path = Chem.GetShortestPath(mol, ac1, ac2)
|
|
580
|
+
if not path or len(path) < 4 or len(path) > 12:
|
|
581
|
+
continue
|
|
582
|
+
# All middle atoms must be C with no N/O/S neighbors
|
|
583
|
+
middle_ok = True
|
|
584
|
+
for mid_idx in path[1:-1]:
|
|
585
|
+
atom = mol.GetAtomWithIdx(mid_idx)
|
|
586
|
+
if atom.GetAtomicNum() != 6:
|
|
587
|
+
middle_ok = False
|
|
588
|
+
break
|
|
589
|
+
if any(n.GetAtomicNum() in (7, 8, 16) for n in atom.GetNeighbors()):
|
|
590
|
+
middle_ok = False
|
|
591
|
+
break
|
|
592
|
+
if middle_ok:
|
|
593
|
+
mid = len(path) // 2
|
|
594
|
+
pair = (min(path[mid - 1], path[mid]), max(path[mid - 1], path[mid]))
|
|
595
|
+
if pair not in existing_atom_pairs and pair not in seen:
|
|
596
|
+
seen.add(pair)
|
|
597
|
+
additional_bonds.append((path[mid - 1], path[mid], LinkageType.UNKNOWN))
|
|
598
|
+
|
|
599
|
+
return additional_bonds
|
|
600
|
+
|
|
389
601
|
def process_molecule(self, mol: Chem.Mol) -> FragmentGraph:
|
|
390
602
|
"""
|
|
391
603
|
Process a molecule into a fragment graph.
|
|
@@ -403,6 +615,11 @@ class FragmentProcessor:
|
|
|
403
615
|
try:
|
|
404
616
|
bonds_to_cleave = self.bond_detector.find_cleavable_bonds(mol)
|
|
405
617
|
|
|
618
|
+
# Detect R3 side-chain bonds for staple monomers (R8, S5, etc.)
|
|
619
|
+
r3_bonds = self._find_staple_sidechain_bonds(mol, bonds_to_cleave)
|
|
620
|
+
if r3_bonds:
|
|
621
|
+
bonds_to_cleave.extend(r3_bonds)
|
|
622
|
+
|
|
406
623
|
if not bonds_to_cleave:
|
|
407
624
|
# Single fragment (no cleavable bonds)
|
|
408
625
|
node = FragmentNode(0, mol)
|
|
@@ -448,6 +665,7 @@ class FragmentProcessor:
|
|
|
448
665
|
graph.cleaved_bond_indices = bond_indices
|
|
449
666
|
graph.bond_info = bond_info
|
|
450
667
|
graph.atom_mappings = atom_mappings
|
|
668
|
+
graph.uncleaned_fragments = fragments # Keep fragments with dummy atoms for R-group SMILES
|
|
451
669
|
|
|
452
670
|
# Create nodes for each fragment
|
|
453
671
|
fragment_nodes = []
|
|
@@ -735,7 +953,7 @@ class FragmentProcessor:
|
|
|
735
953
|
# Identify unmatched nodes
|
|
736
954
|
unmatched_nodes = []
|
|
737
955
|
for node_id, node in graph.nodes.items():
|
|
738
|
-
if node.monomer and node.monomer.
|
|
956
|
+
if node.monomer and node.monomer.is_unknown:
|
|
739
957
|
unmatched_nodes.append(node_id)
|
|
740
958
|
|
|
741
959
|
if not unmatched_nodes:
|
|
@@ -792,7 +1010,7 @@ class FragmentProcessor:
|
|
|
792
1010
|
|
|
793
1011
|
# Try to match the combined fragment (exact match only)
|
|
794
1012
|
monomer = matcher.find_exact_match(combined_mol, num_connections)
|
|
795
|
-
|
|
1013
|
+
|
|
796
1014
|
if monomer:
|
|
797
1015
|
# Success! Create new merged node
|
|
798
1016
|
new_node_id = min(nodes_to_merge)
|
|
@@ -830,8 +1048,7 @@ class FragmentProcessor:
|
|
|
830
1048
|
# Find all unmatched nodes (nodes with mock/unknown monomers)
|
|
831
1049
|
unmatched_nodes = []
|
|
832
1050
|
for node_id, node in graph.nodes.items():
|
|
833
|
-
if node.monomer and
|
|
834
|
-
node.monomer.name.startswith('Unknown')):
|
|
1051
|
+
if node.monomer and node.monomer.is_unknown:
|
|
835
1052
|
unmatched_nodes.append(node_id)
|
|
836
1053
|
|
|
837
1054
|
if not unmatched_nodes:
|
|
@@ -868,6 +1085,86 @@ class FragmentProcessor:
|
|
|
868
1085
|
|
|
869
1086
|
return matched_count
|
|
870
1087
|
|
|
1088
|
+
def recover_unmatched_by_merging_stereo_agnostic(self, graph: FragmentGraph, matcher) -> bool:
|
|
1089
|
+
"""
|
|
1090
|
+
Final recovery pass: merge pairs of BOTH-unmatched neighbor fragments and
|
|
1091
|
+
try stereo-agnostic matching on the combined result.
|
|
1092
|
+
|
|
1093
|
+
This handles monomers like Phe_4Sdihydroorotamido that have internal amide
|
|
1094
|
+
bonds which get incorrectly cleaved, producing two unmatched fragments.
|
|
1095
|
+
|
|
1096
|
+
Only merges when BOTH fragments in a pair are unmatched — never touches
|
|
1097
|
+
already-matched nodes to avoid regressions.
|
|
1098
|
+
|
|
1099
|
+
Returns True if any merges were successful.
|
|
1100
|
+
"""
|
|
1101
|
+
def _is_unmatched(node):
|
|
1102
|
+
return (node.monomer and
|
|
1103
|
+
node.monomer.is_unknown)
|
|
1104
|
+
|
|
1105
|
+
unmatched_ids = [nid for nid, node in graph.nodes.items() if _is_unmatched(node)]
|
|
1106
|
+
if not unmatched_ids:
|
|
1107
|
+
return False
|
|
1108
|
+
|
|
1109
|
+
had_changes = False
|
|
1110
|
+
|
|
1111
|
+
for node_id in unmatched_ids:
|
|
1112
|
+
if node_id not in graph.nodes:
|
|
1113
|
+
continue
|
|
1114
|
+
if not _is_unmatched(graph.nodes[node_id]):
|
|
1115
|
+
continue
|
|
1116
|
+
|
|
1117
|
+
neighbors = graph.get_neighbors(node_id)
|
|
1118
|
+
for neighbor_id, linkage_type in neighbors:
|
|
1119
|
+
if neighbor_id not in graph.nodes:
|
|
1120
|
+
continue
|
|
1121
|
+
# Only merge with another unmatched neighbor
|
|
1122
|
+
if not _is_unmatched(graph.nodes[neighbor_id]):
|
|
1123
|
+
continue
|
|
1124
|
+
|
|
1125
|
+
nodes_to_merge = sorted([node_id, neighbor_id])
|
|
1126
|
+
|
|
1127
|
+
# Find internal links between the merge candidates
|
|
1128
|
+
links_to_exclude = []
|
|
1129
|
+
for link in graph.links:
|
|
1130
|
+
if (link.from_node_id in nodes_to_merge and
|
|
1131
|
+
link.to_node_id in nodes_to_merge):
|
|
1132
|
+
links_to_exclude.append(link)
|
|
1133
|
+
|
|
1134
|
+
combined_mol = self._reconstruct_fragment_with_links(
|
|
1135
|
+
nodes_to_merge, graph, links_to_exclude)
|
|
1136
|
+
if not combined_mol:
|
|
1137
|
+
continue
|
|
1138
|
+
|
|
1139
|
+
# Count external connections for merged fragment
|
|
1140
|
+
all_neighbors = set()
|
|
1141
|
+
for nid in nodes_to_merge:
|
|
1142
|
+
if nid in graph.nodes:
|
|
1143
|
+
for nbr_id, _ in graph.get_neighbors(nid):
|
|
1144
|
+
if nbr_id not in nodes_to_merge:
|
|
1145
|
+
all_neighbors.add(nbr_id)
|
|
1146
|
+
num_connections = len(all_neighbors)
|
|
1147
|
+
|
|
1148
|
+
# Try exact match first, then stereo-agnostic
|
|
1149
|
+
monomer = matcher.find_exact_match(combined_mol, num_connections)
|
|
1150
|
+
if not monomer:
|
|
1151
|
+
combined_smiles = Chem.MolToSmiles(combined_mol, canonical=True)
|
|
1152
|
+
monomer = matcher.monomer_library.find_monomer_by_fragment_smiles_no_stereo(
|
|
1153
|
+
combined_smiles, num_connections)
|
|
1154
|
+
|
|
1155
|
+
if monomer:
|
|
1156
|
+
new_node_id = min(nodes_to_merge)
|
|
1157
|
+
new_node = FragmentNode(new_node_id, combined_mol)
|
|
1158
|
+
new_node.monomer = monomer
|
|
1159
|
+
self._merge_nodes_in_graph(graph, nodes_to_merge, new_node)
|
|
1160
|
+
had_changes = True
|
|
1161
|
+
break # Restart from outer loop
|
|
1162
|
+
|
|
1163
|
+
if had_changes:
|
|
1164
|
+
break
|
|
1165
|
+
|
|
1166
|
+
return had_changes
|
|
1167
|
+
|
|
871
1168
|
# ============================================================================
|
|
872
1169
|
# Content from: helm_generator.py
|
|
873
1170
|
# ============================================================================
|
|
@@ -879,6 +1176,7 @@ class HELMGenerator:
|
|
|
879
1176
|
Supports:
|
|
880
1177
|
- Linear peptides
|
|
881
1178
|
- Cyclic peptides
|
|
1179
|
+
- Multi-chain structures (BILN peptides)
|
|
882
1180
|
- Disulfide bridges
|
|
883
1181
|
- Custom linkages
|
|
884
1182
|
"""
|
|
@@ -896,6 +1194,12 @@ class HELMGenerator:
|
|
|
896
1194
|
"""
|
|
897
1195
|
Generate HELM notation from a FragmentGraph.
|
|
898
1196
|
|
|
1197
|
+
Supports multi-chain structures (BILN peptides):
|
|
1198
|
+
- Detects all cycles (rings) using SSSR-like algorithm
|
|
1199
|
+
- Each cycle becomes a separate PEPTIDE chain
|
|
1200
|
+
- R1-R2 connections define backbone within each chain
|
|
1201
|
+
- R3 connections link chains together
|
|
1202
|
+
|
|
899
1203
|
Args:
|
|
900
1204
|
graph: FragmentGraph containing matched monomers and their connections
|
|
901
1205
|
|
|
@@ -905,6 +1209,30 @@ class HELMGenerator:
|
|
|
905
1209
|
if len(graph) == 0:
|
|
906
1210
|
return ""
|
|
907
1211
|
|
|
1212
|
+
# Find all cycles in the graph (each cycle will be a separate PEPTIDE chain)
|
|
1213
|
+
cycles = graph.find_all_cycles()
|
|
1214
|
+
|
|
1215
|
+
# Decision: Use multi-chain HELM only if:
|
|
1216
|
+
# 1. Multiple cycles exist (BILN-style structure), OR
|
|
1217
|
+
# 2. There are standalone nodes not in any cycle (attached fragments)
|
|
1218
|
+
|
|
1219
|
+
if not cycles:
|
|
1220
|
+
# No cycles - simple linear peptide
|
|
1221
|
+
return self._generate_simple_helm(graph)
|
|
1222
|
+
|
|
1223
|
+
if len(cycles) == 1:
|
|
1224
|
+
# Single cycle (with or without standalone branch nodes like 'ac')
|
|
1225
|
+
# _generate_simple_helm handles branches correctly with proper R-group detection
|
|
1226
|
+
return self._generate_simple_helm(graph)
|
|
1227
|
+
|
|
1228
|
+
# Multi-chain structure detected (multiple cycles)
|
|
1229
|
+
return self._generate_multi_chain_helm(graph, cycles)
|
|
1230
|
+
|
|
1231
|
+
def _generate_simple_helm(self, graph: FragmentGraph) -> str:
|
|
1232
|
+
"""
|
|
1233
|
+
Generate HELM for simple linear or single-cycle peptides.
|
|
1234
|
+
This is the original implementation for backward compatibility.
|
|
1235
|
+
"""
|
|
908
1236
|
# Get ordered sequence of monomers (backbone)
|
|
909
1237
|
ordered_nodes_raw = graph.get_ordered_nodes()
|
|
910
1238
|
|
|
@@ -912,31 +1240,14 @@ class HELMGenerator:
|
|
|
912
1240
|
is_cyclic = graph.is_cyclic()
|
|
913
1241
|
|
|
914
1242
|
# Filter backbone: nodes that are part of R1-R2 chain are backbone
|
|
915
|
-
# Nodes
|
|
916
|
-
#
|
|
917
|
-
# Logic: A node at position 1 is a branch if:
|
|
918
|
-
# - It has no R1 (N-terminus) - meaning it's a cap like 'ac' that only has R2
|
|
919
|
-
# - It only has 1 peptide connection (to the real backbone)
|
|
920
|
-
#
|
|
921
|
-
# Example: [ac].K in cyclic peptide
|
|
922
|
-
# - 'ac' has only R2, no R1 → it's a cap
|
|
923
|
-
# - 'ac' connects to K's R3 (side chain), not K's R1 (backbone)
|
|
924
|
-
# - So 'ac' should be PEPTIDE2, not part of PEPTIDE1
|
|
925
|
-
|
|
1243
|
+
# Nodes lacking R1 (like 'ac' acetyl cap) are branches regardless of position
|
|
926
1244
|
backbone_nodes = []
|
|
927
|
-
for
|
|
1245
|
+
for node in ordered_nodes_raw:
|
|
928
1246
|
is_branch = False
|
|
929
|
-
|
|
930
|
-
if i == 0 and len(ordered_nodes_raw) > 1 and node.monomer:
|
|
931
|
-
# Check if this first node lacks R1 (N-terminus)
|
|
932
|
-
# If it has no R1, it's a cap that should be a branch
|
|
1247
|
+
if node.monomer and len(ordered_nodes_raw) > 1:
|
|
933
1248
|
has_r1 = 'R1' in node.monomer.r_groups
|
|
934
|
-
|
|
935
|
-
if not has_r1:
|
|
936
|
-
# This is an N-terminal cap (like 'ac') at position 1
|
|
937
|
-
# It should be a branch, not part of the main backbone
|
|
1249
|
+
if not has_r1 and not node.monomer.is_unknown:
|
|
938
1250
|
is_branch = True
|
|
939
|
-
|
|
940
1251
|
if not is_branch:
|
|
941
1252
|
backbone_nodes.append(node)
|
|
942
1253
|
|
|
@@ -948,21 +1259,16 @@ class HELMGenerator:
|
|
|
948
1259
|
branch_nodes = [(node_id, node) for node_id, node in graph.nodes.items()
|
|
949
1260
|
if node_id not in ordered_node_ids]
|
|
950
1261
|
|
|
951
|
-
# Generate sequence notation
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
sequence = ".".join(formatted_symbols)
|
|
956
|
-
else:
|
|
957
|
-
# Linear: no brackets
|
|
958
|
-
sequence = ".".join(sequence_symbols)
|
|
1262
|
+
# Generate sequence notation — always bracket multi-char symbols (HELM spec requirement,
|
|
1263
|
+
# also needed for inline SMILES like [*:1]NC(CC(=O)O)C(=O)[*:2])
|
|
1264
|
+
formatted_symbols = [f"[{symbol}]" if len(symbol) > 1 else symbol for symbol in sequence_symbols]
|
|
1265
|
+
sequence = ".".join(formatted_symbols)
|
|
959
1266
|
|
|
960
1267
|
# Collect non-sequential connections (disulfide bridges, cyclic bonds, etc.)
|
|
961
1268
|
connections = []
|
|
962
1269
|
|
|
963
1270
|
if is_cyclic:
|
|
964
1271
|
# Find the actual cyclic peptide bond (last residue connects back to beginning)
|
|
965
|
-
# This handles cases where N-terminal caps (like 'ac') are at position 1
|
|
966
1272
|
last_id = ordered_nodes[-1].id
|
|
967
1273
|
first_few_ids = [ordered_nodes[i].id for i in range(min(3, len(ordered_nodes)))]
|
|
968
1274
|
|
|
@@ -1012,14 +1318,14 @@ class HELMGenerator:
|
|
|
1012
1318
|
branch_chain_name = f"PEPTIDE{branch_idx}"
|
|
1013
1319
|
branch_symbol = branch_node.monomer.symbol if branch_node.monomer else f"X{branch_node_id}"
|
|
1014
1320
|
|
|
1015
|
-
# Format branch chain (single monomer
|
|
1016
|
-
|
|
1321
|
+
# Format branch chain (single monomer)
|
|
1322
|
+
# In cyclic peptides, always use brackets for consistency with reference HELM
|
|
1323
|
+
if is_cyclic:
|
|
1017
1324
|
branch_chains.append(f"{branch_chain_name}{{[{branch_symbol}]}}")
|
|
1018
1325
|
else:
|
|
1019
1326
|
branch_chains.append(f"{branch_chain_name}{{{branch_symbol}}}")
|
|
1020
1327
|
|
|
1021
1328
|
# Find which backbone node this branch connects to
|
|
1022
|
-
# Look for links connecting this branch to the main backbone
|
|
1023
1329
|
for link in graph.links:
|
|
1024
1330
|
backbone_node_id = None
|
|
1025
1331
|
if link.from_node_id == branch_node_id and link.to_node_id in ordered_node_ids:
|
|
@@ -1032,7 +1338,6 @@ class HELMGenerator:
|
|
|
1032
1338
|
backbone_pos = next((i + 1 for i, n in enumerate(ordered_nodes) if n.id == backbone_node_id), None)
|
|
1033
1339
|
if backbone_pos:
|
|
1034
1340
|
# Determine which R-group the branch uses
|
|
1035
|
-
# If branch has R1, connect to R1; if only R2, connect to R2
|
|
1036
1341
|
branch_r_group = "R1"
|
|
1037
1342
|
if branch_node.monomer:
|
|
1038
1343
|
if 'R1' in branch_node.monomer.r_groups:
|
|
@@ -1055,6 +1360,114 @@ class HELMGenerator:
|
|
|
1055
1360
|
helm = f"{helm_chains}$$$$V2.0"
|
|
1056
1361
|
|
|
1057
1362
|
return helm
|
|
1363
|
+
|
|
1364
|
+
def _generate_multi_chain_helm(self, graph: FragmentGraph, cycles: list) -> str:
|
|
1365
|
+
"""
|
|
1366
|
+
Generate HELM for multi-chain structures (BILN peptides).
|
|
1367
|
+
|
|
1368
|
+
Strategy:
|
|
1369
|
+
1. Each cycle becomes a separate PEPTIDE chain
|
|
1370
|
+
2. Nodes not in cycles become additional chains
|
|
1371
|
+
3. R3 connections between chains are added as cross-links
|
|
1372
|
+
"""
|
|
1373
|
+
# Identify which nodes belong to which cycles
|
|
1374
|
+
nodes_in_cycles = set()
|
|
1375
|
+
for cycle in cycles:
|
|
1376
|
+
nodes_in_cycles.update(cycle)
|
|
1377
|
+
|
|
1378
|
+
# Find standalone nodes (not in any cycle)
|
|
1379
|
+
standalone_nodes = [nid for nid in graph.nodes.keys() if nid not in nodes_in_cycles]
|
|
1380
|
+
|
|
1381
|
+
# Build PEPTIDE chains
|
|
1382
|
+
chains = []
|
|
1383
|
+
chain_node_map = {} # Maps node_id -> (chain_idx, position_in_chain)
|
|
1384
|
+
|
|
1385
|
+
# Add cyclic chains
|
|
1386
|
+
for cycle_idx, cycle in enumerate(cycles, start=1):
|
|
1387
|
+
chain_name = f"PEPTIDE{cycle_idx}"
|
|
1388
|
+
# Create sequence from cycle nodes
|
|
1389
|
+
sequence_symbols = []
|
|
1390
|
+
for pos, node_id in enumerate(cycle):
|
|
1391
|
+
node = graph.nodes[node_id]
|
|
1392
|
+
symbol = node.monomer.symbol if node.monomer else f"X{node_id}"
|
|
1393
|
+
sequence_symbols.append(symbol)
|
|
1394
|
+
chain_node_map[node_id] = (cycle_idx, pos + 1) # 1-indexed position
|
|
1395
|
+
|
|
1396
|
+
# Format with brackets for multi-letter symbols
|
|
1397
|
+
formatted = [f"[{s}]" if len(s) > 1 else s for s in sequence_symbols]
|
|
1398
|
+
sequence = ".".join(formatted)
|
|
1399
|
+
chains.append(f"{chain_name}{{{sequence}}}")
|
|
1400
|
+
|
|
1401
|
+
# Add standalone chains (linear fragments not in cycles)
|
|
1402
|
+
next_chain_idx = len(cycles) + 1
|
|
1403
|
+
for node_id in standalone_nodes:
|
|
1404
|
+
chain_name = f"PEPTIDE{next_chain_idx}"
|
|
1405
|
+
node = graph.nodes[node_id]
|
|
1406
|
+
symbol = node.monomer.symbol if node.monomer else f"X{node_id}"
|
|
1407
|
+
chains.append(f"{chain_name}{{{symbol}}}")
|
|
1408
|
+
chain_node_map[node_id] = (next_chain_idx, 1)
|
|
1409
|
+
next_chain_idx += 1
|
|
1410
|
+
|
|
1411
|
+
# Build connections
|
|
1412
|
+
connections = []
|
|
1413
|
+
|
|
1414
|
+
# Add cyclic connections (R1-R2 within each cycle)
|
|
1415
|
+
for cycle_idx, cycle in enumerate(cycles, start=1):
|
|
1416
|
+
if len(cycle) >= 3:
|
|
1417
|
+
# Connect last to first
|
|
1418
|
+
chain_name = f"PEPTIDE{cycle_idx}"
|
|
1419
|
+
last_pos = len(cycle)
|
|
1420
|
+
connections.append(f"{chain_name},{chain_name},{last_pos}:R2-1:R1")
|
|
1421
|
+
|
|
1422
|
+
# Add inter-chain connections (R3 links) and disulfide bridges
|
|
1423
|
+
processed_links = set()
|
|
1424
|
+
for link in graph.links:
|
|
1425
|
+
link_key = tuple(sorted([link.from_node_id, link.to_node_id]))
|
|
1426
|
+
if link_key in processed_links:
|
|
1427
|
+
continue
|
|
1428
|
+
|
|
1429
|
+
from_chain_info = chain_node_map.get(link.from_node_id)
|
|
1430
|
+
to_chain_info = chain_node_map.get(link.to_node_id)
|
|
1431
|
+
|
|
1432
|
+
if not from_chain_info or not to_chain_info:
|
|
1433
|
+
continue
|
|
1434
|
+
|
|
1435
|
+
from_chain, from_pos = from_chain_info
|
|
1436
|
+
to_chain, to_pos = to_chain_info
|
|
1437
|
+
|
|
1438
|
+
# Skip intra-cycle backbone peptide bonds (already handled by R1-R2 connection)
|
|
1439
|
+
if from_chain == to_chain and link.linkage_type == LinkageType.PEPTIDE:
|
|
1440
|
+
# Check if this is a sequential bond within the cycle
|
|
1441
|
+
cycle = cycles[from_chain - 1] if from_chain <= len(cycles) else []
|
|
1442
|
+
# Sequential bonds: adjacent positions or last-to-first
|
|
1443
|
+
if abs(from_pos - to_pos) == 1 or (from_pos == 1 and to_pos == len(cycle)) or (to_pos == 1 and from_pos == len(cycle)):
|
|
1444
|
+
processed_links.add(link_key)
|
|
1445
|
+
continue
|
|
1446
|
+
|
|
1447
|
+
# Add cross-chain connections or intra-chain disulfide bridges
|
|
1448
|
+
if link.linkage_type == LinkageType.DISULFIDE:
|
|
1449
|
+
# Disulfide uses R3 (side chain cysteine)
|
|
1450
|
+
r_group = "R3"
|
|
1451
|
+
elif link.linkage_type == LinkageType.PEPTIDE:
|
|
1452
|
+
# Cross-chain peptide bond (side chain R3 connection)
|
|
1453
|
+
r_group = "R3"
|
|
1454
|
+
else:
|
|
1455
|
+
r_group = "R3"
|
|
1456
|
+
|
|
1457
|
+
from_chain_name = f"PEPTIDE{from_chain}"
|
|
1458
|
+
to_chain_name = f"PEPTIDE{to_chain}"
|
|
1459
|
+
connections.append(f"{from_chain_name},{to_chain_name},{from_pos}:{r_group}-{to_pos}:{r_group}")
|
|
1460
|
+
processed_links.add(link_key)
|
|
1461
|
+
|
|
1462
|
+
# Generate final HELM
|
|
1463
|
+
helm_chains = "|".join(chains)
|
|
1464
|
+
if connections:
|
|
1465
|
+
connection_str = "|".join(connections)
|
|
1466
|
+
helm = f"{helm_chains}${connection_str}$$$V2.0"
|
|
1467
|
+
else:
|
|
1468
|
+
helm = f"{helm_chains}$$$$V2.0"
|
|
1469
|
+
|
|
1470
|
+
return helm
|
|
1058
1471
|
|
|
1059
1472
|
def generate_helm_notation(self, monomers) -> str:
|
|
1060
1473
|
"""
|
|
@@ -1094,23 +1507,34 @@ def remove_stereochemistry_from_smiles(smiles: str) -> str:
|
|
|
1094
1507
|
"""
|
|
1095
1508
|
Remove stereochemistry markers from SMILES string.
|
|
1096
1509
|
Converts [C@@H], [C@H] to C, etc.
|
|
1097
|
-
|
|
1510
|
+
|
|
1098
1511
|
This is used for matching when input molecules don't have stereochemistry defined.
|
|
1512
|
+
Only strips brackets from SMILES organic subset atoms (B,C,N,O,P,S,F,Cl,Br,I).
|
|
1513
|
+
Atoms like Se, Te, etc. must keep their brackets to remain valid SMILES.
|
|
1099
1514
|
"""
|
|
1100
1515
|
if not smiles:
|
|
1101
1516
|
return smiles
|
|
1102
|
-
|
|
1517
|
+
|
|
1518
|
+
# SMILES organic subset: atoms that can appear without brackets
|
|
1519
|
+
organic_subset = {'B', 'C', 'N', 'O', 'P', 'S', 'F', 'Cl', 'Br', 'I'}
|
|
1520
|
+
|
|
1103
1521
|
# Remove @ symbols (stereochemistry markers)
|
|
1104
|
-
# Pattern: [@]+ inside brackets
|
|
1105
1522
|
smiles_no_stereo = re.sub(r'(@+)', '', smiles)
|
|
1106
|
-
|
|
1107
|
-
#
|
|
1108
|
-
#
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1523
|
+
|
|
1524
|
+
# Remove explicit H and brackets only for organic subset atoms
|
|
1525
|
+
# [C@@H] -> [CH] -> C, but [SeH] must stay as [SeH]
|
|
1526
|
+
def _simplify_bracket(match):
|
|
1527
|
+
atom = match.group(1) # e.g. 'C', 'Se', 'N'
|
|
1528
|
+
has_h = match.group(2) # 'H' or ''
|
|
1529
|
+
if atom in organic_subset:
|
|
1530
|
+
return atom # Strip brackets (and H) for organic subset
|
|
1531
|
+
elif has_h:
|
|
1532
|
+
return f'[{atom}H]' # Keep brackets and H for non-organic atoms
|
|
1533
|
+
else:
|
|
1534
|
+
return f'[{atom}]' # Keep brackets for non-organic atoms
|
|
1535
|
+
|
|
1536
|
+
smiles_no_stereo = re.sub(r'\[([A-Z][a-z]?)(H?)\]', _simplify_bracket, smiles_no_stereo)
|
|
1537
|
+
|
|
1114
1538
|
return smiles_no_stereo
|
|
1115
1539
|
|
|
1116
1540
|
class MonomerData:
|
|
@@ -1122,6 +1546,7 @@ class MonomerData:
|
|
|
1122
1546
|
self.r_groups = {} # R-group label -> cap SMILES
|
|
1123
1547
|
self.r_group_count = 0
|
|
1124
1548
|
self.capped_smiles_cache = {} # Cache: frozenset of removed R-groups -> canonical SMILES
|
|
1549
|
+
self.is_unknown = False # True for unmatched fragments with inline SMILES
|
|
1125
1550
|
|
|
1126
1551
|
def __repr__(self):
|
|
1127
1552
|
return f"Monomer({self.symbol}: {self.name}, R-groups: {self.r_group_count})"
|
|
@@ -1234,12 +1659,28 @@ class MonomerData:
|
|
|
1234
1659
|
return ""
|
|
1235
1660
|
|
|
1236
1661
|
|
|
1662
|
+
def _canonicalize_no_stereo(smiles: str) -> str:
|
|
1663
|
+
"""
|
|
1664
|
+
Remove stereochemistry and re-canonicalize through RDKit.
|
|
1665
|
+
This ensures consistent canonical SMILES regardless of how the molecule was constructed.
|
|
1666
|
+
String-only stereo removal can produce non-canonical SMILES.
|
|
1667
|
+
"""
|
|
1668
|
+
no_stereo = remove_stereochemistry_from_smiles(smiles)
|
|
1669
|
+
mol = Chem.MolFromSmiles(no_stereo)
|
|
1670
|
+
if mol:
|
|
1671
|
+
return Chem.MolToSmiles(mol, canonical=True)
|
|
1672
|
+
return no_stereo # Fallback to string version if parse fails
|
|
1673
|
+
|
|
1674
|
+
|
|
1237
1675
|
class MonomerLibrary:
|
|
1238
1676
|
def __init__(self):
|
|
1239
1677
|
self.monomers = {}
|
|
1240
1678
|
self.smiles_to_monomer = {}
|
|
1241
1679
|
self.name_to_monomer = {}
|
|
1242
1680
|
self.symbol_to_monomer = {}
|
|
1681
|
+
# Hash indices for O(1) matching (built after loading)
|
|
1682
|
+
self._smiles_index = {} # canonical_smiles -> MonomerData
|
|
1683
|
+
self._smiles_no_stereo_index = {} # stereo-free_smiles -> MonomerData
|
|
1243
1684
|
|
|
1244
1685
|
def load_from_helm_json(self, json_path: str) -> None:
|
|
1245
1686
|
if not os.path.exists(json_path):
|
|
@@ -1266,6 +1707,9 @@ class MonomerLibrary:
|
|
|
1266
1707
|
except Exception:
|
|
1267
1708
|
continue
|
|
1268
1709
|
|
|
1710
|
+
# Build hash indices for O(1) matching
|
|
1711
|
+
self._build_smiles_indices()
|
|
1712
|
+
|
|
1269
1713
|
def _parse_monomer(self, monomer_dict: dict):
|
|
1270
1714
|
# IMPORTANT: Only load PEPTIDE monomers (amino acids)
|
|
1271
1715
|
# The library contains RNA, CHEM, etc. with overlapping symbols (A, C, G, T, U)
|
|
@@ -1313,101 +1757,85 @@ class MonomerLibrary:
|
|
|
1313
1757
|
|
|
1314
1758
|
return monomer
|
|
1315
1759
|
|
|
1316
|
-
def
|
|
1760
|
+
def _build_smiles_indices(self):
|
|
1317
1761
|
"""
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
Logic:
|
|
1328
|
-
- Fragment with N connections → N R-groups were removed during fragmentation
|
|
1329
|
-
- For monomer with M R-groups, try all C(M,N) combinations of which N R-groups were removed
|
|
1330
|
-
- Generate SMILES for each combination on-demand (with caching)
|
|
1331
|
-
|
|
1332
|
-
Example:
|
|
1333
|
-
Fragment has 1 connection, monomer has R1, R2:
|
|
1334
|
-
- Try removing R1 → check if SMILES matches
|
|
1335
|
-
- Try removing R2 → check if SMILES matches
|
|
1762
|
+
Pre-compute all possible capped SMILES for every monomer and build
|
|
1763
|
+
hash indices for O(1) lookup. Called once after loading all monomers.
|
|
1764
|
+
|
|
1765
|
+
For each monomer with M R-groups, generates capped SMILES for all
|
|
1766
|
+
possible R-group removal combinations (up to 2^M - 1 entries, typically 1-7).
|
|
1767
|
+
|
|
1768
|
+
Deduplicates monomers with identical SMILES+R-groups to avoid redundant
|
|
1769
|
+
capping computations (important for large libraries with variants).
|
|
1336
1770
|
"""
|
|
1337
|
-
|
|
1771
|
+
self._smiles_index = {}
|
|
1772
|
+
self._smiles_no_stereo_index = {}
|
|
1773
|
+
|
|
1774
|
+
# Dedup: group monomers by (smiles, r_group_keys) to avoid recomputing
|
|
1775
|
+
# identical capped forms for monomers with the same structure
|
|
1776
|
+
seen_structures = {} # (smiles, r_group_frozenset) -> list of capped entries
|
|
1777
|
+
|
|
1338
1778
|
for symbol, monomer in self.monomers.items():
|
|
1339
|
-
|
|
1340
|
-
if monomer.r_group_count < num_connections:
|
|
1779
|
+
if monomer.r_group_count == 0:
|
|
1341
1780
|
continue
|
|
1342
|
-
|
|
1343
|
-
# Generate all combinations of num_connections R-groups that could have been removed
|
|
1781
|
+
|
|
1344
1782
|
r_group_labels = list(monomer.r_groups.keys())
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1783
|
+
struct_key = (monomer.smiles, frozenset(monomer.r_groups.items()))
|
|
1784
|
+
|
|
1785
|
+
if struct_key in seen_structures:
|
|
1786
|
+
# Reuse cached capped SMILES from an identical monomer
|
|
1787
|
+
for capped_smiles, n_removed in seen_structures[struct_key]:
|
|
1788
|
+
key = (capped_smiles, n_removed)
|
|
1789
|
+
if key not in self._smiles_index:
|
|
1790
|
+
self._smiles_index[key] = monomer
|
|
1791
|
+
ns_canonical = _canonicalize_no_stereo(capped_smiles)
|
|
1792
|
+
if ns_canonical:
|
|
1793
|
+
ns_key = (ns_canonical, n_removed)
|
|
1794
|
+
if ns_key not in self._smiles_no_stereo_index:
|
|
1795
|
+
self._smiles_no_stereo_index[ns_key] = monomer
|
|
1796
|
+
continue
|
|
1797
|
+
|
|
1798
|
+
# First time seeing this structure — compute capped SMILES
|
|
1799
|
+
cached_entries = []
|
|
1800
|
+
|
|
1801
|
+
for n_removed in range(1, monomer.r_group_count + 1):
|
|
1802
|
+
for removed_combo in combinations(r_group_labels, n_removed):
|
|
1803
|
+
removed_set = frozenset(removed_combo)
|
|
1804
|
+
capped_smiles = monomer.get_capped_smiles_for_removed_rgroups(removed_set)
|
|
1805
|
+
|
|
1806
|
+
if not capped_smiles:
|
|
1807
|
+
continue
|
|
1808
|
+
|
|
1809
|
+
cached_entries.append((capped_smiles, n_removed))
|
|
1810
|
+
|
|
1811
|
+
key = (capped_smiles, n_removed)
|
|
1812
|
+
if key not in self._smiles_index:
|
|
1813
|
+
self._smiles_index[key] = monomer
|
|
1814
|
+
|
|
1815
|
+
ns_canonical = _canonicalize_no_stereo(capped_smiles)
|
|
1816
|
+
if ns_canonical:
|
|
1817
|
+
ns_key = (ns_canonical, n_removed)
|
|
1818
|
+
if ns_key not in self._smiles_no_stereo_index:
|
|
1819
|
+
self._smiles_no_stereo_index[ns_key] = monomer
|
|
1820
|
+
|
|
1821
|
+
seen_structures[struct_key] = cached_entries
|
|
1822
|
+
|
|
1823
|
+
def find_monomer_by_fragment_smiles(self, fragment_smiles: str, num_connections: int):
|
|
1824
|
+
"""
|
|
1825
|
+
Find monomer by matching fragment SMILES. O(1) hash lookup.
|
|
1826
|
+
"""
|
|
1827
|
+
return self._smiles_index.get((fragment_smiles, num_connections))
|
|
1828
|
+
|
|
1359
1829
|
def find_monomer_by_fragment_smiles_no_stereo(self, fragment_smiles: str, num_connections: int):
|
|
1360
1830
|
"""
|
|
1361
1831
|
Find monomer by matching fragment SMILES WITHOUT stereochemistry.
|
|
1362
|
-
Used
|
|
1363
|
-
|
|
1364
|
-
Uses molecular graph isomorphism to handle cases where RDKit generates
|
|
1365
|
-
different canonical SMILES for the same molecule.
|
|
1366
|
-
|
|
1367
|
-
Args:
|
|
1368
|
-
fragment_smiles: Canonical SMILES of the fragment
|
|
1369
|
-
num_connections: Number of connections this fragment has in the graph
|
|
1370
|
-
|
|
1371
|
-
Returns:
|
|
1372
|
-
MonomerData if match found, None otherwise
|
|
1832
|
+
Used in recovery for handling poor quality input data. O(1) hash lookup.
|
|
1373
1833
|
"""
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
fragment_mol = Chem.MolFromSmiles(fragment_no_stereo_smiles)
|
|
1377
|
-
if not fragment_mol:
|
|
1834
|
+
ns_canonical = _canonicalize_no_stereo(fragment_smiles)
|
|
1835
|
+
if not ns_canonical:
|
|
1378
1836
|
return None
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
for symbol, monomer in self.monomers.items():
|
|
1382
|
-
# Skip if monomer doesn't have enough R-groups
|
|
1383
|
-
if monomer.r_group_count < num_connections:
|
|
1384
|
-
continue
|
|
1385
|
-
|
|
1386
|
-
# Generate all combinations of num_connections R-groups that could have been removed
|
|
1387
|
-
r_group_labels = list(monomer.r_groups.keys())
|
|
1388
|
-
|
|
1389
|
-
# For each combination of R-groups that could have been removed
|
|
1390
|
-
for removed_combo in combinations(r_group_labels, num_connections):
|
|
1391
|
-
removed_set = frozenset(removed_combo)
|
|
1392
|
-
|
|
1393
|
-
# Generate SMILES with these R-groups removed (lazy, cached)
|
|
1394
|
-
candidate_smiles = monomer.get_capped_smiles_for_removed_rgroups(removed_set)
|
|
1395
|
-
|
|
1396
|
-
# Try string comparison first (fast path)
|
|
1397
|
-
candidate_no_stereo = remove_stereochemistry_from_smiles(candidate_smiles)
|
|
1398
|
-
|
|
1399
|
-
if candidate_no_stereo == fragment_no_stereo_smiles:
|
|
1400
|
-
return monomer
|
|
1401
|
-
|
|
1402
|
-
# If string comparison fails, try molecular graph isomorphism (slower but more robust)
|
|
1403
|
-
# This handles cases where RDKit generates different canonical SMILES for same molecule
|
|
1404
|
-
candidate_mol = Chem.MolFromSmiles(candidate_no_stereo)
|
|
1405
|
-
if candidate_mol and fragment_mol.HasSubstructMatch(candidate_mol) and candidate_mol.HasSubstructMatch(fragment_mol):
|
|
1406
|
-
# Both molecules are substructures of each other = they're the same
|
|
1407
|
-
if fragment_mol.GetNumAtoms() == candidate_mol.GetNumAtoms():
|
|
1408
|
-
return monomer
|
|
1409
|
-
|
|
1410
|
-
return None
|
|
1837
|
+
|
|
1838
|
+
return self._smiles_no_stereo_index.get((ns_canonical, num_connections))
|
|
1411
1839
|
|
|
1412
1840
|
def find_monomer_by_symbol(self, symbol: str):
|
|
1413
1841
|
return self.symbol_to_monomer.get(symbol)
|
|
@@ -1494,6 +1922,34 @@ from rdkit import Chem
|
|
|
1494
1922
|
import os
|
|
1495
1923
|
import json
|
|
1496
1924
|
|
|
1925
|
+
def _generate_rgroup_smiles(graph, node_id):
|
|
1926
|
+
"""
|
|
1927
|
+
Generate SMILES with R-group markers ([*:1], [*:2], ...) for an unmatched fragment.
|
|
1928
|
+
Uses the uncleaned fragment (with dummy atoms from FragmentOnBonds) stored in the graph.
|
|
1929
|
+
Falls back to plain SMILES from the cleaned mol if uncleaned data isn't available.
|
|
1930
|
+
"""
|
|
1931
|
+
# Try to use uncleaned fragment with dummy atoms
|
|
1932
|
+
if hasattr(graph, 'uncleaned_fragments') and node_id < len(graph.uncleaned_fragments):
|
|
1933
|
+
uncleaned = graph.uncleaned_fragments[node_id]
|
|
1934
|
+
try:
|
|
1935
|
+
mol = Chem.RWMol(Chem.Mol(uncleaned))
|
|
1936
|
+
r_num = 1
|
|
1937
|
+
for atom in mol.GetAtoms():
|
|
1938
|
+
if atom.GetAtomicNum() == 0:
|
|
1939
|
+
atom.SetIsotope(0)
|
|
1940
|
+
atom.SetAtomMapNum(r_num)
|
|
1941
|
+
r_num += 1
|
|
1942
|
+
return Chem.MolToSmiles(mol)
|
|
1943
|
+
except Exception:
|
|
1944
|
+
pass
|
|
1945
|
+
|
|
1946
|
+
# Fallback: plain SMILES from cleaned mol (no R-groups)
|
|
1947
|
+
node = graph.nodes.get(node_id)
|
|
1948
|
+
if node and node.mol:
|
|
1949
|
+
return Chem.MolToSmiles(node.mol, canonical=True)
|
|
1950
|
+
return "?"
|
|
1951
|
+
|
|
1952
|
+
|
|
1497
1953
|
# Global variables for caching
|
|
1498
1954
|
_MONOMER_LIBRARY = None
|
|
1499
1955
|
_PROCESSOR = None
|
|
@@ -1617,7 +2073,10 @@ def convert_molecules_batch(molecules: list, library_json: str = None, input_typ
|
|
|
1617
2073
|
return [(False, "Library loading failed") for _ in molecules]
|
|
1618
2074
|
|
|
1619
2075
|
print(f"Custom library loaded: {len(library.monomers)} monomers")
|
|
1620
|
-
|
|
2076
|
+
|
|
2077
|
+
# Build hash indices for O(1) matching
|
|
2078
|
+
library._build_smiles_indices()
|
|
2079
|
+
|
|
1621
2080
|
# Create processor instances for this library
|
|
1622
2081
|
processor = FragmentProcessor(library)
|
|
1623
2082
|
matcher = MonomerMatcher(library)
|
|
@@ -1672,21 +2131,23 @@ def convert_molecules_batch(molecules: list, library_json: str = None, input_typ
|
|
|
1672
2131
|
graph = processor.process_molecule(mol)
|
|
1673
2132
|
|
|
1674
2133
|
# Match each fragment to a monomer using graph topology
|
|
1675
|
-
unknown_count = 0
|
|
1676
2134
|
for node_id, node in graph.nodes.items():
|
|
1677
2135
|
# Count connections for this node
|
|
1678
2136
|
neighbors = graph.get_neighbors(node_id)
|
|
1679
2137
|
num_connections = len(neighbors)
|
|
1680
|
-
|
|
2138
|
+
|
|
1681
2139
|
# Find matching monomer
|
|
1682
2140
|
monomer = matcher.find_exact_match(node.mol, num_connections)
|
|
1683
2141
|
if monomer:
|
|
1684
2142
|
node.monomer = monomer
|
|
1685
2143
|
else:
|
|
1686
|
-
|
|
2144
|
+
# Generate inline SMILES with R-group markers for unmatched fragments
|
|
1687
2145
|
mock_monomer = MonomerData()
|
|
1688
|
-
mock_monomer.
|
|
1689
|
-
mock_monomer.
|
|
2146
|
+
mock_monomer.is_unknown = True
|
|
2147
|
+
mock_monomer.symbol = _generate_rgroup_smiles(graph, node_id)
|
|
2148
|
+
mock_monomer.name = "Unknown"
|
|
2149
|
+
mock_monomer.r_groups = {f'R{j+1}': '' for j in range(num_connections)}
|
|
2150
|
+
mock_monomer.r_group_count = num_connections
|
|
1690
2151
|
node.monomer = mock_monomer
|
|
1691
2152
|
|
|
1692
2153
|
# Try to recover unmatched fragments by merging with neighbors
|
|
@@ -1701,7 +2162,14 @@ def convert_molecules_batch(molecules: list, library_json: str = None, input_typ
|
|
|
1701
2162
|
stereo_matched = processor.recover_unmatched_with_stereo_agnostic(graph, matcher)
|
|
1702
2163
|
if stereo_matched > 0:
|
|
1703
2164
|
print(f"DEBUG: Stereo-agnostic recovery matched {stereo_matched} additional fragments")
|
|
1704
|
-
|
|
2165
|
+
|
|
2166
|
+
# Final pass: merge pairs of both-unmatched neighbor fragments
|
|
2167
|
+
# with stereo-agnostic matching (handles split monomers like Phe_4Sdihydroorotamido)
|
|
2168
|
+
for attempt in range(max_recovery_attempts):
|
|
2169
|
+
had_changes = processor.recover_unmatched_by_merging_stereo_agnostic(graph, matcher)
|
|
2170
|
+
if not had_changes:
|
|
2171
|
+
break
|
|
2172
|
+
|
|
1705
2173
|
if len(graph.nodes) > 0:
|
|
1706
2174
|
helm_notation = helm_generator.generate_helm_from_graph(graph)
|
|
1707
2175
|
results.append((True, helm_notation))
|
|
@@ -1742,5 +2210,9 @@ def convert_smiles_to_helm(smiles_list: list, library_json: str = None) -> list:
|
|
|
1742
2210
|
"""
|
|
1743
2211
|
return convert_molecules_batch(smiles_list, library_json=library_json, input_type="smiles")
|
|
1744
2212
|
|
|
2213
|
+
global libraryJSON
|
|
2214
|
+
with open(libraryFile) as f:
|
|
2215
|
+
libraryJSON = f.read()
|
|
2216
|
+
|
|
1745
2217
|
res_helm_list = convert_molecules_batch(molListToProcess, library_json=libraryJSON)
|
|
1746
2218
|
result_helm = pd.DataFrame(map(lambda x: x[1], res_helm_list), columns=["regenerated sequences"])
|