@datagrok/bio 2.25.3 → 2.25.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/package-test.js +2 -2
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +2 -2
- package/dist/package.js.map +1 -1
- package/package.json +1 -1
- package/scripts/mol-to-helm.py +556 -89
- package/src/package.g.ts +1 -1
- package/src/package.ts +1 -1
- package/src/utils/monomer-lib/monomer-manager/const.ts +40 -0
- package/src/utils/monomer-lib/monomer-manager/monomer-manager.ts +13 -2
- package/test-console-output-1.log +319 -321
- package/test-record-1.mp4 +0 -0
package/scripts/mol-to-helm.py
CHANGED
|
@@ -25,6 +25,7 @@ from typing import Optional
|
|
|
25
25
|
from typing import Tuple
|
|
26
26
|
import json
|
|
27
27
|
import os
|
|
28
|
+
import re
|
|
28
29
|
|
|
29
30
|
# ============================================================================
|
|
30
31
|
# Content from: fragment_graph.py
|
|
@@ -179,6 +180,37 @@ class FragmentGraph:
|
|
|
179
180
|
for node in ordered_nodes
|
|
180
181
|
]
|
|
181
182
|
|
|
183
|
+
def is_cyclic(self) -> bool:
|
|
184
|
+
"""
|
|
185
|
+
Detect if the peptide is cyclic.
|
|
186
|
+
A cyclic peptide has a peptide bond connecting the last residue back to near the beginning.
|
|
187
|
+
Handles cases where N-terminal caps (like 'ac' from Lys_Ac) create an extra fragment at position 0.
|
|
188
|
+
"""
|
|
189
|
+
if len(self.nodes) < 3:
|
|
190
|
+
return False
|
|
191
|
+
|
|
192
|
+
# Get ordered nodes
|
|
193
|
+
ordered = self.get_ordered_nodes()
|
|
194
|
+
if len(ordered) < 3:
|
|
195
|
+
return False
|
|
196
|
+
|
|
197
|
+
# Get the last node ID
|
|
198
|
+
last_id = ordered[-1].id
|
|
199
|
+
|
|
200
|
+
# For a cyclic peptide, the last residue should connect back to one of the first few residues
|
|
201
|
+
# (usually first, but could be second if there's an N-terminal cap like 'ac')
|
|
202
|
+
# Check if last node has a peptide bond to any of the first 3 nodes
|
|
203
|
+
first_few_ids = [ordered[i].id for i in range(min(3, len(ordered)))]
|
|
204
|
+
|
|
205
|
+
for link in self.links:
|
|
206
|
+
if link.linkage_type == LinkageType.PEPTIDE:
|
|
207
|
+
# Check if link connects last node to one of the first few nodes
|
|
208
|
+
if (link.from_node_id == last_id and link.to_node_id in first_few_ids) or \
|
|
209
|
+
(link.to_node_id == last_id and link.from_node_id in first_few_ids):
|
|
210
|
+
return True
|
|
211
|
+
|
|
212
|
+
return False
|
|
213
|
+
|
|
182
214
|
def __len__(self):
|
|
183
215
|
return len(self.nodes)
|
|
184
216
|
|
|
@@ -221,8 +253,15 @@ class BondDetector:
|
|
|
221
253
|
#GENERALIZATION ITEM: BOND PATTERNS SHOULD BE DERIVED FROM LIBRARY
|
|
222
254
|
def __init__(self):
|
|
223
255
|
# True peptide bond: C and N both in backbone (each bonded to carbons)
|
|
224
|
-
#
|
|
225
|
-
|
|
256
|
+
# First carbon can be aliphatic or aromatic (for amino acids like NMe2Abz)
|
|
257
|
+
# Carbonyl carbon is sp2 (X3)
|
|
258
|
+
# Exclude if carbonyl is in a small ring (r5 or r6) to avoid cleaving lactams like Pyr
|
|
259
|
+
# !r5 = not in 5-membered ring, !r6 = not in 6-membered ring
|
|
260
|
+
# This preserves lactams but allows large macrocycles and proline (C=O outside ring)
|
|
261
|
+
# Nitrogen can be X2 (proline, imino) or X3 (standard amino, N-methyl)
|
|
262
|
+
# N-C bond can be single (-) or double (=) for imine bonds in dehydro amino acids
|
|
263
|
+
# Alpha carbon after N can be sp3 (X4) or sp2 (X3) for dehydroamino acids, or aromatic (#6 includes both)
|
|
264
|
+
self.peptide_bond = Chem.MolFromSmarts('[#6]-[C;X3;!r5;!r6](=[O;X1])-[N;X2,X3]~[#6;X3,X4]')
|
|
226
265
|
# True disulfide bond: S-S where each S is bonded to carbon (cysteine residues)
|
|
227
266
|
self.disulfide_bond = Chem.MolFromSmarts('[C;X4]-[S;X2]-[S;X2]-[C;X4]')
|
|
228
267
|
# Primary amine at N-terminus (can be NH2 or NH3+), alpha-C can be sp3 or sp2
|
|
@@ -265,7 +304,7 @@ class BondDetector:
|
|
|
265
304
|
matches = mol.GetSubstructMatches(self.peptide_bond)
|
|
266
305
|
for match in matches:
|
|
267
306
|
if len(match) >= 5:
|
|
268
|
-
# Pattern: [C;X3,X4]-[C;X3](=[O;X1])-[N;X3]
|
|
307
|
+
# Pattern: [C;X3,X4]-[C;X3](=[O;X1])-[N;X2,X3]~[C;X3,X4]
|
|
269
308
|
# match[0]=alpha-C (sp2 or sp3), match[1]=carbonyl-C, match[2]=O, match[3]=N, match[4]=next-alpha-C (sp2 or sp3)
|
|
270
309
|
c_atom = match[1] # Carbonyl carbon
|
|
271
310
|
n_atom = match[3] # Nitrogen
|
|
@@ -399,18 +438,16 @@ class FragmentProcessor:
|
|
|
399
438
|
# Fragment the molecule
|
|
400
439
|
fragmented_mol = Chem.FragmentOnBonds(mol, bond_indices, addDummies=True)
|
|
401
440
|
|
|
402
|
-
# Get fragments
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
)
|
|
408
|
-
fragments = list(fragments_tuple)
|
|
441
|
+
# Get fragments as molecules
|
|
442
|
+
fragments = Chem.GetMolFrags(fragmented_mol, asMols=True, sanitizeFrags=True)
|
|
443
|
+
|
|
444
|
+
# Get atom mappings separately (which original atoms are in which fragment)
|
|
445
|
+
atom_mappings = Chem.GetMolFrags(fragmented_mol, asMols=False, fragsMolAtomMapping=True)
|
|
409
446
|
|
|
410
447
|
# Store bond cleavage info for recovery - we'll use this to selectively re-fragment
|
|
411
448
|
graph.cleaved_bond_indices = bond_indices
|
|
412
449
|
graph.bond_info = bond_info
|
|
413
|
-
|
|
450
|
+
graph.atom_mappings = atom_mappings
|
|
414
451
|
|
|
415
452
|
# Create nodes for each fragment
|
|
416
453
|
fragment_nodes = []
|
|
@@ -426,20 +463,43 @@ class FragmentProcessor:
|
|
|
426
463
|
graph.add_node(node)
|
|
427
464
|
fragment_nodes.append((i, node))
|
|
428
465
|
|
|
429
|
-
# Create links between fragments based on cleaved bonds
|
|
430
|
-
#
|
|
431
|
-
|
|
432
|
-
for
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
466
|
+
# Create links between fragments based on the actual cleaved bonds
|
|
467
|
+
# Build mapping: original atom index → (fragment_idx, new_atom_idx_in_fragment)
|
|
468
|
+
atom_to_fragment_and_idx = {}
|
|
469
|
+
for frag_idx, original_atom_indices in enumerate(atom_mappings):
|
|
470
|
+
for new_idx_in_frag, original_atom_idx in enumerate(original_atom_indices):
|
|
471
|
+
atom_to_fragment_and_idx[original_atom_idx] = (frag_idx, new_idx_in_frag)
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
# For each cleaved bond, determine which fragments it connects
|
|
475
|
+
link_count = 0
|
|
476
|
+
for bond_idx, atom1_orig, atom2_orig, linkage_type in bond_info:
|
|
477
|
+
# Find which fragments contain these atoms and their new indices
|
|
478
|
+
frag1_info = atom_to_fragment_and_idx.get(atom1_orig)
|
|
479
|
+
frag2_info = atom_to_fragment_and_idx.get(atom2_orig)
|
|
480
|
+
|
|
481
|
+
if frag1_info is None or frag2_info is None:
|
|
482
|
+
print(f"DEBUG: Skipping bond atoms {atom1_orig}-{atom2_orig}: not found in fragments")
|
|
483
|
+
continue
|
|
484
|
+
|
|
485
|
+
frag1, atom1_new = frag1_info
|
|
486
|
+
frag2, atom2_new = frag2_info
|
|
487
|
+
|
|
488
|
+
# Create link even if both atoms are in same fragment (internal bond like in Phe_4Sdihydroorotamido)
|
|
489
|
+
# This creates a "self-link" that will be used during recovery to reconstruct the monomer
|
|
490
|
+
link = FragmentLink(frag1, frag2, linkage_type,
|
|
491
|
+
from_atom_idx=atom1_new, to_atom_idx=atom2_new)
|
|
436
492
|
graph.add_link(link)
|
|
493
|
+
link_count += 1
|
|
494
|
+
|
|
495
|
+
if frag1 == frag2:
|
|
496
|
+
print(f"DEBUG: Link {link_count}: {linkage_type.value.upper()} SELF-LINK frag{frag1} "
|
|
497
|
+
f"orig_atoms({atom1_orig}<->{atom2_orig}) frag_atoms({atom1_new}<->{atom2_new})")
|
|
498
|
+
else:
|
|
499
|
+
print(f"DEBUG: Link {link_count}: {linkage_type.value.upper()} frag{frag1}<->frag{frag2} "
|
|
500
|
+
f"orig_atoms({atom1_orig}<->{atom2_orig}) frag_atoms({atom1_new}<->{atom2_new})")
|
|
437
501
|
|
|
438
|
-
|
|
439
|
-
# TODO: Track which fragments contain the S atoms for proper linking
|
|
440
|
-
disulfide_links = [b for b in bond_info if b[3] == LinkageType.DISULFIDE]
|
|
441
|
-
# For now, disulfide bonds require more complex atom tracking
|
|
442
|
-
# This is a placeholder for future enhancement
|
|
502
|
+
print(f"DEBUG: Created {link_count} links total")
|
|
443
503
|
|
|
444
504
|
return graph
|
|
445
505
|
|
|
@@ -472,6 +532,94 @@ class FragmentProcessor:
|
|
|
472
532
|
except Exception:
|
|
473
533
|
return None
|
|
474
534
|
|
|
535
|
+
def _reconstruct_fragment_with_links(self, node_ids: list, graph: FragmentGraph,
|
|
536
|
+
links_to_exclude: list) -> Chem.Mol:
|
|
537
|
+
"""
|
|
538
|
+
Reconstruct a molecule by combining multiple fragment nodes, using link information.
|
|
539
|
+
|
|
540
|
+
Args:
|
|
541
|
+
node_ids: List of node IDs to merge
|
|
542
|
+
graph: The fragment graph
|
|
543
|
+
links_to_exclude: List of FragmentLink objects connecting the nodes to merge
|
|
544
|
+
|
|
545
|
+
Returns:
|
|
546
|
+
Combined RDKit molecule, or None if reconstruction fails
|
|
547
|
+
"""
|
|
548
|
+
if not node_ids or not hasattr(graph, 'original_mol'):
|
|
549
|
+
return None
|
|
550
|
+
|
|
551
|
+
if not hasattr(graph, 'cleaved_bond_indices') or not hasattr(graph, 'bond_info'):
|
|
552
|
+
return None
|
|
553
|
+
|
|
554
|
+
try:
|
|
555
|
+
# Find which bond indices correspond to the links we want to exclude
|
|
556
|
+
bonds_to_exclude_indices = []
|
|
557
|
+
|
|
558
|
+
for link in links_to_exclude:
|
|
559
|
+
# Find the bond_info entry that matches this link's original atoms
|
|
560
|
+
# We need to find which bond connected these fragments
|
|
561
|
+
for bond_list_idx, (bond_idx, atom1, atom2, linkage_type) in enumerate(graph.bond_info):
|
|
562
|
+
# Check if this bond connects the fragments in this link
|
|
563
|
+
if hasattr(graph, 'atom_mappings'):
|
|
564
|
+
# Find which fragments contain these atoms
|
|
565
|
+
frag1 = None
|
|
566
|
+
frag2 = None
|
|
567
|
+
for frag_idx, atom_indices in enumerate(graph.atom_mappings):
|
|
568
|
+
if atom1 in atom_indices:
|
|
569
|
+
frag1 = frag_idx
|
|
570
|
+
if atom2 in atom_indices:
|
|
571
|
+
frag2 = frag_idx
|
|
572
|
+
|
|
573
|
+
# If this bond connects the two fragments in the link, exclude it
|
|
574
|
+
if (frag1 == link.from_node_id and frag2 == link.to_node_id) or \
|
|
575
|
+
(frag1 == link.to_node_id and frag2 == link.from_node_id):
|
|
576
|
+
bonds_to_exclude_indices.append(bond_list_idx)
|
|
577
|
+
print(f"DEBUG: Excluding {linkage_type.value} bond at index {bond_list_idx} (atoms {atom1}<->{atom2})")
|
|
578
|
+
break
|
|
579
|
+
|
|
580
|
+
# Create new bond list excluding the bonds we want to keep
|
|
581
|
+
new_bond_indices = [
|
|
582
|
+
bond_idx for i, bond_idx in enumerate(graph.cleaved_bond_indices)
|
|
583
|
+
if i not in bonds_to_exclude_indices
|
|
584
|
+
]
|
|
585
|
+
|
|
586
|
+
print(f"DEBUG reconstruct: Original had {len(graph.cleaved_bond_indices)} cleaved bonds, "
|
|
587
|
+
f"excluding {len(bonds_to_exclude_indices)} bonds, new list has {len(new_bond_indices)} bonds")
|
|
588
|
+
|
|
589
|
+
# Re-fragment with the modified bond list
|
|
590
|
+
if not new_bond_indices:
|
|
591
|
+
# No bonds to cleave - return whole molecule
|
|
592
|
+
return graph.original_mol
|
|
593
|
+
|
|
594
|
+
fragmented_mol = Chem.FragmentOnBonds(graph.original_mol, new_bond_indices, addDummies=True)
|
|
595
|
+
fragments = Chem.GetMolFrags(fragmented_mol, asMols=True, sanitizeFrags=True)
|
|
596
|
+
new_atom_mappings = Chem.GetMolFrags(fragmented_mol, asMols=False, fragsMolAtomMapping=True)
|
|
597
|
+
|
|
598
|
+
# Find which new fragment contains atoms from our target nodes
|
|
599
|
+
# Look for the fragment that contains atoms from the first node we want to merge
|
|
600
|
+
sorted_nodes = sorted(node_ids)
|
|
601
|
+
first_node_atoms = set(graph.atom_mappings[sorted_nodes[0]])
|
|
602
|
+
|
|
603
|
+
target_fragment_idx = None
|
|
604
|
+
for new_frag_idx, new_atoms in enumerate(new_atom_mappings):
|
|
605
|
+
# Check if this new fragment contains any atoms from our first target node
|
|
606
|
+
if first_node_atoms & set(new_atoms):
|
|
607
|
+
target_fragment_idx = new_frag_idx
|
|
608
|
+
break
|
|
609
|
+
|
|
610
|
+
print(f"DEBUG reconstruct: Got {len(fragments)} fragments after re-fragmentation, "
|
|
611
|
+
f"target_fragment_idx={target_fragment_idx}")
|
|
612
|
+
|
|
613
|
+
if target_fragment_idx is not None and target_fragment_idx < len(fragments):
|
|
614
|
+
clean_frag = self._clean_fragment(fragments[target_fragment_idx])
|
|
615
|
+
return clean_frag if clean_frag else fragments[target_fragment_idx]
|
|
616
|
+
|
|
617
|
+
return None
|
|
618
|
+
|
|
619
|
+
except Exception as e:
|
|
620
|
+
print(f"DEBUG reconstruct: Exception: {e}")
|
|
621
|
+
return None
|
|
622
|
+
|
|
475
623
|
def _reconstruct_fragment(self, node_ids: list, graph: FragmentGraph) -> Chem.Mol:
|
|
476
624
|
"""
|
|
477
625
|
Reconstruct a molecule by combining multiple fragment nodes.
|
|
@@ -581,7 +729,7 @@ class FragmentProcessor:
|
|
|
581
729
|
|
|
582
730
|
def recover_unmatched_fragments(self, graph: FragmentGraph, matcher) -> bool:
|
|
583
731
|
"""
|
|
584
|
-
Try to recover unmatched fragments by merging with neighbors.
|
|
732
|
+
Try to recover unmatched fragments by merging with neighbors based on graph links.
|
|
585
733
|
Returns True if any merges were successful.
|
|
586
734
|
"""
|
|
587
735
|
# Identify unmatched nodes
|
|
@@ -593,8 +741,6 @@ class FragmentProcessor:
|
|
|
593
741
|
if not unmatched_nodes:
|
|
594
742
|
return False
|
|
595
743
|
|
|
596
|
-
print(f"DEBUG: Found {len(unmatched_nodes)} unmatched nodes: {unmatched_nodes}")
|
|
597
|
-
|
|
598
744
|
had_changes = False
|
|
599
745
|
|
|
600
746
|
# Try to recover each unmatched node
|
|
@@ -603,33 +749,29 @@ class FragmentProcessor:
|
|
|
603
749
|
if node_id not in graph.nodes:
|
|
604
750
|
continue
|
|
605
751
|
|
|
606
|
-
# Get neighbors
|
|
752
|
+
# Get neighbors from graph links (returns list of (neighbor_id, linkage_type))
|
|
607
753
|
neighbors = graph.get_neighbors(node_id)
|
|
608
|
-
neighbor_ids = [n[0] for n in neighbors]
|
|
609
754
|
|
|
610
|
-
if not
|
|
755
|
+
if not neighbors:
|
|
611
756
|
continue
|
|
612
757
|
|
|
613
|
-
#
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
# Try each merge combination
|
|
628
|
-
for nodes_to_merge in merge_attempts:
|
|
629
|
-
print(f"DEBUG: Trying to merge nodes {nodes_to_merge}")
|
|
758
|
+
# Try merging with each individual neighbor first
|
|
759
|
+
for neighbor_id, linkage_type in neighbors:
|
|
760
|
+
if neighbor_id not in graph.nodes:
|
|
761
|
+
continue
|
|
762
|
+
|
|
763
|
+
nodes_to_merge = sorted([node_id, neighbor_id])
|
|
764
|
+
|
|
765
|
+
# Find the links between nodes we're merging
|
|
766
|
+
links_to_exclude = []
|
|
767
|
+
for link in graph.links:
|
|
768
|
+
from_in = link.from_node_id in nodes_to_merge
|
|
769
|
+
to_in = link.to_node_id in nodes_to_merge
|
|
770
|
+
if from_in and to_in:
|
|
771
|
+
links_to_exclude.append(link)
|
|
630
772
|
|
|
631
773
|
# Reconstruct combined molecule
|
|
632
|
-
combined_mol = self.
|
|
774
|
+
combined_mol = self._reconstruct_fragment_with_links(nodes_to_merge, graph, links_to_exclude)
|
|
633
775
|
if not combined_mol:
|
|
634
776
|
print(f"DEBUG: Failed to reconstruct molecule for {nodes_to_merge}")
|
|
635
777
|
continue
|
|
@@ -637,7 +779,7 @@ class FragmentProcessor:
|
|
|
637
779
|
print(f"DEBUG: Reconstructed mol with {combined_mol.GetNumAtoms()} atoms")
|
|
638
780
|
|
|
639
781
|
# Count expected connections for this merged fragment
|
|
640
|
-
# Get all unique neighbors of the merged set
|
|
782
|
+
# Get all unique neighbors of the merged set (excluding internal connections)
|
|
641
783
|
all_neighbors = set()
|
|
642
784
|
for nid in nodes_to_merge:
|
|
643
785
|
if nid in graph.nodes:
|
|
@@ -647,15 +789,13 @@ class FragmentProcessor:
|
|
|
647
789
|
all_neighbors.add(neighbor_id)
|
|
648
790
|
|
|
649
791
|
num_connections = len(all_neighbors)
|
|
650
|
-
print(f"DEBUG: Expecting {num_connections} connections")
|
|
651
792
|
|
|
652
|
-
# Try to match the combined fragment
|
|
793
|
+
# Try to match the combined fragment (exact match only)
|
|
653
794
|
monomer = matcher.find_exact_match(combined_mol, num_connections)
|
|
654
795
|
|
|
655
796
|
if monomer:
|
|
656
|
-
print(f"DEBUG: SUCCESS! Matched to {monomer.symbol}")
|
|
657
797
|
# Success! Create new merged node
|
|
658
|
-
new_node_id = min(nodes_to_merge)
|
|
798
|
+
new_node_id = min(nodes_to_merge)
|
|
659
799
|
new_node = FragmentNode(new_node_id, combined_mol)
|
|
660
800
|
new_node.monomer = monomer
|
|
661
801
|
|
|
@@ -663,11 +803,70 @@ class FragmentProcessor:
|
|
|
663
803
|
self._merge_nodes_in_graph(graph, nodes_to_merge, new_node)
|
|
664
804
|
|
|
665
805
|
had_changes = True
|
|
666
|
-
break # Stop trying other
|
|
667
|
-
|
|
668
|
-
|
|
806
|
+
break # Stop trying other neighbors for this node
|
|
807
|
+
|
|
808
|
+
if had_changes:
|
|
809
|
+
break # Restart from beginning after a successful merge
|
|
669
810
|
|
|
670
811
|
return had_changes
|
|
812
|
+
|
|
813
|
+
def recover_unmatched_with_stereo_agnostic(self, graph: FragmentGraph, matcher) -> int:
|
|
814
|
+
"""
|
|
815
|
+
Separate recovery procedure: Try to match remaining unmatched fragments
|
|
816
|
+
using stereochemistry-agnostic comparison.
|
|
817
|
+
|
|
818
|
+
This handles poor quality input data where stereochemistry is not assigned.
|
|
819
|
+
Only called after regular recovery attempts have finished.
|
|
820
|
+
|
|
821
|
+
Args:
|
|
822
|
+
graph: FragmentGraph with some unmatched nodes
|
|
823
|
+
matcher: MonomerMatcher instance
|
|
824
|
+
|
|
825
|
+
Returns:
|
|
826
|
+
Number of fragments that were successfully matched
|
|
827
|
+
"""
|
|
828
|
+
from rdkit import Chem
|
|
829
|
+
|
|
830
|
+
# Find all unmatched nodes (nodes with mock/unknown monomers)
|
|
831
|
+
unmatched_nodes = []
|
|
832
|
+
for node_id, node in graph.nodes.items():
|
|
833
|
+
if node.monomer and (node.monomer.symbol.startswith('X') or
|
|
834
|
+
node.monomer.name.startswith('Unknown')):
|
|
835
|
+
unmatched_nodes.append(node_id)
|
|
836
|
+
|
|
837
|
+
if not unmatched_nodes:
|
|
838
|
+
return 0
|
|
839
|
+
|
|
840
|
+
print(f"DEBUG: Attempting stereo-agnostic recovery for {len(unmatched_nodes)} unmatched nodes")
|
|
841
|
+
|
|
842
|
+
matched_count = 0
|
|
843
|
+
|
|
844
|
+
for node_id in unmatched_nodes:
|
|
845
|
+
if node_id not in graph.nodes:
|
|
846
|
+
continue
|
|
847
|
+
|
|
848
|
+
node = graph.nodes[node_id]
|
|
849
|
+
|
|
850
|
+
# Get fragment SMILES
|
|
851
|
+
fragment_smiles = Chem.MolToSmiles(node.mol, canonical=True)
|
|
852
|
+
|
|
853
|
+
# Count connections
|
|
854
|
+
neighbors = graph.get_neighbors(node_id)
|
|
855
|
+
num_connections = len(neighbors)
|
|
856
|
+
|
|
857
|
+
# Try stereo-agnostic matching
|
|
858
|
+
monomer = matcher.monomer_library.find_monomer_by_fragment_smiles_no_stereo(
|
|
859
|
+
fragment_smiles, num_connections
|
|
860
|
+
)
|
|
861
|
+
|
|
862
|
+
if monomer:
|
|
863
|
+
print(f"DEBUG: Stereo-agnostic match for node {node_id}: {monomer.symbol}")
|
|
864
|
+
node.monomer = monomer
|
|
865
|
+
matched_count += 1
|
|
866
|
+
else:
|
|
867
|
+
print(f"DEBUG: No stereo-agnostic match for node {node_id}")
|
|
868
|
+
|
|
869
|
+
return matched_count
|
|
671
870
|
|
|
672
871
|
# ============================================================================
|
|
673
872
|
# Content from: helm_generator.py
|
|
@@ -706,36 +905,154 @@ class HELMGenerator:
|
|
|
706
905
|
if len(graph) == 0:
|
|
707
906
|
return ""
|
|
708
907
|
|
|
709
|
-
# Get ordered sequence of monomers
|
|
710
|
-
|
|
908
|
+
# Get ordered sequence of monomers (backbone)
|
|
909
|
+
ordered_nodes_raw = graph.get_ordered_nodes()
|
|
910
|
+
|
|
911
|
+
# Check if cyclic
|
|
912
|
+
is_cyclic = graph.is_cyclic()
|
|
913
|
+
|
|
914
|
+
# Filter backbone: nodes that are part of R1-R2 chain are backbone
|
|
915
|
+
# Nodes connected only via R3 (side chain) are branches
|
|
916
|
+
#
|
|
917
|
+
# Logic: A node at position 1 is a branch if:
|
|
918
|
+
# - It has no R1 (N-terminus) - meaning it's a cap like 'ac' that only has R2
|
|
919
|
+
# - It only has 1 peptide connection (to the real backbone)
|
|
920
|
+
#
|
|
921
|
+
# Example: [ac].K in cyclic peptide
|
|
922
|
+
# - 'ac' has only R2, no R1 → it's a cap
|
|
923
|
+
# - 'ac' connects to K's R3 (side chain), not K's R1 (backbone)
|
|
924
|
+
# - So 'ac' should be PEPTIDE2, not part of PEPTIDE1
|
|
925
|
+
|
|
926
|
+
backbone_nodes = []
|
|
927
|
+
for i, node in enumerate(ordered_nodes_raw):
|
|
928
|
+
is_branch = False
|
|
929
|
+
|
|
930
|
+
if i == 0 and len(ordered_nodes_raw) > 1 and node.monomer:
|
|
931
|
+
# Check if this first node lacks R1 (N-terminus)
|
|
932
|
+
# If it has no R1, it's a cap that should be a branch
|
|
933
|
+
has_r1 = 'R1' in node.monomer.r_groups
|
|
934
|
+
|
|
935
|
+
if not has_r1:
|
|
936
|
+
# This is an N-terminal cap (like 'ac') at position 1
|
|
937
|
+
# It should be a branch, not part of the main backbone
|
|
938
|
+
is_branch = True
|
|
939
|
+
|
|
940
|
+
if not is_branch:
|
|
941
|
+
backbone_nodes.append(node)
|
|
942
|
+
|
|
943
|
+
ordered_nodes = backbone_nodes
|
|
711
944
|
sequence_symbols = [node.monomer.symbol if node.monomer else "X" for node in ordered_nodes]
|
|
712
945
|
|
|
713
|
-
#
|
|
714
|
-
|
|
946
|
+
# Detect branch nodes (nodes not in backbone)
|
|
947
|
+
ordered_node_ids = {node.id for node in ordered_nodes}
|
|
948
|
+
branch_nodes = [(node_id, node) for node_id, node in graph.nodes.items()
|
|
949
|
+
if node_id not in ordered_node_ids]
|
|
950
|
+
|
|
951
|
+
# Generate sequence notation
|
|
952
|
+
if is_cyclic:
|
|
953
|
+
# Cyclic: wrap multi-letter monomers in brackets, single-letter ones stay as-is
|
|
954
|
+
formatted_symbols = [f"[{symbol}]" if len(symbol) > 1 else symbol for symbol in sequence_symbols]
|
|
955
|
+
sequence = ".".join(formatted_symbols)
|
|
956
|
+
else:
|
|
957
|
+
# Linear: no brackets
|
|
958
|
+
sequence = ".".join(sequence_symbols)
|
|
715
959
|
|
|
716
|
-
#
|
|
717
|
-
|
|
718
|
-
link.linkage_type != LinkageType.PEPTIDE
|
|
719
|
-
for link in graph.links
|
|
720
|
-
)
|
|
960
|
+
# Collect non-sequential connections (disulfide bridges, cyclic bonds, etc.)
|
|
961
|
+
connections = []
|
|
721
962
|
|
|
722
|
-
if
|
|
723
|
-
#
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
# Format: PEPTIDE1,PEPTIDE1,from_idx:R3-to_idx:R3
|
|
728
|
-
connections.append(
|
|
729
|
-
f"PEPTIDE1,PEPTIDE1,{link.from_node_id + 1}:R3-{link.to_node_id + 1}:R3"
|
|
730
|
-
)
|
|
963
|
+
if is_cyclic:
|
|
964
|
+
# Find the actual cyclic peptide bond (last residue connects back to beginning)
|
|
965
|
+
# This handles cases where N-terminal caps (like 'ac') are at position 1
|
|
966
|
+
last_id = ordered_nodes[-1].id
|
|
967
|
+
first_few_ids = [ordered_nodes[i].id for i in range(min(3, len(ordered_nodes)))]
|
|
731
968
|
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
969
|
+
for link in graph.links:
|
|
970
|
+
if link.linkage_type == LinkageType.PEPTIDE:
|
|
971
|
+
# Check if this is the cyclic bond (last to one of first few)
|
|
972
|
+
is_cyclic_bond = False
|
|
973
|
+
from_id, to_id = None, None
|
|
974
|
+
|
|
975
|
+
if link.from_node_id == last_id and link.to_node_id in first_few_ids:
|
|
976
|
+
from_id, to_id = link.from_node_id, link.to_node_id
|
|
977
|
+
is_cyclic_bond = True
|
|
978
|
+
elif link.to_node_id == last_id and link.from_node_id in first_few_ids:
|
|
979
|
+
from_id, to_id = link.to_node_id, link.from_node_id
|
|
980
|
+
is_cyclic_bond = True
|
|
981
|
+
|
|
982
|
+
if is_cyclic_bond:
|
|
983
|
+
# Find positions (1-indexed)
|
|
984
|
+
from_pos = next((i + 1 for i, n in enumerate(ordered_nodes) if n.id == from_id), None)
|
|
985
|
+
to_pos = next((i + 1 for i, n in enumerate(ordered_nodes) if n.id == to_id), None)
|
|
986
|
+
|
|
987
|
+
if from_pos and to_pos:
|
|
988
|
+
connections.append(f"PEPTIDE1,PEPTIDE1,{from_pos}:R2-{to_pos}:R1")
|
|
989
|
+
break
|
|
990
|
+
|
|
991
|
+
# Add disulfide bridges
|
|
992
|
+
for link in graph.links:
|
|
993
|
+
if link.linkage_type == LinkageType.DISULFIDE:
|
|
994
|
+
# Get positions in ordered sequence (1-indexed)
|
|
995
|
+
from_pos = None
|
|
996
|
+
to_pos = None
|
|
997
|
+
for i, node in enumerate(ordered_nodes):
|
|
998
|
+
if node.id == link.from_node_id:
|
|
999
|
+
from_pos = i + 1
|
|
1000
|
+
if node.id == link.to_node_id:
|
|
1001
|
+
to_pos = i + 1
|
|
1002
|
+
|
|
1003
|
+
if from_pos and to_pos:
|
|
1004
|
+
# Format: PEPTIDE1,PEPTIDE1,from_pos:R3-to_pos:R3
|
|
1005
|
+
connections.append(f"PEPTIDE1,PEPTIDE1,{from_pos}:R3-{to_pos}:R3")
|
|
1006
|
+
|
|
1007
|
+
# Handle branch nodes (side chain modifications)
|
|
1008
|
+
# Create separate PEPTIDE chains for each branch
|
|
1009
|
+
branch_chains = []
|
|
1010
|
+
if branch_nodes:
|
|
1011
|
+
for branch_idx, (branch_node_id, branch_node) in enumerate(branch_nodes, start=2):
|
|
1012
|
+
branch_chain_name = f"PEPTIDE{branch_idx}"
|
|
1013
|
+
branch_symbol = branch_node.monomer.symbol if branch_node.monomer else f"X{branch_node_id}"
|
|
1014
|
+
|
|
1015
|
+
# Format branch chain (single monomer, so no dots needed)
|
|
1016
|
+
if is_cyclic and len(branch_symbol) > 1:
|
|
1017
|
+
branch_chains.append(f"{branch_chain_name}{{[{branch_symbol}]}}")
|
|
1018
|
+
else:
|
|
1019
|
+
branch_chains.append(f"{branch_chain_name}{{{branch_symbol}}}")
|
|
1020
|
+
|
|
1021
|
+
# Find which backbone node this branch connects to
|
|
1022
|
+
# Look for links connecting this branch to the main backbone
|
|
1023
|
+
for link in graph.links:
|
|
1024
|
+
backbone_node_id = None
|
|
1025
|
+
if link.from_node_id == branch_node_id and link.to_node_id in ordered_node_ids:
|
|
1026
|
+
backbone_node_id = link.to_node_id
|
|
1027
|
+
elif link.to_node_id == branch_node_id and link.from_node_id in ordered_node_ids:
|
|
1028
|
+
backbone_node_id = link.from_node_id
|
|
1029
|
+
|
|
1030
|
+
if backbone_node_id is not None:
|
|
1031
|
+
# Find position of backbone node (1-indexed)
|
|
1032
|
+
backbone_pos = next((i + 1 for i, n in enumerate(ordered_nodes) if n.id == backbone_node_id), None)
|
|
1033
|
+
if backbone_pos:
|
|
1034
|
+
# Determine which R-group the branch uses
|
|
1035
|
+
# If branch has R1, connect to R1; if only R2, connect to R2
|
|
1036
|
+
branch_r_group = "R1"
|
|
1037
|
+
if branch_node.monomer:
|
|
1038
|
+
if 'R1' in branch_node.monomer.r_groups:
|
|
1039
|
+
branch_r_group = "R1"
|
|
1040
|
+
elif 'R2' in branch_node.monomer.r_groups:
|
|
1041
|
+
branch_r_group = "R2"
|
|
1042
|
+
|
|
1043
|
+
# Connection: backbone position R3 (side chain) to branch position 1 R-group
|
|
1044
|
+
connections.append(f"PEPTIDE1,{branch_chain_name},{backbone_pos}:R3-1:{branch_r_group}")
|
|
1045
|
+
break
|
|
1046
|
+
|
|
1047
|
+
# Generate final HELM notation
|
|
1048
|
+
all_chains = [f"PEPTIDE1{{{sequence}}}"] + branch_chains
|
|
1049
|
+
helm_chains = "|".join(all_chains)
|
|
1050
|
+
|
|
1051
|
+
if connections:
|
|
1052
|
+
connection_str = "|".join(connections)
|
|
1053
|
+
helm = f"{helm_chains}${connection_str}$$$V2.0"
|
|
737
1054
|
else:
|
|
738
|
-
helm = f"
|
|
1055
|
+
helm = f"{helm_chains}$$$$V2.0"
|
|
739
1056
|
|
|
740
1057
|
return helm
|
|
741
1058
|
|
|
@@ -768,10 +1085,34 @@ from collections import defaultdict
|
|
|
768
1085
|
from itertools import combinations
|
|
769
1086
|
import json
|
|
770
1087
|
import os
|
|
1088
|
+
import re
|
|
771
1089
|
|
|
772
1090
|
# Suppress RDKit warnings
|
|
773
1091
|
RDLogger.DisableLog('rdApp.warning')
|
|
774
1092
|
|
|
1093
|
+
def remove_stereochemistry_from_smiles(smiles: str) -> str:
|
|
1094
|
+
"""
|
|
1095
|
+
Remove stereochemistry markers from SMILES string.
|
|
1096
|
+
Converts [C@@H], [C@H] to C, etc.
|
|
1097
|
+
|
|
1098
|
+
This is used for matching when input molecules don't have stereochemistry defined.
|
|
1099
|
+
"""
|
|
1100
|
+
if not smiles:
|
|
1101
|
+
return smiles
|
|
1102
|
+
|
|
1103
|
+
# Remove @ symbols (stereochemistry markers)
|
|
1104
|
+
# Pattern: [@]+ inside brackets
|
|
1105
|
+
smiles_no_stereo = re.sub(r'(@+)', '', smiles)
|
|
1106
|
+
|
|
1107
|
+
# Also remove H when it's explicit in brackets like [C@@H] -> [C] -> C
|
|
1108
|
+
# But we need to be careful not to remove H from [H] or CH3
|
|
1109
|
+
# After removing @, we might have [CH] which should become C
|
|
1110
|
+
smiles_no_stereo = re.sub(r'\[([A-Z][a-z]?)H\]', r'\1', smiles_no_stereo)
|
|
1111
|
+
# Handle [C] -> C (single atoms in brackets with no other info)
|
|
1112
|
+
smiles_no_stereo = re.sub(r'\[([A-Z][a-z]?)\]', r'\1', smiles_no_stereo)
|
|
1113
|
+
|
|
1114
|
+
return smiles_no_stereo
|
|
1115
|
+
|
|
775
1116
|
class MonomerData:
|
|
776
1117
|
def __init__(self):
|
|
777
1118
|
self.symbol = ""
|
|
@@ -1009,11 +1350,64 @@ class MonomerLibrary:
|
|
|
1009
1350
|
# Generate SMILES with these R-groups removed (lazy, cached)
|
|
1010
1351
|
candidate_smiles = monomer.get_capped_smiles_for_removed_rgroups(removed_set)
|
|
1011
1352
|
|
|
1012
|
-
# Check if it matches the fragment
|
|
1353
|
+
# Check if it matches the fragment (exact match only)
|
|
1013
1354
|
if candidate_smiles == fragment_smiles:
|
|
1014
1355
|
return monomer
|
|
1015
1356
|
|
|
1016
1357
|
return None
|
|
1358
|
+
|
|
1359
|
+
def find_monomer_by_fragment_smiles_no_stereo(self, fragment_smiles: str, num_connections: int):
|
|
1360
|
+
"""
|
|
1361
|
+
Find monomer by matching fragment SMILES WITHOUT stereochemistry.
|
|
1362
|
+
Used only in recovery for handling poor quality input data.
|
|
1363
|
+
|
|
1364
|
+
Uses molecular graph isomorphism to handle cases where RDKit generates
|
|
1365
|
+
different canonical SMILES for the same molecule.
|
|
1366
|
+
|
|
1367
|
+
Args:
|
|
1368
|
+
fragment_smiles: Canonical SMILES of the fragment
|
|
1369
|
+
num_connections: Number of connections this fragment has in the graph
|
|
1370
|
+
|
|
1371
|
+
Returns:
|
|
1372
|
+
MonomerData if match found, None otherwise
|
|
1373
|
+
"""
|
|
1374
|
+
# Parse fragment molecule once (without stereochemistry)
|
|
1375
|
+
fragment_no_stereo_smiles = remove_stereochemistry_from_smiles(fragment_smiles)
|
|
1376
|
+
fragment_mol = Chem.MolFromSmiles(fragment_no_stereo_smiles)
|
|
1377
|
+
if not fragment_mol:
|
|
1378
|
+
return None
|
|
1379
|
+
|
|
1380
|
+
# Search through all monomers
|
|
1381
|
+
for symbol, monomer in self.monomers.items():
|
|
1382
|
+
# Skip if monomer doesn't have enough R-groups
|
|
1383
|
+
if monomer.r_group_count < num_connections:
|
|
1384
|
+
continue
|
|
1385
|
+
|
|
1386
|
+
# Generate all combinations of num_connections R-groups that could have been removed
|
|
1387
|
+
r_group_labels = list(monomer.r_groups.keys())
|
|
1388
|
+
|
|
1389
|
+
# For each combination of R-groups that could have been removed
|
|
1390
|
+
for removed_combo in combinations(r_group_labels, num_connections):
|
|
1391
|
+
removed_set = frozenset(removed_combo)
|
|
1392
|
+
|
|
1393
|
+
# Generate SMILES with these R-groups removed (lazy, cached)
|
|
1394
|
+
candidate_smiles = monomer.get_capped_smiles_for_removed_rgroups(removed_set)
|
|
1395
|
+
|
|
1396
|
+
# Try string comparison first (fast path)
|
|
1397
|
+
candidate_no_stereo = remove_stereochemistry_from_smiles(candidate_smiles)
|
|
1398
|
+
|
|
1399
|
+
if candidate_no_stereo == fragment_no_stereo_smiles:
|
|
1400
|
+
return monomer
|
|
1401
|
+
|
|
1402
|
+
# If string comparison fails, try molecular graph isomorphism (slower but more robust)
|
|
1403
|
+
# This handles cases where RDKit generates different canonical SMILES for same molecule
|
|
1404
|
+
candidate_mol = Chem.MolFromSmiles(candidate_no_stereo)
|
|
1405
|
+
if candidate_mol and fragment_mol.HasSubstructMatch(candidate_mol) and candidate_mol.HasSubstructMatch(fragment_mol):
|
|
1406
|
+
# Both molecules are substructures of each other = they're the same
|
|
1407
|
+
if fragment_mol.GetNumAtoms() == candidate_mol.GetNumAtoms():
|
|
1408
|
+
return monomer
|
|
1409
|
+
|
|
1410
|
+
return None
|
|
1017
1411
|
|
|
1018
1412
|
def find_monomer_by_symbol(self, symbol: str):
|
|
1019
1413
|
return self.symbol_to_monomer.get(symbol)
|
|
@@ -1163,14 +1557,16 @@ def preload_library():
|
|
|
1163
1557
|
return processor is not None
|
|
1164
1558
|
|
|
1165
1559
|
|
|
1166
|
-
def convert_molecules_batch(
|
|
1560
|
+
def convert_molecules_batch(molecules: list, library_json: str = None, input_type: str = "auto") -> list:
|
|
1167
1561
|
"""
|
|
1168
|
-
Convert a batch of molecules
|
|
1562
|
+
Convert a batch of molecules to HELM notation.
|
|
1169
1563
|
|
|
1170
1564
|
Args:
|
|
1171
|
-
|
|
1565
|
+
molecules: List of molecule strings (molfiles or SMILES)
|
|
1172
1566
|
library_json: Optional monomer library as JSON string.
|
|
1173
1567
|
If None, uses default cached library from HELMCoreLibrary.json
|
|
1568
|
+
input_type: Type of input molecules - "molfile", "smiles", or "auto" (default).
|
|
1569
|
+
"auto" will attempt to detect the format automatically.
|
|
1174
1570
|
|
|
1175
1571
|
Returns:
|
|
1176
1572
|
List of tuples: (success: bool, helm_notation: str)
|
|
@@ -1184,13 +1580,13 @@ def convert_molecules_batch(molfiles: list, library_json: str = None) -> list:
|
|
|
1184
1580
|
print("Initializing monomer library and processors...")
|
|
1185
1581
|
if not preload_library():
|
|
1186
1582
|
print("ERROR: Failed to load monomer library")
|
|
1187
|
-
return [(False, "Library initialization failed") for _ in
|
|
1583
|
+
return [(False, "Library initialization failed") for _ in molecules]
|
|
1188
1584
|
print()
|
|
1189
1585
|
|
|
1190
1586
|
# Use shared processor instances
|
|
1191
1587
|
processor, matcher, helm_generator = _get_processors()
|
|
1192
1588
|
if not processor:
|
|
1193
|
-
return [(False, "") for _ in
|
|
1589
|
+
return [(False, "") for _ in molecules]
|
|
1194
1590
|
else:
|
|
1195
1591
|
# Load custom library from provided JSON string (no caching)
|
|
1196
1592
|
try:
|
|
@@ -1218,7 +1614,7 @@ def convert_molecules_batch(molfiles: list, library_json: str = None) -> list:
|
|
|
1218
1614
|
|
|
1219
1615
|
if not library.monomers:
|
|
1220
1616
|
print("ERROR: No monomers loaded from custom library")
|
|
1221
|
-
return [(False, "Library loading failed") for _ in
|
|
1617
|
+
return [(False, "Library loading failed") for _ in molecules]
|
|
1222
1618
|
|
|
1223
1619
|
print(f"Custom library loaded: {len(library.monomers)} monomers")
|
|
1224
1620
|
|
|
@@ -1227,11 +1623,46 @@ def convert_molecules_batch(molfiles: list, library_json: str = None) -> list:
|
|
|
1227
1623
|
matcher = MonomerMatcher(library)
|
|
1228
1624
|
helm_generator = HELMGenerator()
|
|
1229
1625
|
|
|
1626
|
+
# Helper function to detect molecule format
|
|
1627
|
+
def _is_molfile(mol_string: str) -> bool:
|
|
1628
|
+
"""Check if string is a molfile (starts with RDKit molfile markers or has multiple lines)"""
|
|
1629
|
+
if not mol_string:
|
|
1630
|
+
return False
|
|
1631
|
+
lines = mol_string.strip().split('\n')
|
|
1632
|
+
# Molfiles typically have multiple lines and specific format
|
|
1633
|
+
if len(lines) > 3:
|
|
1634
|
+
# Check for V2000 or V3000 molfile markers
|
|
1635
|
+
if 'V2000' in mol_string or 'V3000' in mol_string:
|
|
1636
|
+
return True
|
|
1637
|
+
# Check for typical molfile structure (counts line format)
|
|
1638
|
+
if len(lines) > 3:
|
|
1639
|
+
counts_line = lines[3] if len(lines) > 3 else ""
|
|
1640
|
+
# Molfile counts line has specific format with atom/bond counts
|
|
1641
|
+
if len(counts_line) >= 6 and counts_line[:6].replace(' ', '').isdigit():
|
|
1642
|
+
return True
|
|
1643
|
+
return False
|
|
1644
|
+
|
|
1230
1645
|
results = []
|
|
1231
1646
|
|
|
1232
|
-
for i in range(len(
|
|
1233
|
-
|
|
1234
|
-
|
|
1647
|
+
for i in range(len(molecules)):
|
|
1648
|
+
mol_string = molecules[i]
|
|
1649
|
+
|
|
1650
|
+
# Determine input type and parse molecule
|
|
1651
|
+
if input_type == "auto":
|
|
1652
|
+
# Auto-detect format
|
|
1653
|
+
if _is_molfile(mol_string):
|
|
1654
|
+
mol = Chem.MolFromMolBlock(mol_string)
|
|
1655
|
+
else:
|
|
1656
|
+
# Assume SMILES if not molfile
|
|
1657
|
+
mol = Chem.MolFromSmiles(mol_string)
|
|
1658
|
+
elif input_type == "molfile":
|
|
1659
|
+
mol = Chem.MolFromMolBlock(mol_string)
|
|
1660
|
+
elif input_type == "smiles":
|
|
1661
|
+
mol = Chem.MolFromSmiles(mol_string)
|
|
1662
|
+
else:
|
|
1663
|
+
results.append((False, f"Invalid input_type: {input_type}"))
|
|
1664
|
+
continue
|
|
1665
|
+
|
|
1235
1666
|
if not mol:
|
|
1236
1667
|
results.append((False, ""))
|
|
1237
1668
|
continue
|
|
@@ -1265,6 +1696,12 @@ def convert_molecules_batch(molfiles: list, library_json: str = None) -> list:
|
|
|
1265
1696
|
if not had_changes:
|
|
1266
1697
|
break
|
|
1267
1698
|
|
|
1699
|
+
# After regular recovery, try stereo-agnostic matching for remaining unmatched fragments
|
|
1700
|
+
# This handles poor quality data with missing stereochemistry
|
|
1701
|
+
stereo_matched = processor.recover_unmatched_with_stereo_agnostic(graph, matcher)
|
|
1702
|
+
if stereo_matched > 0:
|
|
1703
|
+
print(f"DEBUG: Stereo-agnostic recovery matched {stereo_matched} additional fragments")
|
|
1704
|
+
|
|
1268
1705
|
if len(graph.nodes) > 0:
|
|
1269
1706
|
helm_notation = helm_generator.generate_helm_from_graph(graph)
|
|
1270
1707
|
results.append((True, helm_notation))
|
|
@@ -1275,5 +1712,35 @@ def convert_molecules_batch(molfiles: list, library_json: str = None) -> list:
|
|
|
1275
1712
|
|
|
1276
1713
|
return results
|
|
1277
1714
|
|
|
1715
|
+
|
|
1716
|
+
def convert_molfiles_to_helm(molfiles: list, library_json: str = None) -> list:
|
|
1717
|
+
"""
|
|
1718
|
+
Convert a batch of molfiles to HELM notation.
|
|
1719
|
+
Convenience wrapper for convert_molecules_batch with input_type="molfile".
|
|
1720
|
+
|
|
1721
|
+
Args:
|
|
1722
|
+
molfiles: List of molfile strings
|
|
1723
|
+
library_json: Optional monomer library as JSON string
|
|
1724
|
+
|
|
1725
|
+
Returns:
|
|
1726
|
+
List of tuples: (success: bool, helm_notation: str)
|
|
1727
|
+
"""
|
|
1728
|
+
return convert_molecules_batch(molfiles, library_json=library_json, input_type="molfile")
|
|
1729
|
+
|
|
1730
|
+
|
|
1731
|
+
def convert_smiles_to_helm(smiles_list: list, library_json: str = None) -> list:
|
|
1732
|
+
"""
|
|
1733
|
+
Convert a batch of SMILES to HELM notation.
|
|
1734
|
+
Convenience wrapper for convert_molecules_batch with input_type="smiles".
|
|
1735
|
+
|
|
1736
|
+
Args:
|
|
1737
|
+
smiles_list: List of SMILES strings
|
|
1738
|
+
library_json: Optional monomer library as JSON string
|
|
1739
|
+
|
|
1740
|
+
Returns:
|
|
1741
|
+
List of tuples: (success: bool, helm_notation: str)
|
|
1742
|
+
"""
|
|
1743
|
+
return convert_molecules_batch(smiles_list, library_json=library_json, input_type="smiles")
|
|
1744
|
+
|
|
1278
1745
|
res_helm_list = convert_molecules_batch(molListToProcess, library_json=libraryJSON)
|
|
1279
1746
|
result_helm = pd.DataFrame(map(lambda x: x[1], res_helm_list), columns=["regenerated sequences"])
|