@datagrok/bio 2.25.3 → 2.25.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,6 +25,7 @@ from typing import Optional
25
25
  from typing import Tuple
26
26
  import json
27
27
  import os
28
+ import re
28
29
 
29
30
  # ============================================================================
30
31
  # Content from: fragment_graph.py
@@ -179,6 +180,37 @@ class FragmentGraph:
179
180
  for node in ordered_nodes
180
181
  ]
181
182
 
183
+ def is_cyclic(self) -> bool:
184
+ """
185
+ Detect if the peptide is cyclic.
186
+ A cyclic peptide has a peptide bond connecting the last residue back to near the beginning.
187
+ Handles cases where N-terminal caps (like 'ac' from Lys_Ac) create an extra fragment at position 0.
188
+ """
189
+ if len(self.nodes) < 3:
190
+ return False
191
+
192
+ # Get ordered nodes
193
+ ordered = self.get_ordered_nodes()
194
+ if len(ordered) < 3:
195
+ return False
196
+
197
+ # Get the last node ID
198
+ last_id = ordered[-1].id
199
+
200
+ # For a cyclic peptide, the last residue should connect back to one of the first few residues
201
+ # (usually first, but could be second if there's an N-terminal cap like 'ac')
202
+ # Check if last node has a peptide bond to any of the first 3 nodes
203
+ first_few_ids = [ordered[i].id for i in range(min(3, len(ordered)))]
204
+
205
+ for link in self.links:
206
+ if link.linkage_type == LinkageType.PEPTIDE:
207
+ # Check if link connects last node to one of the first few nodes
208
+ if (link.from_node_id == last_id and link.to_node_id in first_few_ids) or \
209
+ (link.to_node_id == last_id and link.from_node_id in first_few_ids):
210
+ return True
211
+
212
+ return False
213
+
182
214
  def __len__(self):
183
215
  return len(self.nodes)
184
216
 
@@ -221,8 +253,15 @@ class BondDetector:
221
253
  #GENERALIZATION ITEM: BOND PATTERNS SHOULD BE DERIVED FROM LIBRARY
222
254
  def __init__(self):
223
255
  # True peptide bond: C and N both in backbone (each bonded to carbons)
224
- # Alpha carbons can be sp3 (X4) or sp2 (X3) for dehydroamino acids
225
- self.peptide_bond = Chem.MolFromSmarts('[C;X3,X4]-[C;X3](=[O;X1])-[N;X3]-[C;X3,X4]')
256
+ # First carbon can be aliphatic or aromatic (for amino acids like NMe2Abz)
257
+ # Carbonyl carbon is sp2 (X3)
258
+ # Exclude if carbonyl is in a small ring (r5 or r6) to avoid cleaving lactams like Pyr
259
+ # !r5 = not in 5-membered ring, !r6 = not in 6-membered ring
260
+ # This preserves lactams but allows large macrocycles and proline (C=O outside ring)
261
+ # Nitrogen can be X2 (proline, imino) or X3 (standard amino, N-methyl)
262
+ # N-C bond can be single (-) or double (=) for imine bonds in dehydro amino acids
263
+ # Alpha carbon after N can be sp3 (X4) or sp2 (X3) for dehydroamino acids, or aromatic (#6 includes both)
264
+ self.peptide_bond = Chem.MolFromSmarts('[#6]-[C;X3;!r5;!r6](=[O;X1])-[N;X2,X3]~[#6;X3,X4]')
226
265
  # True disulfide bond: S-S where each S is bonded to carbon (cysteine residues)
227
266
  self.disulfide_bond = Chem.MolFromSmarts('[C;X4]-[S;X2]-[S;X2]-[C;X4]')
228
267
  # Primary amine at N-terminus (can be NH2 or NH3+), alpha-C can be sp3 or sp2
@@ -265,7 +304,7 @@ class BondDetector:
265
304
  matches = mol.GetSubstructMatches(self.peptide_bond)
266
305
  for match in matches:
267
306
  if len(match) >= 5:
268
- # Pattern: [C;X3,X4]-[C;X3](=[O;X1])-[N;X3]-[C;X3,X4]
307
+ # Pattern: [C;X3,X4]-[C;X3](=[O;X1])-[N;X2,X3]~[C;X3,X4]
269
308
  # match[0]=alpha-C (sp2 or sp3), match[1]=carbonyl-C, match[2]=O, match[3]=N, match[4]=next-alpha-C (sp2 or sp3)
270
309
  c_atom = match[1] # Carbonyl carbon
271
310
  n_atom = match[3] # Nitrogen
@@ -399,18 +438,16 @@ class FragmentProcessor:
399
438
  # Fragment the molecule
400
439
  fragmented_mol = Chem.FragmentOnBonds(mol, bond_indices, addDummies=True)
401
440
 
402
- # Get fragments AND their atom mappings separately
403
- fragments_tuple = Chem.GetMolFrags(
404
- fragmented_mol,
405
- asMols=True,
406
- sanitizeFrags=True
407
- )
408
- fragments = list(fragments_tuple)
441
+ # Get fragments as molecules
442
+ fragments = Chem.GetMolFrags(fragmented_mol, asMols=True, sanitizeFrags=True)
443
+
444
+ # Get atom mappings separately (which original atoms are in which fragment)
445
+ atom_mappings = Chem.GetMolFrags(fragmented_mol, asMols=False, fragsMolAtomMapping=True)
409
446
 
410
447
  # Store bond cleavage info for recovery - we'll use this to selectively re-fragment
411
448
  graph.cleaved_bond_indices = bond_indices
412
449
  graph.bond_info = bond_info
413
- print(f"DEBUG: Created {len(fragments)} fragments, cleaved {len(bond_indices)} bonds")
450
+ graph.atom_mappings = atom_mappings
414
451
 
415
452
  # Create nodes for each fragment
416
453
  fragment_nodes = []
@@ -426,20 +463,43 @@ class FragmentProcessor:
426
463
  graph.add_node(node)
427
464
  fragment_nodes.append((i, node))
428
465
 
429
- # Create links between fragments based on cleaved bonds
430
- # For sequential peptide bonds
431
- peptide_links = [b for b in bond_info if b[3] == LinkageType.PEPTIDE]
432
- for i in range(len(fragment_nodes) - 1):
433
- from_id, _ = fragment_nodes[i]
434
- to_id, _ = fragment_nodes[i + 1]
435
- link = FragmentLink(from_id, to_id, LinkageType.PEPTIDE)
466
+ # Create links between fragments based on the actual cleaved bonds
467
+ # Build mapping: original atom index → (fragment_idx, new_atom_idx_in_fragment)
468
+ atom_to_fragment_and_idx = {}
469
+ for frag_idx, original_atom_indices in enumerate(atom_mappings):
470
+ for new_idx_in_frag, original_atom_idx in enumerate(original_atom_indices):
471
+ atom_to_fragment_and_idx[original_atom_idx] = (frag_idx, new_idx_in_frag)
472
+
473
+
474
+ # For each cleaved bond, determine which fragments it connects
475
+ link_count = 0
476
+ for bond_idx, atom1_orig, atom2_orig, linkage_type in bond_info:
477
+ # Find which fragments contain these atoms and their new indices
478
+ frag1_info = atom_to_fragment_and_idx.get(atom1_orig)
479
+ frag2_info = atom_to_fragment_and_idx.get(atom2_orig)
480
+
481
+ if frag1_info is None or frag2_info is None:
482
+ print(f"DEBUG: Skipping bond atoms {atom1_orig}-{atom2_orig}: not found in fragments")
483
+ continue
484
+
485
+ frag1, atom1_new = frag1_info
486
+ frag2, atom2_new = frag2_info
487
+
488
+ # Create link even if both atoms are in same fragment (internal bond like in Phe_4Sdihydroorotamido)
489
+ # This creates a "self-link" that will be used during recovery to reconstruct the monomer
490
+ link = FragmentLink(frag1, frag2, linkage_type,
491
+ from_atom_idx=atom1_new, to_atom_idx=atom2_new)
436
492
  graph.add_link(link)
493
+ link_count += 1
494
+
495
+ if frag1 == frag2:
496
+ print(f"DEBUG: Link {link_count}: {linkage_type.value.upper()} SELF-LINK frag{frag1} "
497
+ f"orig_atoms({atom1_orig}<->{atom2_orig}) frag_atoms({atom1_new}<->{atom2_new})")
498
+ else:
499
+ print(f"DEBUG: Link {link_count}: {linkage_type.value.upper()} frag{frag1}<->frag{frag2} "
500
+ f"orig_atoms({atom1_orig}<->{atom2_orig}) frag_atoms({atom1_new}<->{atom2_new})")
437
501
 
438
- # Add disulfide bridges (if any)
439
- # TODO: Track which fragments contain the S atoms for proper linking
440
- disulfide_links = [b for b in bond_info if b[3] == LinkageType.DISULFIDE]
441
- # For now, disulfide bonds require more complex atom tracking
442
- # This is a placeholder for future enhancement
502
+ print(f"DEBUG: Created {link_count} links total")
443
503
 
444
504
  return graph
445
505
 
@@ -472,6 +532,94 @@ class FragmentProcessor:
472
532
  except Exception:
473
533
  return None
474
534
 
535
+ def _reconstruct_fragment_with_links(self, node_ids: list, graph: FragmentGraph,
536
+ links_to_exclude: list) -> Chem.Mol:
537
+ """
538
+ Reconstruct a molecule by combining multiple fragment nodes, using link information.
539
+
540
+ Args:
541
+ node_ids: List of node IDs to merge
542
+ graph: The fragment graph
543
+ links_to_exclude: List of FragmentLink objects connecting the nodes to merge
544
+
545
+ Returns:
546
+ Combined RDKit molecule, or None if reconstruction fails
547
+ """
548
+ if not node_ids or not hasattr(graph, 'original_mol'):
549
+ return None
550
+
551
+ if not hasattr(graph, 'cleaved_bond_indices') or not hasattr(graph, 'bond_info'):
552
+ return None
553
+
554
+ try:
555
+ # Find which bond indices correspond to the links we want to exclude
556
+ bonds_to_exclude_indices = []
557
+
558
+ for link in links_to_exclude:
559
+ # Find the bond_info entry that matches this link's original atoms
560
+ # We need to find which bond connected these fragments
561
+ for bond_list_idx, (bond_idx, atom1, atom2, linkage_type) in enumerate(graph.bond_info):
562
+ # Check if this bond connects the fragments in this link
563
+ if hasattr(graph, 'atom_mappings'):
564
+ # Find which fragments contain these atoms
565
+ frag1 = None
566
+ frag2 = None
567
+ for frag_idx, atom_indices in enumerate(graph.atom_mappings):
568
+ if atom1 in atom_indices:
569
+ frag1 = frag_idx
570
+ if atom2 in atom_indices:
571
+ frag2 = frag_idx
572
+
573
+ # If this bond connects the two fragments in the link, exclude it
574
+ if (frag1 == link.from_node_id and frag2 == link.to_node_id) or \
575
+ (frag1 == link.to_node_id and frag2 == link.from_node_id):
576
+ bonds_to_exclude_indices.append(bond_list_idx)
577
+ print(f"DEBUG: Excluding {linkage_type.value} bond at index {bond_list_idx} (atoms {atom1}<->{atom2})")
578
+ break
579
+
580
+ # Create new bond list excluding the bonds we want to keep
581
+ new_bond_indices = [
582
+ bond_idx for i, bond_idx in enumerate(graph.cleaved_bond_indices)
583
+ if i not in bonds_to_exclude_indices
584
+ ]
585
+
586
+ print(f"DEBUG reconstruct: Original had {len(graph.cleaved_bond_indices)} cleaved bonds, "
587
+ f"excluding {len(bonds_to_exclude_indices)} bonds, new list has {len(new_bond_indices)} bonds")
588
+
589
+ # Re-fragment with the modified bond list
590
+ if not new_bond_indices:
591
+ # No bonds to cleave - return whole molecule
592
+ return graph.original_mol
593
+
594
+ fragmented_mol = Chem.FragmentOnBonds(graph.original_mol, new_bond_indices, addDummies=True)
595
+ fragments = Chem.GetMolFrags(fragmented_mol, asMols=True, sanitizeFrags=True)
596
+ new_atom_mappings = Chem.GetMolFrags(fragmented_mol, asMols=False, fragsMolAtomMapping=True)
597
+
598
+ # Find which new fragment contains atoms from our target nodes
599
+ # Look for the fragment that contains atoms from the first node we want to merge
600
+ sorted_nodes = sorted(node_ids)
601
+ first_node_atoms = set(graph.atom_mappings[sorted_nodes[0]])
602
+
603
+ target_fragment_idx = None
604
+ for new_frag_idx, new_atoms in enumerate(new_atom_mappings):
605
+ # Check if this new fragment contains any atoms from our first target node
606
+ if first_node_atoms & set(new_atoms):
607
+ target_fragment_idx = new_frag_idx
608
+ break
609
+
610
+ print(f"DEBUG reconstruct: Got {len(fragments)} fragments after re-fragmentation, "
611
+ f"target_fragment_idx={target_fragment_idx}")
612
+
613
+ if target_fragment_idx is not None and target_fragment_idx < len(fragments):
614
+ clean_frag = self._clean_fragment(fragments[target_fragment_idx])
615
+ return clean_frag if clean_frag else fragments[target_fragment_idx]
616
+
617
+ return None
618
+
619
+ except Exception as e:
620
+ print(f"DEBUG reconstruct: Exception: {e}")
621
+ return None
622
+
475
623
  def _reconstruct_fragment(self, node_ids: list, graph: FragmentGraph) -> Chem.Mol:
476
624
  """
477
625
  Reconstruct a molecule by combining multiple fragment nodes.
@@ -581,7 +729,7 @@ class FragmentProcessor:
581
729
 
582
730
  def recover_unmatched_fragments(self, graph: FragmentGraph, matcher) -> bool:
583
731
  """
584
- Try to recover unmatched fragments by merging with neighbors.
732
+ Try to recover unmatched fragments by merging with neighbors based on graph links.
585
733
  Returns True if any merges were successful.
586
734
  """
587
735
  # Identify unmatched nodes
@@ -593,8 +741,6 @@ class FragmentProcessor:
593
741
  if not unmatched_nodes:
594
742
  return False
595
743
 
596
- print(f"DEBUG: Found {len(unmatched_nodes)} unmatched nodes: {unmatched_nodes}")
597
-
598
744
  had_changes = False
599
745
 
600
746
  # Try to recover each unmatched node
@@ -603,33 +749,29 @@ class FragmentProcessor:
603
749
  if node_id not in graph.nodes:
604
750
  continue
605
751
 
606
- # Get neighbors
752
+ # Get neighbors from graph links (returns list of (neighbor_id, linkage_type))
607
753
  neighbors = graph.get_neighbors(node_id)
608
- neighbor_ids = [n[0] for n in neighbors]
609
754
 
610
- if not neighbor_ids:
755
+ if not neighbors:
611
756
  continue
612
757
 
613
- # Separate left and right neighbors (assuming sequential order)
614
- left_neighbors = [n for n in neighbor_ids if n < node_id]
615
- right_neighbors = [n for n in neighbor_ids if n > node_id]
616
-
617
- # Try merge combinations: left only, right only, both
618
- merge_attempts = []
619
-
620
- if left_neighbors:
621
- merge_attempts.append([left_neighbors[0], node_id])
622
- if right_neighbors:
623
- merge_attempts.append([node_id, right_neighbors[0]])
624
- if left_neighbors and right_neighbors:
625
- merge_attempts.append([left_neighbors[0], node_id, right_neighbors[0]])
626
-
627
- # Try each merge combination
628
- for nodes_to_merge in merge_attempts:
629
- print(f"DEBUG: Trying to merge nodes {nodes_to_merge}")
758
+ # Try merging with each individual neighbor first
759
+ for neighbor_id, linkage_type in neighbors:
760
+ if neighbor_id not in graph.nodes:
761
+ continue
762
+
763
+ nodes_to_merge = sorted([node_id, neighbor_id])
764
+
765
+ # Find the links between nodes we're merging
766
+ links_to_exclude = []
767
+ for link in graph.links:
768
+ from_in = link.from_node_id in nodes_to_merge
769
+ to_in = link.to_node_id in nodes_to_merge
770
+ if from_in and to_in:
771
+ links_to_exclude.append(link)
630
772
 
631
773
  # Reconstruct combined molecule
632
- combined_mol = self._reconstruct_fragment(nodes_to_merge, graph)
774
+ combined_mol = self._reconstruct_fragment_with_links(nodes_to_merge, graph, links_to_exclude)
633
775
  if not combined_mol:
634
776
  print(f"DEBUG: Failed to reconstruct molecule for {nodes_to_merge}")
635
777
  continue
@@ -637,7 +779,7 @@ class FragmentProcessor:
637
779
  print(f"DEBUG: Reconstructed mol with {combined_mol.GetNumAtoms()} atoms")
638
780
 
639
781
  # Count expected connections for this merged fragment
640
- # Get all unique neighbors of the merged set
782
+ # Get all unique neighbors of the merged set (excluding internal connections)
641
783
  all_neighbors = set()
642
784
  for nid in nodes_to_merge:
643
785
  if nid in graph.nodes:
@@ -647,15 +789,13 @@ class FragmentProcessor:
647
789
  all_neighbors.add(neighbor_id)
648
790
 
649
791
  num_connections = len(all_neighbors)
650
- print(f"DEBUG: Expecting {num_connections} connections")
651
792
 
652
- # Try to match the combined fragment
793
+ # Try to match the combined fragment (exact match only)
653
794
  monomer = matcher.find_exact_match(combined_mol, num_connections)
654
795
 
655
796
  if monomer:
656
- print(f"DEBUG: SUCCESS! Matched to {monomer.symbol}")
657
797
  # Success! Create new merged node
658
- new_node_id = min(nodes_to_merge) # Use lowest ID
798
+ new_node_id = min(nodes_to_merge)
659
799
  new_node = FragmentNode(new_node_id, combined_mol)
660
800
  new_node.monomer = monomer
661
801
 
@@ -663,11 +803,70 @@ class FragmentProcessor:
663
803
  self._merge_nodes_in_graph(graph, nodes_to_merge, new_node)
664
804
 
665
805
  had_changes = True
666
- break # Stop trying other combinations for this node
667
- else:
668
- print(f"DEBUG: No match found for merge {nodes_to_merge}")
806
+ break # Stop trying other neighbors for this node
807
+
808
+ if had_changes:
809
+ break # Restart from beginning after a successful merge
669
810
 
670
811
  return had_changes
812
+
813
+ def recover_unmatched_with_stereo_agnostic(self, graph: FragmentGraph, matcher) -> int:
814
+ """
815
+ Separate recovery procedure: Try to match remaining unmatched fragments
816
+ using stereochemistry-agnostic comparison.
817
+
818
+ This handles poor quality input data where stereochemistry is not assigned.
819
+ Only called after regular recovery attempts have finished.
820
+
821
+ Args:
822
+ graph: FragmentGraph with some unmatched nodes
823
+ matcher: MonomerMatcher instance
824
+
825
+ Returns:
826
+ Number of fragments that were successfully matched
827
+ """
828
+ from rdkit import Chem
829
+
830
+ # Find all unmatched nodes (nodes with mock/unknown monomers)
831
+ unmatched_nodes = []
832
+ for node_id, node in graph.nodes.items():
833
+ if node.monomer and (node.monomer.symbol.startswith('X') or
834
+ node.monomer.name.startswith('Unknown')):
835
+ unmatched_nodes.append(node_id)
836
+
837
+ if not unmatched_nodes:
838
+ return 0
839
+
840
+ print(f"DEBUG: Attempting stereo-agnostic recovery for {len(unmatched_nodes)} unmatched nodes")
841
+
842
+ matched_count = 0
843
+
844
+ for node_id in unmatched_nodes:
845
+ if node_id not in graph.nodes:
846
+ continue
847
+
848
+ node = graph.nodes[node_id]
849
+
850
+ # Get fragment SMILES
851
+ fragment_smiles = Chem.MolToSmiles(node.mol, canonical=True)
852
+
853
+ # Count connections
854
+ neighbors = graph.get_neighbors(node_id)
855
+ num_connections = len(neighbors)
856
+
857
+ # Try stereo-agnostic matching
858
+ monomer = matcher.monomer_library.find_monomer_by_fragment_smiles_no_stereo(
859
+ fragment_smiles, num_connections
860
+ )
861
+
862
+ if monomer:
863
+ print(f"DEBUG: Stereo-agnostic match for node {node_id}: {monomer.symbol}")
864
+ node.monomer = monomer
865
+ matched_count += 1
866
+ else:
867
+ print(f"DEBUG: No stereo-agnostic match for node {node_id}")
868
+
869
+ return matched_count
671
870
 
672
871
  # ============================================================================
673
872
  # Content from: helm_generator.py
@@ -706,36 +905,154 @@ class HELMGenerator:
706
905
  if len(graph) == 0:
707
906
  return ""
708
907
 
709
- # Get ordered sequence of monomers
710
- ordered_nodes = graph.get_ordered_nodes()
908
+ # Get ordered sequence of monomers (backbone)
909
+ ordered_nodes_raw = graph.get_ordered_nodes()
910
+
911
+ # Check if cyclic
912
+ is_cyclic = graph.is_cyclic()
913
+
914
+ # Filter backbone: nodes that are part of R1-R2 chain are backbone
915
+ # Nodes connected only via R3 (side chain) are branches
916
+ #
917
+ # Logic: A node at position 1 is a branch if:
918
+ # - It has no R1 (N-terminus) - meaning it's a cap like 'ac' that only has R2
919
+ # - It only has 1 peptide connection (to the real backbone)
920
+ #
921
+ # Example: [ac].K in cyclic peptide
922
+ # - 'ac' has only R2, no R1 → it's a cap
923
+ # - 'ac' connects to K's R3 (side chain), not K's R1 (backbone)
924
+ # - So 'ac' should be PEPTIDE2, not part of PEPTIDE1
925
+
926
+ backbone_nodes = []
927
+ for i, node in enumerate(ordered_nodes_raw):
928
+ is_branch = False
929
+
930
+ if i == 0 and len(ordered_nodes_raw) > 1 and node.monomer:
931
+ # Check if this first node lacks R1 (N-terminus)
932
+ # If it has no R1, it's a cap that should be a branch
933
+ has_r1 = 'R1' in node.monomer.r_groups
934
+
935
+ if not has_r1:
936
+ # This is an N-terminal cap (like 'ac') at position 1
937
+ # It should be a branch, not part of the main backbone
938
+ is_branch = True
939
+
940
+ if not is_branch:
941
+ backbone_nodes.append(node)
942
+
943
+ ordered_nodes = backbone_nodes
711
944
  sequence_symbols = [node.monomer.symbol if node.monomer else "X" for node in ordered_nodes]
712
945
 
713
- # Generate linear peptide notation
714
- sequence = ".".join(sequence_symbols)
946
+ # Detect branch nodes (nodes not in backbone)
947
+ ordered_node_ids = {node.id for node in ordered_nodes}
948
+ branch_nodes = [(node_id, node) for node_id, node in graph.nodes.items()
949
+ if node_id not in ordered_node_ids]
950
+
951
+ # Generate sequence notation
952
+ if is_cyclic:
953
+ # Cyclic: wrap multi-letter monomers in brackets, single-letter ones stay as-is
954
+ formatted_symbols = [f"[{symbol}]" if len(symbol) > 1 else symbol for symbol in sequence_symbols]
955
+ sequence = ".".join(formatted_symbols)
956
+ else:
957
+ # Linear: no brackets
958
+ sequence = ".".join(sequence_symbols)
715
959
 
716
- # Check for disulfide bridges or other non-peptide bonds
717
- has_special_bonds = any(
718
- link.linkage_type != LinkageType.PEPTIDE
719
- for link in graph.links
720
- )
960
+ # Collect non-sequential connections (disulfide bridges, cyclic bonds, etc.)
961
+ connections = []
721
962
 
722
- if has_special_bonds:
723
- # Add connection notation for disulfide bridges
724
- connections = []
725
- for link in graph.links:
726
- if link.linkage_type == LinkageType.DISULFIDE:
727
- # Format: PEPTIDE1,PEPTIDE1,from_idx:R3-to_idx:R3
728
- connections.append(
729
- f"PEPTIDE1,PEPTIDE1,{link.from_node_id + 1}:R3-{link.to_node_id + 1}:R3"
730
- )
963
+ if is_cyclic:
964
+ # Find the actual cyclic peptide bond (last residue connects back to beginning)
965
+ # This handles cases where N-terminal caps (like 'ac') are at position 1
966
+ last_id = ordered_nodes[-1].id
967
+ first_few_ids = [ordered_nodes[i].id for i in range(min(3, len(ordered_nodes)))]
731
968
 
732
- if connections:
733
- connection_str = "|".join(connections)
734
- helm = f"PEPTIDE1{{{sequence}}}${connection_str}$$$V2.0"
735
- else:
736
- helm = f"PEPTIDE1{{{sequence}}}$$$$"
969
+ for link in graph.links:
970
+ if link.linkage_type == LinkageType.PEPTIDE:
971
+ # Check if this is the cyclic bond (last to one of first few)
972
+ is_cyclic_bond = False
973
+ from_id, to_id = None, None
974
+
975
+ if link.from_node_id == last_id and link.to_node_id in first_few_ids:
976
+ from_id, to_id = link.from_node_id, link.to_node_id
977
+ is_cyclic_bond = True
978
+ elif link.to_node_id == last_id and link.from_node_id in first_few_ids:
979
+ from_id, to_id = link.to_node_id, link.from_node_id
980
+ is_cyclic_bond = True
981
+
982
+ if is_cyclic_bond:
983
+ # Find positions (1-indexed)
984
+ from_pos = next((i + 1 for i, n in enumerate(ordered_nodes) if n.id == from_id), None)
985
+ to_pos = next((i + 1 for i, n in enumerate(ordered_nodes) if n.id == to_id), None)
986
+
987
+ if from_pos and to_pos:
988
+ connections.append(f"PEPTIDE1,PEPTIDE1,{from_pos}:R2-{to_pos}:R1")
989
+ break
990
+
991
+ # Add disulfide bridges
992
+ for link in graph.links:
993
+ if link.linkage_type == LinkageType.DISULFIDE:
994
+ # Get positions in ordered sequence (1-indexed)
995
+ from_pos = None
996
+ to_pos = None
997
+ for i, node in enumerate(ordered_nodes):
998
+ if node.id == link.from_node_id:
999
+ from_pos = i + 1
1000
+ if node.id == link.to_node_id:
1001
+ to_pos = i + 1
1002
+
1003
+ if from_pos and to_pos:
1004
+ # Format: PEPTIDE1,PEPTIDE1,from_pos:R3-to_pos:R3
1005
+ connections.append(f"PEPTIDE1,PEPTIDE1,{from_pos}:R3-{to_pos}:R3")
1006
+
1007
+ # Handle branch nodes (side chain modifications)
1008
+ # Create separate PEPTIDE chains for each branch
1009
+ branch_chains = []
1010
+ if branch_nodes:
1011
+ for branch_idx, (branch_node_id, branch_node) in enumerate(branch_nodes, start=2):
1012
+ branch_chain_name = f"PEPTIDE{branch_idx}"
1013
+ branch_symbol = branch_node.monomer.symbol if branch_node.monomer else f"X{branch_node_id}"
1014
+
1015
+ # Format branch chain (single monomer, so no dots needed)
1016
+ if is_cyclic and len(branch_symbol) > 1:
1017
+ branch_chains.append(f"{branch_chain_name}{{[{branch_symbol}]}}")
1018
+ else:
1019
+ branch_chains.append(f"{branch_chain_name}{{{branch_symbol}}}")
1020
+
1021
+ # Find which backbone node this branch connects to
1022
+ # Look for links connecting this branch to the main backbone
1023
+ for link in graph.links:
1024
+ backbone_node_id = None
1025
+ if link.from_node_id == branch_node_id and link.to_node_id in ordered_node_ids:
1026
+ backbone_node_id = link.to_node_id
1027
+ elif link.to_node_id == branch_node_id and link.from_node_id in ordered_node_ids:
1028
+ backbone_node_id = link.from_node_id
1029
+
1030
+ if backbone_node_id is not None:
1031
+ # Find position of backbone node (1-indexed)
1032
+ backbone_pos = next((i + 1 for i, n in enumerate(ordered_nodes) if n.id == backbone_node_id), None)
1033
+ if backbone_pos:
1034
+ # Determine which R-group the branch uses
1035
+ # If branch has R1, connect to R1; if only R2, connect to R2
1036
+ branch_r_group = "R1"
1037
+ if branch_node.monomer:
1038
+ if 'R1' in branch_node.monomer.r_groups:
1039
+ branch_r_group = "R1"
1040
+ elif 'R2' in branch_node.monomer.r_groups:
1041
+ branch_r_group = "R2"
1042
+
1043
+ # Connection: backbone position R3 (side chain) to branch position 1 R-group
1044
+ connections.append(f"PEPTIDE1,{branch_chain_name},{backbone_pos}:R3-1:{branch_r_group}")
1045
+ break
1046
+
1047
+ # Generate final HELM notation
1048
+ all_chains = [f"PEPTIDE1{{{sequence}}}"] + branch_chains
1049
+ helm_chains = "|".join(all_chains)
1050
+
1051
+ if connections:
1052
+ connection_str = "|".join(connections)
1053
+ helm = f"{helm_chains}${connection_str}$$$V2.0"
737
1054
  else:
738
- helm = f"PEPTIDE1{{{sequence}}}$$$$"
1055
+ helm = f"{helm_chains}$$$$V2.0"
739
1056
 
740
1057
  return helm
741
1058
 
@@ -768,10 +1085,34 @@ from collections import defaultdict
768
1085
  from itertools import combinations
769
1086
  import json
770
1087
  import os
1088
+ import re
771
1089
 
772
1090
  # Suppress RDKit warnings
773
1091
  RDLogger.DisableLog('rdApp.warning')
774
1092
 
1093
+ def remove_stereochemistry_from_smiles(smiles: str) -> str:
1094
+ """
1095
+ Remove stereochemistry markers from SMILES string.
1096
+ Converts [C@@H], [C@H] to C, etc.
1097
+
1098
+ This is used for matching when input molecules don't have stereochemistry defined.
1099
+ """
1100
+ if not smiles:
1101
+ return smiles
1102
+
1103
+ # Remove @ symbols (stereochemistry markers)
1104
+ # Pattern: [@]+ inside brackets
1105
+ smiles_no_stereo = re.sub(r'(@+)', '', smiles)
1106
+
1107
+ # Also remove H when it's explicit in brackets like [C@@H] -> [C] -> C
1108
+ # But we need to be careful not to remove H from [H] or CH3
1109
+ # After removing @, we might have [CH] which should become C
1110
+ smiles_no_stereo = re.sub(r'\[([A-Z][a-z]?)H\]', r'\1', smiles_no_stereo)
1111
+ # Handle [C] -> C (single atoms in brackets with no other info)
1112
+ smiles_no_stereo = re.sub(r'\[([A-Z][a-z]?)\]', r'\1', smiles_no_stereo)
1113
+
1114
+ return smiles_no_stereo
1115
+
775
1116
  class MonomerData:
776
1117
  def __init__(self):
777
1118
  self.symbol = ""
@@ -1009,11 +1350,64 @@ class MonomerLibrary:
1009
1350
  # Generate SMILES with these R-groups removed (lazy, cached)
1010
1351
  candidate_smiles = monomer.get_capped_smiles_for_removed_rgroups(removed_set)
1011
1352
 
1012
- # Check if it matches the fragment
1353
+ # Check if it matches the fragment (exact match only)
1013
1354
  if candidate_smiles == fragment_smiles:
1014
1355
  return monomer
1015
1356
 
1016
1357
  return None
1358
+
1359
+ def find_monomer_by_fragment_smiles_no_stereo(self, fragment_smiles: str, num_connections: int):
1360
+ """
1361
+ Find monomer by matching fragment SMILES WITHOUT stereochemistry.
1362
+ Used only in recovery for handling poor quality input data.
1363
+
1364
+ Uses molecular graph isomorphism to handle cases where RDKit generates
1365
+ different canonical SMILES for the same molecule.
1366
+
1367
+ Args:
1368
+ fragment_smiles: Canonical SMILES of the fragment
1369
+ num_connections: Number of connections this fragment has in the graph
1370
+
1371
+ Returns:
1372
+ MonomerData if match found, None otherwise
1373
+ """
1374
+ # Parse fragment molecule once (without stereochemistry)
1375
+ fragment_no_stereo_smiles = remove_stereochemistry_from_smiles(fragment_smiles)
1376
+ fragment_mol = Chem.MolFromSmiles(fragment_no_stereo_smiles)
1377
+ if not fragment_mol:
1378
+ return None
1379
+
1380
+ # Search through all monomers
1381
+ for symbol, monomer in self.monomers.items():
1382
+ # Skip if monomer doesn't have enough R-groups
1383
+ if monomer.r_group_count < num_connections:
1384
+ continue
1385
+
1386
+ # Generate all combinations of num_connections R-groups that could have been removed
1387
+ r_group_labels = list(monomer.r_groups.keys())
1388
+
1389
+ # For each combination of R-groups that could have been removed
1390
+ for removed_combo in combinations(r_group_labels, num_connections):
1391
+ removed_set = frozenset(removed_combo)
1392
+
1393
+ # Generate SMILES with these R-groups removed (lazy, cached)
1394
+ candidate_smiles = monomer.get_capped_smiles_for_removed_rgroups(removed_set)
1395
+
1396
+ # Try string comparison first (fast path)
1397
+ candidate_no_stereo = remove_stereochemistry_from_smiles(candidate_smiles)
1398
+
1399
+ if candidate_no_stereo == fragment_no_stereo_smiles:
1400
+ return monomer
1401
+
1402
+ # If string comparison fails, try molecular graph isomorphism (slower but more robust)
1403
+ # This handles cases where RDKit generates different canonical SMILES for same molecule
1404
+ candidate_mol = Chem.MolFromSmiles(candidate_no_stereo)
1405
+ if candidate_mol and fragment_mol.HasSubstructMatch(candidate_mol) and candidate_mol.HasSubstructMatch(fragment_mol):
1406
+ # Both molecules are substructures of each other = they're the same
1407
+ if fragment_mol.GetNumAtoms() == candidate_mol.GetNumAtoms():
1408
+ return monomer
1409
+
1410
+ return None
1017
1411
 
1018
1412
  def find_monomer_by_symbol(self, symbol: str):
1019
1413
  return self.symbol_to_monomer.get(symbol)
@@ -1163,14 +1557,16 @@ def preload_library():
1163
1557
  return processor is not None
1164
1558
 
1165
1559
 
1166
- def convert_molecules_batch(molfiles: list, library_json: str = None) -> list:
1560
+ def convert_molecules_batch(molecules: list, library_json: str = None, input_type: str = "auto") -> list:
1167
1561
  """
1168
- Convert a batch of molecules from molfile format to HELM notation.
1562
+ Convert a batch of molecules to HELM notation.
1169
1563
 
1170
1564
  Args:
1171
- molfiles: List of molfile strings
1565
+ molecules: List of molecule strings (molfiles or SMILES)
1172
1566
  library_json: Optional monomer library as JSON string.
1173
1567
  If None, uses default cached library from HELMCoreLibrary.json
1568
+ input_type: Type of input molecules - "molfile", "smiles", or "auto" (default).
1569
+ "auto" will attempt to detect the format automatically.
1174
1570
 
1175
1571
  Returns:
1176
1572
  List of tuples: (success: bool, helm_notation: str)
@@ -1184,13 +1580,13 @@ def convert_molecules_batch(molfiles: list, library_json: str = None) -> list:
1184
1580
  print("Initializing monomer library and processors...")
1185
1581
  if not preload_library():
1186
1582
  print("ERROR: Failed to load monomer library")
1187
- return [(False, "Library initialization failed") for _ in molfiles]
1583
+ return [(False, "Library initialization failed") for _ in molecules]
1188
1584
  print()
1189
1585
 
1190
1586
  # Use shared processor instances
1191
1587
  processor, matcher, helm_generator = _get_processors()
1192
1588
  if not processor:
1193
- return [(False, "") for _ in molfiles]
1589
+ return [(False, "") for _ in molecules]
1194
1590
  else:
1195
1591
  # Load custom library from provided JSON string (no caching)
1196
1592
  try:
@@ -1218,7 +1614,7 @@ def convert_molecules_batch(molfiles: list, library_json: str = None) -> list:
1218
1614
 
1219
1615
  if not library.monomers:
1220
1616
  print("ERROR: No monomers loaded from custom library")
1221
- return [(False, "Library loading failed") for _ in molfiles]
1617
+ return [(False, "Library loading failed") for _ in molecules]
1222
1618
 
1223
1619
  print(f"Custom library loaded: {len(library.monomers)} monomers")
1224
1620
 
@@ -1227,11 +1623,46 @@ def convert_molecules_batch(molfiles: list, library_json: str = None) -> list:
1227
1623
  matcher = MonomerMatcher(library)
1228
1624
  helm_generator = HELMGenerator()
1229
1625
 
1626
+ # Helper function to detect molecule format
1627
+ def _is_molfile(mol_string: str) -> bool:
1628
+ """Check if string is a molfile (starts with RDKit molfile markers or has multiple lines)"""
1629
+ if not mol_string:
1630
+ return False
1631
+ lines = mol_string.strip().split('\n')
1632
+ # Molfiles typically have multiple lines and specific format
1633
+ if len(lines) > 3:
1634
+ # Check for V2000 or V3000 molfile markers
1635
+ if 'V2000' in mol_string or 'V3000' in mol_string:
1636
+ return True
1637
+ # Check for typical molfile structure (counts line format)
1638
+ if len(lines) > 3:
1639
+ counts_line = lines[3] if len(lines) > 3 else ""
1640
+ # Molfile counts line has specific format with atom/bond counts
1641
+ if len(counts_line) >= 6 and counts_line[:6].replace(' ', '').isdigit():
1642
+ return True
1643
+ return False
1644
+
1230
1645
  results = []
1231
1646
 
1232
- for i in range(len(molfiles)):
1233
- molfile = molfiles[i]
1234
- mol = Chem.MolFromMolBlock(molfile)
1647
+ for i in range(len(molecules)):
1648
+ mol_string = molecules[i]
1649
+
1650
+ # Determine input type and parse molecule
1651
+ if input_type == "auto":
1652
+ # Auto-detect format
1653
+ if _is_molfile(mol_string):
1654
+ mol = Chem.MolFromMolBlock(mol_string)
1655
+ else:
1656
+ # Assume SMILES if not molfile
1657
+ mol = Chem.MolFromSmiles(mol_string)
1658
+ elif input_type == "molfile":
1659
+ mol = Chem.MolFromMolBlock(mol_string)
1660
+ elif input_type == "smiles":
1661
+ mol = Chem.MolFromSmiles(mol_string)
1662
+ else:
1663
+ results.append((False, f"Invalid input_type: {input_type}"))
1664
+ continue
1665
+
1235
1666
  if not mol:
1236
1667
  results.append((False, ""))
1237
1668
  continue
@@ -1265,6 +1696,12 @@ def convert_molecules_batch(molfiles: list, library_json: str = None) -> list:
1265
1696
  if not had_changes:
1266
1697
  break
1267
1698
 
1699
+ # After regular recovery, try stereo-agnostic matching for remaining unmatched fragments
1700
+ # This handles poor quality data with missing stereochemistry
1701
+ stereo_matched = processor.recover_unmatched_with_stereo_agnostic(graph, matcher)
1702
+ if stereo_matched > 0:
1703
+ print(f"DEBUG: Stereo-agnostic recovery matched {stereo_matched} additional fragments")
1704
+
1268
1705
  if len(graph.nodes) > 0:
1269
1706
  helm_notation = helm_generator.generate_helm_from_graph(graph)
1270
1707
  results.append((True, helm_notation))
@@ -1275,5 +1712,35 @@ def convert_molecules_batch(molfiles: list, library_json: str = None) -> list:
1275
1712
 
1276
1713
  return results
1277
1714
 
1715
+
1716
+ def convert_molfiles_to_helm(molfiles: list, library_json: str = None) -> list:
1717
+ """
1718
+ Convert a batch of molfiles to HELM notation.
1719
+ Convenience wrapper for convert_molecules_batch with input_type="molfile".
1720
+
1721
+ Args:
1722
+ molfiles: List of molfile strings
1723
+ library_json: Optional monomer library as JSON string
1724
+
1725
+ Returns:
1726
+ List of tuples: (success: bool, helm_notation: str)
1727
+ """
1728
+ return convert_molecules_batch(molfiles, library_json=library_json, input_type="molfile")
1729
+
1730
+
1731
+ def convert_smiles_to_helm(smiles_list: list, library_json: str = None) -> list:
1732
+ """
1733
+ Convert a batch of SMILES to HELM notation.
1734
+ Convenience wrapper for convert_molecules_batch with input_type="smiles".
1735
+
1736
+ Args:
1737
+ smiles_list: List of SMILES strings
1738
+ library_json: Optional monomer library as JSON string
1739
+
1740
+ Returns:
1741
+ List of tuples: (success: bool, helm_notation: str)
1742
+ """
1743
+ return convert_molecules_batch(smiles_list, library_json=library_json, input_type="smiles")
1744
+
1278
1745
  res_helm_list = convert_molecules_batch(molListToProcess, library_json=libraryJSON)
1279
1746
  result_helm = pd.DataFrame(map(lambda x: x[1], res_helm_list), columns=["regenerated sequences"])