npm - @datagrok/bio - Versions diffs - 2.25.3 → 2.25.5 - Mend

@datagrok/bio 2.25.3 → 2.25.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/dist/package-test.js +2 -2
package/dist/package-test.js.map +1 -1
package/dist/package.js +2 -2
package/dist/package.js.map +1 -1
package/package.json +1 -1
package/scripts/mol-to-helm.py +556 -89
package/src/package.g.ts +1 -1
package/src/package.ts +1 -1
package/src/utils/monomer-lib/monomer-manager/const.ts +40 -0
package/src/utils/monomer-lib/monomer-manager/monomer-manager.ts +13 -2
package/test-console-output-1.log +319 -321
package/test-record-1.mp4 +0 -0

package/scripts/mol-to-helm.py CHANGED Viewed

@@ -25,6 +25,7 @@ from typing import Optional
 from typing import Tuple
 import json
 import os
+import re
 # ============================================================================
 # Content from: fragment_graph.py
@@ -179,6 +180,37 @@ class FragmentGraph:
             for node in ordered_nodes
         ]
+    def is_cyclic(self) -> bool:
+        """
+        Detect if the peptide is cyclic.
+        A cyclic peptide has a peptide bond connecting the last residue back to near the beginning.
+        Handles cases where N-terminal caps (like 'ac' from Lys_Ac) create an extra fragment at position 0.
+        """
+        if len(self.nodes) < 3:
+            return False
+        # Get ordered nodes
+        ordered = self.get_ordered_nodes()
+        if len(ordered) < 3:
+            return False
+        # Get the last node ID
+        last_id = ordered[-1].id
+        # For a cyclic peptide, the last residue should connect back to one of the first few residues
+        # (usually first, but could be second if there's an N-terminal cap like 'ac')
+        # Check if last node has a peptide bond to any of the first 3 nodes
+        first_few_ids = [ordered[i].id for i in range(min(3, len(ordered)))]
+        for link in self.links:
+            if link.linkage_type == LinkageType.PEPTIDE:
+                # Check if link connects last node to one of the first few nodes
+                if (link.from_node_id == last_id and link.to_node_id in first_few_ids) or \
+                   (link.to_node_id == last_id and link.from_node_id in first_few_ids):
+                    return True
+        return False
     def __len__(self):
         return len(self.nodes)
@@ -221,8 +253,15 @@ class BondDetector:
     #GENERALIZATION ITEM: BOND PATTERNS SHOULD BE DERIVED FROM LIBRARY
     def __init__(self):
         # True peptide bond: C and N both in backbone (each bonded to carbons)
-        # Alpha carbons can be sp3 (X4) or sp2 (X3) for dehydroamino acids
-        self.peptide_bond = Chem.MolFromSmarts('[C;X3,X4]-[C;X3](=[O;X1])-[N;X3]-[C;X3,X4]')
+        # First carbon can be aliphatic or aromatic (for amino acids like NMe2Abz)
+        # Carbonyl carbon is sp2 (X3)
+        # Exclude if carbonyl is in a small ring (r5 or r6) to avoid cleaving lactams like Pyr
+        # !r5 = not in 5-membered ring, !r6 = not in 6-membered ring
+        # This preserves lactams but allows large macrocycles and proline (C=O outside ring)
+        # Nitrogen can be X2 (proline, imino) or X3 (standard amino, N-methyl)
+        # N-C bond can be single (-) or double (=) for imine bonds in dehydro amino acids
+        # Alpha carbon after N can be sp3 (X4) or sp2 (X3) for dehydroamino acids, or aromatic (#6 includes both)
+        self.peptide_bond = Chem.MolFromSmarts('[#6]-[C;X3;!r5;!r6](=[O;X1])-[N;X2,X3]~[#6;X3,X4]')
         # True disulfide bond: S-S where each S is bonded to carbon (cysteine residues)
         self.disulfide_bond = Chem.MolFromSmarts('[C;X4]-[S;X2]-[S;X2]-[C;X4]')
         # Primary amine at N-terminus (can be NH2 or NH3+), alpha-C can be sp3 or sp2
@@ -265,7 +304,7 @@ class BondDetector:
             matches = mol.GetSubstructMatches(self.peptide_bond)
             for match in matches:
                 if len(match) >= 5:
-                    # Pattern: [C;X3,X4]-[C;X3](=[O;X1])-[N;X3]-[C;X3,X4]
+                    # Pattern: [C;X3,X4]-[C;X3](=[O;X1])-[N;X2,X3]~[C;X3,X4]
                     # match[0]=alpha-C (sp2 or sp3), match[1]=carbonyl-C, match[2]=O, match[3]=N, match[4]=next-alpha-C (sp2 or sp3)
                     c_atom = match[1]  # Carbonyl carbon
                     n_atom = match[3]  # Nitrogen
@@ -399,18 +438,16 @@ class FragmentProcessor:
             # Fragment the molecule
             fragmented_mol = Chem.FragmentOnBonds(mol, bond_indices, addDummies=True)
-            # Get fragments AND their atom mappings separately
-            fragments_tuple = Chem.GetMolFrags(
-                fragmented_mol,
-                asMols=True,
-                sanitizeFrags=True
-            )
-            fragments = list(fragments_tuple)
+            # Get fragments as molecules
+            fragments = Chem.GetMolFrags(fragmented_mol, asMols=True, sanitizeFrags=True)
+            # Get atom mappings separately (which original atoms are in which fragment)
+            atom_mappings = Chem.GetMolFrags(fragmented_mol, asMols=False, fragsMolAtomMapping=True)
             # Store bond cleavage info for recovery - we'll use this to selectively re-fragment
             graph.cleaved_bond_indices = bond_indices
             graph.bond_info = bond_info
-            print(f"DEBUG: Created {len(fragments)} fragments, cleaved {len(bond_indices)} bonds")
+            graph.atom_mappings = atom_mappings
             # Create nodes for each fragment
             fragment_nodes = []
@@ -426,20 +463,43 @@ class FragmentProcessor:
                     graph.add_node(node)
                     fragment_nodes.append((i, node))
-            # Create links between fragments based on cleaved bonds
-            # For sequential peptide bonds
-            peptide_links = [b for b in bond_info if b[3] == LinkageType.PEPTIDE]
-            for i in range(len(fragment_nodes) - 1):
-                from_id, _ = fragment_nodes[i]
-                to_id, _ = fragment_nodes[i + 1]
-                link = FragmentLink(from_id, to_id, LinkageType.PEPTIDE)
+            # Create links between fragments based on the actual cleaved bonds
+            # Build mapping: original atom index → (fragment_idx, new_atom_idx_in_fragment)
+            atom_to_fragment_and_idx = {}
+            for frag_idx, original_atom_indices in enumerate(atom_mappings):
+                for new_idx_in_frag, original_atom_idx in enumerate(original_atom_indices):
+                    atom_to_fragment_and_idx[original_atom_idx] = (frag_idx, new_idx_in_frag)
+            # For each cleaved bond, determine which fragments it connects
+            link_count = 0
+            for bond_idx, atom1_orig, atom2_orig, linkage_type in bond_info:
+                # Find which fragments contain these atoms and their new indices
+                frag1_info = atom_to_fragment_and_idx.get(atom1_orig)
+                frag2_info = atom_to_fragment_and_idx.get(atom2_orig)
+                if frag1_info is None or frag2_info is None:
+                    print(f"DEBUG: Skipping bond atoms {atom1_orig}-{atom2_orig}: not found in fragments")
+                    continue
+                frag1, atom1_new = frag1_info
+                frag2, atom2_new = frag2_info
+                # Create link even if both atoms are in same fragment (internal bond like in Phe_4Sdihydroorotamido)
+                # This creates a "self-link" that will be used during recovery to reconstruct the monomer
+                link = FragmentLink(frag1, frag2, linkage_type,
+                                   from_atom_idx=atom1_new, to_atom_idx=atom2_new)
                 graph.add_link(link)
+                link_count += 1
+                if frag1 == frag2:
+                    print(f"DEBUG: Link {link_count}: {linkage_type.value.upper()} SELF-LINK frag{frag1} "
+                          f"orig_atoms({atom1_orig}<->{atom2_orig}) frag_atoms({atom1_new}<->{atom2_new})")
+                else:
+                    print(f"DEBUG: Link {link_count}: {linkage_type.value.upper()} frag{frag1}<->frag{frag2} "
+                          f"orig_atoms({atom1_orig}<->{atom2_orig}) frag_atoms({atom1_new}<->{atom2_new})")
-            # Add disulfide bridges (if any)
-            # TODO: Track which fragments contain the S atoms for proper linking
-            disulfide_links = [b for b in bond_info if b[3] == LinkageType.DISULFIDE]
-            # For now, disulfide bonds require more complex atom tracking
-            # This is a placeholder for future enhancement
+            print(f"DEBUG: Created {link_count} links total")
             return graph
@@ -472,6 +532,94 @@ class FragmentProcessor:
         except Exception:
             return None
+    def _reconstruct_fragment_with_links(self, node_ids: list, graph: FragmentGraph,
+                                         links_to_exclude: list) -> Chem.Mol:
+        """
+        Reconstruct a molecule by combining multiple fragment nodes, using link information.
+        Args:
+            node_ids: List of node IDs to merge
+            graph: The fragment graph
+            links_to_exclude: List of FragmentLink objects connecting the nodes to merge
+        Returns:
+            Combined RDKit molecule, or None if reconstruction fails
+        """
+        if not node_ids or not hasattr(graph, 'original_mol'):
+            return None
+        if not hasattr(graph, 'cleaved_bond_indices') or not hasattr(graph, 'bond_info'):
+            return None
+        try:
+            # Find which bond indices correspond to the links we want to exclude
+            bonds_to_exclude_indices = []
+            for link in links_to_exclude:
+                # Find the bond_info entry that matches this link's original atoms
+                # We need to find which bond connected these fragments
+                for bond_list_idx, (bond_idx, atom1, atom2, linkage_type) in enumerate(graph.bond_info):
+                    # Check if this bond connects the fragments in this link
+                    if hasattr(graph, 'atom_mappings'):
+                        # Find which fragments contain these atoms
+                        frag1 = None
+                        frag2 = None
+                        for frag_idx, atom_indices in enumerate(graph.atom_mappings):
+                            if atom1 in atom_indices:
+                                frag1 = frag_idx
+                            if atom2 in atom_indices:
+                                frag2 = frag_idx
+                        # If this bond connects the two fragments in the link, exclude it
+                        if (frag1 == link.from_node_id and frag2 == link.to_node_id) or \
+                           (frag1 == link.to_node_id and frag2 == link.from_node_id):
+                            bonds_to_exclude_indices.append(bond_list_idx)
+                            print(f"DEBUG: Excluding {linkage_type.value} bond at index {bond_list_idx} (atoms {atom1}<->{atom2})")
+                            break
+            # Create new bond list excluding the bonds we want to keep
+            new_bond_indices = [
+                bond_idx for i, bond_idx in enumerate(graph.cleaved_bond_indices)
+                if i not in bonds_to_exclude_indices
+            ]
+            print(f"DEBUG reconstruct: Original had {len(graph.cleaved_bond_indices)} cleaved bonds, "
+                  f"excluding {len(bonds_to_exclude_indices)} bonds, new list has {len(new_bond_indices)} bonds")
+            # Re-fragment with the modified bond list
+            if not new_bond_indices:
+                # No bonds to cleave - return whole molecule
+                return graph.original_mol
+            fragmented_mol = Chem.FragmentOnBonds(graph.original_mol, new_bond_indices, addDummies=True)
+            fragments = Chem.GetMolFrags(fragmented_mol, asMols=True, sanitizeFrags=True)
+            new_atom_mappings = Chem.GetMolFrags(fragmented_mol, asMols=False, fragsMolAtomMapping=True)
+            # Find which new fragment contains atoms from our target nodes
+            # Look for the fragment that contains atoms from the first node we want to merge
+            sorted_nodes = sorted(node_ids)
+            first_node_atoms = set(graph.atom_mappings[sorted_nodes[0]])
+            target_fragment_idx = None
+            for new_frag_idx, new_atoms in enumerate(new_atom_mappings):
+                # Check if this new fragment contains any atoms from our first target node
+                if first_node_atoms & set(new_atoms):
+                    target_fragment_idx = new_frag_idx
+                    break
+            print(f"DEBUG reconstruct: Got {len(fragments)} fragments after re-fragmentation, "
+                  f"target_fragment_idx={target_fragment_idx}")
+            if target_fragment_idx is not None and target_fragment_idx < len(fragments):
+                clean_frag = self._clean_fragment(fragments[target_fragment_idx])
+                return clean_frag if clean_frag else fragments[target_fragment_idx]
+            return None
+        except Exception as e:
+            print(f"DEBUG reconstruct: Exception: {e}")
+            return None
     def _reconstruct_fragment(self, node_ids: list, graph: FragmentGraph) -> Chem.Mol:
         """
         Reconstruct a molecule by combining multiple fragment nodes.
@@ -581,7 +729,7 @@ class FragmentProcessor:
     def recover_unmatched_fragments(self, graph: FragmentGraph, matcher) -> bool:
         """
-        Try to recover unmatched fragments by merging with neighbors.
+        Try to recover unmatched fragments by merging with neighbors based on graph links.
         Returns True if any merges were successful.
         """
         # Identify unmatched nodes
@@ -593,8 +741,6 @@ class FragmentProcessor:
         if not unmatched_nodes:
             return False
-        print(f"DEBUG: Found {len(unmatched_nodes)} unmatched nodes: {unmatched_nodes}")
         had_changes = False
         # Try to recover each unmatched node
@@ -603,33 +749,29 @@ class FragmentProcessor:
             if node_id not in graph.nodes:
                 continue
-            # Get neighbors
+            # Get neighbors from graph links (returns list of (neighbor_id, linkage_type))
             neighbors = graph.get_neighbors(node_id)
-            neighbor_ids = [n[0] for n in neighbors]
-            if not neighbor_ids:
+            if not neighbors:
                 continue
-            # Separate left and right neighbors (assuming sequential order)
-            left_neighbors = [n for n in neighbor_ids if n < node_id]
-            right_neighbors = [n for n in neighbor_ids if n > node_id]
-            # Try merge combinations: left only, right only, both
-            merge_attempts = []
-            if left_neighbors:
-                merge_attempts.append([left_neighbors[0], node_id])
-            if right_neighbors:
-                merge_attempts.append([node_id, right_neighbors[0]])
-            if left_neighbors and right_neighbors:
-                merge_attempts.append([left_neighbors[0], node_id, right_neighbors[0]])
-            # Try each merge combination
-            for nodes_to_merge in merge_attempts:
-                print(f"DEBUG: Trying to merge nodes {nodes_to_merge}")
+            # Try merging with each individual neighbor first
+            for neighbor_id, linkage_type in neighbors:
+                if neighbor_id not in graph.nodes:
+                    continue
+                nodes_to_merge = sorted([node_id, neighbor_id])
+                # Find the links between nodes we're merging
+                links_to_exclude = []
+                for link in graph.links:
+                    from_in = link.from_node_id in nodes_to_merge
+                    to_in = link.to_node_id in nodes_to_merge
+                    if from_in and to_in:
+                        links_to_exclude.append(link)
                 # Reconstruct combined molecule
-                combined_mol = self._reconstruct_fragment(nodes_to_merge, graph)
+                combined_mol = self._reconstruct_fragment_with_links(nodes_to_merge, graph, links_to_exclude)
                 if not combined_mol:
                     print(f"DEBUG: Failed to reconstruct molecule for {nodes_to_merge}")
                     continue
@@ -637,7 +779,7 @@ class FragmentProcessor:
                 print(f"DEBUG: Reconstructed mol with {combined_mol.GetNumAtoms()} atoms")
                 # Count expected connections for this merged fragment
-                # Get all unique neighbors of the merged set
+                # Get all unique neighbors of the merged set (excluding internal connections)
                 all_neighbors = set()
                 for nid in nodes_to_merge:
                     if nid in graph.nodes:
@@ -647,15 +789,13 @@ class FragmentProcessor:
                                 all_neighbors.add(neighbor_id)
                 num_connections = len(all_neighbors)
-                print(f"DEBUG: Expecting {num_connections} connections")
-                # Try to match the combined fragment
+                # Try to match the combined fragment (exact match only)
                 monomer = matcher.find_exact_match(combined_mol, num_connections)
                 if monomer:
-                    print(f"DEBUG: SUCCESS! Matched to {monomer.symbol}")
                     # Success! Create new merged node
-                    new_node_id = min(nodes_to_merge)  # Use lowest ID
+                    new_node_id = min(nodes_to_merge)
                     new_node = FragmentNode(new_node_id, combined_mol)
                     new_node.monomer = monomer
@@ -663,11 +803,70 @@ class FragmentProcessor:
                     self._merge_nodes_in_graph(graph, nodes_to_merge, new_node)
                     had_changes = True
-                    break  # Stop trying other combinations for this node
-                else:
-                    print(f"DEBUG: No match found for merge {nodes_to_merge}")
+                    break  # Stop trying other neighbors for this node
+            if had_changes:
+                break  # Restart from beginning after a successful merge
         return had_changes
+    def recover_unmatched_with_stereo_agnostic(self, graph: FragmentGraph, matcher) -> int:
+        """
+        Separate recovery procedure: Try to match remaining unmatched fragments
+        using stereochemistry-agnostic comparison.
+        This handles poor quality input data where stereochemistry is not assigned.
+        Only called after regular recovery attempts have finished.
+        Args:
+            graph: FragmentGraph with some unmatched nodes
+            matcher: MonomerMatcher instance
+        Returns:
+            Number of fragments that were successfully matched
+        """
+        from rdkit import Chem
+        # Find all unmatched nodes (nodes with mock/unknown monomers)
+        unmatched_nodes = []
+        for node_id, node in graph.nodes.items():
+            if node.monomer and (node.monomer.symbol.startswith('X') or
+                                 node.monomer.name.startswith('Unknown')):
+                unmatched_nodes.append(node_id)
+        if not unmatched_nodes:
+            return 0
+        print(f"DEBUG: Attempting stereo-agnostic recovery for {len(unmatched_nodes)} unmatched nodes")
+        matched_count = 0
+        for node_id in unmatched_nodes:
+            if node_id not in graph.nodes:
+                continue
+            node = graph.nodes[node_id]
+            # Get fragment SMILES
+            fragment_smiles = Chem.MolToSmiles(node.mol, canonical=True)
+            # Count connections
+            neighbors = graph.get_neighbors(node_id)
+            num_connections = len(neighbors)
+            # Try stereo-agnostic matching
+            monomer = matcher.monomer_library.find_monomer_by_fragment_smiles_no_stereo(
+                fragment_smiles, num_connections
+            )
+            if monomer:
+                print(f"DEBUG: Stereo-agnostic match for node {node_id}: {monomer.symbol}")
+                node.monomer = monomer
+                matched_count += 1
+            else:
+                print(f"DEBUG: No stereo-agnostic match for node {node_id}")
+        return matched_count
 # ============================================================================
 # Content from: helm_generator.py
@@ -706,36 +905,154 @@ class HELMGenerator:
         if len(graph) == 0:
             return ""
-        # Get ordered sequence of monomers
-        ordered_nodes = graph.get_ordered_nodes()
+        # Get ordered sequence of monomers (backbone)
+        ordered_nodes_raw = graph.get_ordered_nodes()
+        # Check if cyclic
+        is_cyclic = graph.is_cyclic()
+        # Filter backbone: nodes that are part of R1-R2 chain are backbone
+        # Nodes connected only via R3 (side chain) are branches
+        #
+        # Logic: A node at position 1 is a branch if:
+        # - It has no R1 (N-terminus) - meaning it's a cap like 'ac' that only has R2
+        # - It only has 1 peptide connection (to the real backbone)
+        #
+        # Example: [ac].K in cyclic peptide
+        # - 'ac' has only R2, no R1 → it's a cap
+        # - 'ac' connects to K's R3 (side chain), not K's R1 (backbone)
+        # - So 'ac' should be PEPTIDE2, not part of PEPTIDE1
+        backbone_nodes = []
+        for i, node in enumerate(ordered_nodes_raw):
+            is_branch = False
+            if i == 0 and len(ordered_nodes_raw) > 1 and node.monomer:
+                # Check if this first node lacks R1 (N-terminus)
+                # If it has no R1, it's a cap that should be a branch
+                has_r1 = 'R1' in node.monomer.r_groups
+                if not has_r1:
+                    # This is an N-terminal cap (like 'ac') at position 1
+                    # It should be a branch, not part of the main backbone
+                    is_branch = True
+            if not is_branch:
+                backbone_nodes.append(node)
+        ordered_nodes = backbone_nodes
         sequence_symbols = [node.monomer.symbol if node.monomer else "X" for node in ordered_nodes]
-        # Generate linear peptide notation
-        sequence = ".".join(sequence_symbols)
+        # Detect branch nodes (nodes not in backbone)
+        ordered_node_ids = {node.id for node in ordered_nodes}
+        branch_nodes = [(node_id, node) for node_id, node in graph.nodes.items()
+                       if node_id not in ordered_node_ids]
+        # Generate sequence notation
+        if is_cyclic:
+            # Cyclic: wrap multi-letter monomers in brackets, single-letter ones stay as-is
+            formatted_symbols = [f"[{symbol}]" if len(symbol) > 1 else symbol for symbol in sequence_symbols]
+            sequence = ".".join(formatted_symbols)
+        else:
+            # Linear: no brackets
+            sequence = ".".join(sequence_symbols)
-        # Check for disulfide bridges or other non-peptide bonds
-        has_special_bonds = any(
-            link.linkage_type != LinkageType.PEPTIDE
-            for link in graph.links
-        )
+        # Collect non-sequential connections (disulfide bridges, cyclic bonds, etc.)
+        connections = []
-        if has_special_bonds:
-            # Add connection notation for disulfide bridges
-            connections = []
-            for link in graph.links:
-                if link.linkage_type == LinkageType.DISULFIDE:
-                    # Format: PEPTIDE1,PEPTIDE1,from_idx:R3-to_idx:R3
-                    connections.append(
-                        f"PEPTIDE1,PEPTIDE1,{link.from_node_id + 1}:R3-{link.to_node_id + 1}:R3"
-                    )
+        if is_cyclic:
+            # Find the actual cyclic peptide bond (last residue connects back to beginning)
+            # This handles cases where N-terminal caps (like 'ac') are at position 1
+            last_id = ordered_nodes[-1].id
+            first_few_ids = [ordered_nodes[i].id for i in range(min(3, len(ordered_nodes)))]
-            if connections:
-                connection_str = "|".join(connections)
-                helm = f"PEPTIDE1{{{sequence}}}${connection_str}$$$V2.0"
-            else:
-                helm = f"PEPTIDE1{{{sequence}}}$$$$"
+            for link in graph.links:
+                if link.linkage_type == LinkageType.PEPTIDE:
+                    # Check if this is the cyclic bond (last to one of first few)
+                    is_cyclic_bond = False
+                    from_id, to_id = None, None
+                    if link.from_node_id == last_id and link.to_node_id in first_few_ids:
+                        from_id, to_id = link.from_node_id, link.to_node_id
+                        is_cyclic_bond = True
+                    elif link.to_node_id == last_id and link.from_node_id in first_few_ids:
+                        from_id, to_id = link.to_node_id, link.from_node_id
+                        is_cyclic_bond = True
+                    if is_cyclic_bond:
+                        # Find positions (1-indexed)
+                        from_pos = next((i + 1 for i, n in enumerate(ordered_nodes) if n.id == from_id), None)
+                        to_pos = next((i + 1 for i, n in enumerate(ordered_nodes) if n.id == to_id), None)
+                        if from_pos and to_pos:
+                            connections.append(f"PEPTIDE1,PEPTIDE1,{from_pos}:R2-{to_pos}:R1")
+                            break
+        # Add disulfide bridges
+        for link in graph.links:
+            if link.linkage_type == LinkageType.DISULFIDE:
+                # Get positions in ordered sequence (1-indexed)
+                from_pos = None
+                to_pos = None
+                for i, node in enumerate(ordered_nodes):
+                    if node.id == link.from_node_id:
+                        from_pos = i + 1
+                    if node.id == link.to_node_id:
+                        to_pos = i + 1
+                if from_pos and to_pos:
+                    # Format: PEPTIDE1,PEPTIDE1,from_pos:R3-to_pos:R3
+                    connections.append(f"PEPTIDE1,PEPTIDE1,{from_pos}:R3-{to_pos}:R3")
+        # Handle branch nodes (side chain modifications)
+        # Create separate PEPTIDE chains for each branch
+        branch_chains = []
+        if branch_nodes:
+            for branch_idx, (branch_node_id, branch_node) in enumerate(branch_nodes, start=2):
+                branch_chain_name = f"PEPTIDE{branch_idx}"
+                branch_symbol = branch_node.monomer.symbol if branch_node.monomer else f"X{branch_node_id}"
+                # Format branch chain (single monomer, so no dots needed)
+                if is_cyclic and len(branch_symbol) > 1:
+                    branch_chains.append(f"{branch_chain_name}{{[{branch_symbol}]}}")
+                else:
+                    branch_chains.append(f"{branch_chain_name}{{{branch_symbol}}}")
+                # Find which backbone node this branch connects to
+                # Look for links connecting this branch to the main backbone
+                for link in graph.links:
+                    backbone_node_id = None
+                    if link.from_node_id == branch_node_id and link.to_node_id in ordered_node_ids:
+                        backbone_node_id = link.to_node_id
+                    elif link.to_node_id == branch_node_id and link.from_node_id in ordered_node_ids:
+                        backbone_node_id = link.from_node_id
+                    if backbone_node_id is not None:
+                        # Find position of backbone node (1-indexed)
+                        backbone_pos = next((i + 1 for i, n in enumerate(ordered_nodes) if n.id == backbone_node_id), None)
+                        if backbone_pos:
+                            # Determine which R-group the branch uses
+                            # If branch has R1, connect to R1; if only R2, connect to R2
+                            branch_r_group = "R1"
+                            if branch_node.monomer:
+                                if 'R1' in branch_node.monomer.r_groups:
+                                    branch_r_group = "R1"
+                                elif 'R2' in branch_node.monomer.r_groups:
+                                    branch_r_group = "R2"
+                            # Connection: backbone position R3 (side chain) to branch position 1 R-group
+                            connections.append(f"PEPTIDE1,{branch_chain_name},{backbone_pos}:R3-1:{branch_r_group}")
+                            break
+        # Generate final HELM notation
+        all_chains = [f"PEPTIDE1{{{sequence}}}"] + branch_chains
+        helm_chains = "|".join(all_chains)
+        if connections:
+            connection_str = "|".join(connections)
+            helm = f"{helm_chains}${connection_str}$$$V2.0"
         else:
-            helm = f"PEPTIDE1{{{sequence}}}$$$$"
+            helm = f"{helm_chains}$$$$V2.0"
         return helm
@@ -768,10 +1085,34 @@ from collections import defaultdict
 from itertools import combinations
 import json
 import os
+import re
 # Suppress RDKit warnings
 RDLogger.DisableLog('rdApp.warning')
+def remove_stereochemistry_from_smiles(smiles: str) -> str:
+    """
+    Remove stereochemistry markers from SMILES string.
+    Converts [C@@H], [C@H] to C, etc.
+    This is used for matching when input molecules don't have stereochemistry defined.
+    """
+    if not smiles:
+        return smiles
+    # Remove @ symbols (stereochemistry markers)
+    # Pattern: [@]+ inside brackets
+    smiles_no_stereo = re.sub(r'(@+)', '', smiles)
+    # Also remove H when it's explicit in brackets like [C@@H] -> [C] -> C
+    # But we need to be careful not to remove H from [H] or CH3
+    # After removing @, we might have [CH] which should become C
+    smiles_no_stereo = re.sub(r'\[([A-Z][a-z]?)H\]', r'\1', smiles_no_stereo)
+    # Handle [C] -> C (single atoms in brackets with no other info)
+    smiles_no_stereo = re.sub(r'\[([A-Z][a-z]?)\]', r'\1', smiles_no_stereo)
+    return smiles_no_stereo
 class MonomerData:
     def __init__(self):
         self.symbol = ""
@@ -1009,11 +1350,64 @@ class MonomerLibrary:
                 # Generate SMILES with these R-groups removed (lazy, cached)
                 candidate_smiles = monomer.get_capped_smiles_for_removed_rgroups(removed_set)
-                # Check if it matches the fragment
+                # Check if it matches the fragment (exact match only)
                 if candidate_smiles == fragment_smiles:
                     return monomer
         return None
+    def find_monomer_by_fragment_smiles_no_stereo(self, fragment_smiles: str, num_connections: int):
+        """
+        Find monomer by matching fragment SMILES WITHOUT stereochemistry.
+        Used only in recovery for handling poor quality input data.
+        Uses molecular graph isomorphism to handle cases where RDKit generates
+        different canonical SMILES for the same molecule.
+        Args:
+            fragment_smiles: Canonical SMILES of the fragment
+            num_connections: Number of connections this fragment has in the graph
+        Returns:
+            MonomerData if match found, None otherwise
+        """
+        # Parse fragment molecule once (without stereochemistry)
+        fragment_no_stereo_smiles = remove_stereochemistry_from_smiles(fragment_smiles)
+        fragment_mol = Chem.MolFromSmiles(fragment_no_stereo_smiles)
+        if not fragment_mol:
+            return None
+        # Search through all monomers
+        for symbol, monomer in self.monomers.items():
+            # Skip if monomer doesn't have enough R-groups
+            if monomer.r_group_count < num_connections:
+                continue
+            # Generate all combinations of num_connections R-groups that could have been removed
+            r_group_labels = list(monomer.r_groups.keys())
+            # For each combination of R-groups that could have been removed
+            for removed_combo in combinations(r_group_labels, num_connections):
+                removed_set = frozenset(removed_combo)
+                # Generate SMILES with these R-groups removed (lazy, cached)
+                candidate_smiles = monomer.get_capped_smiles_for_removed_rgroups(removed_set)
+                # Try string comparison first (fast path)
+                candidate_no_stereo = remove_stereochemistry_from_smiles(candidate_smiles)
+                if candidate_no_stereo == fragment_no_stereo_smiles:
+                    return monomer
+                # If string comparison fails, try molecular graph isomorphism (slower but more robust)
+                # This handles cases where RDKit generates different canonical SMILES for same molecule
+                candidate_mol = Chem.MolFromSmiles(candidate_no_stereo)
+                if candidate_mol and fragment_mol.HasSubstructMatch(candidate_mol) and candidate_mol.HasSubstructMatch(fragment_mol):
+                    # Both molecules are substructures of each other = they're the same
+                    if fragment_mol.GetNumAtoms() == candidate_mol.GetNumAtoms():
+                        return monomer
+        return None
     def find_monomer_by_symbol(self, symbol: str):
         return self.symbol_to_monomer.get(symbol)
@@ -1163,14 +1557,16 @@ def preload_library():
     return processor is not None
-def convert_molecules_batch(molfiles: list, library_json: str = None) -> list:
+def convert_molecules_batch(molecules: list, library_json: str = None, input_type: str = "auto") -> list:
     """
-    Convert a batch of molecules from molfile format to HELM notation.
+    Convert a batch of molecules to HELM notation.
     Args:
-        molfiles: List of molfile strings
+        molecules: List of molecule strings (molfiles or SMILES)
         library_json: Optional monomer library as JSON string.
                      If None, uses default cached library from HELMCoreLibrary.json
+        input_type: Type of input molecules - "molfile", "smiles", or "auto" (default).
+                   "auto" will attempt to detect the format automatically.
     Returns:
         List of tuples: (success: bool, helm_notation: str)
@@ -1184,13 +1580,13 @@ def convert_molecules_batch(molfiles: list, library_json: str = None) -> list:
             print("Initializing monomer library and processors...")
             if not preload_library():
                 print("ERROR: Failed to load monomer library")
-                return [(False, "Library initialization failed") for _ in molfiles]
+                return [(False, "Library initialization failed") for _ in molecules]
             print()
         # Use shared processor instances
         processor, matcher, helm_generator = _get_processors()
         if not processor:
-            return [(False, "") for _ in molfiles]
+            return [(False, "") for _ in molecules]
     else:
         # Load custom library from provided JSON string (no caching)
         try:
@@ -1218,7 +1614,7 @@ def convert_molecules_batch(molfiles: list, library_json: str = None) -> list:
         if not library.monomers:
             print("ERROR: No monomers loaded from custom library")
-            return [(False, "Library loading failed") for _ in molfiles]
+            return [(False, "Library loading failed") for _ in molecules]
         print(f"Custom library loaded: {len(library.monomers)} monomers")
@@ -1227,11 +1623,46 @@ def convert_molecules_batch(molfiles: list, library_json: str = None) -> list:
         matcher = MonomerMatcher(library)
         helm_generator = HELMGenerator()
+    # Helper function to detect molecule format
+    def _is_molfile(mol_string: str) -> bool:
+        """Check if string is a molfile (starts with RDKit molfile markers or has multiple lines)"""
+        if not mol_string:
+            return False
+        lines = mol_string.strip().split('\n')
+        # Molfiles typically have multiple lines and specific format
+        if len(lines) > 3:
+            # Check for V2000 or V3000 molfile markers
+            if 'V2000' in mol_string or 'V3000' in mol_string:
+                return True
+            # Check for typical molfile structure (counts line format)
+            if len(lines) > 3:
+                counts_line = lines[3] if len(lines) > 3 else ""
+                # Molfile counts line has specific format with atom/bond counts
+                if len(counts_line) >= 6 and counts_line[:6].replace(' ', '').isdigit():
+                    return True
+        return False
     results = []
-    for i in range(len(molfiles)):
-        molfile = molfiles[i]
-        mol = Chem.MolFromMolBlock(molfile)
+    for i in range(len(molecules)):
+        mol_string = molecules[i]
+        # Determine input type and parse molecule
+        if input_type == "auto":
+            # Auto-detect format
+            if _is_molfile(mol_string):
+                mol = Chem.MolFromMolBlock(mol_string)
+            else:
+                # Assume SMILES if not molfile
+                mol = Chem.MolFromSmiles(mol_string)
+        elif input_type == "molfile":
+            mol = Chem.MolFromMolBlock(mol_string)
+        elif input_type == "smiles":
+            mol = Chem.MolFromSmiles(mol_string)
+        else:
+            results.append((False, f"Invalid input_type: {input_type}"))
+            continue
         if not mol:
             results.append((False, ""))
             continue
@@ -1265,6 +1696,12 @@ def convert_molecules_batch(molfiles: list, library_json: str = None) -> list:
                 if not had_changes:
                     break
+            # After regular recovery, try stereo-agnostic matching for remaining unmatched fragments
+            # This handles poor quality data with missing stereochemistry
+            stereo_matched = processor.recover_unmatched_with_stereo_agnostic(graph, matcher)
+            if stereo_matched > 0:
+                print(f"DEBUG: Stereo-agnostic recovery matched {stereo_matched} additional fragments")
             if len(graph.nodes) > 0:
                 helm_notation = helm_generator.generate_helm_from_graph(graph)
                 results.append((True, helm_notation))
@@ -1275,5 +1712,35 @@ def convert_molecules_batch(molfiles: list, library_json: str = None) -> list:
     return results
+def convert_molfiles_to_helm(molfiles: list, library_json: str = None) -> list:
+    """
+    Convert a batch of molfiles to HELM notation.
+    Convenience wrapper for convert_molecules_batch with input_type="molfile".
+    Args:
+        molfiles: List of molfile strings
+        library_json: Optional monomer library as JSON string
+    Returns:
+        List of tuples: (success: bool, helm_notation: str)
+    """
+    return convert_molecules_batch(molfiles, library_json=library_json, input_type="molfile")
+def convert_smiles_to_helm(smiles_list: list, library_json: str = None) -> list:
+    """
+    Convert a batch of SMILES to HELM notation.
+    Convenience wrapper for convert_molecules_batch with input_type="smiles".
+    Args:
+        smiles_list: List of SMILES strings
+        library_json: Optional monomer library as JSON string
+    Returns:
+        List of tuples: (success: bool, helm_notation: str)
+    """
+    return convert_molecules_batch(smiles_list, library_json=library_json, input_type="smiles")
 res_helm_list = convert_molecules_batch(molListToProcess, library_json=libraryJSON)
 result_helm = pd.DataFrame(map(lambda x: x[1], res_helm_list), columns=["regenerated sequences"])