@datagrok/bio 2.26.8 → 2.27.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,7 @@
3
3
  #description: Converts molecules to HELM notation based on monomer library
4
4
  #input: dataframe moleculesDataframe
5
5
  #input: column moleculesColumn {semType: Molecule}
6
- #input: string libraryJSON
6
+ #input: file libraryFile
7
7
  #output: dataframe result_helm {action:join(moleculesDataframe)} [Sequences, in HELM format]
8
8
  molListToProcess = moleculesDataframe[moleculesColumn].tolist()
9
9
  import pandas as pd
@@ -150,26 +150,35 @@ class FragmentGraph:
150
150
  return ordered
151
151
 
152
152
  def _traverse_from_node(self, node_id: int, visited: set, ordered: list):
153
- """Helper for depth-first traversal"""
153
+ """Helper for depth-first traversal with bidirectional link support"""
154
154
  if node_id in visited:
155
155
  return
156
-
156
+
157
157
  visited.add(node_id)
158
158
  ordered.append(self.nodes[node_id])
159
-
160
- # Get peptide bond neighbors first (to maintain chain order)
161
- peptide_neighbors = []
162
- other_neighbors = []
163
-
159
+
160
+ # Follow links in BOTH directions but prefer the canonical (from→to)
161
+ # direction. Link direction depends on bond detection order and is not
162
+ # guaranteed to match backbone direction (e.g. FC01 stapled peptides).
163
+ peptide_fwd = []
164
+ peptide_bwd = []
165
+ other_fwd = []
166
+ other_bwd = []
167
+
164
168
  for link in self.links:
165
169
  if link.from_node_id == node_id and link.to_node_id not in visited:
166
170
  if link.linkage_type == LinkageType.PEPTIDE:
167
- peptide_neighbors.append(link.to_node_id)
171
+ peptide_fwd.append(link.to_node_id)
168
172
  else:
169
- other_neighbors.append(link.to_node_id)
170
-
171
- # Visit peptide bonds first, then others
172
- for neighbor_id in peptide_neighbors + other_neighbors:
173
+ other_fwd.append(link.to_node_id)
174
+ elif link.to_node_id == node_id and link.from_node_id not in visited:
175
+ if link.linkage_type == LinkageType.PEPTIDE:
176
+ peptide_bwd.append(link.from_node_id)
177
+ else:
178
+ other_bwd.append(link.from_node_id)
179
+
180
+ # Forward first, backward as fallback
181
+ for neighbor_id in peptide_fwd + peptide_bwd + other_fwd + other_bwd:
173
182
  self._traverse_from_node(neighbor_id, visited, ordered)
174
183
 
175
184
  def get_fragment_sequence(self) -> List[str]:
@@ -194,23 +203,87 @@ class FragmentGraph:
194
203
  if len(ordered) < 3:
195
204
  return False
196
205
 
197
- # Get the last node ID
198
- last_id = ordered[-1].id
199
-
200
- # For a cyclic peptide, the last residue should connect back to one of the first few residues
201
- # (usually first, but could be second if there's an N-terminal cap like 'ac')
202
- # Check if last node has a peptide bond to any of the first 3 nodes
203
- first_few_ids = [ordered[i].id for i in range(min(3, len(ordered)))]
204
-
206
+ # Check if any of the last few residues connect back to any of the first few.
207
+ # Checking multiple positions on each end handles branch nodes (like 'ac')
208
+ # that the bidirectional traversal may place at the edges.
209
+ first_few_ids = set(ordered[i].id for i in range(min(3, len(ordered))))
210
+ last_few_ids = set(ordered[-i - 1].id for i in range(min(3, len(ordered))))
211
+
205
212
  for link in self.links:
206
213
  if link.linkage_type == LinkageType.PEPTIDE:
207
- # Check if link connects last node to one of the first few nodes
208
- if (link.from_node_id == last_id and link.to_node_id in first_few_ids) or \
209
- (link.to_node_id == last_id and link.from_node_id in first_few_ids):
214
+ if (link.from_node_id in last_few_ids and link.to_node_id in first_few_ids) or \
215
+ (link.to_node_id in last_few_ids and link.from_node_id in first_few_ids):
210
216
  return True
211
-
217
+
212
218
  return False
213
219
 
220
+ def find_all_cycles(self) -> List[List[int]]:
221
+ """
222
+ Find all cycles in the graph using DFS.
223
+ Returns list of cycles, where each cycle is a list of node IDs.
224
+ """
225
+ cycles = []
226
+ visited = set()
227
+ rec_stack = set()
228
+ parent = {}
229
+
230
+ def dfs(node_id: int, path: List[int]):
231
+ visited.add(node_id)
232
+ rec_stack.add(node_id)
233
+ path.append(node_id)
234
+
235
+ # Get peptide bond neighbors
236
+ neighbors = [n_id for n_id, link_type in self.get_neighbors(node_id)
237
+ if link_type == LinkageType.PEPTIDE]
238
+
239
+ for neighbor_id in neighbors:
240
+ if neighbor_id not in visited:
241
+ parent[neighbor_id] = node_id
242
+ dfs(neighbor_id, path[:])
243
+ elif neighbor_id in rec_stack and neighbor_id != parent.get(node_id):
244
+ # Found a cycle - extract it from path
245
+ cycle_start_idx = path.index(neighbor_id)
246
+ cycle = path[cycle_start_idx:] + [neighbor_id]
247
+ # Normalize cycle (start from smallest ID)
248
+ min_idx = cycle.index(min(cycle[:-1])) # Don't include duplicate last element
249
+ normalized = cycle[min_idx:-1] + cycle[:min_idx]
250
+ if normalized not in cycles:
251
+ cycles.append(normalized)
252
+
253
+ rec_stack.remove(node_id)
254
+
255
+ # Try starting DFS from each unvisited node
256
+ for node_id in self.nodes.keys():
257
+ if node_id not in visited:
258
+ parent[node_id] = None
259
+ dfs(node_id, [])
260
+
261
+ return cycles
262
+
263
+ def get_connected_components(self) -> List[List[int]]:
264
+ """
265
+ Find all connected components in the graph.
266
+ Returns list of components, where each component is a list of node IDs.
267
+ """
268
+ visited = set()
269
+ components = []
270
+
271
+ def dfs_component(node_id: int, component: List[int]):
272
+ visited.add(node_id)
273
+ component.append(node_id)
274
+ neighbors = self.get_neighbors(node_id)
275
+ for neighbor_id, _ in neighbors:
276
+ if neighbor_id not in visited:
277
+ dfs_component(neighbor_id, component)
278
+
279
+ for node_id in self.nodes.keys():
280
+ if node_id not in visited:
281
+ component = []
282
+ dfs_component(node_id, component)
283
+ components.append(sorted(component))
284
+
285
+ return components
286
+
214
287
  def __len__(self):
215
288
  return len(self.nodes)
216
289
 
@@ -302,10 +375,38 @@ class BondDetector:
302
375
  bonds = []
303
376
  try:
304
377
  matches = mol.GetSubstructMatches(self.peptide_bond)
305
- for match in matches:
306
- if len(match) >= 5:
307
- # Pattern: [C;X3,X4]-[C;X3](=[O;X1])-[N;X2,X3]~[C;X3,X4]
308
- # match[0]=alpha-C (sp2 or sp3), match[1]=carbonyl-C, match[2]=O, match[3]=N, match[4]=next-alpha-C (sp2 or sp3)
378
+
379
+ # Filter out internal amide bonds in CHEM linkers like FC01.
380
+ # FC01 pattern: C(=O)-N-ArRing-N-C(=O) — two amide bonds connect to the
381
+ # same aromatic ring via the alpha-C position (match[4]).
382
+ # Real aromatic amino acids (3Abz) only have ONE such bond per ring.
383
+ skip_indices = set()
384
+ ring_info = mol.GetRingInfo()
385
+ rings = ring_info.AtomRings()
386
+
387
+ # Map: ring_frozenset -> list of match indices where match[4] is aromatic on that ring
388
+ # Only consider small rings (5-6 atoms) — large macrocycles should not be filtered
389
+ ring_to_matches = {}
390
+ for i, match in enumerate(matches):
391
+ if len(match) < 5:
392
+ continue
393
+ alpha_c_atom = mol.GetAtomWithIdx(match[4])
394
+ if alpha_c_atom.GetIsAromatic():
395
+ for ring in rings:
396
+ if match[4] in ring and len(ring) <= 6:
397
+ ring_key = frozenset(ring)
398
+ if ring_key not in ring_to_matches:
399
+ ring_to_matches[ring_key] = []
400
+ ring_to_matches[ring_key].append(i)
401
+ break
402
+
403
+ # If 2+ matches share an aromatic ring at their alpha-C position, skip them
404
+ for ring_key, match_indices in ring_to_matches.items():
405
+ if len(match_indices) >= 2:
406
+ skip_indices.update(match_indices)
407
+
408
+ for i, match in enumerate(matches):
409
+ if len(match) >= 5 and i not in skip_indices:
309
410
  c_atom = match[1] # Carbonyl carbon
310
411
  n_atom = match[3] # Nitrogen
311
412
  bonds.append((c_atom, n_atom))
@@ -386,6 +487,117 @@ class FragmentProcessor:
386
487
  self.monomer_library = monomer_library
387
488
  self.bond_detector = BondDetector()
388
489
 
490
+
491
+ def _find_staple_sidechain_bonds(self, mol, existing_bonds):
492
+ """
493
+ Find non-backbone bonds to cleave in macrocycles.
494
+
495
+ Handles three types of macrocyclic cross-links:
496
+ 1. RCMtrans/RCMcis (stapled peptides): C=C double bond in the linker.
497
+ Cleaves one hop away on each side to keep the correct R3 chain length.
498
+ 2. FC01-type (thioether staples): C-S bonds in the linker.
499
+ 3. Alkyl cross-links (bi-cyclic peptides): pure C-C chains connecting
500
+ two amino acid side chains (R3-R3). Detected by finding non-backbone
501
+ segments in large macrocycles and cleaving at their midpoint.
502
+ """
503
+ ring_info = mol.GetRingInfo()
504
+ large_rings = [set(ring) for ring in ring_info.AtomRings() if len(ring) > 10]
505
+ if not large_rings:
506
+ return []
507
+
508
+ existing_atom_pairs = set()
509
+ for a1, a2, _ in existing_bonds:
510
+ existing_atom_pairs.add((min(a1, a2), max(a1, a2)))
511
+ # Also track existing bond atoms for backbone detection
512
+ existing_bond_atoms = set()
513
+ for a1, a2, _ in existing_bonds:
514
+ existing_bond_atoms.add(a1)
515
+ existing_bond_atoms.add(a2)
516
+
517
+ additional_bonds = []
518
+ seen = set()
519
+
520
+ for ring in large_rings:
521
+ ring_list = list(ring)
522
+
523
+ # --- Type 1: C=C double bonds (RCMtrans/RCMcis) ---
524
+ # Only if molecule has quaternary alpha-methyl C (staple monomer signature)
525
+ quat_alpha = Chem.MolFromSmarts('[N;X2,X3]-[C;X4;H0](-[C;X3](=[O;X1]))-[CH3]')
526
+ has_quat = quat_alpha and mol.HasSubstructMatch(quat_alpha)
527
+ if has_quat:
528
+ for bond in mol.GetBonds():
529
+ a1, a2 = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
530
+ if a1 not in ring or a2 not in ring:
531
+ continue
532
+ if (bond.GetBondTypeAsDouble() >= 2 and
533
+ mol.GetAtomWithIdx(a1).GetAtomicNum() == 6 and
534
+ mol.GetAtomWithIdx(a2).GetAtomicNum() == 6):
535
+ for cc_atom_idx in (a1, a2):
536
+ other_cc = a2 if cc_atom_idx == a1 else a1
537
+ cc_atom = mol.GetAtomWithIdx(cc_atom_idx)
538
+ for nbr in cc_atom.GetNeighbors():
539
+ nbr_idx = nbr.GetIdx()
540
+ if nbr_idx == other_cc or nbr_idx not in ring:
541
+ continue
542
+ for nbr2 in nbr.GetNeighbors():
543
+ nbr2_idx = nbr2.GetIdx()
544
+ if nbr2_idx == cc_atom_idx or nbr2_idx not in ring:
545
+ continue
546
+ pair = (min(nbr_idx, nbr2_idx), max(nbr_idx, nbr2_idx))
547
+ if pair not in existing_atom_pairs and pair not in seen:
548
+ seen.add(pair)
549
+ additional_bonds.append((nbr_idx, nbr2_idx, LinkageType.UNKNOWN))
550
+
551
+ # --- Type 2: C-S thioether bonds (FC01) ---
552
+ # Only true thioethers (S bonded to C on both sides), NOT disulfide-adjacent
553
+ for bond in mol.GetBonds():
554
+ a1, a2 = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
555
+ if a1 not in ring or a2 not in ring:
556
+ continue
557
+ at1, at2 = mol.GetAtomWithIdx(a1), mol.GetAtomWithIdx(a2)
558
+ if ((at1.GetAtomicNum() == 6 and at2.GetAtomicNum() == 16) or
559
+ (at1.GetAtomicNum() == 16 and at2.GetAtomicNum() == 6)):
560
+ s_atom = at2 if at2.GetAtomicNum() == 16 else at1
561
+ # Skip if S is bonded to another S (disulfide bridge path)
562
+ if any(n.GetAtomicNum() == 16 for n in s_atom.GetNeighbors()):
563
+ continue
564
+ pair = (min(a1, a2), max(a1, a2))
565
+ if pair not in existing_atom_pairs and pair not in seen:
566
+ seen.add(pair)
567
+ additional_bonds.append((a1, a2, LinkageType.UNKNOWN))
568
+
569
+ # --- Type 3: Alkyl cross-link paths (bi-cyclic R3-R3) ---
570
+ # Find pairs of alpha-C atoms connected by pure carbon chains (no N/O/S
571
+ # in the path). These are R3-R3 cross-links between different cycles.
572
+ # Cleave at the midpoint of each such chain.
573
+ alpha_c_pat = Chem.MolFromSmarts('[N]-[C;X4]-[C;X3](=[O])')
574
+ if alpha_c_pat:
575
+ ac_matches = mol.GetSubstructMatches(alpha_c_pat)
576
+ alpha_c_set = list(set(m[1] for m in ac_matches))
577
+ for i, ac1 in enumerate(alpha_c_set):
578
+ for ac2 in alpha_c_set[i + 1:]:
579
+ path = Chem.GetShortestPath(mol, ac1, ac2)
580
+ if not path or len(path) < 4 or len(path) > 12:
581
+ continue
582
+ # All middle atoms must be C with no N/O/S neighbors
583
+ middle_ok = True
584
+ for mid_idx in path[1:-1]:
585
+ atom = mol.GetAtomWithIdx(mid_idx)
586
+ if atom.GetAtomicNum() != 6:
587
+ middle_ok = False
588
+ break
589
+ if any(n.GetAtomicNum() in (7, 8, 16) for n in atom.GetNeighbors()):
590
+ middle_ok = False
591
+ break
592
+ if middle_ok:
593
+ mid = len(path) // 2
594
+ pair = (min(path[mid - 1], path[mid]), max(path[mid - 1], path[mid]))
595
+ if pair not in existing_atom_pairs and pair not in seen:
596
+ seen.add(pair)
597
+ additional_bonds.append((path[mid - 1], path[mid], LinkageType.UNKNOWN))
598
+
599
+ return additional_bonds
600
+
389
601
  def process_molecule(self, mol: Chem.Mol) -> FragmentGraph:
390
602
  """
391
603
  Process a molecule into a fragment graph.
@@ -403,6 +615,11 @@ class FragmentProcessor:
403
615
  try:
404
616
  bonds_to_cleave = self.bond_detector.find_cleavable_bonds(mol)
405
617
 
618
+ # Detect R3 side-chain bonds for staple monomers (R8, S5, etc.)
619
+ r3_bonds = self._find_staple_sidechain_bonds(mol, bonds_to_cleave)
620
+ if r3_bonds:
621
+ bonds_to_cleave.extend(r3_bonds)
622
+
406
623
  if not bonds_to_cleave:
407
624
  # Single fragment (no cleavable bonds)
408
625
  node = FragmentNode(0, mol)
@@ -448,6 +665,7 @@ class FragmentProcessor:
448
665
  graph.cleaved_bond_indices = bond_indices
449
666
  graph.bond_info = bond_info
450
667
  graph.atom_mappings = atom_mappings
668
+ graph.uncleaned_fragments = fragments # Keep fragments with dummy atoms for R-group SMILES
451
669
 
452
670
  # Create nodes for each fragment
453
671
  fragment_nodes = []
@@ -735,7 +953,7 @@ class FragmentProcessor:
735
953
  # Identify unmatched nodes
736
954
  unmatched_nodes = []
737
955
  for node_id, node in graph.nodes.items():
738
- if node.monomer and node.monomer.symbol.startswith("X"):
956
+ if node.monomer and node.monomer.is_unknown:
739
957
  unmatched_nodes.append(node_id)
740
958
 
741
959
  if not unmatched_nodes:
@@ -792,7 +1010,7 @@ class FragmentProcessor:
792
1010
 
793
1011
  # Try to match the combined fragment (exact match only)
794
1012
  monomer = matcher.find_exact_match(combined_mol, num_connections)
795
-
1013
+
796
1014
  if monomer:
797
1015
  # Success! Create new merged node
798
1016
  new_node_id = min(nodes_to_merge)
@@ -830,8 +1048,7 @@ class FragmentProcessor:
830
1048
  # Find all unmatched nodes (nodes with mock/unknown monomers)
831
1049
  unmatched_nodes = []
832
1050
  for node_id, node in graph.nodes.items():
833
- if node.monomer and (node.monomer.symbol.startswith('X') or
834
- node.monomer.name.startswith('Unknown')):
1051
+ if node.monomer and node.monomer.is_unknown:
835
1052
  unmatched_nodes.append(node_id)
836
1053
 
837
1054
  if not unmatched_nodes:
@@ -868,6 +1085,86 @@ class FragmentProcessor:
868
1085
 
869
1086
  return matched_count
870
1087
 
1088
+ def recover_unmatched_by_merging_stereo_agnostic(self, graph: FragmentGraph, matcher) -> bool:
1089
+ """
1090
+ Final recovery pass: merge pairs of BOTH-unmatched neighbor fragments and
1091
+ try stereo-agnostic matching on the combined result.
1092
+
1093
+ This handles monomers like Phe_4Sdihydroorotamido that have internal amide
1094
+ bonds which get incorrectly cleaved, producing two unmatched fragments.
1095
+
1096
+ Only merges when BOTH fragments in a pair are unmatched — never touches
1097
+ already-matched nodes to avoid regressions.
1098
+
1099
+ Returns True if any merges were successful.
1100
+ """
1101
+ def _is_unmatched(node):
1102
+ return (node.monomer and
1103
+ node.monomer.is_unknown)
1104
+
1105
+ unmatched_ids = [nid for nid, node in graph.nodes.items() if _is_unmatched(node)]
1106
+ if not unmatched_ids:
1107
+ return False
1108
+
1109
+ had_changes = False
1110
+
1111
+ for node_id in unmatched_ids:
1112
+ if node_id not in graph.nodes:
1113
+ continue
1114
+ if not _is_unmatched(graph.nodes[node_id]):
1115
+ continue
1116
+
1117
+ neighbors = graph.get_neighbors(node_id)
1118
+ for neighbor_id, linkage_type in neighbors:
1119
+ if neighbor_id not in graph.nodes:
1120
+ continue
1121
+ # Only merge with another unmatched neighbor
1122
+ if not _is_unmatched(graph.nodes[neighbor_id]):
1123
+ continue
1124
+
1125
+ nodes_to_merge = sorted([node_id, neighbor_id])
1126
+
1127
+ # Find internal links between the merge candidates
1128
+ links_to_exclude = []
1129
+ for link in graph.links:
1130
+ if (link.from_node_id in nodes_to_merge and
1131
+ link.to_node_id in nodes_to_merge):
1132
+ links_to_exclude.append(link)
1133
+
1134
+ combined_mol = self._reconstruct_fragment_with_links(
1135
+ nodes_to_merge, graph, links_to_exclude)
1136
+ if not combined_mol:
1137
+ continue
1138
+
1139
+ # Count external connections for merged fragment
1140
+ all_neighbors = set()
1141
+ for nid in nodes_to_merge:
1142
+ if nid in graph.nodes:
1143
+ for nbr_id, _ in graph.get_neighbors(nid):
1144
+ if nbr_id not in nodes_to_merge:
1145
+ all_neighbors.add(nbr_id)
1146
+ num_connections = len(all_neighbors)
1147
+
1148
+ # Try exact match first, then stereo-agnostic
1149
+ monomer = matcher.find_exact_match(combined_mol, num_connections)
1150
+ if not monomer:
1151
+ combined_smiles = Chem.MolToSmiles(combined_mol, canonical=True)
1152
+ monomer = matcher.monomer_library.find_monomer_by_fragment_smiles_no_stereo(
1153
+ combined_smiles, num_connections)
1154
+
1155
+ if monomer:
1156
+ new_node_id = min(nodes_to_merge)
1157
+ new_node = FragmentNode(new_node_id, combined_mol)
1158
+ new_node.monomer = monomer
1159
+ self._merge_nodes_in_graph(graph, nodes_to_merge, new_node)
1160
+ had_changes = True
1161
+ break # Restart from outer loop
1162
+
1163
+ if had_changes:
1164
+ break
1165
+
1166
+ return had_changes
1167
+
871
1168
  # ============================================================================
872
1169
  # Content from: helm_generator.py
873
1170
  # ============================================================================
@@ -879,6 +1176,7 @@ class HELMGenerator:
879
1176
  Supports:
880
1177
  - Linear peptides
881
1178
  - Cyclic peptides
1179
+ - Multi-chain structures (BILN peptides)
882
1180
  - Disulfide bridges
883
1181
  - Custom linkages
884
1182
  """
@@ -896,6 +1194,12 @@ class HELMGenerator:
896
1194
  """
897
1195
  Generate HELM notation from a FragmentGraph.
898
1196
 
1197
+ Supports multi-chain structures (BILN peptides):
1198
+ - Detects all cycles (rings) using SSSR-like algorithm
1199
+ - Each cycle becomes a separate PEPTIDE chain
1200
+ - R1-R2 connections define backbone within each chain
1201
+ - R3 connections link chains together
1202
+
899
1203
  Args:
900
1204
  graph: FragmentGraph containing matched monomers and their connections
901
1205
 
@@ -905,6 +1209,30 @@ class HELMGenerator:
905
1209
  if len(graph) == 0:
906
1210
  return ""
907
1211
 
1212
+ # Find all cycles in the graph (each cycle will be a separate PEPTIDE chain)
1213
+ cycles = graph.find_all_cycles()
1214
+
1215
+ # Decision: Use multi-chain HELM only if:
1216
+ # 1. Multiple cycles exist (BILN-style structure), OR
1217
+ # 2. There are standalone nodes not in any cycle (attached fragments)
1218
+
1219
+ if not cycles:
1220
+ # No cycles - simple linear peptide
1221
+ return self._generate_simple_helm(graph)
1222
+
1223
+ if len(cycles) == 1:
1224
+ # Single cycle (with or without standalone branch nodes like 'ac')
1225
+ # _generate_simple_helm handles branches correctly with proper R-group detection
1226
+ return self._generate_simple_helm(graph)
1227
+
1228
+ # Multi-chain structure detected (multiple cycles)
1229
+ return self._generate_multi_chain_helm(graph, cycles)
1230
+
1231
+ def _generate_simple_helm(self, graph: FragmentGraph) -> str:
1232
+ """
1233
+ Generate HELM for simple linear or single-cycle peptides.
1234
+ This is the original implementation for backward compatibility.
1235
+ """
908
1236
  # Get ordered sequence of monomers (backbone)
909
1237
  ordered_nodes_raw = graph.get_ordered_nodes()
910
1238
 
@@ -912,31 +1240,14 @@ class HELMGenerator:
912
1240
  is_cyclic = graph.is_cyclic()
913
1241
 
914
1242
  # Filter backbone: nodes that are part of R1-R2 chain are backbone
915
- # Nodes connected only via R3 (side chain) are branches
916
- #
917
- # Logic: A node at position 1 is a branch if:
918
- # - It has no R1 (N-terminus) - meaning it's a cap like 'ac' that only has R2
919
- # - It only has 1 peptide connection (to the real backbone)
920
- #
921
- # Example: [ac].K in cyclic peptide
922
- # - 'ac' has only R2, no R1 → it's a cap
923
- # - 'ac' connects to K's R3 (side chain), not K's R1 (backbone)
924
- # - So 'ac' should be PEPTIDE2, not part of PEPTIDE1
925
-
1243
+ # Nodes lacking R1 (like 'ac' acetyl cap) are branches regardless of position
926
1244
  backbone_nodes = []
927
- for i, node in enumerate(ordered_nodes_raw):
1245
+ for node in ordered_nodes_raw:
928
1246
  is_branch = False
929
-
930
- if i == 0 and len(ordered_nodes_raw) > 1 and node.monomer:
931
- # Check if this first node lacks R1 (N-terminus)
932
- # If it has no R1, it's a cap that should be a branch
1247
+ if node.monomer and len(ordered_nodes_raw) > 1:
933
1248
  has_r1 = 'R1' in node.monomer.r_groups
934
-
935
- if not has_r1:
936
- # This is an N-terminal cap (like 'ac') at position 1
937
- # It should be a branch, not part of the main backbone
1249
+ if not has_r1 and not node.monomer.is_unknown:
938
1250
  is_branch = True
939
-
940
1251
  if not is_branch:
941
1252
  backbone_nodes.append(node)
942
1253
 
@@ -948,21 +1259,16 @@ class HELMGenerator:
948
1259
  branch_nodes = [(node_id, node) for node_id, node in graph.nodes.items()
949
1260
  if node_id not in ordered_node_ids]
950
1261
 
951
- # Generate sequence notation
952
- if is_cyclic:
953
- # Cyclic: wrap multi-letter monomers in brackets, single-letter ones stay as-is
954
- formatted_symbols = [f"[{symbol}]" if len(symbol) > 1 else symbol for symbol in sequence_symbols]
955
- sequence = ".".join(formatted_symbols)
956
- else:
957
- # Linear: no brackets
958
- sequence = ".".join(sequence_symbols)
1262
+ # Generate sequence notation — always bracket multi-char symbols (HELM spec requirement,
1263
+ # also needed for inline SMILES like [*:1]NC(CC(=O)O)C(=O)[*:2])
1264
+ formatted_symbols = [f"[{symbol}]" if len(symbol) > 1 else symbol for symbol in sequence_symbols]
1265
+ sequence = ".".join(formatted_symbols)
959
1266
 
960
1267
  # Collect non-sequential connections (disulfide bridges, cyclic bonds, etc.)
961
1268
  connections = []
962
1269
 
963
1270
  if is_cyclic:
964
1271
  # Find the actual cyclic peptide bond (last residue connects back to beginning)
965
- # This handles cases where N-terminal caps (like 'ac') are at position 1
966
1272
  last_id = ordered_nodes[-1].id
967
1273
  first_few_ids = [ordered_nodes[i].id for i in range(min(3, len(ordered_nodes)))]
968
1274
 
@@ -1012,14 +1318,14 @@ class HELMGenerator:
1012
1318
  branch_chain_name = f"PEPTIDE{branch_idx}"
1013
1319
  branch_symbol = branch_node.monomer.symbol if branch_node.monomer else f"X{branch_node_id}"
1014
1320
 
1015
- # Format branch chain (single monomer, so no dots needed)
1016
- if is_cyclic and len(branch_symbol) > 1:
1321
+ # Format branch chain (single monomer)
1322
+ # In cyclic peptides, always use brackets for consistency with reference HELM
1323
+ if is_cyclic:
1017
1324
  branch_chains.append(f"{branch_chain_name}{{[{branch_symbol}]}}")
1018
1325
  else:
1019
1326
  branch_chains.append(f"{branch_chain_name}{{{branch_symbol}}}")
1020
1327
 
1021
1328
  # Find which backbone node this branch connects to
1022
- # Look for links connecting this branch to the main backbone
1023
1329
  for link in graph.links:
1024
1330
  backbone_node_id = None
1025
1331
  if link.from_node_id == branch_node_id and link.to_node_id in ordered_node_ids:
@@ -1032,7 +1338,6 @@ class HELMGenerator:
1032
1338
  backbone_pos = next((i + 1 for i, n in enumerate(ordered_nodes) if n.id == backbone_node_id), None)
1033
1339
  if backbone_pos:
1034
1340
  # Determine which R-group the branch uses
1035
- # If branch has R1, connect to R1; if only R2, connect to R2
1036
1341
  branch_r_group = "R1"
1037
1342
  if branch_node.monomer:
1038
1343
  if 'R1' in branch_node.monomer.r_groups:
@@ -1055,6 +1360,114 @@ class HELMGenerator:
1055
1360
  helm = f"{helm_chains}$$$$V2.0"
1056
1361
 
1057
1362
  return helm
1363
+
1364
+ def _generate_multi_chain_helm(self, graph: FragmentGraph, cycles: list) -> str:
1365
+ """
1366
+ Generate HELM for multi-chain structures (BILN peptides).
1367
+
1368
+ Strategy:
1369
+ 1. Each cycle becomes a separate PEPTIDE chain
1370
+ 2. Nodes not in cycles become additional chains
1371
+ 3. R3 connections between chains are added as cross-links
1372
+ """
1373
+ # Identify which nodes belong to which cycles
1374
+ nodes_in_cycles = set()
1375
+ for cycle in cycles:
1376
+ nodes_in_cycles.update(cycle)
1377
+
1378
+ # Find standalone nodes (not in any cycle)
1379
+ standalone_nodes = [nid for nid in graph.nodes.keys() if nid not in nodes_in_cycles]
1380
+
1381
+ # Build PEPTIDE chains
1382
+ chains = []
1383
+ chain_node_map = {} # Maps node_id -> (chain_idx, position_in_chain)
1384
+
1385
+ # Add cyclic chains
1386
+ for cycle_idx, cycle in enumerate(cycles, start=1):
1387
+ chain_name = f"PEPTIDE{cycle_idx}"
1388
+ # Create sequence from cycle nodes
1389
+ sequence_symbols = []
1390
+ for pos, node_id in enumerate(cycle):
1391
+ node = graph.nodes[node_id]
1392
+ symbol = node.monomer.symbol if node.monomer else f"X{node_id}"
1393
+ sequence_symbols.append(symbol)
1394
+ chain_node_map[node_id] = (cycle_idx, pos + 1) # 1-indexed position
1395
+
1396
+ # Format with brackets for multi-letter symbols
1397
+ formatted = [f"[{s}]" if len(s) > 1 else s for s in sequence_symbols]
1398
+ sequence = ".".join(formatted)
1399
+ chains.append(f"{chain_name}{{{sequence}}}")
1400
+
1401
+ # Add standalone chains (linear fragments not in cycles)
1402
+ next_chain_idx = len(cycles) + 1
1403
+ for node_id in standalone_nodes:
1404
+ chain_name = f"PEPTIDE{next_chain_idx}"
1405
+ node = graph.nodes[node_id]
1406
+ symbol = node.monomer.symbol if node.monomer else f"X{node_id}"
1407
+ chains.append(f"{chain_name}{{{symbol}}}")
1408
+ chain_node_map[node_id] = (next_chain_idx, 1)
1409
+ next_chain_idx += 1
1410
+
1411
+ # Build connections
1412
+ connections = []
1413
+
1414
+ # Add cyclic connections (R1-R2 within each cycle)
1415
+ for cycle_idx, cycle in enumerate(cycles, start=1):
1416
+ if len(cycle) >= 3:
1417
+ # Connect last to first
1418
+ chain_name = f"PEPTIDE{cycle_idx}"
1419
+ last_pos = len(cycle)
1420
+ connections.append(f"{chain_name},{chain_name},{last_pos}:R2-1:R1")
1421
+
1422
+ # Add inter-chain connections (R3 links) and disulfide bridges
1423
+ processed_links = set()
1424
+ for link in graph.links:
1425
+ link_key = tuple(sorted([link.from_node_id, link.to_node_id]))
1426
+ if link_key in processed_links:
1427
+ continue
1428
+
1429
+ from_chain_info = chain_node_map.get(link.from_node_id)
1430
+ to_chain_info = chain_node_map.get(link.to_node_id)
1431
+
1432
+ if not from_chain_info or not to_chain_info:
1433
+ continue
1434
+
1435
+ from_chain, from_pos = from_chain_info
1436
+ to_chain, to_pos = to_chain_info
1437
+
1438
+ # Skip intra-cycle backbone peptide bonds (already handled by R1-R2 connection)
1439
+ if from_chain == to_chain and link.linkage_type == LinkageType.PEPTIDE:
1440
+ # Check if this is a sequential bond within the cycle
1441
+ cycle = cycles[from_chain - 1] if from_chain <= len(cycles) else []
1442
+ # Sequential bonds: adjacent positions or last-to-first
1443
+ if abs(from_pos - to_pos) == 1 or (from_pos == 1 and to_pos == len(cycle)) or (to_pos == 1 and from_pos == len(cycle)):
1444
+ processed_links.add(link_key)
1445
+ continue
1446
+
1447
+ # Add cross-chain connections or intra-chain disulfide bridges
1448
+ if link.linkage_type == LinkageType.DISULFIDE:
1449
+ # Disulfide uses R3 (side chain cysteine)
1450
+ r_group = "R3"
1451
+ elif link.linkage_type == LinkageType.PEPTIDE:
1452
+ # Cross-chain peptide bond (side chain R3 connection)
1453
+ r_group = "R3"
1454
+ else:
1455
+ r_group = "R3"
1456
+
1457
+ from_chain_name = f"PEPTIDE{from_chain}"
1458
+ to_chain_name = f"PEPTIDE{to_chain}"
1459
+ connections.append(f"{from_chain_name},{to_chain_name},{from_pos}:{r_group}-{to_pos}:{r_group}")
1460
+ processed_links.add(link_key)
1461
+
1462
+ # Generate final HELM
1463
+ helm_chains = "|".join(chains)
1464
+ if connections:
1465
+ connection_str = "|".join(connections)
1466
+ helm = f"{helm_chains}${connection_str}$$$V2.0"
1467
+ else:
1468
+ helm = f"{helm_chains}$$$$V2.0"
1469
+
1470
+ return helm
1058
1471
 
1059
1472
  def generate_helm_notation(self, monomers) -> str:
1060
1473
  """
@@ -1094,23 +1507,34 @@ def remove_stereochemistry_from_smiles(smiles: str) -> str:
1094
1507
  """
1095
1508
  Remove stereochemistry markers from SMILES string.
1096
1509
  Converts [C@@H], [C@H] to C, etc.
1097
-
1510
+
1098
1511
  This is used for matching when input molecules don't have stereochemistry defined.
1512
+ Only strips brackets from SMILES organic subset atoms (B,C,N,O,P,S,F,Cl,Br,I).
1513
+ Atoms like Se, Te, etc. must keep their brackets to remain valid SMILES.
1099
1514
  """
1100
1515
  if not smiles:
1101
1516
  return smiles
1102
-
1517
+
1518
+ # SMILES organic subset: atoms that can appear without brackets
1519
+ organic_subset = {'B', 'C', 'N', 'O', 'P', 'S', 'F', 'Cl', 'Br', 'I'}
1520
+
1103
1521
  # Remove @ symbols (stereochemistry markers)
1104
- # Pattern: [@]+ inside brackets
1105
1522
  smiles_no_stereo = re.sub(r'(@+)', '', smiles)
1106
-
1107
- # Also remove H when it's explicit in brackets like [C@@H] -> [C] -> C
1108
- # But we need to be careful not to remove H from [H] or CH3
1109
- # After removing @, we might have [CH] which should become C
1110
- smiles_no_stereo = re.sub(r'\[([A-Z][a-z]?)H\]', r'\1', smiles_no_stereo)
1111
- # Handle [C] -> C (single atoms in brackets with no other info)
1112
- smiles_no_stereo = re.sub(r'\[([A-Z][a-z]?)\]', r'\1', smiles_no_stereo)
1113
-
1523
+
1524
+ # Remove explicit H and brackets only for organic subset atoms
1525
+ # [C@@H] -> [CH] -> C, but [SeH] must stay as [SeH]
1526
+ def _simplify_bracket(match):
1527
+ atom = match.group(1) # e.g. 'C', 'Se', 'N'
1528
+ has_h = match.group(2) # 'H' or ''
1529
+ if atom in organic_subset:
1530
+ return atom # Strip brackets (and H) for organic subset
1531
+ elif has_h:
1532
+ return f'[{atom}H]' # Keep brackets and H for non-organic atoms
1533
+ else:
1534
+ return f'[{atom}]' # Keep brackets for non-organic atoms
1535
+
1536
+ smiles_no_stereo = re.sub(r'\[([A-Z][a-z]?)(H?)\]', _simplify_bracket, smiles_no_stereo)
1537
+
1114
1538
  return smiles_no_stereo
1115
1539
 
1116
1540
  class MonomerData:
@@ -1122,6 +1546,7 @@ class MonomerData:
1122
1546
  self.r_groups = {} # R-group label -> cap SMILES
1123
1547
  self.r_group_count = 0
1124
1548
  self.capped_smiles_cache = {} # Cache: frozenset of removed R-groups -> canonical SMILES
1549
+ self.is_unknown = False # True for unmatched fragments with inline SMILES
1125
1550
 
1126
1551
  def __repr__(self):
1127
1552
  return f"Monomer({self.symbol}: {self.name}, R-groups: {self.r_group_count})"
@@ -1234,12 +1659,28 @@ class MonomerData:
1234
1659
  return ""
1235
1660
 
1236
1661
 
1662
+ def _canonicalize_no_stereo(smiles: str) -> str:
1663
+ """
1664
+ Remove stereochemistry and re-canonicalize through RDKit.
1665
+ This ensures consistent canonical SMILES regardless of how the molecule was constructed.
1666
+ String-only stereo removal can produce non-canonical SMILES.
1667
+ """
1668
+ no_stereo = remove_stereochemistry_from_smiles(smiles)
1669
+ mol = Chem.MolFromSmiles(no_stereo)
1670
+ if mol:
1671
+ return Chem.MolToSmiles(mol, canonical=True)
1672
+ return no_stereo # Fallback to string version if parse fails
1673
+
1674
+
1237
1675
  class MonomerLibrary:
1238
1676
  def __init__(self):
1239
1677
  self.monomers = {}
1240
1678
  self.smiles_to_monomer = {}
1241
1679
  self.name_to_monomer = {}
1242
1680
  self.symbol_to_monomer = {}
1681
+ # Hash indices for O(1) matching (built after loading)
1682
+ self._smiles_index = {} # canonical_smiles -> MonomerData
1683
+ self._smiles_no_stereo_index = {} # stereo-free_smiles -> MonomerData
1243
1684
 
1244
1685
  def load_from_helm_json(self, json_path: str) -> None:
1245
1686
  if not os.path.exists(json_path):
@@ -1266,6 +1707,9 @@ class MonomerLibrary:
1266
1707
  except Exception:
1267
1708
  continue
1268
1709
 
1710
+ # Build hash indices for O(1) matching
1711
+ self._build_smiles_indices()
1712
+
1269
1713
  def _parse_monomer(self, monomer_dict: dict):
1270
1714
  # IMPORTANT: Only load PEPTIDE monomers (amino acids)
1271
1715
  # The library contains RNA, CHEM, etc. with overlapping symbols (A, C, G, T, U)
@@ -1313,101 +1757,85 @@ class MonomerLibrary:
1313
1757
 
1314
1758
  return monomer
1315
1759
 
1316
- def find_monomer_by_fragment_smiles(self, fragment_smiles: str, num_connections: int):
1760
+ def _build_smiles_indices(self):
1317
1761
  """
1318
- Find monomer by matching fragment SMILES with on-demand R-group removal.
1319
-
1320
- Args:
1321
- fragment_smiles: Canonical SMILES of the fragment
1322
- num_connections: Number of connections this fragment has in the graph
1323
-
1324
- Returns:
1325
- MonomerData if match found, None otherwise
1326
-
1327
- Logic:
1328
- - Fragment with N connections → N R-groups were removed during fragmentation
1329
- - For monomer with M R-groups, try all C(M,N) combinations of which N R-groups were removed
1330
- - Generate SMILES for each combination on-demand (with caching)
1331
-
1332
- Example:
1333
- Fragment has 1 connection, monomer has R1, R2:
1334
- - Try removing R1 → check if SMILES matches
1335
- - Try removing R2 → check if SMILES matches
1762
+ Pre-compute all possible capped SMILES for every monomer and build
1763
+ hash indices for O(1) lookup. Called once after loading all monomers.
1764
+
1765
+ For each monomer with M R-groups, generates capped SMILES for all
1766
+ possible R-group removal combinations (up to 2^M - 1 entries, typically 1-7).
1767
+
1768
+ Deduplicates monomers with identical SMILES+R-groups to avoid redundant
1769
+ capping computations (important for large libraries with variants).
1336
1770
  """
1337
- # Search through all monomers
1771
+ self._smiles_index = {}
1772
+ self._smiles_no_stereo_index = {}
1773
+
1774
+ # Dedup: group monomers by (smiles, r_group_keys) to avoid recomputing
1775
+ # identical capped forms for monomers with the same structure
1776
+ seen_structures = {} # (smiles, r_group_frozenset) -> list of capped entries
1777
+
1338
1778
  for symbol, monomer in self.monomers.items():
1339
- # Skip if monomer doesn't have enough R-groups
1340
- if monomer.r_group_count < num_connections:
1779
+ if monomer.r_group_count == 0:
1341
1780
  continue
1342
-
1343
- # Generate all combinations of num_connections R-groups that could have been removed
1781
+
1344
1782
  r_group_labels = list(monomer.r_groups.keys())
1345
-
1346
- # For each combination of R-groups that could have been removed
1347
- for removed_combo in combinations(r_group_labels, num_connections):
1348
- removed_set = frozenset(removed_combo)
1349
-
1350
- # Generate SMILES with these R-groups removed (lazy, cached)
1351
- candidate_smiles = monomer.get_capped_smiles_for_removed_rgroups(removed_set)
1352
-
1353
- # Check if it matches the fragment (exact match only)
1354
- if candidate_smiles == fragment_smiles:
1355
- return monomer
1356
-
1357
- return None
1358
-
1783
+ struct_key = (monomer.smiles, frozenset(monomer.r_groups.items()))
1784
+
1785
+ if struct_key in seen_structures:
1786
+ # Reuse cached capped SMILES from an identical monomer
1787
+ for capped_smiles, n_removed in seen_structures[struct_key]:
1788
+ key = (capped_smiles, n_removed)
1789
+ if key not in self._smiles_index:
1790
+ self._smiles_index[key] = monomer
1791
+ ns_canonical = _canonicalize_no_stereo(capped_smiles)
1792
+ if ns_canonical:
1793
+ ns_key = (ns_canonical, n_removed)
1794
+ if ns_key not in self._smiles_no_stereo_index:
1795
+ self._smiles_no_stereo_index[ns_key] = monomer
1796
+ continue
1797
+
1798
+ # First time seeing this structure — compute capped SMILES
1799
+ cached_entries = []
1800
+
1801
+ for n_removed in range(1, monomer.r_group_count + 1):
1802
+ for removed_combo in combinations(r_group_labels, n_removed):
1803
+ removed_set = frozenset(removed_combo)
1804
+ capped_smiles = monomer.get_capped_smiles_for_removed_rgroups(removed_set)
1805
+
1806
+ if not capped_smiles:
1807
+ continue
1808
+
1809
+ cached_entries.append((capped_smiles, n_removed))
1810
+
1811
+ key = (capped_smiles, n_removed)
1812
+ if key not in self._smiles_index:
1813
+ self._smiles_index[key] = monomer
1814
+
1815
+ ns_canonical = _canonicalize_no_stereo(capped_smiles)
1816
+ if ns_canonical:
1817
+ ns_key = (ns_canonical, n_removed)
1818
+ if ns_key not in self._smiles_no_stereo_index:
1819
+ self._smiles_no_stereo_index[ns_key] = monomer
1820
+
1821
+ seen_structures[struct_key] = cached_entries
1822
+
1823
+ def find_monomer_by_fragment_smiles(self, fragment_smiles: str, num_connections: int):
1824
+ """
1825
+ Find monomer by matching fragment SMILES. O(1) hash lookup.
1826
+ """
1827
+ return self._smiles_index.get((fragment_smiles, num_connections))
1828
+
1359
1829
  def find_monomer_by_fragment_smiles_no_stereo(self, fragment_smiles: str, num_connections: int):
1360
1830
  """
1361
1831
  Find monomer by matching fragment SMILES WITHOUT stereochemistry.
1362
- Used only in recovery for handling poor quality input data.
1363
-
1364
- Uses molecular graph isomorphism to handle cases where RDKit generates
1365
- different canonical SMILES for the same molecule.
1366
-
1367
- Args:
1368
- fragment_smiles: Canonical SMILES of the fragment
1369
- num_connections: Number of connections this fragment has in the graph
1370
-
1371
- Returns:
1372
- MonomerData if match found, None otherwise
1832
+ Used in recovery for handling poor quality input data. O(1) hash lookup.
1373
1833
  """
1374
- # Parse fragment molecule once (without stereochemistry)
1375
- fragment_no_stereo_smiles = remove_stereochemistry_from_smiles(fragment_smiles)
1376
- fragment_mol = Chem.MolFromSmiles(fragment_no_stereo_smiles)
1377
- if not fragment_mol:
1834
+ ns_canonical = _canonicalize_no_stereo(fragment_smiles)
1835
+ if not ns_canonical:
1378
1836
  return None
1379
-
1380
- # Search through all monomers
1381
- for symbol, monomer in self.monomers.items():
1382
- # Skip if monomer doesn't have enough R-groups
1383
- if monomer.r_group_count < num_connections:
1384
- continue
1385
-
1386
- # Generate all combinations of num_connections R-groups that could have been removed
1387
- r_group_labels = list(monomer.r_groups.keys())
1388
-
1389
- # For each combination of R-groups that could have been removed
1390
- for removed_combo in combinations(r_group_labels, num_connections):
1391
- removed_set = frozenset(removed_combo)
1392
-
1393
- # Generate SMILES with these R-groups removed (lazy, cached)
1394
- candidate_smiles = monomer.get_capped_smiles_for_removed_rgroups(removed_set)
1395
-
1396
- # Try string comparison first (fast path)
1397
- candidate_no_stereo = remove_stereochemistry_from_smiles(candidate_smiles)
1398
-
1399
- if candidate_no_stereo == fragment_no_stereo_smiles:
1400
- return monomer
1401
-
1402
- # If string comparison fails, try molecular graph isomorphism (slower but more robust)
1403
- # This handles cases where RDKit generates different canonical SMILES for same molecule
1404
- candidate_mol = Chem.MolFromSmiles(candidate_no_stereo)
1405
- if candidate_mol and fragment_mol.HasSubstructMatch(candidate_mol) and candidate_mol.HasSubstructMatch(fragment_mol):
1406
- # Both molecules are substructures of each other = they're the same
1407
- if fragment_mol.GetNumAtoms() == candidate_mol.GetNumAtoms():
1408
- return monomer
1409
-
1410
- return None
1837
+
1838
+ return self._smiles_no_stereo_index.get((ns_canonical, num_connections))
1411
1839
 
1412
1840
  def find_monomer_by_symbol(self, symbol: str):
1413
1841
  return self.symbol_to_monomer.get(symbol)
@@ -1494,6 +1922,34 @@ from rdkit import Chem
1494
1922
  import os
1495
1923
  import json
1496
1924
 
1925
+ def _generate_rgroup_smiles(graph, node_id):
1926
+ """
1927
+ Generate SMILES with R-group markers ([*:1], [*:2], ...) for an unmatched fragment.
1928
+ Uses the uncleaned fragment (with dummy atoms from FragmentOnBonds) stored in the graph.
1929
+ Falls back to plain SMILES from the cleaned mol if uncleaned data isn't available.
1930
+ """
1931
+ # Try to use uncleaned fragment with dummy atoms
1932
+ if hasattr(graph, 'uncleaned_fragments') and node_id < len(graph.uncleaned_fragments):
1933
+ uncleaned = graph.uncleaned_fragments[node_id]
1934
+ try:
1935
+ mol = Chem.RWMol(Chem.Mol(uncleaned))
1936
+ r_num = 1
1937
+ for atom in mol.GetAtoms():
1938
+ if atom.GetAtomicNum() == 0:
1939
+ atom.SetIsotope(0)
1940
+ atom.SetAtomMapNum(r_num)
1941
+ r_num += 1
1942
+ return Chem.MolToSmiles(mol)
1943
+ except Exception:
1944
+ pass
1945
+
1946
+ # Fallback: plain SMILES from cleaned mol (no R-groups)
1947
+ node = graph.nodes.get(node_id)
1948
+ if node and node.mol:
1949
+ return Chem.MolToSmiles(node.mol, canonical=True)
1950
+ return "?"
1951
+
1952
+
1497
1953
  # Global variables for caching
1498
1954
  _MONOMER_LIBRARY = None
1499
1955
  _PROCESSOR = None
@@ -1617,7 +2073,10 @@ def convert_molecules_batch(molecules: list, library_json: str = None, input_typ
1617
2073
  return [(False, "Library loading failed") for _ in molecules]
1618
2074
 
1619
2075
  print(f"Custom library loaded: {len(library.monomers)} monomers")
1620
-
2076
+
2077
+ # Build hash indices for O(1) matching
2078
+ library._build_smiles_indices()
2079
+
1621
2080
  # Create processor instances for this library
1622
2081
  processor = FragmentProcessor(library)
1623
2082
  matcher = MonomerMatcher(library)
@@ -1672,21 +2131,23 @@ def convert_molecules_batch(molecules: list, library_json: str = None, input_typ
1672
2131
  graph = processor.process_molecule(mol)
1673
2132
 
1674
2133
  # Match each fragment to a monomer using graph topology
1675
- unknown_count = 0
1676
2134
  for node_id, node in graph.nodes.items():
1677
2135
  # Count connections for this node
1678
2136
  neighbors = graph.get_neighbors(node_id)
1679
2137
  num_connections = len(neighbors)
1680
-
2138
+
1681
2139
  # Find matching monomer
1682
2140
  monomer = matcher.find_exact_match(node.mol, num_connections)
1683
2141
  if monomer:
1684
2142
  node.monomer = monomer
1685
2143
  else:
1686
- unknown_count += 1
2144
+ # Generate inline SMILES with R-group markers for unmatched fragments
1687
2145
  mock_monomer = MonomerData()
1688
- mock_monomer.symbol = f"X{unknown_count}"
1689
- mock_monomer.name = f"Unknown_{unknown_count}"
2146
+ mock_monomer.is_unknown = True
2147
+ mock_monomer.symbol = _generate_rgroup_smiles(graph, node_id)
2148
+ mock_monomer.name = "Unknown"
2149
+ mock_monomer.r_groups = {f'R{j+1}': '' for j in range(num_connections)}
2150
+ mock_monomer.r_group_count = num_connections
1690
2151
  node.monomer = mock_monomer
1691
2152
 
1692
2153
  # Try to recover unmatched fragments by merging with neighbors
@@ -1701,7 +2162,14 @@ def convert_molecules_batch(molecules: list, library_json: str = None, input_typ
1701
2162
  stereo_matched = processor.recover_unmatched_with_stereo_agnostic(graph, matcher)
1702
2163
  if stereo_matched > 0:
1703
2164
  print(f"DEBUG: Stereo-agnostic recovery matched {stereo_matched} additional fragments")
1704
-
2165
+
2166
+ # Final pass: merge pairs of both-unmatched neighbor fragments
2167
+ # with stereo-agnostic matching (handles split monomers like Phe_4Sdihydroorotamido)
2168
+ for attempt in range(max_recovery_attempts):
2169
+ had_changes = processor.recover_unmatched_by_merging_stereo_agnostic(graph, matcher)
2170
+ if not had_changes:
2171
+ break
2172
+
1705
2173
  if len(graph.nodes) > 0:
1706
2174
  helm_notation = helm_generator.generate_helm_from_graph(graph)
1707
2175
  results.append((True, helm_notation))
@@ -1742,5 +2210,9 @@ def convert_smiles_to_helm(smiles_list: list, library_json: str = None) -> list:
1742
2210
  """
1743
2211
  return convert_molecules_batch(smiles_list, library_json=library_json, input_type="smiles")
1744
2212
 
2213
+ global libraryJSON
2214
+ with open(libraryFile) as f:
2215
+ libraryJSON = f.read()
2216
+
1745
2217
  res_helm_list = convert_molecules_batch(molListToProcess, library_json=libraryJSON)
1746
2218
  result_helm = pd.DataFrame(map(lambda x: x[1], res_helm_list), columns=["regenerated sequences"])