@datagrok/bio 2.25.0 → 2.25.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1279 @@
1
+ #language: python
2
+ #name: molToHelmConverterPy
3
+ #description: Converts molecules to HELM notation based on monomer library
4
+ #input: dataframe moleculesDataframe
5
+ #input: column moleculesColumn {semType: Molecule}
6
+ #input: string libraryJSON
7
+ #output: dataframe result_helm {action:join(moleculesDataframe)} [Sequences, in HELM format]
8
+ molListToProcess = moleculesDataframe[moleculesColumn].tolist()
9
+ import pandas as pd
10
+ import numpy as np
11
+ """
12
+ Aggregated file combining all modules from the logics folder.
13
+ Generated automatically - do not edit manually.
14
+ """
15
+
16
+ # External library imports
17
+ from collections import defaultdict
18
+ from enum import Enum
19
+ from itertools import combinations
20
+ from rdkit import Chem
21
+ from rdkit import RDLogger
22
+ from typing import Dict
23
+ from typing import List
24
+ from typing import Optional
25
+ from typing import Tuple
26
+ import json
27
+ import os
28
+
29
+ # ============================================================================
30
+ # Content from: fragment_graph.py
31
+ # ============================================================================
32
+
33
+ from rdkit import Chem
34
+ from typing import Optional, List, Dict, Tuple
35
+ from enum import Enum
36
+
37
+
38
+ class LinkageType(Enum):
39
+ """Types of linkages between fragments"""
40
+ PEPTIDE = "peptide"
41
+ DISULFIDE = "disulfide"
42
+ ESTER = "ester"
43
+ ETHER = "ether"
44
+ THIOETHER = "thioether"
45
+ UNKNOWN = "unknown"
46
+
47
+
48
+ class FragmentNode:
49
+ """Represents a single molecular fragment (amino acid/monomer)"""
50
+
51
+ def __init__(self, fragment_id: int, mol: Chem.Mol):
52
+ self.id = fragment_id
53
+ self.mol = mol # RDKit molecule object
54
+ self.smiles = Chem.MolToSmiles(mol, canonical=True) if mol else ""
55
+ self.monomer = None # Will be filled by matcher - MonomerData object
56
+ self.is_c_terminal = False
57
+ self.is_n_terminal = False
58
+
59
+ def __repr__(self):
60
+ monomer_name = self.monomer.symbol if self.monomer else "Unknown"
61
+ return f"FragmentNode(id={self.id}, monomer={monomer_name}, smiles={self.smiles[:20]}...)"
62
+
63
+
64
+ class FragmentLink:
65
+ """Represents a connection between two fragments"""
66
+
67
+ def __init__(
68
+ self,
69
+ from_node_id: int,
70
+ to_node_id: int,
71
+ linkage_type: LinkageType,
72
+ from_atom_idx: Optional[int] = None,
73
+ to_atom_idx: Optional[int] = None
74
+ ):
75
+ self.from_node_id = from_node_id
76
+ self.to_node_id = to_node_id
77
+ self.linkage_type = linkage_type
78
+ self.from_atom_idx = from_atom_idx # Atom index in from_node's molecule
79
+ self.to_atom_idx = to_atom_idx # Atom index in to_node's molecule
80
+
81
+ def __repr__(self):
82
+ return f"FragmentLink({self.from_node_id} --{self.linkage_type.value}--> {self.to_node_id})"
83
+
84
+
85
+ class FragmentGraph:
86
+ """
87
+ Graph structure representing a molecule as fragments and their connections.
88
+
89
+ Supports:
90
+ - Linear peptides (chain of peptide bonds)
91
+ - Cyclic peptides (peptide bond from last to first)
92
+ - Disulfide bridges (additional S-S links)
93
+ - Branched structures (multiple links per fragment)
94
+ """
95
+
96
+ def __init__(self):
97
+ self.nodes: Dict[int, FragmentNode] = {} # node_id -> FragmentNode
98
+ self.links: List[FragmentLink] = []
99
+
100
+ def add_node(self, node: FragmentNode) -> int:
101
+ """Add a fragment node to the graph"""
102
+ self.nodes[node.id] = node
103
+ return node.id
104
+
105
+ def add_link(self, link: FragmentLink):
106
+ """Add a linkage between two nodes"""
107
+ if link.from_node_id not in self.nodes or link.to_node_id not in self.nodes:
108
+ raise ValueError(f"Cannot add link: nodes {link.from_node_id} or {link.to_node_id} not in graph")
109
+ self.links.append(link)
110
+
111
+ def get_node(self, node_id: int) -> Optional[FragmentNode]:
112
+ """Get a node by ID"""
113
+ return self.nodes.get(node_id)
114
+
115
+ def get_neighbors(self, node_id: int) -> List[Tuple[int, LinkageType]]:
116
+ """Get all neighbors of a node with their linkage types"""
117
+ neighbors = []
118
+ for link in self.links:
119
+ if link.from_node_id == node_id:
120
+ neighbors.append((link.to_node_id, link.linkage_type))
121
+ elif link.to_node_id == node_id:
122
+ neighbors.append((link.from_node_id, link.linkage_type))
123
+ return neighbors
124
+
125
+ def get_ordered_nodes(self) -> List[FragmentNode]:
126
+ """
127
+ Get nodes in sequential order (for linear/cyclic peptides).
128
+ For branched structures, returns a depth-first traversal.
129
+ """
130
+ if not self.nodes:
131
+ return []
132
+
133
+ # Find starting node (N-terminal for peptides)
134
+ start_node_id = None
135
+ for node_id, node in self.nodes.items():
136
+ if node.is_n_terminal:
137
+ start_node_id = node_id
138
+ break
139
+
140
+ # If no N-terminal found, use first node
141
+ if start_node_id is None:
142
+ start_node_id = min(self.nodes.keys())
143
+
144
+ # Traverse the graph
145
+ ordered = []
146
+ visited = set()
147
+ self._traverse_from_node(start_node_id, visited, ordered)
148
+
149
+ return ordered
150
+
151
+ def _traverse_from_node(self, node_id: int, visited: set, ordered: list):
152
+ """Helper for depth-first traversal"""
153
+ if node_id in visited:
154
+ return
155
+
156
+ visited.add(node_id)
157
+ ordered.append(self.nodes[node_id])
158
+
159
+ # Get peptide bond neighbors first (to maintain chain order)
160
+ peptide_neighbors = []
161
+ other_neighbors = []
162
+
163
+ for link in self.links:
164
+ if link.from_node_id == node_id and link.to_node_id not in visited:
165
+ if link.linkage_type == LinkageType.PEPTIDE:
166
+ peptide_neighbors.append(link.to_node_id)
167
+ else:
168
+ other_neighbors.append(link.to_node_id)
169
+
170
+ # Visit peptide bonds first, then others
171
+ for neighbor_id in peptide_neighbors + other_neighbors:
172
+ self._traverse_from_node(neighbor_id, visited, ordered)
173
+
174
+ def get_fragment_sequence(self) -> List[str]:
175
+ """Get sequence of monomer symbols (for matched fragments)"""
176
+ ordered_nodes = self.get_ordered_nodes()
177
+ return [
178
+ node.monomer.symbol if node.monomer else f"X{node.id}"
179
+ for node in ordered_nodes
180
+ ]
181
+
182
+ def __len__(self):
183
+ return len(self.nodes)
184
+
185
+ def __repr__(self):
186
+ return f"FragmentGraph(nodes={len(self.nodes)}, links={len(self.links)})"
187
+
188
+ def to_dict(self) -> dict:
189
+ """Convert graph to dictionary for serialization"""
190
+ return {
191
+ "nodes": [
192
+ {
193
+ "id": node.id,
194
+ "smiles": node.smiles,
195
+ "monomer": node.monomer.symbol if node.monomer else None,
196
+ "is_n_terminal": node.is_n_terminal,
197
+ "is_c_terminal": node.is_c_terminal
198
+ }
199
+ for node in self.nodes.values()
200
+ ],
201
+ "links": [
202
+ {
203
+ "from": link.from_node_id,
204
+ "to": link.to_node_id,
205
+ "type": link.linkage_type.value,
206
+ "from_atom": link.from_atom_idx,
207
+ "to_atom": link.to_atom_idx
208
+ }
209
+ for link in self.links
210
+ ]
211
+ }
212
+
213
+ # ============================================================================
214
+ # Content from: fragment_processor.py
215
+ # ============================================================================
216
+
217
+ from rdkit import Chem
218
+
219
+
220
+ class BondDetector:
221
+ #GENERALIZATION ITEM: BOND PATTERNS SHOULD BE DERIVED FROM LIBRARY
222
+ def __init__(self):
223
+ # True peptide bond: C and N both in backbone (each bonded to carbons)
224
+ # Alpha carbons can be sp3 (X4) or sp2 (X3) for dehydroamino acids
225
+ self.peptide_bond = Chem.MolFromSmarts('[C;X3,X4]-[C;X3](=[O;X1])-[N;X3]-[C;X3,X4]')
226
+ # True disulfide bond: S-S where each S is bonded to carbon (cysteine residues)
227
+ self.disulfide_bond = Chem.MolFromSmarts('[C;X4]-[S;X2]-[S;X2]-[C;X4]')
228
+ # Primary amine at N-terminus (can be NH2 or NH3+), alpha-C can be sp3 or sp2
229
+ self.primary_amine = Chem.MolFromSmarts('[N;H2,H3;X3,X4]-[C;X3,X4]')
230
+
231
+ def find_cleavable_bonds(self, mol: Chem.Mol):
232
+ """
233
+ Find all cleavable bonds in the molecule.
234
+
235
+ Returns:
236
+ List of tuples: (atom1_idx, atom2_idx, LinkageType)
237
+ """
238
+ try:
239
+ all_bonds = []
240
+
241
+ # Find peptide bonds
242
+ peptide_bonds = self._find_peptide_bonds(mol)
243
+ all_bonds.extend([(bond[0], bond[1], LinkageType.PEPTIDE) for bond in peptide_bonds])
244
+
245
+ # Find disulfide bonds
246
+ disulfide_bonds = self._find_disulfide_bonds(mol)
247
+ all_bonds.extend([(bond[0], bond[1], LinkageType.DISULFIDE) for bond in disulfide_bonds])
248
+
249
+ # Order peptide bonds from N to C (keep disulfide bonds unordered)
250
+ peptide_only = [(b[0], b[1]) for b in all_bonds if b[2] == LinkageType.PEPTIDE]
251
+ ordered_peptide = self._order_bonds_from_n_to_c(mol, peptide_only)
252
+
253
+ # Rebuild with types
254
+ ordered_bonds = [(b[0], b[1], LinkageType.PEPTIDE) for b in ordered_peptide]
255
+ ordered_bonds.extend([b for b in all_bonds if b[2] != LinkageType.PEPTIDE])
256
+
257
+ return ordered_bonds
258
+
259
+ except Exception:
260
+ return []
261
+
262
+ def _find_peptide_bonds(self, mol: Chem.Mol):
263
+ bonds = []
264
+ try:
265
+ matches = mol.GetSubstructMatches(self.peptide_bond)
266
+ for match in matches:
267
+ if len(match) >= 5:
268
+ # Pattern: [C;X3,X4]-[C;X3](=[O;X1])-[N;X3]-[C;X3,X4]
269
+ # match[0]=alpha-C (sp2 or sp3), match[1]=carbonyl-C, match[2]=O, match[3]=N, match[4]=next-alpha-C (sp2 or sp3)
270
+ c_atom = match[1] # Carbonyl carbon
271
+ n_atom = match[3] # Nitrogen
272
+ bonds.append((c_atom, n_atom))
273
+ except Exception:
274
+ pass
275
+ return bonds
276
+
277
+ def _find_disulfide_bonds(self, mol: Chem.Mol):
278
+ """Find disulfide bonds (S-S linkages)"""
279
+ bonds = []
280
+ try:
281
+ matches = mol.GetSubstructMatches(self.disulfide_bond)
282
+ for match in matches:
283
+ if len(match) >= 4:
284
+ # Pattern: [C;X4]-[S;X2]-[S;X2]-[C;X4]
285
+ # match[0]=C, match[1]=S, match[2]=S, match[3]=C
286
+ s1_atom = match[1] # First sulfur
287
+ s2_atom = match[2] # Second sulfur
288
+ bonds.append((s1_atom, s2_atom))
289
+ except Exception:
290
+ pass
291
+ return bonds
292
+
293
+ def _order_bonds_from_n_to_c(self, mol: Chem.Mol, bonds):
294
+ if not bonds:
295
+ return bonds
296
+
297
+ n_terminal = self._find_n_terminal(mol)
298
+ if n_terminal is None:
299
+ return bonds
300
+
301
+ ordered = []
302
+ visited = set()
303
+ current = n_terminal
304
+
305
+ while current is not None and len(ordered) < len(bonds):
306
+ next_bond = None
307
+ for bond in bonds:
308
+ if bond not in visited and bond[1] == current:
309
+ next_bond = bond
310
+ break
311
+
312
+ if next_bond is None:
313
+ break
314
+
315
+ ordered.append(next_bond)
316
+ visited.add(next_bond)
317
+ current = next_bond[0]
318
+
319
+ for bond in bonds:
320
+ if bond not in visited:
321
+ ordered.append(bond)
322
+
323
+ return ordered
324
+
325
+ def _find_n_terminal(self, mol: Chem.Mol):
326
+ try:
327
+ matches = mol.GetSubstructMatches(self.primary_amine)
328
+ if matches:
329
+ return matches[0][0]
330
+
331
+ max_h = -1
332
+ n_term = None
333
+ for atom in mol.GetAtoms():
334
+ if atom.GetAtomicNum() == 7:
335
+ h_count = atom.GetTotalNumHs()
336
+ if h_count > max_h:
337
+ max_h = h_count
338
+ n_term = atom.GetIdx()
339
+ return n_term
340
+
341
+ except Exception:
342
+ return None
343
+
344
+
345
+ class FragmentProcessor:
346
+ def __init__(self, monomer_library):
347
+ self.monomer_library = monomer_library
348
+ self.bond_detector = BondDetector()
349
+
350
+ def process_molecule(self, mol: Chem.Mol) -> FragmentGraph:
351
+ """
352
+ Process a molecule into a fragment graph.
353
+
354
+ Args:
355
+ mol: RDKit molecule object
356
+
357
+ Returns:
358
+ FragmentGraph object containing fragments and their connections
359
+ """
360
+ graph = FragmentGraph()
361
+ # Store original molecule for fragment recovery
362
+ graph.original_mol = mol
363
+
364
+ try:
365
+ bonds_to_cleave = self.bond_detector.find_cleavable_bonds(mol)
366
+
367
+ if not bonds_to_cleave:
368
+ # Single fragment (no cleavable bonds)
369
+ node = FragmentNode(0, mol)
370
+ node.is_n_terminal = True
371
+ node.is_c_terminal = True
372
+ graph.add_node(node)
373
+ return graph
374
+
375
+ # Extract bond info for fragmentation
376
+ bond_indices = []
377
+ bond_info = [] # (bond_idx, atom1, atom2, linkage_type)
378
+ seen_bonds = set() # Track which bonds we've already added
379
+
380
+ for atom1, atom2, linkage_type in bonds_to_cleave:
381
+ bond = mol.GetBondBetweenAtoms(atom1, atom2)
382
+ if bond:
383
+ bond_idx = bond.GetIdx()
384
+ if bond_idx not in seen_bonds:
385
+ bond_indices.append(bond_idx)
386
+ bond_info.append((bond_idx, atom1, atom2, linkage_type))
387
+ seen_bonds.add(bond_idx)
388
+ # Skip duplicate bonds silently
389
+ # Skip invalid bonds silently
390
+
391
+ if not bond_indices:
392
+ # No valid bonds found
393
+ node = FragmentNode(0, mol)
394
+ node.is_n_terminal = True
395
+ node.is_c_terminal = True
396
+ graph.add_node(node)
397
+ return graph
398
+
399
+ # Fragment the molecule
400
+ fragmented_mol = Chem.FragmentOnBonds(mol, bond_indices, addDummies=True)
401
+
402
+ # Get fragments AND their atom mappings separately
403
+ fragments_tuple = Chem.GetMolFrags(
404
+ fragmented_mol,
405
+ asMols=True,
406
+ sanitizeFrags=True
407
+ )
408
+ fragments = list(fragments_tuple)
409
+
410
+ # Store bond cleavage info for recovery - we'll use this to selectively re-fragment
411
+ graph.cleaved_bond_indices = bond_indices
412
+ graph.bond_info = bond_info
413
+ print(f"DEBUG: Created {len(fragments)} fragments, cleaved {len(bond_indices)} bonds")
414
+
415
+ # Create nodes for each fragment
416
+ fragment_nodes = []
417
+ for i, frag in enumerate(fragments):
418
+ clean_frag = self._clean_fragment(frag)
419
+ if clean_frag and clean_frag.GetNumAtoms() >= 3:
420
+ is_c_terminal = (i == len(fragments) - 1)
421
+ is_n_terminal = (i == 0)
422
+ # No normalization! Use fragment as-is
423
+ node = FragmentNode(i, clean_frag)
424
+ node.is_c_terminal = is_c_terminal
425
+ node.is_n_terminal = is_n_terminal
426
+ graph.add_node(node)
427
+ fragment_nodes.append((i, node))
428
+
429
+ # Create links between fragments based on cleaved bonds
430
+ # For sequential peptide bonds
431
+ peptide_links = [b for b in bond_info if b[3] == LinkageType.PEPTIDE]
432
+ for i in range(len(fragment_nodes) - 1):
433
+ from_id, _ = fragment_nodes[i]
434
+ to_id, _ = fragment_nodes[i + 1]
435
+ link = FragmentLink(from_id, to_id, LinkageType.PEPTIDE)
436
+ graph.add_link(link)
437
+
438
+ # Add disulfide bridges (if any)
439
+ # TODO: Track which fragments contain the S atoms for proper linking
440
+ disulfide_links = [b for b in bond_info if b[3] == LinkageType.DISULFIDE]
441
+ # For now, disulfide bonds require more complex atom tracking
442
+ # This is a placeholder for future enhancement
443
+
444
+ return graph
445
+
446
+ except Exception as e:
447
+ # Fallback: single node with original molecule
448
+ node = FragmentNode(0, mol)
449
+ node.is_n_terminal = True
450
+ node.is_c_terminal = True
451
+ graph.add_node(node)
452
+ return graph
453
+
454
+ def _clean_fragment(self, mol: Chem.Mol):
455
+ try:
456
+ mol_copy = Chem.Mol(mol)
457
+ atoms_to_remove = []
458
+
459
+ for atom in mol_copy.GetAtoms():
460
+ if atom.GetAtomicNum() == 0:
461
+ atoms_to_remove.append(atom.GetIdx())
462
+
463
+ atoms_to_remove.sort(reverse=True)
464
+ if atoms_to_remove:
465
+ emol = Chem.EditableMol(mol_copy)
466
+ for atom_idx in atoms_to_remove:
467
+ emol.RemoveAtom(atom_idx)
468
+ return emol.GetMol()
469
+
470
+ return mol_copy
471
+
472
+ except Exception:
473
+ return None
474
+
475
+ def _reconstruct_fragment(self, node_ids: list, graph: FragmentGraph) -> Chem.Mol:
476
+ """
477
+ Reconstruct a molecule by combining multiple fragment nodes.
478
+ Re-fragments the original molecule, excluding bonds between the nodes to merge.
479
+ """
480
+ if not node_ids or not hasattr(graph, 'original_mol') or not hasattr(graph, 'cleaved_bond_indices'):
481
+ return None
482
+
483
+ try:
484
+ # Sort node IDs to ensure consistent ordering
485
+ sorted_nodes = sorted(node_ids)
486
+
487
+ # Identify which bonds to exclude (bonds between consecutive merged nodes)
488
+ bonds_to_exclude = set()
489
+ for i in range(len(sorted_nodes) - 1):
490
+ # We want to keep the bond between node i and node i+1
491
+ # This bond would be at position sorted_nodes[i] in the cleaved_bond_indices
492
+ if sorted_nodes[i] + 1 == sorted_nodes[i + 1]:
493
+ # Consecutive nodes - exclude the bond between them
494
+ if sorted_nodes[i] < len(graph.cleaved_bond_indices):
495
+ bonds_to_exclude.add(sorted_nodes[i])
496
+
497
+ # Create new bond list excluding the bonds we want to keep
498
+ new_bond_indices = [
499
+ bond_idx for i, bond_idx in enumerate(graph.cleaved_bond_indices)
500
+ if i not in bonds_to_exclude
501
+ ]
502
+
503
+ print(f"DEBUG reconstruct: Original had {len(graph.cleaved_bond_indices)} cleaved bonds, "
504
+ f"excluding {len(bonds_to_exclude)} bonds, new list has {len(new_bond_indices)} bonds")
505
+
506
+ # Re-fragment with the modified bond list
507
+ if not new_bond_indices:
508
+ # No bonds to cleave - return whole molecule
509
+ return graph.original_mol
510
+
511
+ fragmented_mol = Chem.FragmentOnBonds(graph.original_mol, new_bond_indices, addDummies=True)
512
+ fragments_tuple = Chem.GetMolFrags(fragmented_mol, asMols=True, sanitizeFrags=True)
513
+ fragments = list(fragments_tuple)
514
+
515
+ # Find which fragment corresponds to our merged nodes
516
+ # The merged nodes should be at the position of the first node ID in sorted order
517
+ target_idx = sorted_nodes[0]
518
+
519
+ # Account for excluded bonds shifting fragment indices
520
+ adjusted_idx = target_idx - sum(1 for excluded_idx in bonds_to_exclude if excluded_idx < target_idx)
521
+
522
+ print(f"DEBUG reconstruct: Got {len(fragments)} fragments after re-fragmentation, "
523
+ f"target_idx={target_idx}, adjusted_idx={adjusted_idx}")
524
+
525
+ if adjusted_idx < len(fragments):
526
+ clean_frag = self._clean_fragment(fragments[adjusted_idx])
527
+ return clean_frag if clean_frag else fragments[adjusted_idx]
528
+
529
+ return None
530
+
531
+ except Exception as e:
532
+ print(f"DEBUG reconstruct: Exception: {e}")
533
+ return None
534
+
535
+ def _merge_nodes_in_graph(self, graph: FragmentGraph, nodes_to_merge: list,
536
+ new_node: FragmentNode) -> None:
537
+ """
538
+ Remove old nodes, add new merged node, update all links.
539
+ Preserves terminal flags from edge nodes.
540
+ """
541
+ if not nodes_to_merge:
542
+ return
543
+
544
+ # Sort node IDs to identify edge nodes
545
+ sorted_nodes = sorted(nodes_to_merge)
546
+ leftmost = sorted_nodes[0]
547
+ rightmost = sorted_nodes[-1]
548
+
549
+ # Preserve terminal flags
550
+ if leftmost in graph.nodes:
551
+ new_node.is_n_terminal = graph.nodes[leftmost].is_n_terminal
552
+ if rightmost in graph.nodes:
553
+ new_node.is_c_terminal = graph.nodes[rightmost].is_c_terminal
554
+
555
+ # Update links: replace references to merged nodes
556
+ updated_links = []
557
+ nodes_to_merge_set = set(nodes_to_merge)
558
+
559
+ for link in graph.links:
560
+ from_in = link.from_node_id in nodes_to_merge_set
561
+ to_in = link.to_node_id in nodes_to_merge_set
562
+
563
+ # Skip internal links between merged nodes
564
+ if from_in and to_in:
565
+ continue
566
+
567
+ # Update link if one end is being merged
568
+ new_from = new_node.id if from_in else link.from_node_id
569
+ new_to = new_node.id if to_in else link.to_node_id
570
+
571
+ updated_links.append(FragmentLink(new_from, new_to, link.linkage_type))
572
+
573
+ # Remove old nodes
574
+ for node_id in nodes_to_merge:
575
+ if node_id in graph.nodes:
576
+ del graph.nodes[node_id]
577
+
578
+ # Add new node and update links
579
+ graph.add_node(new_node)
580
+ graph.links = updated_links
581
+
582
+ def recover_unmatched_fragments(self, graph: FragmentGraph, matcher) -> bool:
583
+ """
584
+ Try to recover unmatched fragments by merging with neighbors.
585
+ Returns True if any merges were successful.
586
+ """
587
+ # Identify unmatched nodes
588
+ unmatched_nodes = []
589
+ for node_id, node in graph.nodes.items():
590
+ if node.monomer and node.monomer.symbol.startswith("X"):
591
+ unmatched_nodes.append(node_id)
592
+
593
+ if not unmatched_nodes:
594
+ return False
595
+
596
+ print(f"DEBUG: Found {len(unmatched_nodes)} unmatched nodes: {unmatched_nodes}")
597
+
598
+ had_changes = False
599
+
600
+ # Try to recover each unmatched node
601
+ for node_id in unmatched_nodes:
602
+ # Check if node still exists (might have been merged already)
603
+ if node_id not in graph.nodes:
604
+ continue
605
+
606
+ # Get neighbors
607
+ neighbors = graph.get_neighbors(node_id)
608
+ neighbor_ids = [n[0] for n in neighbors]
609
+
610
+ if not neighbor_ids:
611
+ continue
612
+
613
+ # Separate left and right neighbors (assuming sequential order)
614
+ left_neighbors = [n for n in neighbor_ids if n < node_id]
615
+ right_neighbors = [n for n in neighbor_ids if n > node_id]
616
+
617
+ # Try merge combinations: left only, right only, both
618
+ merge_attempts = []
619
+
620
+ if left_neighbors:
621
+ merge_attempts.append([left_neighbors[0], node_id])
622
+ if right_neighbors:
623
+ merge_attempts.append([node_id, right_neighbors[0]])
624
+ if left_neighbors and right_neighbors:
625
+ merge_attempts.append([left_neighbors[0], node_id, right_neighbors[0]])
626
+
627
+ # Try each merge combination
628
+ for nodes_to_merge in merge_attempts:
629
+ print(f"DEBUG: Trying to merge nodes {nodes_to_merge}")
630
+
631
+ # Reconstruct combined molecule
632
+ combined_mol = self._reconstruct_fragment(nodes_to_merge, graph)
633
+ if not combined_mol:
634
+ print(f"DEBUG: Failed to reconstruct molecule for {nodes_to_merge}")
635
+ continue
636
+
637
+ print(f"DEBUG: Reconstructed mol with {combined_mol.GetNumAtoms()} atoms")
638
+
639
+ # Count expected connections for this merged fragment
640
+ # Get all unique neighbors of the merged set
641
+ all_neighbors = set()
642
+ for nid in nodes_to_merge:
643
+ if nid in graph.nodes:
644
+ node_neighbors = graph.get_neighbors(nid)
645
+ for neighbor_id, _ in node_neighbors:
646
+ if neighbor_id not in nodes_to_merge:
647
+ all_neighbors.add(neighbor_id)
648
+
649
+ num_connections = len(all_neighbors)
650
+ print(f"DEBUG: Expecting {num_connections} connections")
651
+
652
+ # Try to match the combined fragment
653
+ monomer = matcher.find_exact_match(combined_mol, num_connections)
654
+
655
+ if monomer:
656
+ print(f"DEBUG: SUCCESS! Matched to {monomer.symbol}")
657
+ # Success! Create new merged node
658
+ new_node_id = min(nodes_to_merge) # Use lowest ID
659
+ new_node = FragmentNode(new_node_id, combined_mol)
660
+ new_node.monomer = monomer
661
+
662
+ # Merge nodes in graph
663
+ self._merge_nodes_in_graph(graph, nodes_to_merge, new_node)
664
+
665
+ had_changes = True
666
+ break # Stop trying other combinations for this node
667
+ else:
668
+ print(f"DEBUG: No match found for merge {nodes_to_merge}")
669
+
670
+ return had_changes
671
+
672
+ # ============================================================================
673
+ # Content from: helm_generator.py
674
+ # ============================================================================
675
+
676
+ class HELMGenerator:
677
+ """
678
+ Generates HELM notation from fragment graphs or monomer lists.
679
+
680
+ Supports:
681
+ - Linear peptides
682
+ - Cyclic peptides
683
+ - Disulfide bridges
684
+ - Custom linkages
685
+ """
686
+
687
+ def __init__(self):
688
+ #GENERALIZATION ITEM: POLYMER TYPES SHOULD BE DERIVED FROM LIBRARY
689
+ self.polymer_types = {
690
+ "peptide": "PEPTIDE",
691
+ "rna": "RNA",
692
+ "dna": "DNA",
693
+ "chemical": "CHEM"
694
+ }
695
+
696
+ def generate_helm_from_graph(self, graph: FragmentGraph) -> str:
697
+ """
698
+ Generate HELM notation from a FragmentGraph.
699
+
700
+ Args:
701
+ graph: FragmentGraph containing matched monomers and their connections
702
+
703
+ Returns:
704
+ HELM notation string
705
+ """
706
+ if len(graph) == 0:
707
+ return ""
708
+
709
+ # Get ordered sequence of monomers
710
+ ordered_nodes = graph.get_ordered_nodes()
711
+ sequence_symbols = [node.monomer.symbol if node.monomer else "X" for node in ordered_nodes]
712
+
713
+ # Generate linear peptide notation
714
+ sequence = ".".join(sequence_symbols)
715
+
716
+ # Check for disulfide bridges or other non-peptide bonds
717
+ has_special_bonds = any(
718
+ link.linkage_type != LinkageType.PEPTIDE
719
+ for link in graph.links
720
+ )
721
+
722
+ if has_special_bonds:
723
+ # Add connection notation for disulfide bridges
724
+ connections = []
725
+ for link in graph.links:
726
+ if link.linkage_type == LinkageType.DISULFIDE:
727
+ # Format: PEPTIDE1,PEPTIDE1,from_idx:R3-to_idx:R3
728
+ connections.append(
729
+ f"PEPTIDE1,PEPTIDE1,{link.from_node_id + 1}:R3-{link.to_node_id + 1}:R3"
730
+ )
731
+
732
+ if connections:
733
+ connection_str = "|".join(connections)
734
+ helm = f"PEPTIDE1{{{sequence}}}${connection_str}$$$V2.0"
735
+ else:
736
+ helm = f"PEPTIDE1{{{sequence}}}$$$$"
737
+ else:
738
+ helm = f"PEPTIDE1{{{sequence}}}$$$$"
739
+
740
+ return helm
741
+
742
+ def generate_helm_notation(self, monomers) -> str:
743
+ """
744
+ Legacy method: Generate HELM notation from a list of monomers.
745
+ Kept for backward compatibility.
746
+
747
+ Args:
748
+ monomers: List of MonomerData objects
749
+
750
+ Returns:
751
+ HELM notation string
752
+ """
753
+ if not monomers:
754
+ return ""
755
+
756
+ sequence = ".".join([monomer.symbol for monomer in monomers])
757
+ helm = f"PEPTIDE1{{{sequence}}}$$$$"
758
+
759
+ return helm
760
+
761
+ # ============================================================================
762
+ # Content from: monomer_library.py
763
+ # ============================================================================
764
+
765
+ from rdkit import Chem
766
+ from rdkit import RDLogger
767
+ from collections import defaultdict
768
+ from itertools import combinations
769
+ import json
770
+ import os
771
+
772
+ # Suppress RDKit warnings
773
+ RDLogger.DisableLog('rdApp.warning')
774
+
775
+ class MonomerData:
776
+ def __init__(self):
777
+ self.symbol = ""
778
+ self.name = ""
779
+ self.mol = None
780
+ self.smiles = "" # Original SMILES with R-groups
781
+ self.r_groups = {} # R-group label -> cap SMILES
782
+ self.r_group_count = 0
783
+ self.capped_smiles_cache = {} # Cache: frozenset of removed R-groups -> canonical SMILES
784
+
785
+ def __repr__(self):
786
+ return f"Monomer({self.symbol}: {self.name}, R-groups: {self.r_group_count})"
787
+
788
+ def get_capped_smiles_for_removed_rgroups(self, removed_rgroups: frozenset) -> str:
789
+ """
790
+ Get canonical SMILES with specific R-groups removed (lazy generation with caching).
791
+
792
+ Args:
793
+ removed_rgroups: frozenset of R-group labels that were removed (e.g., {'R1', 'R2'})
794
+
795
+ Returns:
796
+ Canonical SMILES with those R-groups removed, or empty string on error
797
+
798
+ Example:
799
+ For monomer with R1, R2:
800
+ - get_capped_smiles_for_removed_rgroups({'R1'}) → SMILES with R1 removed, R2 kept
801
+ - get_capped_smiles_for_removed_rgroups({'R2'}) → SMILES with R2 removed, R1 kept
802
+ - get_capped_smiles_for_removed_rgroups({'R1', 'R2'}) → SMILES with both removed
803
+ """
804
+ # Check cache first
805
+ if removed_rgroups in self.capped_smiles_cache:
806
+ return self.capped_smiles_cache[removed_rgroups]
807
+
808
+ # Generate on demand
809
+ smiles = self._get_smiles_with_rgroups_removed(removed_rgroups)
810
+
811
+ # Cache for future use
812
+ self.capped_smiles_cache[removed_rgroups] = smiles
813
+
814
+ return smiles
815
+
816
+ def _get_smiles_with_rgroups_removed(self, removed_rgroups: frozenset) -> str:
817
+ """
818
+ Generate canonical SMILES with specific R-groups removed and others capped.
819
+
820
+ Args:
821
+ removed_rgroups: Set of R-group labels where bonds were broken (e.g., {'R1', 'R2'})
822
+
823
+ Returns:
824
+ Canonical SMILES string matching fragment structure
825
+
826
+ Logic:
827
+ - R-groups in removed_rgroups: Remove dummy atom (bond was broken)
828
+ - R-groups NOT in removed_rgroups: Cap according to library (e.g., OH, H)
829
+ - Final SMILES has NO [*:X] markers to match fragment SMILES
830
+ """
831
+ try:
832
+ mol_copy = Chem.Mol(self.mol)
833
+
834
+ # Identify which R-groups to cap vs remove
835
+ kept_rgroups = set(self.r_groups.keys()) - removed_rgroups
836
+
837
+ # Process each R-group
838
+ # IMPORTANT: SMILES [*:1] uses atom map numbers, not isotopes!
839
+ dummy_atoms_to_process = []
840
+ for atom in mol_copy.GetAtoms():
841
+ if atom.GetAtomicNum() == 0: # Dummy atom (R-group)
842
+ map_num = atom.GetAtomMapNum()
843
+ if map_num > 0:
844
+ r_label = f"R{map_num}"
845
+ if r_label in removed_rgroups:
846
+ # Just remove this dummy atom
847
+ dummy_atoms_to_process.append((atom.GetIdx(), 'remove', r_label))
848
+ elif r_label in kept_rgroups:
849
+ # Need to cap this R-group
850
+ cap_smiles = self.r_groups.get(r_label, '')
851
+ dummy_atoms_to_process.append((atom.GetIdx(), 'cap', cap_smiles))
852
+
853
+ # Apply caps to kept R-groups, remove others
854
+ # Process in two passes: first cap, then remove
855
+ # Cap R-groups: Replace [*:X] with the cap group (e.g., H or OH)
856
+ for atom_idx, action, data in sorted(dummy_atoms_to_process, reverse=True):
857
+ if action == 'cap':
858
+ cap_smiles = data
859
+ # For R1 cap '[*:1][H]', we just remove [*:1] (implicit H added)
860
+ # For R2 cap 'O[*:2]', we need to add O when removing [*:2]
861
+ # Simplified: check if cap has O
862
+ if 'O' in cap_smiles and '[*:' in cap_smiles:
863
+ # R2-like cap: need to add OH group
864
+ # Get the neighbor atom of the dummy
865
+ atom = mol_copy.GetAtomWithIdx(atom_idx)
866
+ neighbors = atom.GetNeighbors()
867
+ if neighbors:
868
+ neighbor = neighbors[0]
869
+ # Add OH to the neighbor before removing dummy
870
+ emol = Chem.EditableMol(mol_copy)
871
+ new_o_idx = emol.AddAtom(Chem.Atom(8)) # Oxygen
872
+ emol.AddBond(neighbor.GetIdx(), new_o_idx, Chem.BondType.SINGLE)
873
+ emol.RemoveAtom(atom_idx)
874
+ mol_copy = emol.GetMol()
875
+ else:
876
+ # R1-like cap: just remove dummy (implicit H)
877
+ emol = Chem.EditableMol(mol_copy)
878
+ emol.RemoveAtom(atom_idx)
879
+ mol_copy = emol.GetMol()
880
+ elif action == 'remove':
881
+ # Just remove the dummy atom
882
+ emol = Chem.EditableMol(mol_copy)
883
+ emol.RemoveAtom(atom_idx)
884
+ mol_copy = emol.GetMol()
885
+
886
+ if mol_copy:
887
+ # Sanitize to add implicit hydrogens where needed
888
+ Chem.SanitizeMol(mol_copy)
889
+ # Generate canonical SMILES without any R-group markers
890
+ return Chem.MolToSmiles(mol_copy, canonical=True)
891
+ return ""
892
+ except Exception as e:
893
+ return ""
894
+
895
+
896
+ class MonomerLibrary:
897
+ def __init__(self):
898
+ self.monomers = {}
899
+ self.smiles_to_monomer = {}
900
+ self.name_to_monomer = {}
901
+ self.symbol_to_monomer = {}
902
+
903
+ def load_from_helm_json(self, json_path: str) -> None:
904
+ if not os.path.exists(json_path):
905
+ return
906
+
907
+ try:
908
+ with open(json_path, 'r', encoding='utf-8') as f:
909
+ data = json.load(f)
910
+ except Exception:
911
+ return
912
+
913
+ successful = 0
914
+ for monomer_dict in data:
915
+ try:
916
+ monomer = self._parse_monomer(monomer_dict)
917
+ if monomer and monomer.mol is not None:
918
+ self.monomers[monomer.symbol] = monomer
919
+ self.symbol_to_monomer[monomer.symbol] = monomer
920
+
921
+ clean_name = monomer.name.lower().replace(" ", "").replace("-", "").replace("_", "")
922
+ self.name_to_monomer[clean_name] = monomer
923
+
924
+ successful += 1
925
+ except Exception:
926
+ continue
927
+
928
+ def _parse_monomer(self, monomer_dict: dict):
929
+ # IMPORTANT: Only load PEPTIDE monomers (amino acids)
930
+ # The library contains RNA, CHEM, etc. with overlapping symbols (A, C, G, T, U)
931
+ polymer_type = monomer_dict.get('polymerType', '')
932
+ if polymer_type != 'PEPTIDE':
933
+ return None
934
+
935
+ monomer = MonomerData()
936
+ monomer.symbol = monomer_dict.get('symbol', '')
937
+ monomer.name = monomer_dict.get('name', '')
938
+
939
+ if not monomer.symbol:
940
+ return None
941
+
942
+ smiles = monomer_dict.get('smiles', '')
943
+ molfile = monomer_dict.get('molfile', '')
944
+
945
+ if smiles:
946
+ try:
947
+ monomer.mol = Chem.MolFromSmiles(smiles)
948
+ monomer.smiles = smiles
949
+ except Exception:
950
+ monomer.mol = None
951
+
952
+ if monomer.mol is None and molfile:
953
+ try:
954
+ monomer.mol = Chem.MolFromMolBlock(molfile)
955
+ if monomer.mol:
956
+ monomer.smiles = Chem.MolToSmiles(monomer.mol)
957
+ except Exception:
958
+ monomer.mol = None
959
+
960
+ if monomer.mol is None:
961
+ return None
962
+
963
+ # Parse R-groups
964
+ rgroups_list = monomer_dict.get('rgroups', [])
965
+ for rgroup in rgroups_list:
966
+ label = rgroup.get('label', '')
967
+ cap_smiles = rgroup.get('capGroupSMILES', '')
968
+ if label and cap_smiles:
969
+ monomer.r_groups[label] = cap_smiles
970
+
971
+ monomer.r_group_count = len(monomer.r_groups)
972
+
973
+ return monomer
974
+
975
+ def find_monomer_by_fragment_smiles(self, fragment_smiles: str, num_connections: int):
976
+ """
977
+ Find monomer by matching fragment SMILES with on-demand R-group removal.
978
+
979
+ Args:
980
+ fragment_smiles: Canonical SMILES of the fragment
981
+ num_connections: Number of connections this fragment has in the graph
982
+
983
+ Returns:
984
+ MonomerData if match found, None otherwise
985
+
986
+ Logic:
987
+ - Fragment with N connections → N R-groups were removed during fragmentation
988
+ - For monomer with M R-groups, try all C(M,N) combinations of which N R-groups were removed
989
+ - Generate SMILES for each combination on-demand (with caching)
990
+
991
+ Example:
992
+ Fragment has 1 connection, monomer has R1, R2:
993
+ - Try removing R1 → check if SMILES matches
994
+ - Try removing R2 → check if SMILES matches
995
+ """
996
+ # Search through all monomers
997
+ for symbol, monomer in self.monomers.items():
998
+ # Skip if monomer doesn't have enough R-groups
999
+ if monomer.r_group_count < num_connections:
1000
+ continue
1001
+
1002
+ # Generate all combinations of num_connections R-groups that could have been removed
1003
+ r_group_labels = list(monomer.r_groups.keys())
1004
+
1005
+ # For each combination of R-groups that could have been removed
1006
+ for removed_combo in combinations(r_group_labels, num_connections):
1007
+ removed_set = frozenset(removed_combo)
1008
+
1009
+ # Generate SMILES with these R-groups removed (lazy, cached)
1010
+ candidate_smiles = monomer.get_capped_smiles_for_removed_rgroups(removed_set)
1011
+
1012
+ # Check if it matches the fragment
1013
+ if candidate_smiles == fragment_smiles:
1014
+ return monomer
1015
+
1016
+ return None
1017
+
1018
+ def find_monomer_by_symbol(self, symbol: str):
1019
+ return self.symbol_to_monomer.get(symbol)
1020
+
1021
+ # ============================================================================
1022
+ # Content from: monomer_matcher.py
1023
+ # ============================================================================
1024
+
1025
+ from rdkit import Chem
1026
+
1027
+
1028
+ class MonomerMatcher:
1029
+ """
1030
+ Matches molecular fragments to monomers using graph-aware R-group analysis.
1031
+
1032
+ Revolutionary approach:
1033
+ - No hardcoded mappings
1034
+ - No complex normalization
1035
+ - Direct string comparison of canonical SMILES
1036
+ - Graph topology determines which R-groups are capped
1037
+ """
1038
+
1039
+ def __init__(self, monomer_library: MonomerLibrary):
1040
+ self.monomer_library = monomer_library
1041
+
1042
+ def find_exact_match(self, fragment: Chem.Mol, num_connections: int = 0):
1043
+ """
1044
+ Find exact match for a fragment based on graph topology.
1045
+
1046
+ Args:
1047
+ fragment: RDKit molecule object representing a fragment
1048
+ num_connections: Number of connections this fragment has in the graph
1049
+
1050
+ Returns:
1051
+ MonomerData object if match found, None otherwise
1052
+ """
1053
+ try:
1054
+ # Get canonical SMILES of the fragment
1055
+ frag_smiles = Chem.MolToSmiles(fragment, canonical=True)
1056
+ if not frag_smiles:
1057
+ return None
1058
+
1059
+ # Use the library's new graph-aware matching
1060
+ match = self.monomer_library.find_monomer_by_fragment_smiles(
1061
+ frag_smiles, num_connections
1062
+ )
1063
+
1064
+ return match
1065
+
1066
+ except Exception:
1067
+ return None
1068
+
1069
+ def match_graph(self, graph: FragmentGraph):
1070
+ """
1071
+ Match all fragments in a graph to monomers.
1072
+
1073
+ Args:
1074
+ graph: FragmentGraph with unmatched nodes
1075
+
1076
+ Returns:
1077
+ Number of successfully matched nodes
1078
+ """
1079
+ matched_count = 0
1080
+
1081
+ for node_id, node in graph.nodes.items():
1082
+ # Count connections for this node
1083
+ neighbors = graph.get_neighbors(node_id)
1084
+ num_connections = len(neighbors)
1085
+
1086
+ # Find matching monomer
1087
+ monomer = self.find_exact_match(node.mol, num_connections)
1088
+
1089
+ if monomer:
1090
+ node.monomer = monomer
1091
+ matched_count += 1
1092
+
1093
+ return matched_count
1094
+
1095
+ # ============================================================================
1096
+ # Content from: pipeline.py
1097
+ # ============================================================================
1098
+
1099
+ from rdkit import Chem
1100
+ import os
1101
+ import json
1102
+
1103
+ # Global variables for caching
1104
+ _MONOMER_LIBRARY = None
1105
+ _PROCESSOR = None
1106
+ _MATCHER = None
1107
+ _HELM_GENERATOR = None
1108
+
1109
+
1110
+ def _load_monomer_library():
1111
+ global _MONOMER_LIBRARY
1112
+ if _MONOMER_LIBRARY is None:
1113
+ # Define path to library relative to current directory
1114
+ current_dir = os.path.dirname(os.path.abspath(__file__))
1115
+ project_root = os.path.dirname(current_dir)
1116
+ library_path = os.path.join(project_root, "libraries", "HELMCoreLibrary.json")
1117
+
1118
+ if not os.path.exists(library_path):
1119
+ return None
1120
+
1121
+ print("Loading monomer library...")
1122
+ _MONOMER_LIBRARY = MonomerLibrary()
1123
+ _MONOMER_LIBRARY.load_from_helm_json(library_path)
1124
+
1125
+ if not _MONOMER_LIBRARY.monomers:
1126
+ return None
1127
+
1128
+ print(f"Monomer library loaded: {len(_MONOMER_LIBRARY.monomers)} monomers")
1129
+
1130
+ return _MONOMER_LIBRARY
1131
+
1132
+
1133
+ def _get_processors():
1134
+ """
1135
+ Get or create singleton instances of processors.
1136
+ Returns tuple: (processor, matcher, helm_generator)
1137
+ """
1138
+ global _PROCESSOR, _MATCHER, _HELM_GENERATOR
1139
+
1140
+ if _PROCESSOR is None or _MATCHER is None or _HELM_GENERATOR is None:
1141
+ library = _load_monomer_library()
1142
+ if not library:
1143
+ return None, None, None
1144
+
1145
+ _PROCESSOR = FragmentProcessor(library)
1146
+ _MATCHER = MonomerMatcher(library)
1147
+ _HELM_GENERATOR = HELMGenerator()
1148
+
1149
+ return _PROCESSOR, _MATCHER, _HELM_GENERATOR
1150
+
1151
+
1152
+ def preload_library():
1153
+ """
1154
+ Preload the monomer library and initialize processors once at the start.
1155
+ Returns True if successful, False otherwise.
1156
+ """
1157
+ library = _load_monomer_library()
1158
+ if library is None:
1159
+ return False
1160
+
1161
+ # Initialize processors
1162
+ processor, matcher, generator = _get_processors()
1163
+ return processor is not None
1164
+
1165
+
1166
+ def convert_molecules_batch(molfiles: list, library_json: str = None) -> list:
1167
+ """
1168
+ Convert a batch of molecules from molfile format to HELM notation.
1169
+
1170
+ Args:
1171
+ molfiles: List of molfile strings
1172
+ library_json: Optional monomer library as JSON string.
1173
+ If None, uses default cached library from HELMCoreLibrary.json
1174
+
1175
+ Returns:
1176
+ List of tuples: (success: bool, helm_notation: str)
1177
+ success is True if molecule was successfully converted, False otherwise
1178
+ """
1179
+ # Determine which library to use
1180
+ if library_json is None:
1181
+ # Use cached global library
1182
+ global _PROCESSOR
1183
+ if _PROCESSOR is None:
1184
+ print("Initializing monomer library and processors...")
1185
+ if not preload_library():
1186
+ print("ERROR: Failed to load monomer library")
1187
+ return [(False, "Library initialization failed") for _ in molfiles]
1188
+ print()
1189
+
1190
+ # Use shared processor instances
1191
+ processor, matcher, helm_generator = _get_processors()
1192
+ if not processor:
1193
+ return [(False, "") for _ in molfiles]
1194
+ else:
1195
+ # Load custom library from provided JSON string (no caching)
1196
+ try:
1197
+ library_data = json.loads(library_json)
1198
+ except Exception as e:
1199
+ print(f"ERROR: Failed to parse library JSON: {str(e)}")
1200
+ return [(False, "Invalid JSON") for _ in molfiles]
1201
+
1202
+ print(f"Loading custom library from JSON string...")
1203
+ library = MonomerLibrary()
1204
+
1205
+ # Parse the library data
1206
+ successful = 0
1207
+ for monomer_dict in library_data:
1208
+ try:
1209
+ monomer = library._parse_monomer(monomer_dict)
1210
+ if monomer and monomer.mol is not None:
1211
+ library.monomers[monomer.symbol] = monomer
1212
+ library.symbol_to_monomer[monomer.symbol] = monomer
1213
+ clean_name = monomer.name.lower().replace(" ", "").replace("-", "").replace("_", "")
1214
+ library.name_to_monomer[clean_name] = monomer
1215
+ successful += 1
1216
+ except Exception:
1217
+ continue
1218
+
1219
+ if not library.monomers:
1220
+ print("ERROR: No monomers loaded from custom library")
1221
+ return [(False, "Library loading failed") for _ in molfiles]
1222
+
1223
+ print(f"Custom library loaded: {len(library.monomers)} monomers")
1224
+
1225
+ # Create processor instances for this library
1226
+ processor = FragmentProcessor(library)
1227
+ matcher = MonomerMatcher(library)
1228
+ helm_generator = HELMGenerator()
1229
+
1230
+ results = []
1231
+
1232
+ for i in range(len(molfiles)):
1233
+ molfile = molfiles[i]
1234
+ mol = Chem.MolFromMolBlock(molfile)
1235
+ if not mol:
1236
+ results.append((False, ""))
1237
+ continue
1238
+
1239
+ try:
1240
+ # Process molecule into fragment graph
1241
+ graph = processor.process_molecule(mol)
1242
+
1243
+ # Match each fragment to a monomer using graph topology
1244
+ unknown_count = 0
1245
+ for node_id, node in graph.nodes.items():
1246
+ # Count connections for this node
1247
+ neighbors = graph.get_neighbors(node_id)
1248
+ num_connections = len(neighbors)
1249
+
1250
+ # Find matching monomer
1251
+ monomer = matcher.find_exact_match(node.mol, num_connections)
1252
+ if monomer:
1253
+ node.monomer = monomer
1254
+ else:
1255
+ unknown_count += 1
1256
+ mock_monomer = MonomerData()
1257
+ mock_monomer.symbol = f"X{unknown_count}"
1258
+ mock_monomer.name = f"Unknown_{unknown_count}"
1259
+ node.monomer = mock_monomer
1260
+
1261
+ # Try to recover unmatched fragments by merging with neighbors
1262
+ max_recovery_attempts = 3 # Prevent infinite loops
1263
+ for attempt in range(max_recovery_attempts):
1264
+ had_changes = processor.recover_unmatched_fragments(graph, matcher)
1265
+ if not had_changes:
1266
+ break
1267
+
1268
+ if len(graph.nodes) > 0:
1269
+ helm_notation = helm_generator.generate_helm_from_graph(graph)
1270
+ results.append((True, helm_notation))
1271
+ else:
1272
+ results.append((False, ""))
1273
+ except Exception as e:
1274
+ results.append((False, f"Error: {str(e)}"))
1275
+
1276
+ return results
1277
+
1278
+ res_helm_list = convert_molecules_batch(molListToProcess, library_json=libraryJSON)
1279
+ result_helm = pd.DataFrame(map(lambda x: x[1], res_helm_list), columns=["regenerated sequences"])