@datagrok/bio 2.25.4 → 2.25.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/package-test.js +1 -1
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +1 -1
- package/dist/package.js.map +1 -1
- package/package.json +1 -1
- package/scripts/mol-to-helm.py +306 -31
- package/src/package.g.ts +1 -1
- package/src/package.ts +1 -1
- package/test-console-output-1.log +323 -320
- package/test-record-1.mp4 +0 -0
package/package.json
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
"name": "Davit Rizhinashvili",
|
|
6
6
|
"email": "drizhinashvili@datagrok.ai"
|
|
7
7
|
},
|
|
8
|
-
"version": "2.25.
|
|
8
|
+
"version": "2.25.5",
|
|
9
9
|
"description": "Bioinformatics support (import/export of sequences, conversion, visualization, analysis). [See more](https://github.com/datagrok-ai/public/blob/master/packages/Bio/README.md) for details.",
|
|
10
10
|
"repository": {
|
|
11
11
|
"type": "git",
|
package/scripts/mol-to-helm.py
CHANGED
|
@@ -25,6 +25,7 @@ from typing import Optional
|
|
|
25
25
|
from typing import Tuple
|
|
26
26
|
import json
|
|
27
27
|
import os
|
|
28
|
+
import re
|
|
28
29
|
|
|
29
30
|
# ============================================================================
|
|
30
31
|
# Content from: fragment_graph.py
|
|
@@ -259,8 +260,8 @@ class BondDetector:
|
|
|
259
260
|
# This preserves lactams but allows large macrocycles and proline (C=O outside ring)
|
|
260
261
|
# Nitrogen can be X2 (proline, imino) or X3 (standard amino, N-methyl)
|
|
261
262
|
# N-C bond can be single (-) or double (=) for imine bonds in dehydro amino acids
|
|
262
|
-
# Alpha carbon after N can be sp3 (X4) or sp2 (X3) for dehydroamino acids
|
|
263
|
-
self.peptide_bond = Chem.MolFromSmarts('[#6]-[C;X3;!r5;!r6](=[O;X1])-[N;X2,X3]~[
|
|
263
|
+
# Alpha carbon after N can be sp3 (X4) or sp2 (X3) for dehydroamino acids, or aromatic (#6 includes both)
|
|
264
|
+
self.peptide_bond = Chem.MolFromSmarts('[#6]-[C;X3;!r5;!r6](=[O;X1])-[N;X2,X3]~[#6;X3,X4]')
|
|
264
265
|
# True disulfide bond: S-S where each S is bonded to carbon (cysteine residues)
|
|
265
266
|
self.disulfide_bond = Chem.MolFromSmarts('[C;X4]-[S;X2]-[S;X2]-[C;X4]')
|
|
266
267
|
# Primary amine at N-terminus (can be NH2 or NH3+), alpha-C can be sp3 or sp2
|
|
@@ -447,7 +448,6 @@ class FragmentProcessor:
|
|
|
447
448
|
graph.cleaved_bond_indices = bond_indices
|
|
448
449
|
graph.bond_info = bond_info
|
|
449
450
|
graph.atom_mappings = atom_mappings
|
|
450
|
-
print(f"DEBUG: Created {len(fragments)} fragments, cleaved {len(bond_indices)} bonds")
|
|
451
451
|
|
|
452
452
|
# Create nodes for each fragment
|
|
453
453
|
fragment_nodes = []
|
|
@@ -470,8 +470,6 @@ class FragmentProcessor:
|
|
|
470
470
|
for new_idx_in_frag, original_atom_idx in enumerate(original_atom_indices):
|
|
471
471
|
atom_to_fragment_and_idx[original_atom_idx] = (frag_idx, new_idx_in_frag)
|
|
472
472
|
|
|
473
|
-
print(f"DEBUG: Processing {len(bond_info)} cleaved bonds to create links")
|
|
474
|
-
print(f"DEBUG: atom_to_fragment_and_idx has {len(atom_to_fragment_and_idx)} entries")
|
|
475
473
|
|
|
476
474
|
# For each cleaved bond, determine which fragments it connects
|
|
477
475
|
link_count = 0
|
|
@@ -743,8 +741,6 @@ class FragmentProcessor:
|
|
|
743
741
|
if not unmatched_nodes:
|
|
744
742
|
return False
|
|
745
743
|
|
|
746
|
-
print(f"DEBUG: Found {len(unmatched_nodes)} unmatched nodes: {unmatched_nodes}")
|
|
747
|
-
|
|
748
744
|
had_changes = False
|
|
749
745
|
|
|
750
746
|
# Try to recover each unmatched node
|
|
@@ -757,18 +753,14 @@ class FragmentProcessor:
|
|
|
757
753
|
neighbors = graph.get_neighbors(node_id)
|
|
758
754
|
|
|
759
755
|
if not neighbors:
|
|
760
|
-
print(f"DEBUG: Node {node_id} has no neighbors")
|
|
761
756
|
continue
|
|
762
757
|
|
|
763
|
-
print(f"DEBUG: Node {node_id} neighbors: {[(n[0], n[1].value) for n in neighbors]}")
|
|
764
|
-
|
|
765
758
|
# Try merging with each individual neighbor first
|
|
766
759
|
for neighbor_id, linkage_type in neighbors:
|
|
767
760
|
if neighbor_id not in graph.nodes:
|
|
768
761
|
continue
|
|
769
762
|
|
|
770
763
|
nodes_to_merge = sorted([node_id, neighbor_id])
|
|
771
|
-
print(f"DEBUG: Trying to merge nodes {nodes_to_merge} (via {linkage_type.value} bond)")
|
|
772
764
|
|
|
773
765
|
# Find the links between nodes we're merging
|
|
774
766
|
links_to_exclude = []
|
|
@@ -797,13 +789,11 @@ class FragmentProcessor:
|
|
|
797
789
|
all_neighbors.add(neighbor_id)
|
|
798
790
|
|
|
799
791
|
num_connections = len(all_neighbors)
|
|
800
|
-
print(f"DEBUG: Expecting {num_connections} connections")
|
|
801
792
|
|
|
802
|
-
# Try to match the combined fragment
|
|
793
|
+
# Try to match the combined fragment (exact match only)
|
|
803
794
|
monomer = matcher.find_exact_match(combined_mol, num_connections)
|
|
804
795
|
|
|
805
796
|
if monomer:
|
|
806
|
-
print(f"DEBUG: SUCCESS! Matched to {monomer.symbol}")
|
|
807
797
|
# Success! Create new merged node
|
|
808
798
|
new_node_id = min(nodes_to_merge)
|
|
809
799
|
new_node = FragmentNode(new_node_id, combined_mol)
|
|
@@ -814,13 +804,69 @@ class FragmentProcessor:
|
|
|
814
804
|
|
|
815
805
|
had_changes = True
|
|
816
806
|
break # Stop trying other neighbors for this node
|
|
817
|
-
else:
|
|
818
|
-
print(f"DEBUG: No match found for merge {nodes_to_merge}")
|
|
819
807
|
|
|
820
808
|
if had_changes:
|
|
821
809
|
break # Restart from beginning after a successful merge
|
|
822
810
|
|
|
823
811
|
return had_changes
|
|
812
|
+
|
|
813
|
+
def recover_unmatched_with_stereo_agnostic(self, graph: FragmentGraph, matcher) -> int:
|
|
814
|
+
"""
|
|
815
|
+
Separate recovery procedure: Try to match remaining unmatched fragments
|
|
816
|
+
using stereochemistry-agnostic comparison.
|
|
817
|
+
|
|
818
|
+
This handles poor quality input data where stereochemistry is not assigned.
|
|
819
|
+
Only called after regular recovery attempts have finished.
|
|
820
|
+
|
|
821
|
+
Args:
|
|
822
|
+
graph: FragmentGraph with some unmatched nodes
|
|
823
|
+
matcher: MonomerMatcher instance
|
|
824
|
+
|
|
825
|
+
Returns:
|
|
826
|
+
Number of fragments that were successfully matched
|
|
827
|
+
"""
|
|
828
|
+
from rdkit import Chem
|
|
829
|
+
|
|
830
|
+
# Find all unmatched nodes (nodes with mock/unknown monomers)
|
|
831
|
+
unmatched_nodes = []
|
|
832
|
+
for node_id, node in graph.nodes.items():
|
|
833
|
+
if node.monomer and (node.monomer.symbol.startswith('X') or
|
|
834
|
+
node.monomer.name.startswith('Unknown')):
|
|
835
|
+
unmatched_nodes.append(node_id)
|
|
836
|
+
|
|
837
|
+
if not unmatched_nodes:
|
|
838
|
+
return 0
|
|
839
|
+
|
|
840
|
+
print(f"DEBUG: Attempting stereo-agnostic recovery for {len(unmatched_nodes)} unmatched nodes")
|
|
841
|
+
|
|
842
|
+
matched_count = 0
|
|
843
|
+
|
|
844
|
+
for node_id in unmatched_nodes:
|
|
845
|
+
if node_id not in graph.nodes:
|
|
846
|
+
continue
|
|
847
|
+
|
|
848
|
+
node = graph.nodes[node_id]
|
|
849
|
+
|
|
850
|
+
# Get fragment SMILES
|
|
851
|
+
fragment_smiles = Chem.MolToSmiles(node.mol, canonical=True)
|
|
852
|
+
|
|
853
|
+
# Count connections
|
|
854
|
+
neighbors = graph.get_neighbors(node_id)
|
|
855
|
+
num_connections = len(neighbors)
|
|
856
|
+
|
|
857
|
+
# Try stereo-agnostic matching
|
|
858
|
+
monomer = matcher.monomer_library.find_monomer_by_fragment_smiles_no_stereo(
|
|
859
|
+
fragment_smiles, num_connections
|
|
860
|
+
)
|
|
861
|
+
|
|
862
|
+
if monomer:
|
|
863
|
+
print(f"DEBUG: Stereo-agnostic match for node {node_id}: {monomer.symbol}")
|
|
864
|
+
node.monomer = monomer
|
|
865
|
+
matched_count += 1
|
|
866
|
+
else:
|
|
867
|
+
print(f"DEBUG: No stereo-agnostic match for node {node_id}")
|
|
868
|
+
|
|
869
|
+
return matched_count
|
|
824
870
|
|
|
825
871
|
# ============================================================================
|
|
826
872
|
# Content from: helm_generator.py
|
|
@@ -859,13 +905,49 @@ class HELMGenerator:
|
|
|
859
905
|
if len(graph) == 0:
|
|
860
906
|
return ""
|
|
861
907
|
|
|
862
|
-
# Get ordered sequence of monomers
|
|
863
|
-
|
|
864
|
-
sequence_symbols = [node.monomer.symbol if node.monomer else "X" for node in ordered_nodes]
|
|
908
|
+
# Get ordered sequence of monomers (backbone)
|
|
909
|
+
ordered_nodes_raw = graph.get_ordered_nodes()
|
|
865
910
|
|
|
866
911
|
# Check if cyclic
|
|
867
912
|
is_cyclic = graph.is_cyclic()
|
|
868
913
|
|
|
914
|
+
# Filter backbone: nodes that are part of R1-R2 chain are backbone
|
|
915
|
+
# Nodes connected only via R3 (side chain) are branches
|
|
916
|
+
#
|
|
917
|
+
# Logic: A node at position 1 is a branch if:
|
|
918
|
+
# - It has no R1 (N-terminus) - meaning it's a cap like 'ac' that only has R2
|
|
919
|
+
# - It only has 1 peptide connection (to the real backbone)
|
|
920
|
+
#
|
|
921
|
+
# Example: [ac].K in cyclic peptide
|
|
922
|
+
# - 'ac' has only R2, no R1 → it's a cap
|
|
923
|
+
# - 'ac' connects to K's R3 (side chain), not K's R1 (backbone)
|
|
924
|
+
# - So 'ac' should be PEPTIDE2, not part of PEPTIDE1
|
|
925
|
+
|
|
926
|
+
backbone_nodes = []
|
|
927
|
+
for i, node in enumerate(ordered_nodes_raw):
|
|
928
|
+
is_branch = False
|
|
929
|
+
|
|
930
|
+
if i == 0 and len(ordered_nodes_raw) > 1 and node.monomer:
|
|
931
|
+
# Check if this first node lacks R1 (N-terminus)
|
|
932
|
+
# If it has no R1, it's a cap that should be a branch
|
|
933
|
+
has_r1 = 'R1' in node.monomer.r_groups
|
|
934
|
+
|
|
935
|
+
if not has_r1:
|
|
936
|
+
# This is an N-terminal cap (like 'ac') at position 1
|
|
937
|
+
# It should be a branch, not part of the main backbone
|
|
938
|
+
is_branch = True
|
|
939
|
+
|
|
940
|
+
if not is_branch:
|
|
941
|
+
backbone_nodes.append(node)
|
|
942
|
+
|
|
943
|
+
ordered_nodes = backbone_nodes
|
|
944
|
+
sequence_symbols = [node.monomer.symbol if node.monomer else "X" for node in ordered_nodes]
|
|
945
|
+
|
|
946
|
+
# Detect branch nodes (nodes not in backbone)
|
|
947
|
+
ordered_node_ids = {node.id for node in ordered_nodes}
|
|
948
|
+
branch_nodes = [(node_id, node) for node_id, node in graph.nodes.items()
|
|
949
|
+
if node_id not in ordered_node_ids]
|
|
950
|
+
|
|
869
951
|
# Generate sequence notation
|
|
870
952
|
if is_cyclic:
|
|
871
953
|
# Cyclic: wrap multi-letter monomers in brackets, single-letter ones stay as-is
|
|
@@ -922,12 +1004,55 @@ class HELMGenerator:
|
|
|
922
1004
|
# Format: PEPTIDE1,PEPTIDE1,from_pos:R3-to_pos:R3
|
|
923
1005
|
connections.append(f"PEPTIDE1,PEPTIDE1,{from_pos}:R3-{to_pos}:R3")
|
|
924
1006
|
|
|
1007
|
+
# Handle branch nodes (side chain modifications)
|
|
1008
|
+
# Create separate PEPTIDE chains for each branch
|
|
1009
|
+
branch_chains = []
|
|
1010
|
+
if branch_nodes:
|
|
1011
|
+
for branch_idx, (branch_node_id, branch_node) in enumerate(branch_nodes, start=2):
|
|
1012
|
+
branch_chain_name = f"PEPTIDE{branch_idx}"
|
|
1013
|
+
branch_symbol = branch_node.monomer.symbol if branch_node.monomer else f"X{branch_node_id}"
|
|
1014
|
+
|
|
1015
|
+
# Format branch chain (single monomer, so no dots needed)
|
|
1016
|
+
if is_cyclic and len(branch_symbol) > 1:
|
|
1017
|
+
branch_chains.append(f"{branch_chain_name}{{[{branch_symbol}]}}")
|
|
1018
|
+
else:
|
|
1019
|
+
branch_chains.append(f"{branch_chain_name}{{{branch_symbol}}}")
|
|
1020
|
+
|
|
1021
|
+
# Find which backbone node this branch connects to
|
|
1022
|
+
# Look for links connecting this branch to the main backbone
|
|
1023
|
+
for link in graph.links:
|
|
1024
|
+
backbone_node_id = None
|
|
1025
|
+
if link.from_node_id == branch_node_id and link.to_node_id in ordered_node_ids:
|
|
1026
|
+
backbone_node_id = link.to_node_id
|
|
1027
|
+
elif link.to_node_id == branch_node_id and link.from_node_id in ordered_node_ids:
|
|
1028
|
+
backbone_node_id = link.from_node_id
|
|
1029
|
+
|
|
1030
|
+
if backbone_node_id is not None:
|
|
1031
|
+
# Find position of backbone node (1-indexed)
|
|
1032
|
+
backbone_pos = next((i + 1 for i, n in enumerate(ordered_nodes) if n.id == backbone_node_id), None)
|
|
1033
|
+
if backbone_pos:
|
|
1034
|
+
# Determine which R-group the branch uses
|
|
1035
|
+
# If branch has R1, connect to R1; if only R2, connect to R2
|
|
1036
|
+
branch_r_group = "R1"
|
|
1037
|
+
if branch_node.monomer:
|
|
1038
|
+
if 'R1' in branch_node.monomer.r_groups:
|
|
1039
|
+
branch_r_group = "R1"
|
|
1040
|
+
elif 'R2' in branch_node.monomer.r_groups:
|
|
1041
|
+
branch_r_group = "R2"
|
|
1042
|
+
|
|
1043
|
+
# Connection: backbone position R3 (side chain) to branch position 1 R-group
|
|
1044
|
+
connections.append(f"PEPTIDE1,{branch_chain_name},{backbone_pos}:R3-1:{branch_r_group}")
|
|
1045
|
+
break
|
|
1046
|
+
|
|
925
1047
|
# Generate final HELM notation
|
|
1048
|
+
all_chains = [f"PEPTIDE1{{{sequence}}}"] + branch_chains
|
|
1049
|
+
helm_chains = "|".join(all_chains)
|
|
1050
|
+
|
|
926
1051
|
if connections:
|
|
927
1052
|
connection_str = "|".join(connections)
|
|
928
|
-
helm = f"
|
|
1053
|
+
helm = f"{helm_chains}${connection_str}$$$V2.0"
|
|
929
1054
|
else:
|
|
930
|
-
helm = f"
|
|
1055
|
+
helm = f"{helm_chains}$$$$V2.0"
|
|
931
1056
|
|
|
932
1057
|
return helm
|
|
933
1058
|
|
|
@@ -960,10 +1085,34 @@ from collections import defaultdict
|
|
|
960
1085
|
from itertools import combinations
|
|
961
1086
|
import json
|
|
962
1087
|
import os
|
|
1088
|
+
import re
|
|
963
1089
|
|
|
964
1090
|
# Suppress RDKit warnings
|
|
965
1091
|
RDLogger.DisableLog('rdApp.warning')
|
|
966
1092
|
|
|
1093
|
+
def remove_stereochemistry_from_smiles(smiles: str) -> str:
|
|
1094
|
+
"""
|
|
1095
|
+
Remove stereochemistry markers from SMILES string.
|
|
1096
|
+
Converts [C@@H], [C@H] to C, etc.
|
|
1097
|
+
|
|
1098
|
+
This is used for matching when input molecules don't have stereochemistry defined.
|
|
1099
|
+
"""
|
|
1100
|
+
if not smiles:
|
|
1101
|
+
return smiles
|
|
1102
|
+
|
|
1103
|
+
# Remove @ symbols (stereochemistry markers)
|
|
1104
|
+
# Pattern: [@]+ inside brackets
|
|
1105
|
+
smiles_no_stereo = re.sub(r'(@+)', '', smiles)
|
|
1106
|
+
|
|
1107
|
+
# Also remove H when it's explicit in brackets like [C@@H] -> [C] -> C
|
|
1108
|
+
# But we need to be careful not to remove H from [H] or CH3
|
|
1109
|
+
# After removing @, we might have [CH] which should become C
|
|
1110
|
+
smiles_no_stereo = re.sub(r'\[([A-Z][a-z]?)H\]', r'\1', smiles_no_stereo)
|
|
1111
|
+
# Handle [C] -> C (single atoms in brackets with no other info)
|
|
1112
|
+
smiles_no_stereo = re.sub(r'\[([A-Z][a-z]?)\]', r'\1', smiles_no_stereo)
|
|
1113
|
+
|
|
1114
|
+
return smiles_no_stereo
|
|
1115
|
+
|
|
967
1116
|
class MonomerData:
|
|
968
1117
|
def __init__(self):
|
|
969
1118
|
self.symbol = ""
|
|
@@ -1201,11 +1350,64 @@ class MonomerLibrary:
|
|
|
1201
1350
|
# Generate SMILES with these R-groups removed (lazy, cached)
|
|
1202
1351
|
candidate_smiles = monomer.get_capped_smiles_for_removed_rgroups(removed_set)
|
|
1203
1352
|
|
|
1204
|
-
# Check if it matches the fragment
|
|
1353
|
+
# Check if it matches the fragment (exact match only)
|
|
1205
1354
|
if candidate_smiles == fragment_smiles:
|
|
1206
1355
|
return monomer
|
|
1207
1356
|
|
|
1208
1357
|
return None
|
|
1358
|
+
|
|
1359
|
+
def find_monomer_by_fragment_smiles_no_stereo(self, fragment_smiles: str, num_connections: int):
|
|
1360
|
+
"""
|
|
1361
|
+
Find monomer by matching fragment SMILES WITHOUT stereochemistry.
|
|
1362
|
+
Used only in recovery for handling poor quality input data.
|
|
1363
|
+
|
|
1364
|
+
Uses molecular graph isomorphism to handle cases where RDKit generates
|
|
1365
|
+
different canonical SMILES for the same molecule.
|
|
1366
|
+
|
|
1367
|
+
Args:
|
|
1368
|
+
fragment_smiles: Canonical SMILES of the fragment
|
|
1369
|
+
num_connections: Number of connections this fragment has in the graph
|
|
1370
|
+
|
|
1371
|
+
Returns:
|
|
1372
|
+
MonomerData if match found, None otherwise
|
|
1373
|
+
"""
|
|
1374
|
+
# Parse fragment molecule once (without stereochemistry)
|
|
1375
|
+
fragment_no_stereo_smiles = remove_stereochemistry_from_smiles(fragment_smiles)
|
|
1376
|
+
fragment_mol = Chem.MolFromSmiles(fragment_no_stereo_smiles)
|
|
1377
|
+
if not fragment_mol:
|
|
1378
|
+
return None
|
|
1379
|
+
|
|
1380
|
+
# Search through all monomers
|
|
1381
|
+
for symbol, monomer in self.monomers.items():
|
|
1382
|
+
# Skip if monomer doesn't have enough R-groups
|
|
1383
|
+
if monomer.r_group_count < num_connections:
|
|
1384
|
+
continue
|
|
1385
|
+
|
|
1386
|
+
# Generate all combinations of num_connections R-groups that could have been removed
|
|
1387
|
+
r_group_labels = list(monomer.r_groups.keys())
|
|
1388
|
+
|
|
1389
|
+
# For each combination of R-groups that could have been removed
|
|
1390
|
+
for removed_combo in combinations(r_group_labels, num_connections):
|
|
1391
|
+
removed_set = frozenset(removed_combo)
|
|
1392
|
+
|
|
1393
|
+
# Generate SMILES with these R-groups removed (lazy, cached)
|
|
1394
|
+
candidate_smiles = monomer.get_capped_smiles_for_removed_rgroups(removed_set)
|
|
1395
|
+
|
|
1396
|
+
# Try string comparison first (fast path)
|
|
1397
|
+
candidate_no_stereo = remove_stereochemistry_from_smiles(candidate_smiles)
|
|
1398
|
+
|
|
1399
|
+
if candidate_no_stereo == fragment_no_stereo_smiles:
|
|
1400
|
+
return monomer
|
|
1401
|
+
|
|
1402
|
+
# If string comparison fails, try molecular graph isomorphism (slower but more robust)
|
|
1403
|
+
# This handles cases where RDKit generates different canonical SMILES for same molecule
|
|
1404
|
+
candidate_mol = Chem.MolFromSmiles(candidate_no_stereo)
|
|
1405
|
+
if candidate_mol and fragment_mol.HasSubstructMatch(candidate_mol) and candidate_mol.HasSubstructMatch(fragment_mol):
|
|
1406
|
+
# Both molecules are substructures of each other = they're the same
|
|
1407
|
+
if fragment_mol.GetNumAtoms() == candidate_mol.GetNumAtoms():
|
|
1408
|
+
return monomer
|
|
1409
|
+
|
|
1410
|
+
return None
|
|
1209
1411
|
|
|
1210
1412
|
def find_monomer_by_symbol(self, symbol: str):
|
|
1211
1413
|
return self.symbol_to_monomer.get(symbol)
|
|
@@ -1355,14 +1557,16 @@ def preload_library():
|
|
|
1355
1557
|
return processor is not None
|
|
1356
1558
|
|
|
1357
1559
|
|
|
1358
|
-
def convert_molecules_batch(
|
|
1560
|
+
def convert_molecules_batch(molecules: list, library_json: str = None, input_type: str = "auto") -> list:
|
|
1359
1561
|
"""
|
|
1360
|
-
Convert a batch of molecules
|
|
1562
|
+
Convert a batch of molecules to HELM notation.
|
|
1361
1563
|
|
|
1362
1564
|
Args:
|
|
1363
|
-
|
|
1565
|
+
molecules: List of molecule strings (molfiles or SMILES)
|
|
1364
1566
|
library_json: Optional monomer library as JSON string.
|
|
1365
1567
|
If None, uses default cached library from HELMCoreLibrary.json
|
|
1568
|
+
input_type: Type of input molecules - "molfile", "smiles", or "auto" (default).
|
|
1569
|
+
"auto" will attempt to detect the format automatically.
|
|
1366
1570
|
|
|
1367
1571
|
Returns:
|
|
1368
1572
|
List of tuples: (success: bool, helm_notation: str)
|
|
@@ -1376,13 +1580,13 @@ def convert_molecules_batch(molfiles: list, library_json: str = None) -> list:
|
|
|
1376
1580
|
print("Initializing monomer library and processors...")
|
|
1377
1581
|
if not preload_library():
|
|
1378
1582
|
print("ERROR: Failed to load monomer library")
|
|
1379
|
-
return [(False, "Library initialization failed") for _ in
|
|
1583
|
+
return [(False, "Library initialization failed") for _ in molecules]
|
|
1380
1584
|
print()
|
|
1381
1585
|
|
|
1382
1586
|
# Use shared processor instances
|
|
1383
1587
|
processor, matcher, helm_generator = _get_processors()
|
|
1384
1588
|
if not processor:
|
|
1385
|
-
return [(False, "") for _ in
|
|
1589
|
+
return [(False, "") for _ in molecules]
|
|
1386
1590
|
else:
|
|
1387
1591
|
# Load custom library from provided JSON string (no caching)
|
|
1388
1592
|
try:
|
|
@@ -1410,7 +1614,7 @@ def convert_molecules_batch(molfiles: list, library_json: str = None) -> list:
|
|
|
1410
1614
|
|
|
1411
1615
|
if not library.monomers:
|
|
1412
1616
|
print("ERROR: No monomers loaded from custom library")
|
|
1413
|
-
return [(False, "Library loading failed") for _ in
|
|
1617
|
+
return [(False, "Library loading failed") for _ in molecules]
|
|
1414
1618
|
|
|
1415
1619
|
print(f"Custom library loaded: {len(library.monomers)} monomers")
|
|
1416
1620
|
|
|
@@ -1419,11 +1623,46 @@ def convert_molecules_batch(molfiles: list, library_json: str = None) -> list:
|
|
|
1419
1623
|
matcher = MonomerMatcher(library)
|
|
1420
1624
|
helm_generator = HELMGenerator()
|
|
1421
1625
|
|
|
1626
|
+
# Helper function to detect molecule format
|
|
1627
|
+
def _is_molfile(mol_string: str) -> bool:
|
|
1628
|
+
"""Check if string is a molfile (starts with RDKit molfile markers or has multiple lines)"""
|
|
1629
|
+
if not mol_string:
|
|
1630
|
+
return False
|
|
1631
|
+
lines = mol_string.strip().split('\n')
|
|
1632
|
+
# Molfiles typically have multiple lines and specific format
|
|
1633
|
+
if len(lines) > 3:
|
|
1634
|
+
# Check for V2000 or V3000 molfile markers
|
|
1635
|
+
if 'V2000' in mol_string or 'V3000' in mol_string:
|
|
1636
|
+
return True
|
|
1637
|
+
# Check for typical molfile structure (counts line format)
|
|
1638
|
+
if len(lines) > 3:
|
|
1639
|
+
counts_line = lines[3] if len(lines) > 3 else ""
|
|
1640
|
+
# Molfile counts line has specific format with atom/bond counts
|
|
1641
|
+
if len(counts_line) >= 6 and counts_line[:6].replace(' ', '').isdigit():
|
|
1642
|
+
return True
|
|
1643
|
+
return False
|
|
1644
|
+
|
|
1422
1645
|
results = []
|
|
1423
1646
|
|
|
1424
|
-
for i in range(len(
|
|
1425
|
-
|
|
1426
|
-
|
|
1647
|
+
for i in range(len(molecules)):
|
|
1648
|
+
mol_string = molecules[i]
|
|
1649
|
+
|
|
1650
|
+
# Determine input type and parse molecule
|
|
1651
|
+
if input_type == "auto":
|
|
1652
|
+
# Auto-detect format
|
|
1653
|
+
if _is_molfile(mol_string):
|
|
1654
|
+
mol = Chem.MolFromMolBlock(mol_string)
|
|
1655
|
+
else:
|
|
1656
|
+
# Assume SMILES if not molfile
|
|
1657
|
+
mol = Chem.MolFromSmiles(mol_string)
|
|
1658
|
+
elif input_type == "molfile":
|
|
1659
|
+
mol = Chem.MolFromMolBlock(mol_string)
|
|
1660
|
+
elif input_type == "smiles":
|
|
1661
|
+
mol = Chem.MolFromSmiles(mol_string)
|
|
1662
|
+
else:
|
|
1663
|
+
results.append((False, f"Invalid input_type: {input_type}"))
|
|
1664
|
+
continue
|
|
1665
|
+
|
|
1427
1666
|
if not mol:
|
|
1428
1667
|
results.append((False, ""))
|
|
1429
1668
|
continue
|
|
@@ -1457,6 +1696,12 @@ def convert_molecules_batch(molfiles: list, library_json: str = None) -> list:
|
|
|
1457
1696
|
if not had_changes:
|
|
1458
1697
|
break
|
|
1459
1698
|
|
|
1699
|
+
# After regular recovery, try stereo-agnostic matching for remaining unmatched fragments
|
|
1700
|
+
# This handles poor quality data with missing stereochemistry
|
|
1701
|
+
stereo_matched = processor.recover_unmatched_with_stereo_agnostic(graph, matcher)
|
|
1702
|
+
if stereo_matched > 0:
|
|
1703
|
+
print(f"DEBUG: Stereo-agnostic recovery matched {stereo_matched} additional fragments")
|
|
1704
|
+
|
|
1460
1705
|
if len(graph.nodes) > 0:
|
|
1461
1706
|
helm_notation = helm_generator.generate_helm_from_graph(graph)
|
|
1462
1707
|
results.append((True, helm_notation))
|
|
@@ -1467,5 +1712,35 @@ def convert_molecules_batch(molfiles: list, library_json: str = None) -> list:
|
|
|
1467
1712
|
|
|
1468
1713
|
return results
|
|
1469
1714
|
|
|
1715
|
+
|
|
1716
|
+
def convert_molfiles_to_helm(molfiles: list, library_json: str = None) -> list:
|
|
1717
|
+
"""
|
|
1718
|
+
Convert a batch of molfiles to HELM notation.
|
|
1719
|
+
Convenience wrapper for convert_molecules_batch with input_type="molfile".
|
|
1720
|
+
|
|
1721
|
+
Args:
|
|
1722
|
+
molfiles: List of molfile strings
|
|
1723
|
+
library_json: Optional monomer library as JSON string
|
|
1724
|
+
|
|
1725
|
+
Returns:
|
|
1726
|
+
List of tuples: (success: bool, helm_notation: str)
|
|
1727
|
+
"""
|
|
1728
|
+
return convert_molecules_batch(molfiles, library_json=library_json, input_type="molfile")
|
|
1729
|
+
|
|
1730
|
+
|
|
1731
|
+
def convert_smiles_to_helm(smiles_list: list, library_json: str = None) -> list:
|
|
1732
|
+
"""
|
|
1733
|
+
Convert a batch of SMILES to HELM notation.
|
|
1734
|
+
Convenience wrapper for convert_molecules_batch with input_type="smiles".
|
|
1735
|
+
|
|
1736
|
+
Args:
|
|
1737
|
+
smiles_list: List of SMILES strings
|
|
1738
|
+
library_json: Optional monomer library as JSON string
|
|
1739
|
+
|
|
1740
|
+
Returns:
|
|
1741
|
+
List of tuples: (success: bool, helm_notation: str)
|
|
1742
|
+
"""
|
|
1743
|
+
return convert_molecules_batch(smiles_list, library_json=library_json, input_type="smiles")
|
|
1744
|
+
|
|
1470
1745
|
res_helm_list = convert_molecules_batch(molListToProcess, library_json=libraryJSON)
|
|
1471
1746
|
result_helm = pd.DataFrame(map(lambda x: x[1], res_helm_list), columns=["regenerated sequences"])
|
package/src/package.g.ts
CHANGED
|
@@ -277,7 +277,7 @@ export async function moleculesToHelmTopMenu(table: DG.DataFrame, molecules: DG.
|
|
|
277
277
|
//description: Converts sequences to molblocks
|
|
278
278
|
//input: dataframe table { description: Input data table }
|
|
279
279
|
//input: column seqCol { semType: Macromolecule; caption: Sequence }
|
|
280
|
-
//input: bool nonlinear =
|
|
280
|
+
//input: bool nonlinear = true { caption: Non-linear; description: Slower mode for cycling/branching HELM structures }
|
|
281
281
|
//input: bool highlight = false { caption: Highlight monomers; description: Highlight monomers' substructures of the molecule }
|
|
282
282
|
//top-menu: Bio | Transform | To Atomic Level...
|
|
283
283
|
export async function toAtomicLevel(table: DG.DataFrame, seqCol: DG.Column, nonlinear: boolean, highlight: boolean) : Promise<void> {
|
package/src/package.ts
CHANGED
|
@@ -651,7 +651,7 @@ export class PackageFunctions {
|
|
|
651
651
|
static async toAtomicLevel(
|
|
652
652
|
@grok.decorators.param({options: {description: 'Input data table'}})table: DG.DataFrame,
|
|
653
653
|
@grok.decorators.param({options: {semType: 'Macromolecule', caption: 'Sequence'}})seqCol: DG.Column,
|
|
654
|
-
@grok.decorators.param({options: {initialValue: '
|
|
654
|
+
@grok.decorators.param({options: {initialValue: 'true', caption: 'Non-linear', description: 'Slower mode for cycling/branching HELM structures'}}) nonlinear: boolean = true,
|
|
655
655
|
@grok.decorators.param({options: {initialValue: 'false', caption: 'Highlight monomers', description: 'Highlight monomers\' substructures of the molecule'}}) highlight: boolean = false
|
|
656
656
|
): Promise<void> {
|
|
657
657
|
const pi = DG.TaskBarProgressIndicator.create('Converting to atomic level ...');
|