RNApolis 0.10.4__py3-none-any.whl → 0.10.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rnapolis/adapter.py CHANGED
@@ -8,7 +8,7 @@ from collections import defaultdict
8
8
  from dataclasses import dataclass
9
9
  from enum import Enum
10
10
  from tempfile import NamedTemporaryFile
11
- from typing import DefaultDict, Dict, List, Optional, Set, Tuple
11
+ from typing import Any, DefaultDict, Dict, List, Optional, Set, Tuple, Union
12
12
 
13
13
  import orjson
14
14
 
@@ -49,6 +49,7 @@ class ExternalTool(Enum):
49
49
  BPNET = "bpnet"
50
50
  MAXIT = "maxit"
51
51
  BARNABA = "barnaba"
52
+ MCANNOTATE = "mc-annotate"
52
53
 
53
54
 
54
55
  logging.basicConfig(level=os.getenv("LOGLEVEL", "INFO").upper())
@@ -68,25 +69,30 @@ def auto_detect_tool(external_files: List[str]) -> ExternalTool:
68
69
  return ExternalTool.MAXIT
69
70
 
70
71
  for file_path in external_files:
72
+ basename = os.path.basename(file_path)
73
+
71
74
  # Check for FR3D pattern
72
- if file_path.endswith("basepair_detail.txt"):
75
+ if basename.endswith("basepair_detail.txt"):
73
76
  return ExternalTool.FR3D
74
77
 
75
78
  # Check for RNAView pattern
76
- if file_path.endswith(".out"):
79
+ if basename.endswith(".out"):
77
80
  return ExternalTool.RNAVIEW
78
81
 
79
82
  # Check for BPNet pattern
80
- if file_path.endswith("basepair.json"):
83
+ if basename.endswith("basepair.json"):
81
84
  return ExternalTool.BPNET
82
85
 
86
+ # Check for MC-Annotate pattern
87
+ if basename.endswith("stdout.txt"):
88
+ return ExternalTool.MCANNOTATE
89
+
83
90
  # Check for Barnaba pattern
84
- basename = os.path.basename(file_path)
85
91
  if "pairing" in basename or "stacking" in basename:
86
92
  return ExternalTool.BARNABA
87
93
 
88
94
  # Check for JSON files (DSSR)
89
- if file_path.endswith(".json"):
95
+ if basename.endswith(".json"):
90
96
  return ExternalTool.DSSR
91
97
 
92
98
  # Default to MAXIT if no patterns match
@@ -317,10 +323,14 @@ def parse_dssr_output(
317
323
  if nt1 is not None and nt2 is not None:
318
324
  stackings.append(Stacking(nt1, nt2, None))
319
325
 
320
- return BaseInteractions(base_pairs, stackings, [], [], [])
326
+ return BaseInteractions.from_structure3d(
327
+ structure3d, base_pairs, stackings, [], [], []
328
+ )
321
329
 
322
330
 
323
- def parse_maxit_output(file_paths: List[str]) -> BaseInteractions:
331
+ def parse_maxit_output(
332
+ file_paths: List[str], structure3d: Structure3D
333
+ ) -> BaseInteractions:
324
334
  """
325
335
  Parse MAXIT output files and convert to BaseInteractions.
326
336
 
@@ -448,10 +458,14 @@ def parse_maxit_output(file_paths: List[str]) -> BaseInteractions:
448
458
  except Exception as e:
449
459
  logging.warning(f"Error processing MAXIT file {cif_file}: {e}", exc_info=True)
450
460
 
451
- return BaseInteractions(all_base_pairs, [], [], [], all_other_interactions)
461
+ return BaseInteractions.from_structure3d(
462
+ structure3d, all_base_pairs, [], [], [], all_other_interactions
463
+ )
452
464
 
453
465
 
454
- def parse_bpnet_output(file_paths: List[str]) -> BaseInteractions:
466
+ def parse_bpnet_output(
467
+ file_paths: List[str], structure3d: Structure3D
468
+ ) -> BaseInteractions:
455
469
  """
456
470
  Parse BPNet output files and convert to BaseInteractions.
457
471
 
@@ -649,7 +663,8 @@ def parse_bpnet_output(file_paths: List[str]) -> BaseInteractions:
649
663
  f"Error processing BPNet rob file {rob_file}: {e}", exc_info=True
650
664
  )
651
665
 
652
- return BaseInteractions(
666
+ return BaseInteractions.from_structure3d(
667
+ structure3d,
653
668
  base_pairs,
654
669
  stackings,
655
670
  base_ribose_interactions,
@@ -986,7 +1001,8 @@ def parse_rnaview_output(
986
1001
  except Exception as e:
987
1002
  logging.warning(f"Error processing RNAView file {out_file}: {e}", exc_info=True)
988
1003
 
989
- return BaseInteractions(
1004
+ return BaseInteractions.from_structure3d(
1005
+ structure3d,
990
1006
  base_pairs,
991
1007
  stackings,
992
1008
  base_ribose_interactions,
@@ -1142,7 +1158,349 @@ def parse_barnaba_output(
1142
1158
  f"Unknown barnaba stacking topology: {interaction_str}"
1143
1159
  )
1144
1160
 
1145
- return BaseInteractions(base_pairs, stackings, [], [], other_interactions)
1161
+ return BaseInteractions.from_structure3d(
1162
+ structure3d, base_pairs, stackings, [], [], other_interactions
1163
+ )
1164
+
1165
+
1166
+ class MCAnnotateAdapter:
1167
+ # Represents state of parsing MC-Annotate result
1168
+ # Luckily every important part of file
1169
+ # begins with a unique sentence
1170
+ class ParseState(str, Enum):
1171
+ RESIDUES_INFORMATION = "Residue conformations"
1172
+ ADJACENT_STACKINGS = "Adjacent stackings"
1173
+ NON_ADJACENT_STACKINGS = "Non-Adjacent stackings"
1174
+ BASE_PAIRS_SECTION = "Base-pairs"
1175
+ SUMMARY_SECTION = "Number of"
1176
+
1177
+ # This dictionary maps our model edges
1178
+ # to edge representation used by MC-Annotate
1179
+ EDGES: Dict[str, Tuple[str, ...]] = {
1180
+ "H": ("Hh", "Hw", "Bh", "C8"),
1181
+ "W": ("Wh", "Ww", "Ws"),
1182
+ "S": ("Ss", "Sw", "Bs"),
1183
+ }
1184
+
1185
+ # Contains flatten EDGES values (in one touple)
1186
+ ALL_EDGES = sum(EDGES.values(), ())
1187
+
1188
+ # Based on these tokens
1189
+ # BaseRiboseInteractions and BasePhosphateInteractions are created
1190
+ RIBOSE_ATOM = "O2'"
1191
+ PHOSPHATE_ATOM = "O2P"
1192
+
1193
+ # Single hydrogen bond - for us it's OtherInteraction
1194
+ ONE_HBOND = "one_hbond"
1195
+
1196
+ # Cis/trans tokens used by MC-Annotate
1197
+ CIS = "cis"
1198
+ TRANS = "trans"
1199
+
1200
+ # Tokens used in PDB files
1201
+ ATOM = "ATOM"
1202
+ HETATM = "HETATM"
1203
+
1204
+ # This regex is used to capture 6 groups of residues information:
1205
+ # (1) (2) (3) (4) (5) (6)
1206
+ # 1, 4 - chain IDs
1207
+ # 2, 5 - numbers
1208
+ # 3, 6 - icodes (or empty string if no icode)
1209
+ # Example - match and groups:
1210
+ # A-100.X-B200
1211
+ # ('A'), ('-100'), ('X'), ('B'), ('200'), ('')
1212
+ RESIDUE_REGEX = re.compile(
1213
+ r"'?(.)'?(-?[0-9]+)\.?([a-zA-Z]?)-'?(.)'?(-?[0-9]+)\.?([a-zA-Z]?)"
1214
+ )
1215
+
1216
+ # Roman numerals used by Saenger
1217
+ # both in our model and MC-Annotate
1218
+ ROMAN_NUMERALS = ("I", "V", "X")
1219
+
1220
+ # Positions of residues info in PDB files
1221
+ CHAIN_INDEX = 21
1222
+ NUMBER_INDEX = slice(22, 26)
1223
+ ICODE_INDEX = 26
1224
+ NAME_INDEX = slice(17, 20)
1225
+
1226
+ def __init__(self) -> None:
1227
+ # Since names are not present in adjacent and non-adjacent stackings
1228
+ # we need save these values eariler
1229
+ self.names: Dict[str, str] = {}
1230
+ self.base_pairs: List[BasePair] = []
1231
+ self.stackings: List[Stacking] = []
1232
+ self.base_ribose_interactions: List[BaseRibose] = []
1233
+ self.base_phosphate_interactions: List[BasePhosphate] = []
1234
+ self.other_interactions: List[OtherInteraction] = []
1235
+
1236
+ def classify_edge(self, edge_type: str) -> Optional[str]:
1237
+ for edge, edges in self.EDGES.items():
1238
+ if edge_type in edges:
1239
+ return edge
1240
+ logging.warning('Edge type "{type}" unknown')
1241
+ return None
1242
+
1243
+ def get_residue(self, residue_info_list: Tuple[Union[str, Any], ...]) -> Residue:
1244
+ chain = residue_info_list[0]
1245
+ number = int(residue_info_list[1])
1246
+
1247
+ if residue_info_list[2] == "":
1248
+ icode = None
1249
+ residue_info = f"{chain}{number}"
1250
+ else:
1251
+ icode = residue_info_list[2]
1252
+ residue_info = f"{chain}{number}.{icode}"
1253
+
1254
+ return Residue(
1255
+ None, ResidueAuth(chain, number, icode, self.names[residue_info])
1256
+ )
1257
+
1258
+ def get_residues(
1259
+ self, residues_info: str
1260
+ ) -> Tuple[Optional[Residue], Optional[Residue]]:
1261
+ regex_result = re.search(self.RESIDUE_REGEX, residues_info)
1262
+ if regex_result is None:
1263
+ logging.error("MC-Annotate regex failed: {residues_info}")
1264
+ return None, None
1265
+ residues_info_list = regex_result.groups()
1266
+ # Expects (chain1, number1, icode1, chain2, number2, icode2)
1267
+ if len(residues_info_list) != 6:
1268
+ logging.error(f"MC-Annotate regex failed for {residues_info}")
1269
+ return None, None
1270
+ residue_left = self.get_residue(residues_info_list[:3])
1271
+ residue_right = self.get_residue(residues_info_list[3:])
1272
+ return residue_left, residue_right
1273
+
1274
+ def append_stacking(self, line: str, topology_position: int) -> None:
1275
+ splitted_line = line.split()
1276
+ topology_info = splitted_line[topology_position]
1277
+ residue_left, residue_right = self.get_residues(splitted_line[0])
1278
+ if residue_left is None or residue_right is None:
1279
+ logging.warning(f"Could not parse residues in line: {line}")
1280
+ return
1281
+ stacking = Stacking(
1282
+ residue_left, residue_right, StackingTopology[topology_info]
1283
+ )
1284
+ self.stackings.append(stacking)
1285
+
1286
+ def get_ribose_interaction(
1287
+ self, residues: Tuple[Residue, Residue], token: str
1288
+ ) -> BaseRibose:
1289
+ # BasePair is preffered first so swap if necessary
1290
+ if token.split("/", 1)[0] == self.RIBOSE_ATOM:
1291
+ residue_left, residue_right = residues[1], residues[0]
1292
+ else:
1293
+ residue_left, residue_right = residues[0], residues[1]
1294
+ return BaseRibose(residue_left, residue_right, None)
1295
+
1296
+ def get_phosphate_interaction(
1297
+ self, residues: Tuple[Residue, Residue], token: str
1298
+ ) -> BasePhosphate:
1299
+ # BasePair is preffered first so swap if necessary
1300
+ if token.split("/", 1)[0] == self.PHOSPHATE_ATOM:
1301
+ residue_left, residue_right = residues[1], residues[0]
1302
+ else:
1303
+ residue_left, residue_right = residues[0], residues[1]
1304
+ return BasePhosphate(residue_left, residue_right, None)
1305
+
1306
+ def get_base_interaction(
1307
+ self,
1308
+ residues: Tuple[Residue, Residue],
1309
+ token: str,
1310
+ tokens: List[str],
1311
+ ) -> Optional[BasePair]:
1312
+ if self.CIS in tokens:
1313
+ cis_trans = "c"
1314
+ elif self.TRANS in tokens:
1315
+ cis_trans = "t"
1316
+ else:
1317
+ logging.warning(f"Cis/trans expected, but not present in {tokens}")
1318
+ return None
1319
+
1320
+ # example saenger: XIX or XII,XIII (?)
1321
+ for potential_saenger_token in tokens:
1322
+ potential_saenger_without_comma = potential_saenger_token.split(",")[0]
1323
+ if all(
1324
+ char in self.ROMAN_NUMERALS for char in potential_saenger_without_comma
1325
+ ):
1326
+ saenger = Saenger[potential_saenger_without_comma]
1327
+ break
1328
+ else:
1329
+ saenger = None
1330
+
1331
+ left_edge, right_edge = token.split("/", 1)
1332
+ leontis_westhof_left = self.classify_edge(left_edge)
1333
+ leontis_westohf_right = self.classify_edge(right_edge)
1334
+
1335
+ if leontis_westhof_left is None or leontis_westohf_right is None:
1336
+ return None
1337
+
1338
+ leontis_westhof = LeontisWesthof[
1339
+ f"{cis_trans}{leontis_westhof_left}{leontis_westohf_right}"
1340
+ ]
1341
+ residue_left, residue_right = residues
1342
+ return BasePair(residue_left, residue_right, leontis_westhof, saenger)
1343
+
1344
+ def get_other_interaction(
1345
+ self, residues: Tuple[Residue, Residue]
1346
+ ) -> OtherInteraction:
1347
+ return OtherInteraction(residues[0], residues[1])
1348
+
1349
+ def append_interactions(self, line: str) -> None:
1350
+ splitted_line = line.split()
1351
+ residues = self.get_residues(splitted_line[0])
1352
+ if residues[0] is None or residues[1] is None:
1353
+ logging.warning(f"Could not parse residues in line: {line}")
1354
+ return
1355
+ # Assumes that one pair can belong to every interaction type
1356
+ # no more than once!
1357
+ base_added, ribose_added, phosphate_added = False, False, False
1358
+ # example tokens: Ww/Ww pairing antiparallel cis XX
1359
+ tokens: List[str] = splitted_line[3:]
1360
+
1361
+ # Special case
1362
+ # IF single hydrogen bond and base pairs only THEN
1363
+ # append to OtherIneraction list
1364
+ if self.ONE_HBOND in tokens:
1365
+ for token in tokens:
1366
+ if self.RIBOSE_ATOM in token or self.PHOSPHATE_ATOM in token:
1367
+ break
1368
+ else:
1369
+ other_interaction = self.get_other_interaction(residues)
1370
+ self.other_interactions.append(other_interaction)
1371
+ return
1372
+
1373
+ for token in tokens:
1374
+ if self.RIBOSE_ATOM in token and not ribose_added:
1375
+ # example token: Ss/O2'
1376
+ ribose_interaction = self.get_ribose_interaction(residues, token)
1377
+ self.base_ribose_interactions.append(ribose_interaction)
1378
+ ribose_added = True
1379
+
1380
+ elif self.PHOSPHATE_ATOM in token and not phosphate_added:
1381
+ # example token: O2P/Bh
1382
+ phosphate_interaction = self.get_phosphate_interaction(residues, token)
1383
+ self.base_phosphate_interactions.append(phosphate_interaction)
1384
+ phosphate_added = True
1385
+
1386
+ elif len(token.split("/", 1)) > 1:
1387
+ token_left, token_right = token.split("/", 1)
1388
+ tokens_in_edges = (
1389
+ token_left in self.ALL_EDGES and token_right in self.ALL_EDGES
1390
+ )
1391
+ if tokens_in_edges and not base_added:
1392
+ # example token_left: Ww | example token_right: Ws
1393
+ base_pair_interaction = self.get_base_interaction(
1394
+ residues, token, tokens
1395
+ )
1396
+ if base_pair_interaction is not None:
1397
+ self.base_pairs.append(base_pair_interaction)
1398
+ base_added = True
1399
+
1400
+ def append_names(self, file_content: str) -> None:
1401
+ for line in file_content.splitlines():
1402
+ if line.startswith(self.ATOM) or line.startswith(self.HETATM):
1403
+ chain = line[self.CHAIN_INDEX].strip()
1404
+ number = line[self.NUMBER_INDEX].strip()
1405
+ icode = line[self.ICODE_INDEX].strip()
1406
+ name = line[self.NAME_INDEX].strip()
1407
+ residue_info = (
1408
+ f"{chain}{number}" if icode == "" else f"{chain}{number}.{icode}"
1409
+ )
1410
+ self.names[residue_info] = name
1411
+
1412
+ def analyze_by_mc_annotate(
1413
+ self, pdb_content: str, mc_result: str, **_: Dict[str, Any]
1414
+ ) -> BaseInteractions:
1415
+ self.append_names(pdb_content)
1416
+ current_state = None
1417
+
1418
+ for line in mc_result.splitlines():
1419
+ for state in self.ParseState:
1420
+ if line.startswith(state.value):
1421
+ current_state = state
1422
+ break
1423
+ # Loop ended without break - parse file
1424
+ else:
1425
+ if current_state == self.ParseState.RESIDUES_INFORMATION:
1426
+ # example line: X7.H : G C3p_endo anti
1427
+ # Skip residues information - meaningless information
1428
+ pass
1429
+ elif current_state == self.ParseState.ADJACENT_STACKINGS:
1430
+ # example line: X4.E-X5.F : adjacent_5p upward
1431
+ self.append_stacking(line, 3)
1432
+ elif current_state == self.ParseState.NON_ADJACENT_STACKINGS:
1433
+ # example line: Y40.M-Y67.N : inward pairing
1434
+ self.append_stacking(line, 2)
1435
+ elif current_state == self.ParseState.BASE_PAIRS_SECTION:
1436
+ # example line: Y38.K-Y51.X : A-U Ww/Ww pairing antiparallel cis XX
1437
+ self.append_interactions(line)
1438
+ elif current_state == self.ParseState.SUMMARY_SECTION:
1439
+ # example line: Number of non adjacent stackings = 26
1440
+ # Skip summary section - meaningless information
1441
+ pass
1442
+
1443
+ return (
1444
+ self.base_pairs,
1445
+ self.stackings,
1446
+ self.base_ribose_interactions,
1447
+ self.base_phosphate_interactions,
1448
+ self.other_interactions,
1449
+ )
1450
+
1451
+
1452
+ def parse_mcannotate_output(
1453
+ file_paths: List[str], structure3d: Structure3D
1454
+ ) -> BaseInteractions:
1455
+ """
1456
+ Parse mc-annotate output and convert to BaseInteractions.
1457
+ This function expects a file with mc-annotate stdout and a PDB file.
1458
+ """
1459
+ stdout_file = None
1460
+ structure_file = None
1461
+ for file_path in file_paths:
1462
+ if os.path.basename(file_path).endswith("stdout.txt"):
1463
+ stdout_file = file_path
1464
+ elif file_path.endswith(".pdb"):
1465
+ structure_file = file_path
1466
+
1467
+ if not stdout_file:
1468
+ logging.warning("No stdout.txt file found for mc-annotate.")
1469
+ return BaseInteractions([], [], [], [], [])
1470
+
1471
+ if not structure_file:
1472
+ logging.warning("No PDB file found for mc-annotate.")
1473
+ return BaseInteractions([], [], [], [], [])
1474
+
1475
+ logging.info(f"Processing mc-annotate stdout file: {stdout_file}")
1476
+ logging.info(f"Using structure file for residue names: {structure_file}")
1477
+
1478
+ try:
1479
+ with open(stdout_file, "r") as f:
1480
+ mc_result = f.read()
1481
+ with open(structure_file, "r") as f:
1482
+ pdb_content = f.read()
1483
+ except Exception as e:
1484
+ logging.warning(f"Could not read input files for mc-annotate: {e}")
1485
+ return BaseInteractions([], [], [], [], [])
1486
+
1487
+ adapter = MCAnnotateAdapter()
1488
+ (
1489
+ base_pairs,
1490
+ stackings,
1491
+ base_ribose_interactions,
1492
+ base_phosphate_interactions,
1493
+ other_interactions,
1494
+ ) = adapter.analyze_by_mc_annotate(pdb_content, mc_result)
1495
+
1496
+ return BaseInteractions.from_structure3d(
1497
+ structure3d,
1498
+ base_pairs,
1499
+ stackings,
1500
+ base_ribose_interactions,
1501
+ base_phosphate_interactions,
1502
+ other_interactions,
1503
+ )
1146
1504
 
1147
1505
 
1148
1506
  def parse_external_output(
@@ -1160,22 +1518,26 @@ def parse_external_output(
1160
1518
  BaseInteractions object containing the interactions found by the external tool
1161
1519
  """
1162
1520
  if tool == ExternalTool.FR3D:
1163
- return parse_fr3d_output(file_paths)
1521
+ return parse_fr3d_output(file_paths, structure3d)
1164
1522
  elif tool == ExternalTool.DSSR:
1165
1523
  return parse_dssr_output(file_paths, structure3d)
1166
1524
  elif tool == ExternalTool.MAXIT:
1167
- return parse_maxit_output(file_paths)
1525
+ return parse_maxit_output(file_paths, structure3d)
1168
1526
  elif tool == ExternalTool.BPNET:
1169
- return parse_bpnet_output(file_paths)
1527
+ return parse_bpnet_output(file_paths, structure3d)
1170
1528
  elif tool == ExternalTool.RNAVIEW:
1171
1529
  return parse_rnaview_output(file_paths, structure3d)
1172
1530
  elif tool == ExternalTool.BARNABA:
1173
1531
  return parse_barnaba_output(file_paths, structure3d)
1532
+ elif tool == ExternalTool.MCANNOTATE:
1533
+ return parse_mcannotate_output(file_paths, structure3d)
1174
1534
  else:
1175
1535
  raise ValueError(f"Unsupported external tool: {tool}")
1176
1536
 
1177
1537
 
1178
- def parse_fr3d_output(file_paths: List[str]) -> BaseInteractions:
1538
+ def parse_fr3d_output(
1539
+ file_paths: List[str], structure3d: Structure3D
1540
+ ) -> BaseInteractions:
1179
1541
  """
1180
1542
  Parse FR3D output files and convert to BaseInteractions.
1181
1543
 
@@ -1208,7 +1570,8 @@ def parse_fr3d_output(file_paths: List[str]) -> BaseInteractions:
1208
1570
  _process_interaction_line(line, interactions_data)
1209
1571
 
1210
1572
  # Return a BaseInteractions object with all the processed interactions
1211
- return BaseInteractions(
1573
+ return BaseInteractions.from_structure3d(
1574
+ structure3d,
1212
1575
  interactions_data["base_pairs"],
1213
1576
  interactions_data["stackings"],
1214
1577
  interactions_data["base_ribose_interactions"],
@@ -1244,6 +1607,9 @@ def process_external_tool_output(
1244
1607
  if not external_file_paths:
1245
1608
  # For MAXIT or when no external files are provided, use the input file
1246
1609
  file_paths_to_process = [input_file_path]
1610
+ elif tool == ExternalTool.MCANNOTATE:
1611
+ # MC-Annotate requires both the stdout and the PDB file
1612
+ file_paths_to_process = external_file_paths + [input_file_path]
1247
1613
  else:
1248
1614
  # Process all external files
1249
1615
  file_paths_to_process = external_file_paths
rnapolis/annotator.py CHANGED
@@ -85,15 +85,6 @@ def detect_cis_trans(residue_i: Residue3D, residue_j: Residue3D) -> Optional[str
85
85
  return "c" if -90.0 < torsion < 90.0 else "t"
86
86
 
87
87
 
88
- def detect_saenger(
89
- residue_i: Residue3D, residue_j: Residue3D, lw: LeontisWesthof
90
- ) -> Optional[Saenger]:
91
- key = (f"{residue_i.one_letter_name}{residue_j.one_letter_name}", lw.value)
92
- if key in Saenger.table():
93
- return Saenger[Saenger.table()[key]]
94
- return None
95
-
96
-
97
88
  def detect_bph_br_classification(
98
89
  donor_residue: Residue3D, donor: Atom, acceptor: Atom
99
90
  ) -> Optional[int]:
@@ -367,7 +358,9 @@ def find_pairs(
367
358
  Residue(residue_i.label, residue_i.auth),
368
359
  Residue(residue_j.label, residue_j.auth),
369
360
  lw,
370
- detect_saenger(residue_i, residue_j, lw),
361
+ Saenger.from_leontis_westhof(
362
+ residue_i.one_letter_name, residue_j.one_letter_name, lw
363
+ ),
371
364
  )
372
365
  )
373
366
 
@@ -483,7 +476,9 @@ def extract_base_interactions(
483
476
  ) -> BaseInteractions:
484
477
  base_pairs, base_phosphate, base_ribose = find_pairs(tertiary_structure, model)
485
478
  stackings = find_stackings(tertiary_structure, model)
486
- return BaseInteractions(base_pairs, stackings, base_ribose, base_phosphate, [])
479
+ return BaseInteractions.from_structure3d(
480
+ tertiary_structure, base_pairs, stackings, base_ribose, base_phosphate, []
481
+ )
487
482
 
488
483
 
489
484
  def generate_pymol_script(mapping: Mapping2D3D, stems: List[Stem]) -> str:
@@ -688,91 +683,6 @@ def add_common_output_arguments(parser: argparse.ArgumentParser):
688
683
  )
689
684
 
690
685
 
691
- def unify_structure_data(structure2d: Structure2D, mapping: Mapping2D3D) -> Structure2D:
692
- """
693
- Unify structure data by:
694
- 1. Adding missing Saenger classifications to base pairs
695
- 2. Filling in empty residue labels from Structure3D
696
- """
697
- # Create a mapping from residue to residue3d for label filling
698
- residue_to_residue3d = {}
699
- for residue3d in mapping.structure3d.residues:
700
- residue_key = Residue(residue3d.label, residue3d.auth)
701
- residue_to_residue3d[residue_key] = residue3d
702
-
703
- def fill_residue_label(residue: Residue) -> Residue:
704
- """Fill empty label from Structure3D if available."""
705
- if residue.label is not None:
706
- return residue
707
-
708
- # Try to find matching residue3d by auth
709
- for residue3d in mapping.structure3d.residues:
710
- if residue.auth == residue3d.auth:
711
- return Residue(residue3d.label, residue.auth)
712
-
713
- return residue
714
-
715
- # Process base pairs
716
- unified_base_pairs = []
717
- for base_pair in structure2d.base_pairs:
718
- # Fill in missing labels
719
- nt1 = fill_residue_label(base_pair.nt1)
720
- nt2 = fill_residue_label(base_pair.nt2)
721
-
722
- # Detect missing Saenger classification
723
- saenger = base_pair.saenger
724
- if saenger is None:
725
- # Find corresponding 3D residues for Saenger detection
726
- residue3d_1 = residue_to_residue3d.get(Residue(nt1.label, nt1.auth))
727
- residue3d_2 = residue_to_residue3d.get(Residue(nt2.label, nt2.auth))
728
-
729
- if residue3d_1 is not None and residue3d_2 is not None:
730
- saenger = detect_saenger(residue3d_1, residue3d_2, base_pair.lw)
731
-
732
- unified_base_pairs.append(BasePair(nt1, nt2, base_pair.lw, saenger))
733
-
734
- # Process other interaction types (fill labels only)
735
- unified_stackings = []
736
- for stacking in structure2d.stackings:
737
- nt1 = fill_residue_label(stacking.nt1)
738
- nt2 = fill_residue_label(stacking.nt2)
739
- unified_stackings.append(Stacking(nt1, nt2, stacking.topology))
740
-
741
- unified_base_ribose = []
742
- for base_ribose in structure2d.base_ribose_interactions:
743
- nt1 = fill_residue_label(base_ribose.nt1)
744
- nt2 = fill_residue_label(base_ribose.nt2)
745
- unified_base_ribose.append(BaseRibose(nt1, nt2, base_ribose.br))
746
-
747
- unified_base_phosphate = []
748
- for base_phosphate in structure2d.base_phosphate_interactions:
749
- nt1 = fill_residue_label(base_phosphate.nt1)
750
- nt2 = fill_residue_label(base_phosphate.nt2)
751
- unified_base_phosphate.append(BasePhosphate(nt1, nt2, base_phosphate.bph))
752
-
753
- unified_other = []
754
- for other in structure2d.other_interactions:
755
- nt1 = fill_residue_label(other.nt1)
756
- nt2 = fill_residue_label(other.nt2)
757
- unified_other.append(OtherInteraction(nt1, nt2))
758
-
759
- # Create new Structure2D with unified data
760
- unified_base_interactions = BaseInteractions(
761
- unified_base_pairs,
762
- unified_stackings,
763
- unified_base_ribose,
764
- unified_base_phosphate,
765
- unified_other,
766
- )
767
-
768
- # Recreate Structure2D with unified interactions
769
- unified_structure2d, _ = mapping.structure3d.extract_secondary_structure(
770
- unified_base_interactions, False
771
- )
772
-
773
- return unified_structure2d
774
-
775
-
776
686
  def handle_output_arguments(
777
687
  args: argparse.Namespace,
778
688
  structure2d: Structure2D,
@@ -780,34 +690,31 @@ def handle_output_arguments(
780
690
  input_filename: str,
781
691
  ):
782
692
  """Handles writing output based on provided arguments."""
783
- # Unify the structure data before processing outputs
784
- unified_structure2d = unify_structure_data(structure2d, mapping)
785
-
786
693
  input_basename = os.path.basename(input_filename)
787
694
  if args.csv:
788
- write_csv(args.csv, unified_structure2d)
695
+ write_csv(args.csv, structure2d)
789
696
 
790
697
  if args.json:
791
- write_json(args.json, unified_structure2d)
698
+ write_json(args.json, structure2d)
792
699
 
793
700
  if args.bpseq:
794
- write_bpseq(args.bpseq, unified_structure2d.bpseq)
701
+ write_bpseq(args.bpseq, structure2d.bpseq)
795
702
 
796
703
  if args.extended:
797
- print(unified_structure2d.extended_dot_bracket)
704
+ print(structure2d.extended_dot_bracket)
798
705
  else:
799
- print(unified_structure2d.dot_bracket)
706
+ print(structure2d.dot_bracket)
800
707
 
801
708
  if args.dot:
802
- print(BpSeq.from_string(unified_structure2d.bpseq).graphviz)
709
+ print(BpSeq.from_string(structure2d.bpseq).graphviz)
803
710
 
804
711
  if args.pml:
805
- pml_script = generate_pymol_script(mapping, unified_structure2d.stems)
712
+ pml_script = generate_pymol_script(mapping, structure2d.stems)
806
713
  with open(args.pml, "w") as f:
807
714
  f.write(pml_script)
808
715
 
809
716
  if args.inter_stem_csv:
810
- if unified_structure2d.inter_stem_parameters:
717
+ if structure2d.inter_stem_parameters:
811
718
  # Convert list of dataclasses to list of dicts
812
719
  params_list = [
813
720
  {
@@ -820,7 +727,7 @@ def handle_output_arguments(
820
727
  "min_endpoint_distance_pdf": p.min_endpoint_distance_pdf,
821
728
  "coaxial_probability": p.coaxial_probability,
822
729
  }
823
- for p in unified_structure2d.interStemParameters
730
+ for p in structure2d.interStemParameters
824
731
  ]
825
732
  df = pd.DataFrame(params_list)
826
733
  df["input_basename"] = input_basename
@@ -838,9 +745,9 @@ def handle_output_arguments(
838
745
  # pd.DataFrame(columns=['input_basename', 'stem1_idx', ...]).to_csv(args.inter_stem_csv, index=False)
839
746
 
840
747
  if args.stems_csv:
841
- if unified_structure2d.stems:
748
+ if structure2d.stems:
842
749
  stems_data = []
843
- for i, stem in enumerate(unified_structure2d.stems):
750
+ for i, stem in enumerate(structure2d.stems):
844
751
  try:
845
752
  res5p_first = mapping.bpseq_index_to_residue_map.get(
846
753
  stem.strand5p.first
rnapolis/common.py CHANGED
@@ -5,7 +5,7 @@ import re
5
5
  import string
6
6
  from collections import defaultdict
7
7
  from collections.abc import Sequence
8
- from dataclasses import dataclass
8
+ from dataclasses import InitVar, dataclass
9
9
  from enum import Enum
10
10
  from functools import cache, cached_property, total_ordering
11
11
  from typing import Dict, List, Optional, Tuple
@@ -152,6 +152,18 @@ class Saenger(Enum):
152
152
  ("TG", "cWW"): "XXVIII",
153
153
  }
154
154
 
155
+ @classmethod
156
+ def from_leontis_westhof(
157
+ cls,
158
+ residue_i_one_letter_name: str,
159
+ residue_j_one_letter_name: str,
160
+ lw: LeontisWesthof,
161
+ ) -> Optional["Saenger"]:
162
+ key = (f"{residue_i_one_letter_name}{residue_j_one_letter_name}", lw.value)
163
+ if key in Saenger.table():
164
+ return Saenger[Saenger.table()[key]]
165
+ return None
166
+
155
167
  @property
156
168
  def is_canonical(self) -> bool:
157
169
  return self == Saenger.XIX or self == Saenger.XX or self == Saenger.XXVIII
@@ -1062,6 +1074,91 @@ class BaseInteractions:
1062
1074
  base_phosphate_interactions: List[BasePhosphate]
1063
1075
  other_interactions: List[OtherInteraction]
1064
1076
 
1077
+ @classmethod
1078
+ def from_structure3d(
1079
+ cls,
1080
+ structure3d: "Structure3D",
1081
+ base_pairs: List[BasePair],
1082
+ stackings: List[Stacking],
1083
+ base_ribose_interactions: List[BaseRibose],
1084
+ base_phosphate_interactions: List[BasePhosphate],
1085
+ other_interactions: List[OtherInteraction],
1086
+ ) -> "BaseInteractions":
1087
+ auth2residue3d = {}
1088
+ auth2label = {}
1089
+ label2auth = {}
1090
+
1091
+ for residue3d in structure3d.residues:
1092
+ auth2residue3d[residue3d.auth] = residue3d
1093
+ auth2label[residue3d.auth] = residue3d.label
1094
+ label2auth[residue3d.label] = residue3d.auth
1095
+
1096
+ def unify_nt(nt: Residue) -> Residue:
1097
+ if nt.auth is not None and nt.label is not None:
1098
+ return nt
1099
+ if nt.auth is not None:
1100
+ return Residue(label=auth2label.get(nt.auth, None), auth=nt.auth)
1101
+ if nt.label is not None:
1102
+ return Residue(label=nt.label, auth=label2auth.get(nt.label, None))
1103
+ return nt
1104
+
1105
+ base_pairs_new = []
1106
+ for base_pair in base_pairs:
1107
+ nt1 = unify_nt(base_pair.nt1)
1108
+ nt2 = unify_nt(base_pair.nt2)
1109
+ saenger = base_pair.saenger or Saenger.from_leontis_westhof(
1110
+ auth2residue3d[nt1.auth].one_letter_name,
1111
+ auth2residue3d[nt2.auth].one_letter_name,
1112
+ base_pair.lw,
1113
+ )
1114
+ if (
1115
+ nt1 != base_pair.nt1
1116
+ or nt2 != base_pair.nt2
1117
+ or saenger != base_pair.saenger
1118
+ ):
1119
+ base_pair = BasePair(nt1=nt1, nt2=nt2, lw=base_pair.lw, saenger=saenger)
1120
+ base_pairs_new.append(base_pair)
1121
+
1122
+ stackings_new = []
1123
+ for stacking in stackings:
1124
+ nt1 = unify_nt(stacking.nt1)
1125
+ nt2 = unify_nt(stacking.nt2)
1126
+ if nt1 != stacking.nt1 or nt2 != stacking.nt2:
1127
+ stacking = Stacking(nt1=nt1, nt2=nt2, topology=stacking.topology)
1128
+ stackings_new.append(stacking)
1129
+
1130
+ base_ribose_interactions_new = []
1131
+ for base_ribose in base_ribose_interactions:
1132
+ nt1 = unify_nt(base_ribose.nt1)
1133
+ nt2 = unify_nt(base_ribose.nt2)
1134
+ if nt1 != base_ribose.nt1 or nt2 != base_ribose.nt2:
1135
+ base_ribose = BaseRibose(nt1=nt1, nt2=nt2, br=base_ribose.br)
1136
+ base_ribose_interactions_new.append(base_ribose)
1137
+
1138
+ base_phosphate_interactions_new = []
1139
+ for base_phosphate in base_phosphate_interactions:
1140
+ nt1 = unify_nt(base_phosphate.nt1)
1141
+ nt2 = unify_nt(base_phosphate.nt2)
1142
+ if nt1 != base_phosphate.nt1 or nt2 != base_phosphate.nt2:
1143
+ base_phosphate = BasePhosphate(nt1=nt1, nt2=nt2, bph=base_phosphate.bph)
1144
+ base_phosphate_interactions_new.append(base_phosphate)
1145
+
1146
+ other_interactions_new = []
1147
+ for other_interaction in other_interactions:
1148
+ nt1 = unify_nt(other_interaction.nt1)
1149
+ nt2 = unify_nt(other_interaction.nt2)
1150
+ if nt1 != other_interaction.nt1 or nt2 != other_interaction.nt2:
1151
+ other_interaction = OtherInteraction(nt1=nt1, nt2=nt2)
1152
+ other_interactions_new.append(other_interaction)
1153
+
1154
+ return cls(
1155
+ base_pairs=base_pairs_new,
1156
+ stackings=stackings_new,
1157
+ base_ribose_interactions=base_ribose_interactions_new,
1158
+ base_phosphate_interactions=base_phosphate_interactions_new,
1159
+ other_interactions=other_interactions_new,
1160
+ )
1161
+
1065
1162
 
1066
1163
  @dataclass(frozen=True, order=True)
1067
1164
  class InterStemParameters:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: RNApolis
3
- Version: 0.10.4
3
+ Version: 0.10.6
4
4
  Summary: A Python library containing RNA-related bioinformatics functions and classes
5
5
  Home-page: https://github.com/tzok/rnapolis-py
6
6
  Author: Tomasz Zok
@@ -1,8 +1,8 @@
1
- rnapolis/adapter.py,sha256=6b4m1EzD1BA3D7GgXaV4iYKOnmzgz0AKwXHc2svQj3w,48132
1
+ rnapolis/adapter.py,sha256=6hJTweIqUXH8CEGvi8oupFzk5etkIt8Q2bqRvgsqako,62169
2
2
  rnapolis/aligner.py,sha256=o7rQyjAZ3n4VXcnSPY3HVB8nLNRkVbl552O3NVh0mfg,3429
3
- rnapolis/annotator.py,sha256=OkqFVuxOtb-mySmw3bc5NF9ETu4BWq4ImtBecWJikrY,33899
3
+ rnapolis/annotator.py,sha256=HA2hfEUXdmBElObqRlASAB1FgkysjiHgwMTjEhsDiDE,30277
4
4
  rnapolis/clashfinder.py,sha256=AC9_tIx7QIk57sELq_aKfU1u3UMOXbgcccQeGHhMR6c,8517
5
- rnapolis/common.py,sha256=HTe-RSZa_9hEIi-j4-1afxdqt7zAD-BpZ7JxRZGX170,32390
5
+ rnapolis/common.py,sha256=hamlW892ZF5A0dSWsl7cOCZqOpbVQMgXjVPYDFzk3pE,36347
6
6
  rnapolis/component_A.csv,sha256=koirS-AwUZwoYGItT8yn3wS6Idvmh2FANfTQcOS_xh8,2897
7
7
  rnapolis/component_C.csv,sha256=NtvsAu_YrUgTjzZm3j4poW4IZ99x3dPARB09XVIiMCc,2803
8
8
  rnapolis/component_G.csv,sha256=Z5wl8OnHRyx4XhTyBiWgRZiEvmZXhoxtVRH8bn6Vxf0,2898
@@ -22,9 +22,9 @@ rnapolis/tertiary_v2.py,sha256=SgijTv0bPqMJwsMqyQk0O8QAnS2Ozk45vk8igxt9hRs,38001
22
22
  rnapolis/transformer.py,sha256=aC0nBmHHJf5TyLvBIV57Jj3tlwpvHbPo347opfAOlQA,3844
23
23
  rnapolis/unifier.py,sha256=2ge7IB9FdRgzSAiVD39U_ciwtdDJ2fGzf8mUIudbrqY,5820
24
24
  rnapolis/util.py,sha256=IdquFO3PV1_KDqodjupzm0Rqvgy0CeSzxGHaGEHYXVU,543
25
- rnapolis-0.10.4.dist-info/licenses/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
26
- rnapolis-0.10.4.dist-info/METADATA,sha256=VKy39unD-Kyqzg7J7ADgFlseV3FftWCyBtjn-vnYbEU,54611
27
- rnapolis-0.10.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
28
- rnapolis-0.10.4.dist-info/entry_points.txt,sha256=MZMWnYBUYnis-zWDmFfuA5yXtU3W5YdQrm5HA5LrkeM,474
29
- rnapolis-0.10.4.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
30
- rnapolis-0.10.4.dist-info/RECORD,,
25
+ rnapolis-0.10.6.dist-info/licenses/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
26
+ rnapolis-0.10.6.dist-info/METADATA,sha256=Q2OY_Y3PZgVNaob7Xk8vruYNZ13HyFfdiRD7giJqJ_I,54611
27
+ rnapolis-0.10.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
28
+ rnapolis-0.10.6.dist-info/entry_points.txt,sha256=MZMWnYBUYnis-zWDmFfuA5yXtU3W5YdQrm5HA5LrkeM,474
29
+ rnapolis-0.10.6.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
30
+ rnapolis-0.10.6.dist-info/RECORD,,