RNApolis 0.9.0__py3-none-any.whl → 0.9.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnapolis/adapter.py +791 -37
- rnapolis/annotator.py +103 -24
- {rnapolis-0.9.0.dist-info → rnapolis-0.9.2.dist-info}/METADATA +1 -1
- {rnapolis-0.9.0.dist-info → rnapolis-0.9.2.dist-info}/RECORD +8 -8
- {rnapolis-0.9.0.dist-info → rnapolis-0.9.2.dist-info}/WHEEL +0 -0
- {rnapolis-0.9.0.dist-info → rnapolis-0.9.2.dist-info}/entry_points.txt +0 -0
- {rnapolis-0.9.0.dist-info → rnapolis-0.9.2.dist-info}/licenses/LICENSE +0 -0
- {rnapolis-0.9.0.dist-info → rnapolis-0.9.2.dist-info}/top_level.txt +0 -0
rnapolis/adapter.py
CHANGED
@@ -1,8 +1,12 @@
|
|
1
1
|
#! /usr/bin/env python
|
2
2
|
import argparse
|
3
3
|
import logging
|
4
|
+
import math
|
4
5
|
import os
|
6
|
+
import re
|
7
|
+
from dataclasses import dataclass
|
5
8
|
from enum import Enum
|
9
|
+
from tempfile import NamedTemporaryFile
|
6
10
|
from typing import Dict, List, Optional, Tuple
|
7
11
|
|
8
12
|
import orjson
|
@@ -22,10 +26,13 @@ from rnapolis.common import (
|
|
22
26
|
OtherInteraction,
|
23
27
|
Residue,
|
24
28
|
ResidueAuth,
|
29
|
+
ResidueLabel,
|
30
|
+
Saenger,
|
25
31
|
Stacking,
|
26
32
|
StackingTopology,
|
27
33
|
Structure2D,
|
28
34
|
)
|
35
|
+
from rnapolis.metareader import read_metadata
|
29
36
|
from rnapolis.parser import read_3d_structure
|
30
37
|
from rnapolis.tertiary import (
|
31
38
|
Mapping2D3D,
|
@@ -37,11 +44,48 @@ from rnapolis.util import handle_input_file
|
|
37
44
|
class ExternalTool(Enum):
|
38
45
|
FR3D = "fr3d"
|
39
46
|
DSSR = "dssr"
|
47
|
+
RNAVIEW = "rnaview"
|
48
|
+
BPNET = "bpnet"
|
49
|
+
MAXIT = "maxit"
|
40
50
|
|
41
51
|
|
42
52
|
logging.basicConfig(level=os.getenv("LOGLEVEL", "INFO").upper())
|
43
53
|
|
44
54
|
|
55
|
+
def auto_detect_tool(external_files: List[str]) -> ExternalTool:
|
56
|
+
"""
|
57
|
+
Auto-detect the external tool based on file patterns.
|
58
|
+
|
59
|
+
Args:
|
60
|
+
external_files: List of external tool output file paths
|
61
|
+
|
62
|
+
Returns:
|
63
|
+
ExternalTool enum value based on detected patterns
|
64
|
+
"""
|
65
|
+
if not external_files:
|
66
|
+
return ExternalTool.MAXIT
|
67
|
+
|
68
|
+
for file_path in external_files:
|
69
|
+
# Check for FR3D pattern
|
70
|
+
if file_path.endswith("basepair_detail.txt"):
|
71
|
+
return ExternalTool.FR3D
|
72
|
+
|
73
|
+
# Check for RNAView pattern
|
74
|
+
if file_path.endswith(".out"):
|
75
|
+
return ExternalTool.RNAVIEW
|
76
|
+
|
77
|
+
# Check for BPNet pattern
|
78
|
+
if file_path.endswith("basepair.json"):
|
79
|
+
return ExternalTool.BPNET
|
80
|
+
|
81
|
+
# Check for JSON files (DSSR)
|
82
|
+
if file_path.endswith(".json"):
|
83
|
+
return ExternalTool.DSSR
|
84
|
+
|
85
|
+
# Default to MAXIT if no patterns match
|
86
|
+
return ExternalTool.MAXIT
|
87
|
+
|
88
|
+
|
45
89
|
def parse_unit_id(nt: str) -> Residue:
|
46
90
|
"""Parse FR3D unit ID format into a Residue object."""
|
47
91
|
fields = nt.split("|")
|
@@ -200,13 +244,13 @@ def match_dssr_lw(lw: Optional[str]) -> Optional[LeontisWesthof]:
|
|
200
244
|
|
201
245
|
|
202
246
|
def parse_dssr_output(
|
203
|
-
|
247
|
+
file_paths: List[str], structure3d: Structure3D, model: Optional[int] = None
|
204
248
|
) -> BaseInteractions:
|
205
249
|
"""
|
206
250
|
Parse DSSR JSON output and convert to BaseInteractions.
|
207
251
|
|
208
252
|
Args:
|
209
|
-
|
253
|
+
file_paths: List of paths to DSSR output files
|
210
254
|
structure3d: The 3D structure parsed from PDB/mmCIF
|
211
255
|
model: Model number to use (if None, use first model)
|
212
256
|
|
@@ -216,7 +260,23 @@ def parse_dssr_output(
|
|
216
260
|
base_pairs: List[BasePair] = []
|
217
261
|
stackings: List[Stacking] = []
|
218
262
|
|
219
|
-
|
263
|
+
# Find the first .json file in the list
|
264
|
+
json_file = None
|
265
|
+
for file_path in file_paths:
|
266
|
+
if file_path.endswith(".json"):
|
267
|
+
json_file = file_path
|
268
|
+
break
|
269
|
+
|
270
|
+
if json_file is None:
|
271
|
+
logging.warning("No .json file found in DSSR file list")
|
272
|
+
return BaseInteractions([], [], [], [], [])
|
273
|
+
|
274
|
+
# Log unused files
|
275
|
+
unused_files = [f for f in file_paths if f != json_file]
|
276
|
+
if unused_files:
|
277
|
+
logging.info(f"DSSR: Using {json_file}, ignoring unused files: {unused_files}")
|
278
|
+
|
279
|
+
with open(json_file) as f:
|
220
280
|
dssr = orjson.loads(f.read())
|
221
281
|
|
222
282
|
# Handle multi-model files
|
@@ -253,14 +313,689 @@ def parse_dssr_output(
|
|
253
313
|
return BaseInteractions(base_pairs, stackings, [], [], [])
|
254
314
|
|
255
315
|
|
316
|
+
def parse_maxit_output(file_paths: List[str]) -> BaseInteractions:
|
317
|
+
"""
|
318
|
+
Parse MAXIT output files and convert to BaseInteractions.
|
319
|
+
|
320
|
+
MAXIT analysis is embedded in mmCIF files as ndb_struct_na_base_pair category.
|
321
|
+
|
322
|
+
Args:
|
323
|
+
file_paths: List of paths to mmCIF files containing MAXIT analysis
|
324
|
+
|
325
|
+
Returns:
|
326
|
+
BaseInteractions object containing the interactions found by MAXIT
|
327
|
+
"""
|
328
|
+
|
329
|
+
def convert_saenger(hbond_type_28: str) -> Optional[Saenger]:
|
330
|
+
if hbond_type_28 == "?":
|
331
|
+
return None
|
332
|
+
try:
|
333
|
+
index = int(hbond_type_28)
|
334
|
+
if 1 <= index <= 28:
|
335
|
+
return list(Saenger)[index - 1]
|
336
|
+
except ValueError:
|
337
|
+
pass
|
338
|
+
return None
|
339
|
+
|
340
|
+
def convert_lw(hbond_type_12) -> Optional[LeontisWesthof]:
|
341
|
+
if hbond_type_12 == "?":
|
342
|
+
return None
|
343
|
+
try:
|
344
|
+
index = int(hbond_type_12)
|
345
|
+
if index == 1:
|
346
|
+
return LeontisWesthof.cWW
|
347
|
+
if index == 2:
|
348
|
+
return LeontisWesthof.tWW
|
349
|
+
if index == 3:
|
350
|
+
return LeontisWesthof.cWH
|
351
|
+
if index == 4:
|
352
|
+
return LeontisWesthof.tWH
|
353
|
+
if index == 5:
|
354
|
+
return LeontisWesthof.cWS
|
355
|
+
if index == 6:
|
356
|
+
return LeontisWesthof.tWS
|
357
|
+
if index == 7:
|
358
|
+
return LeontisWesthof.cHH
|
359
|
+
if index == 8:
|
360
|
+
return LeontisWesthof.tHH
|
361
|
+
if index == 9:
|
362
|
+
return LeontisWesthof.cHS
|
363
|
+
if index == 10:
|
364
|
+
return LeontisWesthof.tHS
|
365
|
+
if index == 11:
|
366
|
+
return LeontisWesthof.cSS
|
367
|
+
if index == 12:
|
368
|
+
return LeontisWesthof.tSS
|
369
|
+
except ValueError:
|
370
|
+
pass
|
371
|
+
return None
|
372
|
+
|
373
|
+
all_base_pairs = []
|
374
|
+
all_other_interactions = []
|
375
|
+
|
376
|
+
# Find the first .cif file in the list
|
377
|
+
cif_file = None
|
378
|
+
for file_path in file_paths:
|
379
|
+
if file_path.endswith(".cif"):
|
380
|
+
cif_file = file_path
|
381
|
+
break
|
382
|
+
|
383
|
+
if cif_file is None:
|
384
|
+
logging.warning("No .cif file found in MAXIT file list")
|
385
|
+
return BaseInteractions([], [], [], [], [])
|
386
|
+
|
387
|
+
# Log unused files
|
388
|
+
unused_files = [f for f in file_paths if f != cif_file]
|
389
|
+
if unused_files:
|
390
|
+
logging.info(f"MAXIT: Using {cif_file}, ignoring unused files: {unused_files}")
|
391
|
+
|
392
|
+
# Process only the first .cif file
|
393
|
+
logging.info(f"Processing MAXIT file: {cif_file}")
|
394
|
+
|
395
|
+
try:
|
396
|
+
with open(cif_file, "r") as f:
|
397
|
+
file_content = f.read()
|
398
|
+
|
399
|
+
with NamedTemporaryFile("w+", suffix=".cif") as mmcif:
|
400
|
+
mmcif.write(file_content)
|
401
|
+
mmcif.seek(0)
|
402
|
+
metadata = read_metadata(mmcif, ["ndb_struct_na_base_pair"])
|
403
|
+
|
404
|
+
# Parse base pairs from this file
|
405
|
+
for entry in metadata.get("ndb_struct_na_base_pair", []):
|
406
|
+
auth_chain_i = entry["i_auth_asym_id"]
|
407
|
+
auth_number_i = int(entry["i_auth_seq_id"])
|
408
|
+
auth_icode_i = (
|
409
|
+
entry["i_PDB_ins_code"] if entry["i_PDB_ins_code"] != "?" else None
|
410
|
+
)
|
411
|
+
name_i = entry["i_label_comp_id"]
|
412
|
+
auth_i = ResidueAuth(auth_chain_i, auth_number_i, auth_icode_i, name_i)
|
413
|
+
|
414
|
+
auth_chain_j = entry["j_auth_asym_id"]
|
415
|
+
auth_number_j = int(entry["j_auth_seq_id"])
|
416
|
+
auth_icode_j = (
|
417
|
+
entry["j_PDB_ins_code"] if entry["j_PDB_ins_code"] != "?" else None
|
418
|
+
)
|
419
|
+
name_j = entry["j_label_comp_id"]
|
420
|
+
auth_j = ResidueAuth(auth_chain_j, auth_number_j, auth_icode_j, name_j)
|
421
|
+
|
422
|
+
label_chain_i = entry["i_label_asym_id"]
|
423
|
+
label_number_i = int(entry["i_label_seq_id"])
|
424
|
+
label_i = ResidueLabel(label_chain_i, label_number_i, name_i)
|
425
|
+
|
426
|
+
label_chain_j = entry["j_label_asym_id"]
|
427
|
+
label_number_j = int(entry["j_label_seq_id"])
|
428
|
+
label_j = ResidueLabel(label_chain_j, label_number_j, name_j)
|
429
|
+
|
430
|
+
residue_i = Residue(label_i, auth_i)
|
431
|
+
residue_j = Residue(label_j, auth_j)
|
432
|
+
|
433
|
+
saenger = convert_saenger(entry["hbond_type_28"])
|
434
|
+
lw = convert_lw(entry["hbond_type_12"])
|
435
|
+
|
436
|
+
if lw is not None:
|
437
|
+
all_base_pairs.append(BasePair(residue_i, residue_j, lw, saenger))
|
438
|
+
else:
|
439
|
+
all_other_interactions.append(OtherInteraction(residue_i, residue_j))
|
440
|
+
|
441
|
+
except Exception as e:
|
442
|
+
logging.warning(f"Error processing MAXIT file {cif_file}: {e}", exc_info=True)
|
443
|
+
|
444
|
+
return BaseInteractions(all_base_pairs, [], [], [], all_other_interactions)
|
445
|
+
|
446
|
+
|
447
|
+
def parse_bpnet_output(file_paths: List[str]) -> BaseInteractions:
|
448
|
+
"""
|
449
|
+
Parse BPNet output files and convert to BaseInteractions.
|
450
|
+
|
451
|
+
Args:
|
452
|
+
file_paths: List of paths to BPNet output files (basepair.json and .rob files)
|
453
|
+
|
454
|
+
Returns:
|
455
|
+
BaseInteractions object containing the interactions found by BPNet
|
456
|
+
"""
|
457
|
+
|
458
|
+
def convert_lw(bpnet_lw) -> LeontisWesthof:
|
459
|
+
"""Convert BPNet LW notation to LeontisWesthof enum."""
|
460
|
+
if len(bpnet_lw) != 4:
|
461
|
+
raise ValueError(f"bpnet lw invalid length: {bpnet_lw}")
|
462
|
+
bpnet_lw = bpnet_lw.replace("+", "W").replace("z", "S").replace("g", "H")
|
463
|
+
edge5 = bpnet_lw[0].upper()
|
464
|
+
edge3 = bpnet_lw[2].upper()
|
465
|
+
stericity = bpnet_lw[3].lower()
|
466
|
+
return LeontisWesthof[f"{stericity}{edge5}{edge3}"]
|
467
|
+
|
468
|
+
def residues_from_overlap_info(fields):
|
469
|
+
"""Parse residue information from overlap line fields."""
|
470
|
+
chains = fields[6].split("^")
|
471
|
+
numbers = list(map(int, fields[3].split(":")))
|
472
|
+
icode1, icode2 = fields[2], fields[4]
|
473
|
+
names = fields[5].split(":")
|
474
|
+
|
475
|
+
if icode1 in " ?":
|
476
|
+
icode1 = None
|
477
|
+
if icode2 in " ?":
|
478
|
+
icode2 = None
|
479
|
+
|
480
|
+
nt1 = Residue(None, ResidueAuth(chains[0], numbers[0], icode1, names[0]))
|
481
|
+
nt2 = Residue(None, ResidueAuth(chains[1], numbers[1], icode2, names[1]))
|
482
|
+
return nt1, nt2
|
483
|
+
|
484
|
+
# Find required files
|
485
|
+
basepair_json = None
|
486
|
+
rob_file = None
|
487
|
+
|
488
|
+
for file_path in file_paths:
|
489
|
+
if file_path.endswith("basepair.json"):
|
490
|
+
basepair_json = file_path
|
491
|
+
elif file_path.endswith(".rob"):
|
492
|
+
rob_file = file_path
|
493
|
+
|
494
|
+
# Log unused files
|
495
|
+
used_files = [f for f in [basepair_json, rob_file] if f is not None]
|
496
|
+
unused_files = [f for f in file_paths if f not in used_files]
|
497
|
+
if unused_files:
|
498
|
+
logging.info(
|
499
|
+
f"BPNet: Using {used_files}, ignoring unused files: {unused_files}"
|
500
|
+
)
|
501
|
+
|
502
|
+
base_pairs = []
|
503
|
+
stackings = []
|
504
|
+
base_ribose_interactions = []
|
505
|
+
base_phosphate_interactions = []
|
506
|
+
other_interactions = []
|
507
|
+
|
508
|
+
# Parse base pairs from JSON file
|
509
|
+
if basepair_json:
|
510
|
+
logging.info(f"Processing BPNet basepair file: {basepair_json}")
|
511
|
+
try:
|
512
|
+
with open(basepair_json, encoding="utf-8") as f:
|
513
|
+
data = orjson.loads(f.read())
|
514
|
+
|
515
|
+
for entry in data["basepairs"]:
|
516
|
+
nt1 = Residue(
|
517
|
+
None,
|
518
|
+
ResidueAuth(
|
519
|
+
entry["chain1"],
|
520
|
+
entry["resnum1"],
|
521
|
+
entry["ins1"],
|
522
|
+
entry["resname1"],
|
523
|
+
),
|
524
|
+
)
|
525
|
+
nt2 = Residue(
|
526
|
+
None,
|
527
|
+
ResidueAuth(
|
528
|
+
entry["chain2"],
|
529
|
+
entry["resnum2"],
|
530
|
+
entry["ins2"],
|
531
|
+
entry["resname2"],
|
532
|
+
),
|
533
|
+
)
|
534
|
+
lw = convert_lw(entry["basepair"])
|
535
|
+
base_pairs.append(BasePair(nt1, nt2, lw, None))
|
536
|
+
except Exception as e:
|
537
|
+
logging.warning(
|
538
|
+
f"Error processing BPNet basepair file {basepair_json}: {e}",
|
539
|
+
exc_info=True,
|
540
|
+
)
|
541
|
+
|
542
|
+
# Parse overlaps from ROB file
|
543
|
+
if rob_file:
|
544
|
+
logging.info(f"Processing BPNet rob file: {rob_file}")
|
545
|
+
try:
|
546
|
+
with open(rob_file, encoding="utf-8") as f:
|
547
|
+
rob_content = f.read()
|
548
|
+
|
549
|
+
for line in rob_content.splitlines():
|
550
|
+
if line.startswith("OVLP"):
|
551
|
+
fields = line.strip().split()
|
552
|
+
if len(fields) == 13:
|
553
|
+
# ASTK means Adjacent Stacking, OSTK means Non-Adjacent Stacking
|
554
|
+
# ADJA means Adjacent contact but not proper stacking
|
555
|
+
if fields[7] in ["ASTK", "OSTK", "ADJA"]:
|
556
|
+
nt1, nt2 = residues_from_overlap_info(fields)
|
557
|
+
stackings.append(Stacking(nt1, nt2, None))
|
558
|
+
else:
|
559
|
+
logging.warning(f"Failed to parse OVLP line: {line}")
|
560
|
+
elif line.startswith("PROX"):
|
561
|
+
fields = line.strip().split()
|
562
|
+
if len(fields) == 11:
|
563
|
+
nt1, nt2 = residues_from_overlap_info(fields)
|
564
|
+
atom1, atom2 = fields[7].split(":")
|
565
|
+
|
566
|
+
# Determine element types based on atom names
|
567
|
+
phosphate_atoms = frozenset(
|
568
|
+
(
|
569
|
+
"P",
|
570
|
+
"OP1",
|
571
|
+
"OP2",
|
572
|
+
"O5'",
|
573
|
+
"C5'",
|
574
|
+
"C4'",
|
575
|
+
"C3'",
|
576
|
+
"O3'",
|
577
|
+
"O5*",
|
578
|
+
"C5*",
|
579
|
+
"C4*",
|
580
|
+
"C3*",
|
581
|
+
"O3*",
|
582
|
+
)
|
583
|
+
)
|
584
|
+
ribose_atoms = frozenset(
|
585
|
+
("C1'", "C2'", "O2'", "O4'", "C1*", "C2*", "O2*", "O4*")
|
586
|
+
)
|
587
|
+
base_atoms = frozenset(
|
588
|
+
(
|
589
|
+
"C2",
|
590
|
+
"C4",
|
591
|
+
"C5",
|
592
|
+
"C6",
|
593
|
+
"C8",
|
594
|
+
"N1",
|
595
|
+
"N2",
|
596
|
+
"N3",
|
597
|
+
"N4",
|
598
|
+
"N6",
|
599
|
+
"N7",
|
600
|
+
"N9",
|
601
|
+
"O2",
|
602
|
+
"O4",
|
603
|
+
"O6",
|
604
|
+
)
|
605
|
+
)
|
606
|
+
|
607
|
+
def assign_element(atom_name):
|
608
|
+
if atom_name in phosphate_atoms:
|
609
|
+
return "PHOSPHATE"
|
610
|
+
elif atom_name in ribose_atoms:
|
611
|
+
return "RIBOSE"
|
612
|
+
elif atom_name in base_atoms:
|
613
|
+
return "BASE"
|
614
|
+
else:
|
615
|
+
return "UNKNOWN"
|
616
|
+
|
617
|
+
element1 = assign_element(atom1)
|
618
|
+
element2 = assign_element(atom2)
|
619
|
+
|
620
|
+
# Base-ribose interactions
|
621
|
+
if element1 == "BASE" and element2 == "RIBOSE":
|
622
|
+
base_ribose_interactions.append(BaseRibose(nt1, nt2, None))
|
623
|
+
elif element1 == "RIBOSE" and element2 == "BASE":
|
624
|
+
base_ribose_interactions.append(BaseRibose(nt2, nt1, None))
|
625
|
+
|
626
|
+
# Base-phosphate interactions
|
627
|
+
elif element1 == "BASE" and element2 == "PHOSPHATE":
|
628
|
+
base_phosphate_interactions.append(
|
629
|
+
BasePhosphate(nt1, nt2, None)
|
630
|
+
)
|
631
|
+
elif element1 == "PHOSPHATE" and element2 == "BASE":
|
632
|
+
base_phosphate_interactions.append(
|
633
|
+
BasePhosphate(nt2, nt1, None)
|
634
|
+
)
|
635
|
+
|
636
|
+
# Other interactions
|
637
|
+
other_interactions.append(OtherInteraction(nt1, nt2))
|
638
|
+
else:
|
639
|
+
logging.warning(f"Failed to parse PROX line: {line}")
|
640
|
+
except Exception as e:
|
641
|
+
logging.warning(
|
642
|
+
f"Error processing BPNet rob file {rob_file}: {e}", exc_info=True
|
643
|
+
)
|
644
|
+
|
645
|
+
return BaseInteractions(
|
646
|
+
base_pairs,
|
647
|
+
stackings,
|
648
|
+
base_ribose_interactions,
|
649
|
+
base_phosphate_interactions,
|
650
|
+
other_interactions,
|
651
|
+
)
|
652
|
+
|
653
|
+
|
654
|
+
def parse_rnaview_output(
|
655
|
+
file_paths: List[str], structure3d: Structure3D
|
656
|
+
) -> BaseInteractions:
|
657
|
+
"""
|
658
|
+
Parse RNAView output files and convert to BaseInteractions.
|
659
|
+
|
660
|
+
Args:
|
661
|
+
file_paths: List of paths to RNAView output files (.out files)
|
662
|
+
structure3d: The 3D structure parsed from PDB/mmCIF
|
663
|
+
|
664
|
+
Returns:
|
665
|
+
BaseInteractions object containing the interactions found by RNAView
|
666
|
+
"""
|
667
|
+
|
668
|
+
@dataclass
|
669
|
+
class PotentialResidue:
|
670
|
+
residue: Residue
|
671
|
+
position_c2: Optional[Tuple[float, float, float]]
|
672
|
+
position_c6: Optional[Tuple[float, float, float]]
|
673
|
+
position_n1: Optional[Tuple[float, float, float]]
|
674
|
+
|
675
|
+
def is_correct_according_to_rnaview(self) -> bool:
|
676
|
+
"""
|
677
|
+
This is a reimplementation of residue_ident() function from fpair_sub.c from RNAView source code.
|
678
|
+
"""
|
679
|
+
if any(
|
680
|
+
(
|
681
|
+
self.position_c2 is None,
|
682
|
+
self.position_c6 is None,
|
683
|
+
self.position_n1 is None,
|
684
|
+
)
|
685
|
+
):
|
686
|
+
return False
|
687
|
+
|
688
|
+
distance_n1_c2 = math.dist(self.position_n1, self.position_c2) # type: ignore
|
689
|
+
distance_n1_c6 = math.dist(self.position_n1, self.position_c6) # type: ignore
|
690
|
+
distance_c2_c6 = math.dist(self.position_c2, self.position_c6) # type: ignore
|
691
|
+
return all(
|
692
|
+
(distance_n1_c2 <= 2.0, distance_n1_c6 <= 2.0, distance_c2_c6 <= 3.0)
|
693
|
+
)
|
694
|
+
|
695
|
+
# RNAView regex pattern from the reference implementation
|
696
|
+
RNAVIEW_REGEX = re.compile(
|
697
|
+
r"\s*(\d+)_(\d+),\s+(\w):\s+(-?\d+)\s+(\w+)-(\w+)\s+(-?\d+)\s+(\w):\s+(syn|\s+)*((./.)\s+(cis|tran)(syn|\s+)*([IVX,]+|n/a|![^.]+)|stacked)\.?"
|
698
|
+
)
|
699
|
+
|
700
|
+
# Positions of residues info in PDB files
|
701
|
+
ATOM_NAME_INDEX = slice(12, 16)
|
702
|
+
CHAIN_INDEX = 21
|
703
|
+
NUMBER_INDEX = slice(22, 26)
|
704
|
+
ICODE_INDEX = 26
|
705
|
+
NAME_INDEX = slice(17, 20)
|
706
|
+
X_INDEX, Y_INDEX, Z_INDEX = slice(30, 38), slice(38, 46), slice(46, 54)
|
707
|
+
|
708
|
+
# Tokens used in PDB files
|
709
|
+
ATOM = "ATOM"
|
710
|
+
HETATM = "HETATM"
|
711
|
+
ATOM_C6 = "C6"
|
712
|
+
ATOM_C2 = "C2"
|
713
|
+
ATOM_N1 = "N1"
|
714
|
+
|
715
|
+
# RNAView tokens
|
716
|
+
BEGIN_BASE_PAIR = "BEGIN_base-pair"
|
717
|
+
END_BASE_PAIR = "END_base-pair"
|
718
|
+
STACKING = "stacked"
|
719
|
+
BASE_RIBOSE = "!(b_s)"
|
720
|
+
BASE_PHOSPHATE = "!b_(O1P,O2P)"
|
721
|
+
OTHER_INTERACTION = "!(s_s)"
|
722
|
+
SAENGER_UNKNOWN = "n/a"
|
723
|
+
PLUS_INTERACTION = "+/+" # For us - cWW
|
724
|
+
MINUS_INTERACTION = "-/-" # For us - cWW
|
725
|
+
X_INTERACTION = "X/X" # For us - cWW
|
726
|
+
ONE_HBOND = "!1H(b_b)" # For us - OtherInteraction
|
727
|
+
DOUBLE_SAENGER = ("XIV,XV", "XII,XIII")
|
728
|
+
UNKNOWN_LW_CHARS = (".", "?")
|
729
|
+
ROMAN_NUMERALS = ("I", "V", "X")
|
730
|
+
|
731
|
+
def get_leontis_westhof(
|
732
|
+
lw_info: str, trans_cis_info: str
|
733
|
+
) -> Optional[LeontisWesthof]:
|
734
|
+
"""Convert RNAView LW notation to LeontisWesthof enum."""
|
735
|
+
trans_cis = trans_cis_info[0]
|
736
|
+
if any(char in lw_info for char in UNKNOWN_LW_CHARS):
|
737
|
+
return None
|
738
|
+
if lw_info in (PLUS_INTERACTION, MINUS_INTERACTION, X_INTERACTION):
|
739
|
+
return LeontisWesthof[f"{trans_cis}WW"]
|
740
|
+
return LeontisWesthof[f"{trans_cis}{lw_info[0].upper()}{lw_info[2].upper()}"]
|
741
|
+
|
742
|
+
def append_residues_from_pdb_using_rnaview_indexing(
|
743
|
+
pdb_content: str,
|
744
|
+
) -> Dict[int, Residue]:
|
745
|
+
"""Parse PDB content and create RNAView-style residue mapping."""
|
746
|
+
potential_residues: Dict[str, PotentialResidue] = {}
|
747
|
+
|
748
|
+
for line in pdb_content.splitlines():
|
749
|
+
if line.startswith(ATOM) or line.startswith(HETATM):
|
750
|
+
atom_name = line[ATOM_NAME_INDEX].strip()
|
751
|
+
|
752
|
+
number = int(line[NUMBER_INDEX].strip())
|
753
|
+
icode = None if line[ICODE_INDEX].strip() == "" else line[ICODE_INDEX]
|
754
|
+
chain = line[CHAIN_INDEX].strip()
|
755
|
+
name = line[NAME_INDEX].strip()
|
756
|
+
|
757
|
+
residue = Residue(None, ResidueAuth(chain, number, icode, name))
|
758
|
+
|
759
|
+
if str(residue) not in potential_residues:
|
760
|
+
potential_residues[str(residue)] = PotentialResidue(
|
761
|
+
residue, None, None, None
|
762
|
+
)
|
763
|
+
potential_residue = potential_residues[str(residue)]
|
764
|
+
|
765
|
+
atom_position = (
|
766
|
+
float(line[X_INDEX].strip()),
|
767
|
+
float(line[Y_INDEX].strip()),
|
768
|
+
float(line[Z_INDEX].strip()),
|
769
|
+
)
|
770
|
+
|
771
|
+
if atom_name == ATOM_C6:
|
772
|
+
potential_residue.position_c6 = atom_position
|
773
|
+
elif atom_name == ATOM_C2:
|
774
|
+
potential_residue.position_c2 = atom_position
|
775
|
+
elif atom_name == ATOM_N1:
|
776
|
+
potential_residue.position_n1 = atom_position
|
777
|
+
|
778
|
+
residues_from_pdb: Dict[int, Residue] = {}
|
779
|
+
counter = 1
|
780
|
+
for potential_residue in potential_residues.values():
|
781
|
+
if potential_residue.is_correct_according_to_rnaview():
|
782
|
+
residues_from_pdb[counter] = potential_residue.residue
|
783
|
+
counter += 1
|
784
|
+
|
785
|
+
logging.debug("RNAView residues mapping:")
|
786
|
+
for idx, residue in sorted(residues_from_pdb.items()):
|
787
|
+
logging.debug(f" {idx}: {residue}")
|
788
|
+
|
789
|
+
return residues_from_pdb
|
790
|
+
|
791
|
+
def check_indexing_correctness(
|
792
|
+
regex_result: Tuple[str, ...], line: str, residues_from_pdb: Dict[int, Residue]
|
793
|
+
) -> None:
|
794
|
+
"""Check if RNAView internal indexing matches PDB residue information."""
|
795
|
+
residue_left = residues_from_pdb[int(regex_result[0])]
|
796
|
+
|
797
|
+
if residue_left.auth.chain.lower() != regex_result[
|
798
|
+
2
|
799
|
+
].lower() or residue_left.auth.number != int(regex_result[3]):
|
800
|
+
raise ValueError(
|
801
|
+
f"Wrong internal index for {residue_left}. Fix RNAView internal index mapping. Line: {line}"
|
802
|
+
)
|
803
|
+
|
804
|
+
residue_right = residues_from_pdb[int(regex_result[1])]
|
805
|
+
|
806
|
+
if residue_right.auth.chain.lower() != regex_result[
|
807
|
+
7
|
808
|
+
].lower() or residue_right.auth.number != int(regex_result[6]):
|
809
|
+
raise ValueError(
|
810
|
+
f"Wrong internal index for {residue_right}. Fix RNAView internal index mapping. Line: {line}"
|
811
|
+
)
|
812
|
+
|
813
|
+
# Find the first .out file in the list
|
814
|
+
out_file = None
|
815
|
+
pdb_file = None
|
816
|
+
for file_path in file_paths:
|
817
|
+
if file_path.endswith(".out"):
|
818
|
+
out_file = file_path
|
819
|
+
elif file_path.endswith(".pdb"):
|
820
|
+
pdb_file = file_path
|
821
|
+
|
822
|
+
if out_file is None:
|
823
|
+
logging.warning("No .out file found in RNAView file list")
|
824
|
+
return BaseInteractions([], [], [], [], [])
|
825
|
+
|
826
|
+
# Log unused files
|
827
|
+
used_files = [f for f in [out_file, pdb_file] if f is not None]
|
828
|
+
unused_files = [f for f in file_paths if f not in used_files]
|
829
|
+
if unused_files:
|
830
|
+
logging.info(
|
831
|
+
f"RNAView: Using {used_files}, ignoring unused files: {unused_files}"
|
832
|
+
)
|
833
|
+
|
834
|
+
base_pairs = []
|
835
|
+
stackings = []
|
836
|
+
base_ribose_interactions = []
|
837
|
+
base_phosphate_interactions = []
|
838
|
+
other_interactions = []
|
839
|
+
|
840
|
+
# Parse PDB content to build residue mapping if PDB file is available
|
841
|
+
residues_from_pdb: Dict[int, Residue] = {}
|
842
|
+
if pdb_file:
|
843
|
+
logging.info(f"Processing RNAView PDB file: {pdb_file}")
|
844
|
+
try:
|
845
|
+
with open(pdb_file, "r", encoding="utf-8") as f:
|
846
|
+
pdb_content = f.read()
|
847
|
+
residues_from_pdb = append_residues_from_pdb_using_rnaview_indexing(
|
848
|
+
pdb_content
|
849
|
+
)
|
850
|
+
except Exception as e:
|
851
|
+
logging.warning(
|
852
|
+
f"Error processing RNAView PDB file {pdb_file}: {e}", exc_info=True
|
853
|
+
)
|
854
|
+
|
855
|
+
# Process the RNAView output file
|
856
|
+
logging.info(f"Processing RNAView file: {out_file}")
|
857
|
+
|
858
|
+
try:
|
859
|
+
with open(out_file, "r", encoding="utf-8") as f:
|
860
|
+
rnaview_result = f.read()
|
861
|
+
|
862
|
+
base_pair_section = False
|
863
|
+
for line in rnaview_result.splitlines():
|
864
|
+
if line.startswith(BEGIN_BASE_PAIR):
|
865
|
+
base_pair_section = True
|
866
|
+
elif line.startswith(END_BASE_PAIR):
|
867
|
+
base_pair_section = False
|
868
|
+
elif base_pair_section:
|
869
|
+
rnaview_regex_result = re.search(RNAVIEW_REGEX, line)
|
870
|
+
if rnaview_regex_result is None:
|
871
|
+
logging.warning(f"RNAView regex failed for line: {line}")
|
872
|
+
continue
|
873
|
+
|
874
|
+
rnaview_regex_groups = rnaview_regex_result.groups()
|
875
|
+
|
876
|
+
# Log parsed groups with their meanings
|
877
|
+
logging.debug("RNAView regex parsed:")
|
878
|
+
logging.debug(
|
879
|
+
f" First residue: idx={rnaview_regex_groups[0]}, chain={rnaview_regex_groups[2]}, num={rnaview_regex_groups[3]}, name={rnaview_regex_groups[4]}"
|
880
|
+
)
|
881
|
+
logging.debug(
|
882
|
+
f" Second residue: idx={rnaview_regex_groups[1]}, chain={rnaview_regex_groups[7]}, num={rnaview_regex_groups[6]}, name={rnaview_regex_groups[5]}"
|
883
|
+
)
|
884
|
+
if rnaview_regex_groups[9] == "stacked":
|
885
|
+
logging.debug(" Interaction: stacking")
|
886
|
+
else:
|
887
|
+
logging.debug(f" LW edges: {rnaview_regex_groups[10]}")
|
888
|
+
logging.debug(f" LW orientation: {rnaview_regex_groups[11]}")
|
889
|
+
logging.debug(f" Classification: {rnaview_regex_groups[13]}")
|
890
|
+
|
891
|
+
# Use residue mapping if available, otherwise create residues from regex
|
892
|
+
if residues_from_pdb:
|
893
|
+
try:
|
894
|
+
check_indexing_correctness(
|
895
|
+
rnaview_regex_groups, line, residues_from_pdb
|
896
|
+
)
|
897
|
+
residue_left = residues_from_pdb[int(rnaview_regex_groups[0])]
|
898
|
+
residue_right = residues_from_pdb[int(rnaview_regex_groups[1])]
|
899
|
+
except (KeyError, ValueError) as e:
|
900
|
+
logging.warning(f"RNAView indexing error: {e}")
|
901
|
+
continue
|
902
|
+
else:
|
903
|
+
# Fallback: create residues from regex groups
|
904
|
+
chain_left = rnaview_regex_groups[2]
|
905
|
+
number_left = int(rnaview_regex_groups[3])
|
906
|
+
name_left = rnaview_regex_groups[4]
|
907
|
+
|
908
|
+
chain_right = rnaview_regex_groups[7]
|
909
|
+
number_right = int(rnaview_regex_groups[6])
|
910
|
+
name_right = rnaview_regex_groups[5]
|
911
|
+
|
912
|
+
residue_left = Residue(
|
913
|
+
None, ResidueAuth(chain_left, number_left, None, name_left)
|
914
|
+
)
|
915
|
+
residue_right = Residue(
|
916
|
+
None, ResidueAuth(chain_right, number_right, None, name_right)
|
917
|
+
)
|
918
|
+
|
919
|
+
# Interaction OR Saenger OR n/a OR empty string
|
920
|
+
token = rnaview_regex_groups[13]
|
921
|
+
|
922
|
+
if rnaview_regex_groups[9] == STACKING:
|
923
|
+
stackings.append(Stacking(residue_left, residue_right, None))
|
924
|
+
|
925
|
+
elif token == BASE_RIBOSE:
|
926
|
+
base_ribose_interactions.append(
|
927
|
+
BaseRibose(residue_left, residue_right, None)
|
928
|
+
)
|
929
|
+
|
930
|
+
elif token == BASE_PHOSPHATE:
|
931
|
+
base_phosphate_interactions.append(
|
932
|
+
BasePhosphate(residue_left, residue_right, None)
|
933
|
+
)
|
934
|
+
|
935
|
+
elif token in (OTHER_INTERACTION, ONE_HBOND):
|
936
|
+
other_interactions.append(
|
937
|
+
OtherInteraction(residue_left, residue_right)
|
938
|
+
)
|
939
|
+
|
940
|
+
elif token == SAENGER_UNKNOWN:
|
941
|
+
leontis_westhof = get_leontis_westhof(
|
942
|
+
rnaview_regex_groups[10], rnaview_regex_groups[11]
|
943
|
+
)
|
944
|
+
if leontis_westhof is None:
|
945
|
+
other_interactions.append(
|
946
|
+
OtherInteraction(residue_left, residue_right)
|
947
|
+
)
|
948
|
+
else:
|
949
|
+
base_pairs.append(
|
950
|
+
BasePair(residue_left, residue_right, leontis_westhof, None)
|
951
|
+
)
|
952
|
+
|
953
|
+
elif (
|
954
|
+
all(char in ROMAN_NUMERALS for char in token)
|
955
|
+
or token in DOUBLE_SAENGER
|
956
|
+
):
|
957
|
+
leontis_westhof = get_leontis_westhof(
|
958
|
+
rnaview_regex_groups[10], rnaview_regex_groups[11]
|
959
|
+
)
|
960
|
+
if leontis_westhof is None:
|
961
|
+
other_interactions.append(
|
962
|
+
OtherInteraction(residue_left, residue_right)
|
963
|
+
)
|
964
|
+
else:
|
965
|
+
saenger = (
|
966
|
+
Saenger[token.split(",", 1)[0]]
|
967
|
+
if token in DOUBLE_SAENGER
|
968
|
+
else Saenger[token]
|
969
|
+
)
|
970
|
+
base_pairs.append(
|
971
|
+
BasePair(
|
972
|
+
residue_left, residue_right, leontis_westhof, saenger
|
973
|
+
)
|
974
|
+
)
|
975
|
+
|
976
|
+
else:
|
977
|
+
logging.warning(f"Unknown RNAView interaction: {token}")
|
978
|
+
|
979
|
+
except Exception as e:
|
980
|
+
logging.warning(f"Error processing RNAView file {out_file}: {e}", exc_info=True)
|
981
|
+
|
982
|
+
return BaseInteractions(
|
983
|
+
base_pairs,
|
984
|
+
stackings,
|
985
|
+
base_ribose_interactions,
|
986
|
+
base_phosphate_interactions,
|
987
|
+
other_interactions,
|
988
|
+
)
|
989
|
+
|
990
|
+
|
256
991
|
def parse_external_output(
|
257
|
-
|
992
|
+
file_paths: List[str], tool: ExternalTool, structure3d: Structure3D
|
258
993
|
) -> BaseInteractions:
|
259
994
|
"""
|
260
995
|
Parse the output from an external tool (FR3D, DSSR, etc.) and convert it to BaseInteractions.
|
261
996
|
|
262
997
|
Args:
|
263
|
-
|
998
|
+
file_paths: List of paths to external tool output files
|
264
999
|
tool: The external tool that generated the output
|
265
1000
|
structure3d: The 3D structure parsed from PDB/mmCIF
|
266
1001
|
|
@@ -268,20 +1003,26 @@ def parse_external_output(
|
|
268
1003
|
BaseInteractions object containing the interactions found by the external tool
|
269
1004
|
"""
|
270
1005
|
if tool == ExternalTool.FR3D:
|
271
|
-
return parse_fr3d_output(
|
1006
|
+
return parse_fr3d_output(file_paths)
|
272
1007
|
elif tool == ExternalTool.DSSR:
|
273
|
-
return parse_dssr_output(
|
1008
|
+
return parse_dssr_output(file_paths, structure3d)
|
1009
|
+
elif tool == ExternalTool.MAXIT:
|
1010
|
+
return parse_maxit_output(file_paths)
|
1011
|
+
elif tool == ExternalTool.BPNET:
|
1012
|
+
return parse_bpnet_output(file_paths)
|
1013
|
+
elif tool == ExternalTool.RNAVIEW:
|
1014
|
+
return parse_rnaview_output(file_paths, structure3d)
|
274
1015
|
else:
|
275
1016
|
raise ValueError(f"Unsupported external tool: {tool}")
|
276
1017
|
|
277
1018
|
|
278
|
-
def parse_fr3d_output(
|
1019
|
+
def parse_fr3d_output(file_paths: List[str]) -> BaseInteractions:
|
279
1020
|
"""
|
280
|
-
Parse FR3D output
|
1021
|
+
Parse FR3D output files and convert to BaseInteractions.
|
281
1022
|
|
282
1023
|
Args:
|
283
|
-
|
284
|
-
|
1024
|
+
file_paths: List of paths to FR3D output files containing basepair, stacking,
|
1025
|
+
and backbone interactions
|
285
1026
|
|
286
1027
|
Returns:
|
287
1028
|
BaseInteractions object containing the interactions found by FR3D
|
@@ -295,15 +1036,17 @@ def parse_fr3d_output(file_path: str) -> BaseInteractions:
|
|
295
1036
|
"other_interactions": [],
|
296
1037
|
}
|
297
1038
|
|
298
|
-
# Process
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
1039
|
+
# Process each input file
|
1040
|
+
for file_path in file_paths:
|
1041
|
+
logging.info(f"Processing FR3D file: {file_path}")
|
1042
|
+
with open(file_path, "r") as f:
|
1043
|
+
for line in f:
|
1044
|
+
line = line.strip()
|
1045
|
+
if not line or line.startswith("#"):
|
1046
|
+
continue
|
304
1047
|
|
305
|
-
|
306
|
-
|
1048
|
+
# Process every non-empty, non-comment line
|
1049
|
+
_process_interaction_line(line, interactions_data)
|
307
1050
|
|
308
1051
|
# Return a BaseInteractions object with all the processed interactions
|
309
1052
|
return BaseInteractions(
|
@@ -317,8 +1060,9 @@ def parse_fr3d_output(file_path: str) -> BaseInteractions:
|
|
317
1060
|
|
318
1061
|
def process_external_tool_output(
|
319
1062
|
structure3d: Structure3D,
|
320
|
-
|
1063
|
+
external_file_paths: List[str],
|
321
1064
|
tool: ExternalTool,
|
1065
|
+
input_file_path: str,
|
322
1066
|
find_gaps: bool = False,
|
323
1067
|
) -> Tuple[Structure2D, Mapping2D3D]: # Added Mapping2D3D to return tuple
|
324
1068
|
"""
|
@@ -329,16 +1073,23 @@ def process_external_tool_output(
|
|
329
1073
|
|
330
1074
|
Args:
|
331
1075
|
structure3d: The 3D structure parsed from PDB/mmCIF
|
332
|
-
|
1076
|
+
external_file_paths: List of paths to external tool output files (empty for MAXIT)
|
333
1077
|
tool: The external tool that generated the output (FR3D, DSSR, etc.)
|
334
|
-
|
1078
|
+
input_file_path: Path to the input file (used when external_file_paths is empty)
|
335
1079
|
find_gaps: Whether to detect gaps in the structure
|
336
1080
|
|
337
1081
|
Returns:
|
338
1082
|
A tuple containing the Structure2D object and the Mapping2D3D object.
|
339
1083
|
"""
|
340
1084
|
# Parse external tool output
|
341
|
-
|
1085
|
+
if not external_file_paths:
|
1086
|
+
# For MAXIT or when no external files are provided, use the input file
|
1087
|
+
file_paths_to_process = [input_file_path]
|
1088
|
+
else:
|
1089
|
+
# Process all external files
|
1090
|
+
file_paths_to_process = external_file_paths
|
1091
|
+
|
1092
|
+
base_interactions = parse_external_output(file_paths_to_process, tool, structure3d)
|
342
1093
|
|
343
1094
|
# Extract secondary structure using the external tool's interactions
|
344
1095
|
return structure3d.extract_secondary_structure(base_interactions, find_gaps)
|
@@ -348,15 +1099,14 @@ def main():
|
|
348
1099
|
parser = argparse.ArgumentParser()
|
349
1100
|
parser.add_argument("input", help="Path to PDB or mmCIF file")
|
350
1101
|
parser.add_argument(
|
351
|
-
"
|
352
|
-
|
353
|
-
help="Path to external tool output file (FR3D, DSSR, etc.)",
|
1102
|
+
"external_files",
|
1103
|
+
nargs="*",
|
1104
|
+
help="Path(s) to external tool output file(s) (FR3D, DSSR, etc.)",
|
354
1105
|
)
|
355
1106
|
parser.add_argument(
|
356
1107
|
"--tool",
|
357
1108
|
choices=[t.value for t in ExternalTool],
|
358
|
-
|
359
|
-
help="External tool that generated the output file",
|
1109
|
+
help="External tool that generated the output file (auto-detected if not specified)",
|
360
1110
|
)
|
361
1111
|
parser.add_argument(
|
362
1112
|
"-f",
|
@@ -371,20 +1121,24 @@ def main():
|
|
371
1121
|
file = handle_input_file(args.input)
|
372
1122
|
structure3d = read_3d_structure(file, None)
|
373
1123
|
|
374
|
-
#
|
1124
|
+
# Auto-detect tool if not specified
|
1125
|
+
if args.tool is not None:
|
1126
|
+
tool = ExternalTool(args.tool)
|
1127
|
+
else:
|
1128
|
+
tool = auto_detect_tool(args.external_files)
|
1129
|
+
logging.info(f"Auto-detected tool: {tool.value}")
|
1130
|
+
|
1131
|
+
# Process external tool output files and get secondary structure
|
1132
|
+
# Always call process_external_tool_output, even for MAXIT (empty external files)
|
375
1133
|
structure2d, mapping = process_external_tool_output(
|
376
1134
|
structure3d,
|
377
|
-
args.
|
378
|
-
|
1135
|
+
args.external_files,
|
1136
|
+
tool,
|
1137
|
+
args.input,
|
379
1138
|
args.find_gaps,
|
380
1139
|
)
|
381
1140
|
|
382
|
-
|
383
|
-
dot_brackets = mapping.all_dot_brackets
|
384
|
-
else:
|
385
|
-
dot_brackets = [mapping.dot_bracket]
|
386
|
-
|
387
|
-
handle_output_arguments(args, structure2d, dot_brackets, mapping, args.input)
|
1141
|
+
handle_output_arguments(args, structure2d, mapping, args.input)
|
388
1142
|
|
389
1143
|
|
390
1144
|
if __name__ == "__main__":
|
rnapolis/annotator.py
CHANGED
@@ -24,6 +24,7 @@ from rnapolis.common import (
|
|
24
24
|
BPh,
|
25
25
|
BpSeq,
|
26
26
|
LeontisWesthof,
|
27
|
+
OtherInteraction,
|
27
28
|
Residue,
|
28
29
|
Saenger,
|
29
30
|
Stacking,
|
@@ -660,12 +661,6 @@ def write_bpseq(path: str, bpseq: BpSeq):
|
|
660
661
|
|
661
662
|
def add_common_output_arguments(parser: argparse.ArgumentParser):
|
662
663
|
"""Adds common output and processing arguments to the parser."""
|
663
|
-
parser.add_argument(
|
664
|
-
"-a",
|
665
|
-
"--all-dot-brackets",
|
666
|
-
action="store_true",
|
667
|
-
help="(optional) print all dot-brackets, not only optimal one (exclusive with -e/--extended)",
|
668
|
-
)
|
669
664
|
parser.add_argument("-b", "--bpseq", help="(optional) path to output BPSEQ file")
|
670
665
|
parser.add_argument("-c", "--csv", help="(optional) path to output CSV file")
|
671
666
|
parser.add_argument(
|
@@ -693,42 +688,126 @@ def add_common_output_arguments(parser: argparse.ArgumentParser):
|
|
693
688
|
)
|
694
689
|
|
695
690
|
|
691
|
+
def unify_structure_data(structure2d: Structure2D, mapping: Mapping2D3D) -> Structure2D:
|
692
|
+
"""
|
693
|
+
Unify structure data by:
|
694
|
+
1. Adding missing Saenger classifications to base pairs
|
695
|
+
2. Filling in empty residue labels from Structure3D
|
696
|
+
"""
|
697
|
+
# Create a mapping from residue to residue3d for label filling
|
698
|
+
residue_to_residue3d = {}
|
699
|
+
for residue3d in mapping.structure3d.residues:
|
700
|
+
residue_key = Residue(residue3d.label, residue3d.auth)
|
701
|
+
residue_to_residue3d[residue_key] = residue3d
|
702
|
+
|
703
|
+
def fill_residue_label(residue: Residue) -> Residue:
|
704
|
+
"""Fill empty label from Structure3D if available."""
|
705
|
+
if residue.label is not None:
|
706
|
+
return residue
|
707
|
+
|
708
|
+
# Try to find matching residue3d by auth
|
709
|
+
for residue3d in mapping.structure3d.residues:
|
710
|
+
if residue.auth == residue3d.auth:
|
711
|
+
return Residue(residue3d.label, residue.auth)
|
712
|
+
|
713
|
+
return residue
|
714
|
+
|
715
|
+
# Process base pairs
|
716
|
+
unified_base_pairs = []
|
717
|
+
for base_pair in structure2d.base_pairs:
|
718
|
+
# Fill in missing labels
|
719
|
+
nt1 = fill_residue_label(base_pair.nt1)
|
720
|
+
nt2 = fill_residue_label(base_pair.nt2)
|
721
|
+
|
722
|
+
# Detect missing Saenger classification
|
723
|
+
saenger = base_pair.saenger
|
724
|
+
if saenger is None:
|
725
|
+
# Find corresponding 3D residues for Saenger detection
|
726
|
+
residue3d_1 = residue_to_residue3d.get(Residue(nt1.label, nt1.auth))
|
727
|
+
residue3d_2 = residue_to_residue3d.get(Residue(nt2.label, nt2.auth))
|
728
|
+
|
729
|
+
if residue3d_1 is not None and residue3d_2 is not None:
|
730
|
+
saenger = detect_saenger(residue3d_1, residue3d_2, base_pair.lw)
|
731
|
+
|
732
|
+
unified_base_pairs.append(BasePair(nt1, nt2, base_pair.lw, saenger))
|
733
|
+
|
734
|
+
# Process other interaction types (fill labels only)
|
735
|
+
unified_stackings = []
|
736
|
+
for stacking in structure2d.stackings:
|
737
|
+
nt1 = fill_residue_label(stacking.nt1)
|
738
|
+
nt2 = fill_residue_label(stacking.nt2)
|
739
|
+
unified_stackings.append(Stacking(nt1, nt2, stacking.topology))
|
740
|
+
|
741
|
+
unified_base_ribose = []
|
742
|
+
for base_ribose in structure2d.base_ribose_interactions:
|
743
|
+
nt1 = fill_residue_label(base_ribose.nt1)
|
744
|
+
nt2 = fill_residue_label(base_ribose.nt2)
|
745
|
+
unified_base_ribose.append(BaseRibose(nt1, nt2, base_ribose.br))
|
746
|
+
|
747
|
+
unified_base_phosphate = []
|
748
|
+
for base_phosphate in structure2d.base_phosphate_interactions:
|
749
|
+
nt1 = fill_residue_label(base_phosphate.nt1)
|
750
|
+
nt2 = fill_residue_label(base_phosphate.nt2)
|
751
|
+
unified_base_phosphate.append(BasePhosphate(nt1, nt2, base_phosphate.bph))
|
752
|
+
|
753
|
+
unified_other = []
|
754
|
+
for other in structure2d.other_interactions:
|
755
|
+
nt1 = fill_residue_label(other.nt1)
|
756
|
+
nt2 = fill_residue_label(other.nt2)
|
757
|
+
unified_other.append(OtherInteraction(nt1, nt2))
|
758
|
+
|
759
|
+
# Create new Structure2D with unified data
|
760
|
+
unified_base_interactions = BaseInteractions(
|
761
|
+
unified_base_pairs,
|
762
|
+
unified_stackings,
|
763
|
+
unified_base_ribose,
|
764
|
+
unified_base_phosphate,
|
765
|
+
unified_other,
|
766
|
+
)
|
767
|
+
|
768
|
+
# Recreate Structure2D with unified interactions
|
769
|
+
unified_structure2d, _ = mapping.structure3d.extract_secondary_structure(
|
770
|
+
unified_base_interactions, False
|
771
|
+
)
|
772
|
+
|
773
|
+
return unified_structure2d
|
774
|
+
|
775
|
+
|
696
776
|
def handle_output_arguments(
|
697
777
|
args: argparse.Namespace,
|
698
778
|
structure2d: Structure2D,
|
699
|
-
dot_brackets: List[str],
|
700
779
|
mapping: Mapping2D3D,
|
701
780
|
input_filename: str,
|
702
781
|
):
|
703
782
|
"""Handles writing output based on provided arguments."""
|
783
|
+
# Unify the structure data before processing outputs
|
784
|
+
unified_structure2d = unify_structure_data(structure2d, mapping)
|
785
|
+
|
704
786
|
input_basename = os.path.basename(input_filename)
|
705
787
|
if args.csv:
|
706
|
-
write_csv(args.csv,
|
788
|
+
write_csv(args.csv, unified_structure2d)
|
707
789
|
|
708
790
|
if args.json:
|
709
|
-
write_json(args.json,
|
791
|
+
write_json(args.json, unified_structure2d)
|
710
792
|
|
711
793
|
if args.bpseq:
|
712
|
-
write_bpseq(args.bpseq,
|
794
|
+
write_bpseq(args.bpseq, unified_structure2d.bpseq)
|
713
795
|
|
714
796
|
if args.extended:
|
715
|
-
print(
|
716
|
-
elif args.all_dot_brackets:
|
717
|
-
for dot_bracket in dot_brackets:
|
718
|
-
print(dot_bracket)
|
797
|
+
print(unified_structure2d.extended_dot_bracket)
|
719
798
|
else:
|
720
|
-
print(
|
799
|
+
print(unified_structure2d.dot_bracket)
|
721
800
|
|
722
801
|
if args.dot:
|
723
|
-
print(BpSeq.from_string(
|
802
|
+
print(BpSeq.from_string(unified_structure2d.bpseq).graphviz)
|
724
803
|
|
725
804
|
if args.pml:
|
726
|
-
pml_script = generate_pymol_script(mapping,
|
805
|
+
pml_script = generate_pymol_script(mapping, unified_structure2d.stems)
|
727
806
|
with open(args.pml, "w") as f:
|
728
807
|
f.write(pml_script)
|
729
808
|
|
730
809
|
if args.inter_stem_csv:
|
731
|
-
if
|
810
|
+
if unified_structure2d.inter_stem_parameters:
|
732
811
|
# Convert list of dataclasses to list of dicts
|
733
812
|
params_list = [
|
734
813
|
{
|
@@ -741,7 +820,7 @@ def handle_output_arguments(
|
|
741
820
|
"min_endpoint_distance_pdf": p.min_endpoint_distance_pdf,
|
742
821
|
"coaxial_probability": p.coaxial_probability,
|
743
822
|
}
|
744
|
-
for p in
|
823
|
+
for p in unified_structure2d.interStemParameters
|
745
824
|
]
|
746
825
|
df = pd.DataFrame(params_list)
|
747
826
|
df["input_basename"] = input_basename
|
@@ -759,9 +838,9 @@ def handle_output_arguments(
|
|
759
838
|
# pd.DataFrame(columns=['input_basename', 'stem1_idx', ...]).to_csv(args.inter_stem_csv, index=False)
|
760
839
|
|
761
840
|
if args.stems_csv:
|
762
|
-
if
|
841
|
+
if unified_structure2d.stems:
|
763
842
|
stems_data = []
|
764
|
-
for i, stem in enumerate(
|
843
|
+
for i, stem in enumerate(unified_structure2d.stems):
|
765
844
|
try:
|
766
845
|
res5p_first = mapping.bpseq_index_to_residue_map.get(
|
767
846
|
stem.strand5p.first
|
@@ -838,11 +917,11 @@ def main():
|
|
838
917
|
file = handle_input_file(args.input)
|
839
918
|
structure3d = read_3d_structure(file, None)
|
840
919
|
base_interactions = extract_base_interactions(structure3d)
|
841
|
-
structure2d,
|
842
|
-
base_interactions, args.find_gaps
|
920
|
+
structure2d, mapping = structure3d.extract_secondary_structure(
|
921
|
+
base_interactions, args.find_gaps
|
843
922
|
)
|
844
923
|
|
845
|
-
handle_output_arguments(args, structure2d,
|
924
|
+
handle_output_arguments(args, structure2d, mapping, args.input)
|
846
925
|
|
847
926
|
|
848
927
|
if __name__ == "__main__":
|
@@ -1,6 +1,6 @@
|
|
1
|
-
rnapolis/adapter.py,sha256=
|
1
|
+
rnapolis/adapter.py,sha256=apDxyftg9NnlsN9ieVk07dFzsxmJTTilJ2gyHV0_HX8,42239
|
2
2
|
rnapolis/aligner.py,sha256=o7rQyjAZ3n4VXcnSPY3HVB8nLNRkVbl552O3NVh0mfg,3429
|
3
|
-
rnapolis/annotator.py,sha256=
|
3
|
+
rnapolis/annotator.py,sha256=OkqFVuxOtb-mySmw3bc5NF9ETu4BWq4ImtBecWJikrY,33899
|
4
4
|
rnapolis/clashfinder.py,sha256=AC9_tIx7QIk57sELq_aKfU1u3UMOXbgcccQeGHhMR6c,8517
|
5
5
|
rnapolis/common.py,sha256=HTe-RSZa_9hEIi-j4-1afxdqt7zAD-BpZ7JxRZGX170,32390
|
6
6
|
rnapolis/component_A.csv,sha256=koirS-AwUZwoYGItT8yn3wS6Idvmh2FANfTQcOS_xh8,2897
|
@@ -21,9 +21,9 @@ rnapolis/tertiary_v2.py,sha256=y7Rh43Jzt9QU6wCa1wAHIcO3BcNQY83PbbWNTmqI8zM,23424
|
|
21
21
|
rnapolis/transformer.py,sha256=aC0nBmHHJf5TyLvBIV57Jj3tlwpvHbPo347opfAOlQA,3844
|
22
22
|
rnapolis/unifier.py,sha256=2ge7IB9FdRgzSAiVD39U_ciwtdDJ2fGzf8mUIudbrqY,5820
|
23
23
|
rnapolis/util.py,sha256=IdquFO3PV1_KDqodjupzm0Rqvgy0CeSzxGHaGEHYXVU,543
|
24
|
-
rnapolis-0.9.
|
25
|
-
rnapolis-0.9.
|
26
|
-
rnapolis-0.9.
|
27
|
-
rnapolis-0.9.
|
28
|
-
rnapolis-0.9.
|
29
|
-
rnapolis-0.9.
|
24
|
+
rnapolis-0.9.2.dist-info/licenses/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
|
25
|
+
rnapolis-0.9.2.dist-info/METADATA,sha256=im-tdbK04EmFGO4O7ZGUCMWp5rimzW6_NZ5YQlrKJ0U,54537
|
26
|
+
rnapolis-0.9.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
27
|
+
rnapolis-0.9.2.dist-info/entry_points.txt,sha256=H00KoN54wU3dFOofAu3H_3PADmZOBTB1hXf5TUU2uzo,438
|
28
|
+
rnapolis-0.9.2.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
|
29
|
+
rnapolis-0.9.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|