PyPI - RNApolis - Versions diffs - 0.3.18__tar.gz → 0.4.0__tar.gz - Mend

RNApolis 0.3.18tar.gz → 0.4.0tar.gz

Files changed (31) hide show

{rnapolis-0.3.18/src/RNApolis.egg-info → rnapolis-0.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: RNApolis
-Version: 0.3.18
+Version: 0.4.0
 Summary: A Python library containing RNA-related bioinformatics functions and classes
 Home-page: https://github.com/tzok/rnapolis-py
 Author: Tomasz Zok

{rnapolis-0.3.18 → rnapolis-0.4.0}/setup.py RENAMED Viewed

@@ -5,7 +5,7 @@ with open("README.md") as f:
 setup(
     name="RNApolis",
-    version="0.3.18",
+    version="0.4.0",
     packages=["rnapolis"],
     package_dir={"": "src"},
     author="Tomasz Zok",

{rnapolis-0.3.18 → rnapolis-0.4.0/src/RNApolis.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: RNApolis
-Version: 0.3.18
+Version: 0.4.0
 Summary: A Python library containing RNA-related bioinformatics functions and classes
 Home-page: https://github.com/tzok/rnapolis-py
 Author: Tomasz Zok

{rnapolis-0.3.18 → rnapolis-0.4.0}/src/rnapolis/parser.py RENAMED Viewed

@@ -12,7 +12,7 @@ logger = logging.getLogger(__name__)
 def read_3d_structure(
     cif_or_pdb: IO[str], model: Optional[int] = None, nucleic_acid_only: bool = False
 ) -> Structure3D:
-    atoms, modified, sequence = (
+    atoms, modified, sequence_by_entity, is_nucleic_acid_by_entity = (
         parse_cif(cif_or_pdb) if is_cif(cif_or_pdb) else parse_pdb(cif_or_pdb)
     )
     available_models = {atom.model: None for atom in atoms}
@@ -24,7 +24,13 @@ def read_3d_structure(
         atoms = atoms_by_model[model]
     else:
         atoms = atoms_by_model[list(available_models.keys())[0]]
-    return group_atoms(atoms, modified, sequence, nucleic_acid_only)
+    return group_atoms(
+        atoms,
+        modified,
+        sequence_by_entity,
+        is_nucleic_acid_by_entity,
+        nucleic_acid_only,
+    )
 def is_cif(cif_or_pdb: IO[str]) -> bool:
@@ -40,7 +46,8 @@ def parse_cif(
 ) -> Tuple[
     List[Atom],
     Dict[Union[ResidueLabel, ResidueAuth], str],
-    Dict[Tuple[str, int], str],
+    Dict[str, str],
+    Dict[str, bool],
 ]:
     cif.seek(0)
@@ -48,7 +55,8 @@ def parse_cif(
     data = io_adapter.readFile(cif.name)
     atoms: List[Atom] = []
     modified: Dict[Union[ResidueLabel, ResidueAuth], str] = {}
-    sequence = {}
+    sequence_by_entity = {}
+    is_nucleic_acid_by_entity = {}
     if data:
         atom_site = data[0].getObj("atom_site")
@@ -59,6 +67,7 @@ def parse_cif(
             for row in atom_site.getRowList():
                 row_dict = dict(zip(atom_site.getAttributeList(), row))
+                label_entity_id = row_dict.get("label_entity_id", None)
                 label_chain_name = row_dict.get("label_asym_id", None)
                 label_residue_number = try_parse_int(row_dict.get("label_seq_id", None))
                 label_residue_name = row_dict.get("label_comp_id", None)
@@ -127,7 +136,19 @@ def parse_cif(
                     else None
                 )
-                atoms.append(Atom(label, auth, model, atom_name, x, y, z, occupancy))
+                atoms.append(
+                    Atom(
+                        label_entity_id,
+                        label,
+                        auth,
+                        model,
+                        atom_name,
+                        x,
+                        y,
+                        z,
+                        occupancy,
+                    )
+                )
         if mod_residue:
             for row in mod_residue.getRowList():
@@ -178,17 +199,24 @@ def parse_cif(
             for row in entity_poly.getRowList():
                 row_dict = dict(zip(entity_poly.getAttributeList(), row))
-                pdbx_strand_id = row_dict.get("pdbx_strand_id", None)
+                entity_id = row_dict.get("entity_id", None)
+                type_ = row_dict.get("type", None)
                 pdbx_seq_one_letter_code_can = row_dict.get(
                     "pdbx_seq_one_letter_code_can", None
                 )
-                if pdbx_strand_id and pdbx_seq_one_letter_code_can:
-                    for strand in pdbx_strand_id.split(","):
-                        for i, letter in enumerate(pdbx_seq_one_letter_code_can):
-                            sequence[(strand, i + 1)] = letter
+                if entity_id and type_:
+                    is_nucleic_acid_by_entity[entity_id] = type_ in (
+                        "peptide nucleic acid",
+                        "polydeoxyribonucleotide",
+                        "polydeoxyribonucleotide/polyribonucleotide hybrid",
+                        "polyribonucleotide",
+                    )
+                if entity_id and pdbx_seq_one_letter_code_can:
+                    sequence_by_entity[entity_id] = pdbx_seq_one_letter_code_can
-    return atoms, modified, sequence
+    return atoms, modified, sequence_by_entity, is_nucleic_acid_by_entity
 def parse_pdb(
@@ -196,7 +224,8 @@ def parse_pdb(
 ) -> Tuple[
     List[Atom],
     Dict[Union[ResidueLabel, ResidueAuth], str],
-    Dict[Tuple[str, int], str],
+    Dict[str, str],
+    Dict[str, bool],
 ]:
     pdb.seek(0)
     atoms: List[Atom] = []
@@ -222,7 +251,7 @@ def parse_pdb(
             auth = ResidueAuth(
                 chain_identifier, residue_number, insertion_code, residue_name
             )
-            atoms.append(Atom(None, auth, model, atom_name, x, y, z, occupancy))
+            atoms.append(Atom(None, None, auth, model, atom_name, x, y, z, occupancy))
         elif line.startswith("MODRES"):
             original_name = line[12:15]
             chain_identifier = line[16]
@@ -234,13 +263,14 @@ def parse_pdb(
             )
             modified[auth] = standard_residue_name
-    return atoms, modified, {}
+    return atoms, modified, {}, {}
 def group_atoms(
     atoms: List[Atom],
     modified: Dict[Union[ResidueLabel, ResidueAuth], str],
-    sequence: Dict[Tuple[str, int], str],
+    sequence_by_entity: Dict[str, str],
+    is_nucleic_acid_by_entity: Dict[str, bool],
     nucleic_acid_only: bool,
 ) -> Structure3D:
     if not atoms:
@@ -258,28 +288,45 @@ def group_atoms(
             label = key_previous[0]
             auth = key_previous[1]
             model = key_previous[2]
+            entity_id = residue_atoms[-1].entity_id
             name = get_residue_name(auth, label, modified)
-            one_letter_name = get_one_letter_name(label, sequence, name)
-            if one_letter_name not in "ACGUT":
+            one_letter_name = get_one_letter_name(
+                entity_id, label, sequence_by_entity, name
+            )
+            if one_letter_name not in "ACGUTN":
                 one_letter_name = detect_one_letter_name(residue_atoms)
-            residue = Residue3D(
-                label, auth, model, one_letter_name, tuple(residue_atoms)
+            residues.append(
+                Residue3D(label, auth, model, one_letter_name, tuple(residue_atoms))
             )
-            if not nucleic_acid_only or (nucleic_acid_only and residue.is_nucleotide):
-                residues.append(residue)
             key_previous = key
             residue_atoms = [atom]
     label = key_previous[0]
     auth = key_previous[1]
     model = key_previous[2]
+    entity_id = residue_atoms[-1].entity_id
     name = get_residue_name(auth, label, modified)
-    one_letter_name = get_one_letter_name(label, sequence, name)
-    if one_letter_name not in "ACGUT":
+    one_letter_name = get_one_letter_name(entity_id, label, sequence_by_entity, name)
+    if one_letter_name not in "ACGUTN":
         one_letter_name = detect_one_letter_name(residue_atoms)
-    residue = Residue3D(label, auth, model, one_letter_name, tuple(residue_atoms))
-    if not nucleic_acid_only or (nucleic_acid_only and residue.is_nucleotide):
-        residues.append(residue)
+    residues.append(
+        Residue3D(label, auth, model, one_letter_name, tuple(residue_atoms))
+    )
+    if nucleic_acid_only:
+        if is_nucleic_acid_by_entity:
+            residues = [
+                residue
+                for residue in residues
+                if is_nucleic_acid_by_entity[residue.atoms[0].entity_id]
+            ]
+        else:
+            residues = [residue for residue in residues if residue.is_nucleotide]
     return Structure3D(residues)
@@ -304,13 +351,14 @@ def get_residue_name(
 def get_one_letter_name(
-    label: Optional[ResidueLabel], sequence: Dict[Tuple[str, int], str], name: str
+    entity_id: Optional[str],
+    label: Optional[ResidueLabel],
+    sequence_by_entity: Dict[str, str],
+    name: str,
 ) -> str:
     # try getting the value from _entity_poly first
-    if label is not None:
-        key = (label.chain, label.number)
-        if key in sequence:
-            return sequence[key]
+    if entity_id is not None and label is not None and entity_id in sequence_by_entity:
+        return sequence_by_entity[entity_id][label.number - 1]
     # RNA
     if len(name) == 1:
         return name
@@ -334,11 +382,13 @@ def detect_one_letter_name(atoms: List[Atom]) -> str:
         ) / len(atom_names_expected)
         score[candidate] = count
     items = sorted(score.items(), key=lambda kv: kv[1], reverse=True)
+    if items[0][1] == 0:
+        return "?"
     return items[0][0]
 def try_parse_int(s: str) -> Optional[int]:
     try:
         return int(s)
-    except:
+    except ValueError:
         return None

{rnapolis-0.3.18 → rnapolis-0.4.0}/src/rnapolis/tertiary.py RENAMED Viewed

@@ -96,6 +96,7 @@ AVERAGE_OXYGEN_PHOSPHORUS_DISTANCE_COVALENT = 1.6
 @dataclass(frozen=True, order=True)
 class Atom:
+    entity_id: Optional[str]
     label: Optional[ResidueLabel]
     auth: Optional[ResidueAuth]
     model: int
@@ -128,6 +129,29 @@ class Residue3D(Residue):
         "C": set(["N1", "C2", "O2", "N3", "C4", "N4", "C5", "C6"]),
         "U": set(["N1", "C2", "O2", "N3", "C4", "O4", "C5", "C6"]),
     }
+    # Heavy atoms in nucleotide
+    nucleotide_heavy_atoms = (
+        set(
+            [
+                "P",
+                "OP1",
+                "OP2",
+                "O5'",
+                "C5'",
+                "C4'",
+                "O4'",
+                "C3'",
+                "O3'",
+                "C2'",
+                "O2'",
+                "C1'",
+            ]
+        )
+        .union(nucleobase_heavy_atoms["A"])
+        .union(nucleobase_heavy_atoms["G"])
+        .union(nucleobase_heavy_atoms["C"])
+        .union(nucleobase_heavy_atoms["U"])
+    )
     def __lt__(self, other):
         return (self.model, self.chain, self.number, self.icode or " ") < (
@@ -176,8 +200,8 @@ class Residue3D(Residue):
     @cached_property
     def is_nucleotide(self) -> bool:
-        return len(self.atoms) > 1 and any(
-            [atom for atom in self.atoms if atom.name == "C1'"]
+        return self.nucleotide_heavy_atoms.intersection(
+            set([atom.name for atom in self.atoms])
         )
     @cached_property
@@ -268,7 +292,7 @@ class Residue3D(Residue):
         logging.error(
             f"Failed to determine the outermost atom for nucleotide {self}, so an arbitrary atom will be used"
         )
-        yield Atom(self.label, self.auth, self.model, "UNK", 0.0, 0.0, 0.0, None)
+        yield Atom(None, self.label, self.auth, self.model, "UNK", 0.0, 0.0, 0.0, None)
     def __inner_generator(self):
         # try to find expected atom name
@@ -296,7 +320,7 @@ class Residue3D(Residue):
         logging.error(
             f"Failed to determine the innermost atom for nucleotide {self}, so an arbitrary atom will be used"
         )
-        yield Atom(self.label, self.auth, self.model, "UNK", 0.0, 0.0, 0.0, None)
+        yield Atom(None, self.label, self.auth, self.model, "UNK", 0.0, 0.0, 0.0, None)
 @dataclass(frozen=True, order=True)

{rnapolis-0.3.18 → rnapolis-0.4.0}/tests/test_annotator.py RENAMED Viewed

@@ -36,9 +36,7 @@ def test_1ehz():
                     for bp in interactions[i]
                     if (bp.nt1.full_name, bp.nt2.full_name) == element
                 ]
-                assert (
-                    False
-                ), f"Interaction {element} occurs {count} times among {labels[i]} type: {duplicates}"
+                assert False, f"Interaction {element} occurs {count} times among {labels[i]} type: {duplicates}"
 def test_8btk():

{rnapolis-0.3.18 → rnapolis-0.4.0}/tests/test_bugfixes.py RENAMED Viewed

@@ -34,7 +34,7 @@ def test_1DFU():
     assert b1u not in mapping.base_pair_graph[b2g]
-# in 4WTI the first residue has only O3' atom and so is not considered a nucleotide
+# in 4WTI the first residue has only O3' atom, but is stil considered a nucleotide
 def test_4WTI():
     with open("tests/4WTI_1_T-P.cif") as f:
         structure3d = read_3d_structure(f, 1)
@@ -42,7 +42,7 @@ def test_4WTI():
     mapping = Mapping2D3D(
         structure3d, base_interactions.basePairs, base_interactions.stackings, True
     )
-    assert mapping.dot_bracket == ">strand_T\nCGG\n.((\n>strand_P\nCC\n))"
+    assert mapping.dot_bracket == ">strand_T\nACGG\n..((\n>strand_P\nCC\n))"
 # in 1HMH the bases are oriented in 45 degrees and it caused the program to identify invalid base pair
@@ -64,3 +64,14 @@ def test_6INQ():
     assert structure3d.find_residue(None, ResidueAuth("T", 0, None, "DC")) is not None
     assert structure3d.find_residue(ResidueLabel("O", 126, "DG"), None) is not None
     assert structure3d.find_residue(None, ResidueAuth("N", 0, None, "DG")) is not None
+# in 6g90 from rna3db, the sequence contains Ns which were ignored incorrectly
+def test_6g90():
+    with open("tests/6g90_1.cif") as f:
+        structure3d = read_3d_structure(f, nucleic_acid_only=True)
+    sequence = "".join([residue.one_letter_name for residue in structure3d.residues])
+    assert (
+        sequence
+        == "AUACUUACCUUAAGAUAUCAGAGGAGAUCAAGAAGUCCUACUGAUCAAACAUGCGCUUCCAAGAAGGACGUUAAGCAUUUAUCAUUGAACGUUCAUUGAACAUUGAUGCAAACUCCUUGGUCACACACACGCGGAAGGCGUGUUUGCUGACGUCCCUUGUUUCAAUCAUUGGUUAACUGAUUUUUGGGGCCCUUUGUUCUUCUGAGAAGUGACACCAAUUGGUGUUAGGGGAGCUGGGGCCUUUCAAAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNUUUUGGAAGGUCUUGGUCGGGUGGAUCUUAUAAUUUUUGAUUUA"
+    )

{rnapolis-0.3.18 → rnapolis-0.4.0}/tests/test_common.py RENAMED Viewed

@@ -2,7 +2,6 @@ from collections import Counter
 from hypothesis import given, settings
 from hypothesis import strategies as st
 from rnapolis.common import (
     BaseInteractions,
     BasePair,
@@ -93,7 +92,7 @@ def test_rnapdbee_adapters_api_compliance_structure2d(obj):
 def test_bpseq_from_dotbracket():
     expected = BpSeq.from_file("tests/1ET4-A.bpseq")
-    actual = BpSeq.from_dotbracket(DotBracket.from_file(f"tests/1ET4-A.dbn"))
+    actual = BpSeq.from_dotbracket(DotBracket.from_file("tests/1ET4-A.dbn"))
     assert expected == actual

{rnapolis-0.3.18 → rnapolis-0.4.0}/tests/test_rfam_folder.py RENAMED Viewed

@@ -1,7 +1,6 @@
 import os
 import pytest
 from rnapolis.rfam_folder import generate_consensus_secondary_structure, parse_fasta
 IN_GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true"

{rnapolis-0.3.18 → rnapolis-0.4.0}/tests/test_tertiary.py RENAMED Viewed

@@ -5,10 +5,10 @@ from rnapolis.tertiary import Atom, torsion_angle
 def test_torsion_angle():
-    a1 = Atom(None, None, 1, "P", 50.63, 49.73, 50.57, None)
-    a2 = Atom(None, None, 1, "O5'", 50.16, 49.14, 52.02, None)
-    a3 = Atom(None, None, 1, "C5'", 50.22, 49.95, 53.21, None)
-    a4 = Atom(None, None, 1, "C4'", 50.97, 49.23, 54.31, None)
+    a1 = Atom(None, None, None, 1, "P", 50.63, 49.73, 50.57, None)
+    a2 = Atom(None, None, None, 1, "O5'", 50.16, 49.14, 52.02, None)
+    a3 = Atom(None, None, None, 1, "C5'", 50.22, 49.95, 53.21, None)
+    a4 = Atom(None, None, None, 1, "C4'", 50.97, 49.23, 54.31, None)
     assert math.isclose(
         math.degrees(torsion_angle(a1, a2, a3, a4)), -127.83976634524326
     )