PyPI - RNApolis - Versions diffs - 0.4.4__tar.gz → 0.4.7__tar.gz - Mend

RNApolis 0.4.4tar.gz → 0.4.7tar.gz

Files changed (31) hide show

{rnapolis-0.4.4/src/RNApolis.egg-info → rnapolis-0.4.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: RNApolis
-Version: 0.4.4
+Version: 0.4.7
 Summary: A Python library containing RNA-related bioinformatics functions and classes
 Home-page: https://github.com/tzok/rnapolis-py
 Author: Tomasz Zok

{rnapolis-0.4.4 → rnapolis-0.4.7}/setup.py RENAMED Viewed

@@ -5,7 +5,7 @@ with open("README.md") as f:
 setup(
     name="RNApolis",
-    version="0.4.4",
+    version="0.4.7",
     packages=["rnapolis"],
     package_dir={"": "src"},
     author="Tomasz Zok",

{rnapolis-0.4.4 → rnapolis-0.4.7/src/RNApolis.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: RNApolis
-Version: 0.4.4
+Version: 0.4.7
 Summary: A Python library containing RNA-related bioinformatics functions and classes
 Home-page: https://github.com/tzok/rnapolis-py
 Author: Tomasz Zok

{rnapolis-0.4.4 → rnapolis-0.4.7}/src/rnapolis/common.py RENAMED Viewed

@@ -338,6 +338,9 @@ class Entry(Sequence):
             return self.pair
         raise IndexError()
+    def __lt__(self, other):
+        return self.index_ < other.index_
     def __len__(self) -> int:
         return 3
@@ -838,7 +841,7 @@ class BpSeq:
         for i in range(1, len(regions)):
             k, l, _ = regions[i]
-            available = [True for i in range(10)]
+            available = [True for _ in range(len("([{<" + string.ascii_uppercase))]
             for j in range(i):
                 m, n, _ = regions[j]

{rnapolis-0.4.4 → rnapolis-0.4.7}/src/rnapolis/parser.py RENAMED Viewed

@@ -1,7 +1,10 @@
 import logging
 from typing import IO, Dict, List, Optional, Tuple, Union
+import numpy as np
 from mmcif.io.IoAdapterPy import IoAdapterPy
+from scipy.spatial import KDTree
 from rnapolis.common import ResidueAuth, ResidueLabel
 from rnapolis.tertiary import BASE_ATOMS, Atom, Residue3D, Structure3D
@@ -53,10 +56,10 @@ def parse_cif(
     io_adapter = IoAdapterPy()
     data = io_adapter.readFile(cif.name)
-    atoms: List[Atom] = []
+    atoms_to_process: List[Atom] = []
     modified: Dict[Union[ResidueLabel, ResidueAuth], str] = {}
-    sequence_by_entity = {}
-    is_nucleic_acid_by_entity = {}
+    sequence_by_entity: Dict[str, str] = {}
+    is_nucleic_acid_by_entity: Dict[str, bool] = {}
     if data:
         atom_site = data[0].getObj("atom_site")
@@ -136,7 +139,7 @@ def parse_cif(
                     else None
                 )
-                atoms.append(
+                atoms_to_process.append(
                     Atom(
                         label_entity_id,
                         label,
@@ -216,6 +219,7 @@ def parse_cif(
                 if entity_id and pdbx_seq_one_letter_code_can:
                     sequence_by_entity[entity_id] = pdbx_seq_one_letter_code_can
+    atoms = filter_clashing_atoms(atoms_to_process)
     return atoms, modified, sequence_by_entity, is_nucleic_acid_by_entity
@@ -228,7 +232,7 @@ def parse_pdb(
     Dict[str, bool],
 ]:
     pdb.seek(0)
-    atoms: List[Atom] = []
+    atoms_to_process: List[Atom] = []
     modified: Dict[Union[ResidueLabel, ResidueAuth], str] = {}
     model = 1
@@ -236,9 +240,6 @@ def parse_pdb(
         if line.startswith("MODEL"):
             model = int(line[10:14].strip())
         elif line.startswith("ATOM") or line.startswith("HETATM"):
-            alternate_location = line[16]
-            if alternate_location != " ":
-                continue
             atom_name = line[12:16].strip()
             residue_name = line[17:20].strip()
             chain_identifier = line[21]
@@ -251,7 +252,10 @@ def parse_pdb(
             auth = ResidueAuth(
                 chain_identifier, residue_number, insertion_code, residue_name
             )
-            atoms.append(Atom(None, None, auth, model, atom_name, x, y, z, occupancy))
+            atoms_to_process.append(
+                Atom(None, None, auth, model, atom_name, x, y, z, occupancy)
+            )
         elif line.startswith("MODRES"):
             original_name = line[12:15]
             chain_identifier = line[16]
@@ -263,6 +267,7 @@ def parse_pdb(
             )
             modified[auth] = standard_residue_name
+    atoms = filter_clashing_atoms(atoms_to_process)
     return atoms, modified, {}, {}
@@ -392,3 +397,36 @@ def try_parse_int(s: str) -> Optional[int]:
         return int(s)
     except ValueError:
         return None
+def filter_clashing_atoms(atoms: List[Atom], clash_distance: float = 0.5) -> List[Atom]:
+    # First, remove duplicate atoms
+    unique_atoms = {}
+    for i, atom in enumerate(atoms):
+        key = (atom.label, atom.auth, atom.name)
+        if key not in unique_atoms or atom.occupancy > unique_atoms[key].occupancy:
+            unique_atoms[key] = atom
+    unique_atoms_list = list(unique_atoms.values())
+    # Now handle clashing atoms
+    coords = np.array([(atom.x, atom.y, atom.z) for atom in unique_atoms_list])
+    tree = KDTree(coords)
+    pairs = tree.query_pairs(r=clash_distance)
+    atoms_to_keep = set(range(len(unique_atoms_list)))
+    for i, j in pairs:
+        if (
+            unique_atoms_list[i].occupancy is None
+            or unique_atoms_list[j].occupancy is None
+        ):
+            continue
+        if unique_atoms_list[i].occupancy > unique_atoms_list[j].occupancy:
+            atoms_to_keep.discard(j)
+        else:
+            atoms_to_keep.discard(i)
+    return [unique_atoms_list[i] for i in atoms_to_keep]

{rnapolis-0.4.4 → rnapolis-0.4.7}/src/rnapolis/tertiary.py RENAMED Viewed

@@ -124,36 +124,17 @@ class Residue3D(Residue):
     outermost_atoms = {"A": "N9", "G": "N9", "C": "N1", "U": "N1", "T": "N1"}
     # Dist representing expected name of atom closest to the tetrad center
     innermost_atoms = {"A": "N6", "G": "O6", "C": "N4", "U": "O4", "T": "O4"}
+    # Heavy atoms in phosphate and ribose
+    phosphate_atoms = {"P", "OP1", "OP2", "O3'", "O5'"}
+    sugar_atoms = {"C1'", "C2'", "C3'", "C4'", "C5'", "O4'"}
     # Heavy atoms for each main nucleobase
     nucleobase_heavy_atoms = {
         "A": set(["N1", "C2", "N3", "C4", "C5", "C6", "N6", "N7", "C8", "N9"]),
         "G": set(["N1", "C2", "N2", "N3", "C4", "C5", "C6", "O6", "N7", "C8", "N9"]),
         "C": set(["N1", "C2", "O2", "N3", "C4", "N4", "C5", "C6"]),
         "U": set(["N1", "C2", "O2", "N3", "C4", "O4", "C5", "C6"]),
+        "T": set(["N1", "C2", "O2", "N3", "C4", "O4", "C5", "C5M", "C6"]),
     }
-    # Heavy atoms in nucleotide
-    nucleotide_heavy_atoms = (
-        set(
-            [
-                "P",
-                "OP1",
-                "OP2",
-                "O5'",
-                "C5'",
-                "C4'",
-                "O4'",
-                "C3'",
-                "O3'",
-                "C2'",
-                "O2'",
-                "C1'",
-            ]
-        )
-        .union(nucleobase_heavy_atoms["A"])
-        .union(nucleobase_heavy_atoms["G"])
-        .union(nucleobase_heavy_atoms["C"])
-        .union(nucleobase_heavy_atoms["U"])
-    )
     def __lt__(self, other):
         return (self.model, self.chain, self.number, self.icode or " ") < (
@@ -202,9 +183,59 @@ class Residue3D(Residue):
     @cached_property
     def is_nucleotide(self) -> bool:
-        return self.nucleotide_heavy_atoms.intersection(
-            set([atom.name for atom in self.atoms])
+        scores = {"phosphate": 0.0, "sugar": 0.0, "base": 0.0, "connections": 0.0}
+        weights = {"phosphate": 0.25, "sugar": 0.25, "base": 0.25, "connections": 0.25}
+        residue_atoms = {atom.name for atom in self.atoms}
+        phosphate_match = len(residue_atoms.intersection(self.phosphate_atoms))
+        scores["phosphate"] = phosphate_match / len(self.phosphate_atoms)
+        sugar_match = len(residue_atoms.intersection(self.sugar_atoms))
+        scores["sugar"] = sugar_match / len(self.sugar_atoms)
+        nucleobase_atoms = {
+            key: self.nucleobase_heavy_atoms[key] for key in self.nucleobase_heavy_atoms
+        }
+        matches = {
+            key: len(residue_atoms.intersection(nucleobase_atoms[key]))
+            / len(nucleobase_atoms[key])
+            for key in nucleobase_atoms
+        }
+        best_match = max(matches.items(), key=lambda x: x[1])
+        scores["base"] = best_match[1]
+        connection_score = 0.0
+        distance_threshold = 2.0
+        if "P" in residue_atoms and "O5'" in residue_atoms:
+            p_atom = next(atom for atom in self.atoms if atom.name == "P")
+            o5_atom = next(atom for atom in self.atoms if atom.name == "O5'")
+            if (
+                numpy.linalg.norm(p_atom.coordinates - o5_atom.coordinates)
+                <= distance_threshold
+            ):
+                connection_score += 0.5
+        if "C1'" in residue_atoms:
+            c1_atom = next(atom for atom in self.atoms if atom.name == "C1'")
+            for base_connection in ["N9", "N1"]:
+                if base_connection in residue_atoms:
+                    base_atom = next(
+                        atom for atom in self.atoms if atom.name == base_connection
+                    )
+                    if (
+                        numpy.linalg.norm(c1_atom.coordinates - base_atom.coordinates)
+                        <= distance_threshold
+                    ):
+                        connection_score += 0.5
+                        break
+        scores["connections"] = connection_score
+        probability = sum(
+            scores[component] * weights[component] for component in scores.keys()
         )
+        return probability > 0.5
     @cached_property
     def base_normal_vector(self) -> Optional[numpy.typing.NDArray[numpy.floating]]:
@@ -566,15 +597,14 @@ class Mapping2D3D:
         return self.__generate_bpseq(canonical)
     def __generate_bpseq(self, base_pairs):
+        nucleotides = list(filter(lambda r: r.is_nucleotide, self.structure3d.residues))
         result: Dict[int, List] = {}
         residue_map: Dict[Residue3D, int] = {}
         i = 1
-        for j, residue in enumerate(
-            filter(lambda r: r.is_nucleotide, self.structure3d.residues)
-        ):
+        for j, residue in enumerate(nucleotides):
             if self.find_gaps and j > 0:
-                previous = self.structure3d.residues[j - 1]
+                previous = nucleotides[j - 1]
                 if (
                     not previous.is_connected(residue)

{rnapolis-0.4.4 → rnapolis-0.4.7}/tests/test_annotator.py RENAMED Viewed

@@ -43,3 +43,14 @@ def test_8btk():
     with open("tests/8btk_B7.cif") as f:
         structure3d = read_3d_structure(f, 1)
     assert extract_secondary_structure(structure3d, 1) is not None
+def test_488d():
+    """
+    There are clashing residues 151 in chains B and D. The clash is caused by occupancy factors less than 1.
+    """
+    with open("tests/488d.pdb") as f:
+        structure3d = read_3d_structure(f)
+    base_interactions = extract_base_interactions(structure3d)
+    assert base_interactions is not None

{rnapolis-0.4.4 → rnapolis-0.4.7}/tests/test_bugfixes.py RENAMED Viewed

@@ -42,7 +42,7 @@ def test_4WTI():
     mapping = Mapping2D3D(
         structure3d, base_interactions.basePairs, base_interactions.stackings, True
     )
-    assert mapping.dot_bracket == ">strand_T\nACGG\n..((\n>strand_P\nCC\n))"
+    assert mapping.dot_bracket == ">strand_T\nCGG\n.((\n>strand_P\nCC\n))"
 # in 1HMH the bases are oriented in 45 degrees and it caused the program to identify invalid base pair

{rnapolis-0.4.4 → rnapolis-0.4.7}/tests/test_common.py RENAMED Viewed

@@ -1,3 +1,4 @@
+import string
 from collections import Counter
 import orjson
@@ -11,6 +12,7 @@ from rnapolis.common import (
     BaseRibose,
     BpSeq,
     DotBracket,
+    Entry,
     Interaction,
     LeontisWesthof,
     MultiStrandDotBracket,
@@ -180,3 +182,20 @@ def test_conflicted_base_pairs():
     assert (
         mapping.dot_bracket == ">strand_B\nGGACUAGCGGAGGCUAGUCC\n((((((((....))))))))"
     )
+def test_high_level_pseudoknot():
+    entries = []
+    brackets = "([{<" + string.ascii_uppercase
+    for i in range(len(brackets)):
+        entries.append(Entry(i + 1, "C", i + len(brackets) + 1))
+        entries.append(Entry(i + len(brackets) + 1, "G", i + 1))
+    bpseq = BpSeq(sorted(entries))
+    dot_bracket = bpseq.fcfs
+    assert dot_bracket.sequence == "C" * len(brackets) + "G" * len(brackets)
+    assert (
+        dot_bracket.structure
+        == "([{<" + string.ascii_uppercase + ")]}>" + string.ascii_lowercase
+    )

{rnapolis-0.4.4 → rnapolis-0.4.7}/tests/test_parser.py RENAMED Viewed

@@ -16,3 +16,18 @@ def test_1ato():
         structure3d = read_3d_structure(f)
     sequence = "".join([residue.one_letter_name for residue in structure3d.residues])
     assert sequence == "GGCACCUCCUCGCGGUGCC"
+def test_4qln_no_duplicate_atoms():
+    for ext in (".pdb", ".cif"):
+        with open(f"tests/4qln{ext}") as f:
+            structure3d = read_3d_structure(f)
+        chain_a = [r for r in structure3d.residues if r.auth.chain == "A"]
+        residues_to_check = [r for r in chain_a if r.auth.number in (18, 19, 20)]
+        for residue in residues_to_check:
+            atom_names = [atom.name for atom in residue.atoms]
+            assert len(atom_names) == len(
+                set(atom_names)
+            ), f"Duplicate atoms found in residue {residue.auth}"