PyPI - RNApolis - Versions diffs - 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

RNApolis 0.2.1py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

{RNApolis-0.2.1.dist-info → RNApolis-0.3.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: RNApolis
-Version: 0.2.1
+Version: 0.3.1
 Summary: A Python library containing RNA-related bioinformatics functions and classes
 Home-page: https://github.com/tzok/rnapolis-py
 Author: Tomasz Zok
@@ -15,13 +15,16 @@ Classifier: Programming Language :: Python :: 3
 Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
 Description-Content-Type: text/markdown
 License-File: LICENSE
+Requires-Dist: appdirs
 Requires-Dist: graphviz
 Requires-Dist: mmcif
 Requires-Dist: numpy
 Requires-Dist: ordered-set
 Requires-Dist: orjson
 Requires-Dist: pulp
+Requires-Dist: requests
 Requires-Dist: scipy
+Requires-Dist: viennarna
 # RNApolis
@@ -2133,3 +2136,38 @@ To use `transformer.py`, specify the path to your input mmCIF file and the desir
 - `--copy-to COPY_TO`: Indicate the column name to copy data to (e.g., `auth_asym_id`).
 For additional guidance, use `-h` or `--help`.
+### `rfam-folder`
+`rfam-folder` is a command-line tool for generating consensus secondary structures for RNA sequences. This tool can process a single RNA sequence or multiple sequences from a FASTA file. It offers the flexibility to specify an Rfam family for targeted analysis, control the folding of the secondary structure, and set a limit on the number of structures generated.
+**Important!** You need to have [Infernal software](http://eddylab.org/infernal/) installed for this script to run.
+#### Usage:
+The general usage pattern for rfam_folder.py is as follows:
+```bash
+usage: rfam_folder.py [-h] [--family FAMILY] [--no-fold] [--count COUNT] sequence
+```
+Positional Arguments:
+- sequence: An RNA sequence directly or a path to a FASTA file containing one or more sequences.
+Options:
+- `--family FAMILY`: (Optional) Specify the name of the Rfam family to use. If not given, the entire Rfam database is searched for the sequence.
+- `--no-fold`: (Optional) Disable folding of the consensus secondary structure by RNAfold with constraints.
+- `--count COUNT`: (Optional) Set the maximum number of consensus secondary structures to generate per sequence, with a default of 1.
+#### Examples
+Generate a consensus structure for a single RNA sequence given specific Rfam family:
+```
+$ rfam-folder AUCGUUCACCUCAUAAAUUGAGUGAGACGGAAGUAGGUUAAAACCGAAGGAACGCAGU --family RF01739
+>header
+AUCGUUCACCUCAUAAAUUGAGUGAGACGGAAGUAGGUUAAAACCGAAGGAACGCAGU
+..(((((..(((((.......)))))((....)).(((....)))....)))))....
+```

{RNApolis-0.2.1.dist-info → RNApolis-0.3.1.dist-info}/RECORD RENAMED Viewed

@@ -5,12 +5,13 @@ rnapolis/metareader.py,sha256=4qtMKRvww2sUStLeV8WVrLEt-ScydHUv4Gxx96tnf-M,1683
 rnapolis/molecule_filter.py,sha256=NhjuqdCRnXgPefWZPeTq77tifmnAzamQtA0ODqPPG9k,6918
 rnapolis/motif_extractor.py,sha256=duHvpi9Ulcny9K60E6VBpz5RpJZw-KdTB4_Ph0iP478,774
 rnapolis/parser.py,sha256=Z3Zd_IuRyOP45x5BStgu7UgoyHthhw55fT3udHUhAE4,11905
+rnapolis/rfam_folder.py,sha256=MggwxechIE5f2K-p5nhwNqsL4ckuQw5bJQaFohC2u4c,8918
 rnapolis/tertiary.py,sha256=iWMPD9c21rjMPpEdBd7mPCQgds65IbOr4_Fy06s0NoU,18957
 rnapolis/transformer.py,sha256=V9nOQvdq4-p7yUWo0vQg0CDQMpmyxz9t4TMSRVEKHnw,1817
 rnapolis/util.py,sha256=IdquFO3PV1_KDqodjupzm0Rqvgy0CeSzxGHaGEHYXVU,543
-RNApolis-0.2.1.dist-info/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
-RNApolis-0.2.1.dist-info/METADATA,sha256=PQMiXyedtnIVHGawMug4hssMACTxbHBQjtW7_NrxbpQ,52712
-RNApolis-0.2.1.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92
-RNApolis-0.2.1.dist-info/entry_points.txt,sha256=oI0ywRPjBQJBR_k4MIQIqwsy5MZu6D5dkj_rfQNZTV4,268
-RNApolis-0.2.1.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
-RNApolis-0.2.1.dist-info/RECORD,,
+RNApolis-0.3.1.dist-info/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
+RNApolis-0.3.1.dist-info/METADATA,sha256=quiuTEU3oKIvg6Mkpa4CN8-MBgUjjxzFjytjGtDd2eE,54300
+RNApolis-0.3.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
+RNApolis-0.3.1.dist-info/entry_points.txt,sha256=foN2Pn5e-OzEz0fFmNoX6PnFSZFQntOlY8LbognP5F0,308
+RNApolis-0.3.1.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
+RNApolis-0.3.1.dist-info/RECORD,,

{RNApolis-0.2.1.dist-info → RNApolis-0.3.1.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.41.3)
+Generator: bdist_wheel (0.42.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{RNApolis-0.2.1.dist-info → RNApolis-0.3.1.dist-info}/entry_points.txt RENAMED Viewed

@@ -4,4 +4,5 @@ clashfinder = rnapolis.clashfinder:main
 metareader = rnapolis.metareader:main
 molecule-filter = rnapolis.molecule_filter:main
 motif-extractor = rnapolis.motif_extractor:main
+rfam-folder = rnapolis.rfam_folder:main
 transformer = rnapolis.transformer:main

rnapolis/rfam_folder.py ADDED Viewed

@@ -0,0 +1,294 @@
+#! /usr/bin/env python
+import argparse
+import gzip
+import os
+import re
+import shutil
+import subprocess
+import tempfile
+from typing import List
+import appdirs
+import requests
+import RNA
+from rnapolis.common import BpSeq, DotBracket
+COMBINED_CM = "https://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.cm.gz"
+SEPARATE_CM = "https://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.tar.gz"
+class FASTA:
+    header: str
+    sequence: str
+    def __init__(self, header: str, sequence: str):
+        self.header = header
+        self.sequence = sequence
+    def __str__(self):
+        return f">{self.header}\n{self.sequence}"
+def parse_fasta(fasta_path: str) -> List[FASTA]:
+    """
+    Read FASTA entries from a file.
+    Args:
+        fasta_path (str): The path to the FASTA file.
+    Returns:
+        List[Fasta]: A list of FASTA objects representing the entries in the file.
+    """
+    with open(fasta_path) as f:
+        content = f.read()
+    entries = content.split(">")[1:]
+    fastas = []
+    for entry in entries:
+        lines = entry.splitlines()
+        header = lines[0]
+        sequence = "".join(lines[1:])
+        fastas.append(FASTA(header, sequence))
+    return fastas
+def ensure_cm(family: str = None):
+    if not os.path.exists(appdirs.user_data_dir("rnapolis")):
+        os.makedirs(appdirs.user_data_dir("rnapolis"))
+    if family is None:
+        cm_gz_path = appdirs.user_data_dir("rnapolis") + "/Rfam.cm.gz"
+        cm_path = appdirs.user_data_dir("rnapolis") + "/Rfam.cm"
+        if not os.path.exists(cm_gz_path):
+            response = requests.get(COMBINED_CM)
+            with open(cm_gz_path, "wb") as f:
+                f.write(response.content)
+        if not os.path.exists(cm_path):
+            with gzip.open(cm_gz_path, "rb") as f_in, open(cm_path, "wb") as f_out:
+                f_out.write(f_in.read())
+    else:
+        cm_gz_path = appdirs.user_data_dir("rnapolis") + "/Rfam.tar.gz"
+        cm_path = appdirs.user_data_dir("rnapolis") + f"/{family}.cm"
+        if not os.path.exists(cm_gz_path):
+            response = requests.get(SEPARATE_CM)
+            with open(cm_gz_path, "wb") as f:
+                f.write(response.content)
+        if not os.path.exists(cm_path):
+            shutil.unpack_archive(cm_gz_path, appdirs.user_data_dir("rnapolis"))
+            if not os.path.exists(cm_path):
+                raise RuntimeError(
+                    f"Failed to find covariance model for {family} from Rfam."
+                )
+    if not os.path.exists(cm_path + ".i1m"):
+        subprocess.run(["cmpress", cm_path], check=True, capture_output=True)
+    return cm_path
+def analyze_cmsearch(cmsearch: str, fasta: FASTA, count: int = 1):
+    result = []
+    lines = cmsearch.splitlines()
+    begins = [i for i, line in enumerate(lines) if line.startswith(">>")]
+    for i, begin in enumerate(begins):
+        nc_index, cs_index = None, None
+        for j in range(begin, begins[i + 1] if i + 1 < len(begins) else len(lines)):
+            if lines[j].endswith(" NC"):
+                nc_index = j
+            if lines[j].endswith(" CS"):
+                cs_index = j
+        assert len(lines[cs_index].split()) == 2
+        structure = lines[cs_index]
+        sequence = lines[cs_index + 3]
+        match = re.match(r"\s*.+?\s+(\d+)\s+.+\s+(\d+)", sequence)
+        assert match is not None, sequence
+        first, last = int(match.group(1)), int(match.group(2))
+        for i in range(len(structure)):
+            if structure[i] != " ":
+                break
+        j = structure.find(" CS")
+        structure = structure[i:j]
+        sequence = sequence[i:j].upper()
+        # remove pairs which did not match to consensus
+        if nc_index is not None:
+            non_canonical = lines[nc_index][i:j]
+            for match in re.finditer(r"[v?]", non_canonical):
+                i = match.start()
+                structure = structure[:i] + "." + structure[i + 1 :]
+        # replace *[n]* placeholders
+        while True:
+            match = re.search(r"[<*]\[ *(\d+)\][*>]", sequence)
+            if match is None:
+                break
+            i, j = match.start(), match.end()
+            n = int(match.group(1))
+            sequence = sequence[:i] + "." * n + sequence[j:]
+            structure = structure[:i] + "." * n + structure[j:]
+        # replace gaps
+        while True:
+            match = re.search(r"-+", sequence)
+            if match is None:
+                break
+            i, j = match.start(), match.end()
+            sequence = sequence[:i] + sequence[j:]
+            structure = structure[:i] + structure[j:]
+        assert len(sequence) == len(structure)
+        if first > last:
+            # https://en.wikipedia.org/wiki/Nucleic_acid_notation
+            complementary = {
+                "A": "U",
+                "C": "G",
+                "G": "C",
+                "U": "A",
+                "W": "W",
+                "S": "S",
+                "M": "K",
+                "K": "M",
+                "R": "Y",
+                "Y": "R",
+                "B": "V",
+                "D": "H",
+                "H": "D",
+                "V": "B",
+                "N": "N",
+                ".": ".",
+            }
+            assert set(sequence) <= set(complementary.keys()), (
+                set(sequence) - set(complementary.keys()),
+                sequence,
+            )
+            sequence_comp = "".join([complementary[c] for c in sequence[::-1]])
+            match = re.search(sequence_comp, fasta.sequence)
+            assert match is not None, (sequence, fasta.sequence)
+            sequence_comp = match.group()
+            sequence = "".join([complementary[c] for c in sequence_comp[::-1]])
+        else:
+            match = re.search(sequence, fasta.sequence)
+            assert match is not None, (sequence, fasta.sequence)
+            sequence = match.group()
+        assert len(sequence) == len(structure)
+        structure = (
+            structure.replace(":", ".")
+            .replace("-", ".")
+            .replace("_", ".")
+            .replace(",", ".")
+            .replace("~", ".")
+        )
+        if set(structure) == {"."}:
+            continue
+        dot_bracket = DotBracket.from_string("N" * len(structure), structure)
+        structure = BpSeq.from_dotbracket(dot_bracket).dot_bracket.structure
+        result.append([sequence, structure])
+        if len(result) >= count:
+            break
+    return result
+def generate_consensus_secondary_structure(
+    fasta: FASTA, family: str = None, fold: bool = True, count: int = 1
+):
+    if shutil.which("cmpress") is None or shutil.which("cmsearch") is None:
+        raise RuntimeError(
+            "cmpress/cmsearch not found in PATH, please install Infernal first."
+        )
+    cm_path = ensure_cm(family)
+    with tempfile.NamedTemporaryFile(suffix=".fa") as fin:
+        fin.write(str(fasta).encode())
+        fin.seek(0)
+        completed = subprocess.run(
+            ["cmsearch", "--notextw", cm_path, fin.name],
+            check=True,
+            capture_output=True,
+        )
+    results = analyze_cmsearch(completed.stdout.decode(), fasta, count)
+    if fold:
+        for i in range(len(results)):
+            RNAfold = RNA.fold_compound(results[i][0])
+            RNAfold.hc_add_from_db(results[i][1])
+            structure, _ = RNAfold.mfe()
+            results[i][1] = structure
+    return [
+        f">{fasta.header}\n{sequence}\n{structure}" for sequence, structure in results
+    ]
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate consensus secondary structure for a given sequence. IMPORTANT! You need to have Infernal software installed to use this script."
+    )
+    parser.add_argument(
+        "sequence",
+        type=str,
+        help="an RNA sequence or a path to FASTA file, possibly containing multiple sequences",
+    )
+    parser.add_argument(
+        "--family",
+        type=str,
+        help="(optional) name of the Rfam family to use, if not given, the whole Rfam will be checked for the given sequence",
+    )
+    parser.add_argument(
+        "--no-fold",
+        action="store_true",
+        help="(optional) whether to disable folding of the consensus secondary structure by RNAfold with constraints",
+    )
+    parser.add_argument(
+        "--count",
+        type=int,
+        default=1,
+        help="(optional) maximum number of consensus secondary structures to generate per sequence, default is 1",
+    )
+    args = parser.parse_args()
+    if os.path.exists(args.sequence):
+        fastas = parse_fasta(args.sequence)
+    else:
+        fastas = [FASTA("header", args.sequence)]
+    for fasta in fastas:
+        results = generate_consensus_secondary_structure(
+            fasta, args.family, not args.no_fold, args.count
+        )
+        for result in results:
+            print(result)
+if __name__ == "__main__":
+    main()

{RNApolis-0.2.1.dist-info → RNApolis-0.3.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{RNApolis-0.2.1.dist-info → RNApolis-0.3.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

RNApolis 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl

RNApolis 0.2.1py3-none-any.whl → 0.3.1py3-none-any.whl