RNApolis 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: RNApolis
3
- Version: 0.2.1
3
+ Version: 0.3.1
4
4
  Summary: A Python library containing RNA-related bioinformatics functions and classes
5
5
  Home-page: https://github.com/tzok/rnapolis-py
6
6
  Author: Tomasz Zok
@@ -15,13 +15,16 @@ Classifier: Programming Language :: Python :: 3
15
15
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
+ Requires-Dist: appdirs
18
19
  Requires-Dist: graphviz
19
20
  Requires-Dist: mmcif
20
21
  Requires-Dist: numpy
21
22
  Requires-Dist: ordered-set
22
23
  Requires-Dist: orjson
23
24
  Requires-Dist: pulp
25
+ Requires-Dist: requests
24
26
  Requires-Dist: scipy
27
+ Requires-Dist: viennarna
25
28
 
26
29
  # RNApolis
27
30
 
@@ -2133,3 +2136,38 @@ To use `transformer.py`, specify the path to your input mmCIF file and the desir
2133
2136
  - `--copy-to COPY_TO`: Indicate the column name to copy data to (e.g., `auth_asym_id`).
2134
2137
 
2135
2138
  For additional guidance, use `-h` or `--help`.
2139
+
2140
+ ### `rfam-folder`
2141
+
2142
+ `rfam-folder` is a command-line tool for generating consensus secondary structures for RNA sequences. This tool can process a single RNA sequence or multiple sequences from a FASTA file. It offers the flexibility to specify an Rfam family for targeted analysis, control the folding of the secondary structure, and set a limit on the number of structures generated.
2143
+
2144
+ **Important!** You need to have [Infernal software](http://eddylab.org/infernal/) installed for this script to run.
2145
+
2146
+ #### Usage:
2147
+
2148
+ The general usage pattern for rfam_folder.py is as follows:
2149
+
2150
+ ```bash
2151
+ usage: rfam_folder.py [-h] [--family FAMILY] [--no-fold] [--count COUNT] sequence
2152
+ ```
2153
+
2154
+ Positional Arguments:
2155
+
2156
+ - sequence: An RNA sequence directly or a path to a FASTA file containing one or more sequences.
2157
+
2158
+ Options:
2159
+
2160
+ - `--family FAMILY`: (Optional) Specify the name of the Rfam family to use. If not given, the entire Rfam database is searched for the sequence.
2161
+ - `--no-fold`: (Optional) Disable folding of the consensus secondary structure by RNAfold with constraints.
2162
+ - `--count COUNT`: (Optional) Set the maximum number of consensus secondary structures to generate per sequence, with a default of 1.
2163
+
2164
+ #### Examples
2165
+
2166
+ Generate a consensus structure for a single RNA sequence given specific Rfam family:
2167
+
2168
+ ```
2169
+ $ rfam-folder AUCGUUCACCUCAUAAAUUGAGUGAGACGGAAGUAGGUUAAAACCGAAGGAACGCAGU --family RF01739
2170
+ >header
2171
+ AUCGUUCACCUCAUAAAUUGAGUGAGACGGAAGUAGGUUAAAACCGAAGGAACGCAGU
2172
+ ..(((((..(((((.......)))))((....)).(((....)))....)))))....
2173
+ ```
@@ -5,12 +5,13 @@ rnapolis/metareader.py,sha256=4qtMKRvww2sUStLeV8WVrLEt-ScydHUv4Gxx96tnf-M,1683
5
5
  rnapolis/molecule_filter.py,sha256=NhjuqdCRnXgPefWZPeTq77tifmnAzamQtA0ODqPPG9k,6918
6
6
  rnapolis/motif_extractor.py,sha256=duHvpi9Ulcny9K60E6VBpz5RpJZw-KdTB4_Ph0iP478,774
7
7
  rnapolis/parser.py,sha256=Z3Zd_IuRyOP45x5BStgu7UgoyHthhw55fT3udHUhAE4,11905
8
+ rnapolis/rfam_folder.py,sha256=MggwxechIE5f2K-p5nhwNqsL4ckuQw5bJQaFohC2u4c,8918
8
9
  rnapolis/tertiary.py,sha256=iWMPD9c21rjMPpEdBd7mPCQgds65IbOr4_Fy06s0NoU,18957
9
10
  rnapolis/transformer.py,sha256=V9nOQvdq4-p7yUWo0vQg0CDQMpmyxz9t4TMSRVEKHnw,1817
10
11
  rnapolis/util.py,sha256=IdquFO3PV1_KDqodjupzm0Rqvgy0CeSzxGHaGEHYXVU,543
11
- RNApolis-0.2.1.dist-info/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
12
- RNApolis-0.2.1.dist-info/METADATA,sha256=PQMiXyedtnIVHGawMug4hssMACTxbHBQjtW7_NrxbpQ,52712
13
- RNApolis-0.2.1.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92
14
- RNApolis-0.2.1.dist-info/entry_points.txt,sha256=oI0ywRPjBQJBR_k4MIQIqwsy5MZu6D5dkj_rfQNZTV4,268
15
- RNApolis-0.2.1.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
16
- RNApolis-0.2.1.dist-info/RECORD,,
12
+ RNApolis-0.3.1.dist-info/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
13
+ RNApolis-0.3.1.dist-info/METADATA,sha256=quiuTEU3oKIvg6Mkpa4CN8-MBgUjjxzFjytjGtDd2eE,54300
14
+ RNApolis-0.3.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
15
+ RNApolis-0.3.1.dist-info/entry_points.txt,sha256=foN2Pn5e-OzEz0fFmNoX6PnFSZFQntOlY8LbognP5F0,308
16
+ RNApolis-0.3.1.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
17
+ RNApolis-0.3.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.41.3)
2
+ Generator: bdist_wheel (0.42.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -4,4 +4,5 @@ clashfinder = rnapolis.clashfinder:main
4
4
  metareader = rnapolis.metareader:main
5
5
  molecule-filter = rnapolis.molecule_filter:main
6
6
  motif-extractor = rnapolis.motif_extractor:main
7
+ rfam-folder = rnapolis.rfam_folder:main
7
8
  transformer = rnapolis.transformer:main
@@ -0,0 +1,294 @@
1
+ #! /usr/bin/env python
2
+ import argparse
3
+ import gzip
4
+ import os
5
+ import re
6
+ import shutil
7
+ import subprocess
8
+ import tempfile
9
+ from typing import List
10
+
11
+ import appdirs
12
+ import requests
13
+ import RNA
14
+
15
+ from rnapolis.common import BpSeq, DotBracket
16
+
17
+ COMBINED_CM = "https://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.cm.gz"
18
+ SEPARATE_CM = "https://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.tar.gz"
19
+
20
+
21
+ class FASTA:
22
+ header: str
23
+ sequence: str
24
+
25
+ def __init__(self, header: str, sequence: str):
26
+ self.header = header
27
+ self.sequence = sequence
28
+
29
+ def __str__(self):
30
+ return f">{self.header}\n{self.sequence}"
31
+
32
+
33
+ def parse_fasta(fasta_path: str) -> List[FASTA]:
34
+ """
35
+ Read FASTA entries from a file.
36
+
37
+ Args:
38
+ fasta_path (str): The path to the FASTA file.
39
+
40
+ Returns:
41
+ List[Fasta]: A list of FASTA objects representing the entries in the file.
42
+ """
43
+ with open(fasta_path) as f:
44
+ content = f.read()
45
+
46
+ entries = content.split(">")[1:]
47
+ fastas = []
48
+
49
+ for entry in entries:
50
+ lines = entry.splitlines()
51
+ header = lines[0]
52
+ sequence = "".join(lines[1:])
53
+ fastas.append(FASTA(header, sequence))
54
+
55
+ return fastas
56
+
57
+
58
+ def ensure_cm(family: str = None):
59
+ if not os.path.exists(appdirs.user_data_dir("rnapolis")):
60
+ os.makedirs(appdirs.user_data_dir("rnapolis"))
61
+
62
+ if family is None:
63
+ cm_gz_path = appdirs.user_data_dir("rnapolis") + "/Rfam.cm.gz"
64
+ cm_path = appdirs.user_data_dir("rnapolis") + "/Rfam.cm"
65
+
66
+ if not os.path.exists(cm_gz_path):
67
+ response = requests.get(COMBINED_CM)
68
+
69
+ with open(cm_gz_path, "wb") as f:
70
+ f.write(response.content)
71
+
72
+ if not os.path.exists(cm_path):
73
+ with gzip.open(cm_gz_path, "rb") as f_in, open(cm_path, "wb") as f_out:
74
+ f_out.write(f_in.read())
75
+ else:
76
+ cm_gz_path = appdirs.user_data_dir("rnapolis") + "/Rfam.tar.gz"
77
+ cm_path = appdirs.user_data_dir("rnapolis") + f"/{family}.cm"
78
+
79
+ if not os.path.exists(cm_gz_path):
80
+ response = requests.get(SEPARATE_CM)
81
+
82
+ with open(cm_gz_path, "wb") as f:
83
+ f.write(response.content)
84
+
85
+ if not os.path.exists(cm_path):
86
+ shutil.unpack_archive(cm_gz_path, appdirs.user_data_dir("rnapolis"))
87
+
88
+ if not os.path.exists(cm_path):
89
+ raise RuntimeError(
90
+ f"Failed to find covariance model for {family} from Rfam."
91
+ )
92
+
93
+ if not os.path.exists(cm_path + ".i1m"):
94
+ subprocess.run(["cmpress", cm_path], check=True, capture_output=True)
95
+
96
+ return cm_path
97
+
98
+
99
+ def analyze_cmsearch(cmsearch: str, fasta: FASTA, count: int = 1):
100
+ result = []
101
+ lines = cmsearch.splitlines()
102
+ begins = [i for i, line in enumerate(lines) if line.startswith(">>")]
103
+
104
+ for i, begin in enumerate(begins):
105
+ nc_index, cs_index = None, None
106
+
107
+ for j in range(begin, begins[i + 1] if i + 1 < len(begins) else len(lines)):
108
+ if lines[j].endswith(" NC"):
109
+ nc_index = j
110
+ if lines[j].endswith(" CS"):
111
+ cs_index = j
112
+
113
+ assert len(lines[cs_index].split()) == 2
114
+
115
+ structure = lines[cs_index]
116
+ sequence = lines[cs_index + 3]
117
+
118
+ match = re.match(r"\s*.+?\s+(\d+)\s+.+\s+(\d+)", sequence)
119
+ assert match is not None, sequence
120
+ first, last = int(match.group(1)), int(match.group(2))
121
+
122
+ for i in range(len(structure)):
123
+ if structure[i] != " ":
124
+ break
125
+ j = structure.find(" CS")
126
+
127
+ structure = structure[i:j]
128
+ sequence = sequence[i:j].upper()
129
+
130
+ # remove pairs which did not match to consensus
131
+ if nc_index is not None:
132
+ non_canonical = lines[nc_index][i:j]
133
+ for match in re.finditer(r"[v?]", non_canonical):
134
+ i = match.start()
135
+ structure = structure[:i] + "." + structure[i + 1 :]
136
+
137
+ # replace *[n]* placeholders
138
+ while True:
139
+ match = re.search(r"[<*]\[ *(\d+)\][*>]", sequence)
140
+
141
+ if match is None:
142
+ break
143
+
144
+ i, j = match.start(), match.end()
145
+ n = int(match.group(1))
146
+ sequence = sequence[:i] + "." * n + sequence[j:]
147
+ structure = structure[:i] + "." * n + structure[j:]
148
+
149
+ # replace gaps
150
+ while True:
151
+ match = re.search(r"-+", sequence)
152
+
153
+ if match is None:
154
+ break
155
+
156
+ i, j = match.start(), match.end()
157
+ sequence = sequence[:i] + sequence[j:]
158
+ structure = structure[:i] + structure[j:]
159
+
160
+ assert len(sequence) == len(structure)
161
+
162
+ if first > last:
163
+ # https://en.wikipedia.org/wiki/Nucleic_acid_notation
164
+ complementary = {
165
+ "A": "U",
166
+ "C": "G",
167
+ "G": "C",
168
+ "U": "A",
169
+ "W": "W",
170
+ "S": "S",
171
+ "M": "K",
172
+ "K": "M",
173
+ "R": "Y",
174
+ "Y": "R",
175
+ "B": "V",
176
+ "D": "H",
177
+ "H": "D",
178
+ "V": "B",
179
+ "N": "N",
180
+ ".": ".",
181
+ }
182
+ assert set(sequence) <= set(complementary.keys()), (
183
+ set(sequence) - set(complementary.keys()),
184
+ sequence,
185
+ )
186
+ sequence_comp = "".join([complementary[c] for c in sequence[::-1]])
187
+ match = re.search(sequence_comp, fasta.sequence)
188
+ assert match is not None, (sequence, fasta.sequence)
189
+ sequence_comp = match.group()
190
+ sequence = "".join([complementary[c] for c in sequence_comp[::-1]])
191
+ else:
192
+ match = re.search(sequence, fasta.sequence)
193
+ assert match is not None, (sequence, fasta.sequence)
194
+ sequence = match.group()
195
+
196
+ assert len(sequence) == len(structure)
197
+
198
+ structure = (
199
+ structure.replace(":", ".")
200
+ .replace("-", ".")
201
+ .replace("_", ".")
202
+ .replace(",", ".")
203
+ .replace("~", ".")
204
+ )
205
+ if set(structure) == {"."}:
206
+ continue
207
+
208
+ dot_bracket = DotBracket.from_string("N" * len(structure), structure)
209
+ structure = BpSeq.from_dotbracket(dot_bracket).dot_bracket.structure
210
+ result.append([sequence, structure])
211
+
212
+ if len(result) >= count:
213
+ break
214
+
215
+ return result
216
+
217
+
218
+ def generate_consensus_secondary_structure(
219
+ fasta: FASTA, family: str = None, fold: bool = True, count: int = 1
220
+ ):
221
+ if shutil.which("cmpress") is None or shutil.which("cmsearch") is None:
222
+ raise RuntimeError(
223
+ "cmpress/cmsearch not found in PATH, please install Infernal first."
224
+ )
225
+
226
+ cm_path = ensure_cm(family)
227
+
228
+ with tempfile.NamedTemporaryFile(suffix=".fa") as fin:
229
+ fin.write(str(fasta).encode())
230
+ fin.seek(0)
231
+
232
+ completed = subprocess.run(
233
+ ["cmsearch", "--notextw", cm_path, fin.name],
234
+ check=True,
235
+ capture_output=True,
236
+ )
237
+
238
+ results = analyze_cmsearch(completed.stdout.decode(), fasta, count)
239
+
240
+ if fold:
241
+ for i in range(len(results)):
242
+ RNAfold = RNA.fold_compound(results[i][0])
243
+ RNAfold.hc_add_from_db(results[i][1])
244
+ structure, _ = RNAfold.mfe()
245
+ results[i][1] = structure
246
+
247
+ return [
248
+ f">{fasta.header}\n{sequence}\n{structure}" for sequence, structure in results
249
+ ]
250
+
251
+
252
+ def main():
253
+ parser = argparse.ArgumentParser(
254
+ description="Generate consensus secondary structure for a given sequence. IMPORTANT! You need to have Infernal software installed to use this script."
255
+ )
256
+ parser.add_argument(
257
+ "sequence",
258
+ type=str,
259
+ help="an RNA sequence or a path to FASTA file, possibly containing multiple sequences",
260
+ )
261
+ parser.add_argument(
262
+ "--family",
263
+ type=str,
264
+ help="(optional) name of the Rfam family to use, if not given, the whole Rfam will be checked for the given sequence",
265
+ )
266
+ parser.add_argument(
267
+ "--no-fold",
268
+ action="store_true",
269
+ help="(optional) whether to disable folding of the consensus secondary structure by RNAfold with constraints",
270
+ )
271
+ parser.add_argument(
272
+ "--count",
273
+ type=int,
274
+ default=1,
275
+ help="(optional) maximum number of consensus secondary structures to generate per sequence, default is 1",
276
+ )
277
+
278
+ args = parser.parse_args()
279
+
280
+ if os.path.exists(args.sequence):
281
+ fastas = parse_fasta(args.sequence)
282
+ else:
283
+ fastas = [FASTA("header", args.sequence)]
284
+
285
+ for fasta in fastas:
286
+ results = generate_consensus_secondary_structure(
287
+ fasta, args.family, not args.no_fold, args.count
288
+ )
289
+ for result in results:
290
+ print(result)
291
+
292
+
293
+ if __name__ == "__main__":
294
+ main()