pyaragorn 0.3.0__cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyaragorn/CMakeLists.txt +1 -0
- pyaragorn/__init__.py +36 -0
- pyaragorn/lib.cpython-38-aarch64-linux-gnu.so +0 -0
- pyaragorn/lib.pyi +107 -0
- pyaragorn/lib.pyx +834 -0
- pyaragorn/py.typed +0 -0
- pyaragorn/tests/__init__.py +10 -0
- pyaragorn/tests/data/CP001621.default.txt +95 -0
- pyaragorn/tests/data/CP001621.fna.gz +0 -0
- pyaragorn/tests/data/CP001621.ps95.txt +101 -0
- pyaragorn/tests/data/__init__.py +30 -0
- pyaragorn/tests/fasta.py +86 -0
- pyaragorn/tests/requirements.txt +1 -0
- pyaragorn/tests/test_doctest.py +93 -0
- pyaragorn/tests/test_rna_finder.py +104 -0
- pyaragorn-0.3.0.dist-info/METADATA +212 -0
- pyaragorn-0.3.0.dist-info/RECORD +19 -0
- pyaragorn-0.3.0.dist-info/WHEEL +7 -0
- pyaragorn-0.3.0.dist-info/licenses/COPYING +674 -0
pyaragorn/lib.pyx
ADDED
|
@@ -0,0 +1,834 @@
|
|
|
1
|
+
# coding: utf-8
|
|
2
|
+
# cython: language_level=3, linetrace=True, binding=True
|
|
3
|
+
|
|
4
|
+
"""Bindings to ARAGORN, a (t|mt|tm)RNA gene finder.
|
|
5
|
+
|
|
6
|
+
Attributes:
|
|
7
|
+
ARAGORN_VERSION (`str`): The version of ARAGORN currently wrapped
|
|
8
|
+
in PyARAGORN.
|
|
9
|
+
TRANSLATION_TABLES (`set` of `int`): A set containing all the
|
|
10
|
+
translation tables supported by PyARAGORN.
|
|
11
|
+
|
|
12
|
+
Example:
|
|
13
|
+
PyARAGORN can work on any DNA sequence stored in either a text or a
|
|
14
|
+
byte array. To load a sequence from one of the common sequence formats,
|
|
15
|
+
you can use an external dedicated library such as
|
|
16
|
+
`Biopython <https://github.com/biopython/biopython>`_::
|
|
17
|
+
|
|
18
|
+
>>> import gzip
|
|
19
|
+
>>> import Bio.SeqIO
|
|
20
|
+
>>> with gzip.open("CP001621.fna.gz", "rt") as f:
|
|
21
|
+
... record = Bio.SeqIO.read(f, "fasta")
|
|
22
|
+
|
|
23
|
+
Then use PyARAGORN to find the tRNA genes using the
|
|
24
|
+
bacterial genetic code (translation table 11):
|
|
25
|
+
|
|
26
|
+
>>> import pyaragorn
|
|
27
|
+
>>> rna_finder = pyaragorn.RNAFinder(11, trna=True, tmrna=False)
|
|
28
|
+
>>> for gene in rna_finder.find_rna(record.seq.encode()):
|
|
29
|
+
... print(gene.anticodon, gene.amino_acid, gene.begin, gene.end)
|
|
30
|
+
tag Leu 87124 87207
|
|
31
|
+
ttt Lys 87210 87285
|
|
32
|
+
...
|
|
33
|
+
|
|
34
|
+
The gene coordinates are 1-indexed, inclusive, similarly to
|
|
35
|
+
`Pyrodigal <https://pyrodigal.readthedocs.io>`_ genes.
|
|
36
|
+
|
|
37
|
+
References:
|
|
38
|
+
- Laslett, Dean, and Björn Canback.
|
|
39
|
+
“ARAGORN, a program to detect tRNA genes and tmRNA genes in nucleotide
|
|
40
|
+
sequences.” Nucleic acids research vol. 32,1 11-6. 2 Jan. 2004,
|
|
41
|
+
:doi:`10.1093/nar/gkh152`. :pmid:`14704338`. :pmcid:`PMC373265`.
|
|
42
|
+
- Laslett, Dean, and Björn Canbäck.
|
|
43
|
+
“ARWEN: a program to detect tRNA genes in metazoan mitochondrial
|
|
44
|
+
nucleotide sequences.” Bioinformatics (Oxford, England) vol. 24,2
|
|
45
|
+
(2008): 172-5. :doi:`10.1093/bioinformatics/btm573`. :pmid:`18033792`.
|
|
46
|
+
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
from cython.operator cimport postincrement, dereference
|
|
50
|
+
from cpython.bytes cimport PyBytes_FromStringAndSize
|
|
51
|
+
from cpython.exc cimport PyErr_CheckSignals
|
|
52
|
+
from cpython.unicode cimport PyUnicode_AsASCIIString
|
|
53
|
+
|
|
54
|
+
from libc.stdio cimport FILE, fopen, fdopen, fclose, fprintf, fputc, stdout, stderr
|
|
55
|
+
from libc.stdlib cimport calloc, free
|
|
56
|
+
from libc.string cimport memcpy
|
|
57
|
+
from libc.stdint cimport intptr_t
|
|
58
|
+
|
|
59
|
+
cimport aragorn
|
|
60
|
+
from aragorn cimport csw, data_set, gene
|
|
61
|
+
|
|
62
|
+
# --- Helpers ------------------------------------------------------------------
|
|
63
|
+
|
|
64
|
+
cdef extern from * nogil:
|
|
65
|
+
Py_UCS4 PyUnicode_READ(int kind, const void* data, size_t pos)
|
|
66
|
+
|
|
67
|
+
cdef extern from * nogil:
|
|
68
|
+
"""
|
|
69
|
+
void default_sw(csw* sw) {
|
|
70
|
+
csw x = {
|
|
71
|
+
{"tRNA", "tmRNA", "", "", "CDS", "overall"},
|
|
72
|
+
NULL, NULL, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, STANDARD, 0,
|
|
73
|
+
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
|
74
|
+
0, METAZOAN_MT, 1, 0, 5, 5, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
|
|
75
|
+
3, 0, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
|
|
76
|
+
{0, 0, 0, 0, 0, 0}, 0, 0, 0, 0, NTAG, 10, 30,
|
|
77
|
+
{0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0},
|
|
78
|
+
0, 0, 0, 0, 0L, 100.0, 1.0, tRNAthresh, 4.0, 29.0, 26.0, 7.5, 8.0,
|
|
79
|
+
mtRNAtthresh, mtRNAdthresh, mtRNAdtthresh, -7.9, -6.0, tmRNAthresh,
|
|
80
|
+
14.0, 10.0, 25.0, 9.0, srpRNAthresh, CDSthresh,
|
|
81
|
+
{tRNAthresh, tmRNAthresh, srpRNAthresh, 0.0, CDSthresh},
|
|
82
|
+
{
|
|
83
|
+
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
|
|
84
|
+
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 10, 65,
|
|
85
|
+
82, 65, 71, 79, 82, 78, 32, 118, 49, 46, 50, 46, 52, 49, 32,
|
|
86
|
+
32, 32, 68, 101, 97, 110, 32, 76, 97, 115, 108, 101, 116, 116,
|
|
87
|
+
10, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
|
|
88
|
+
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 10,
|
|
89
|
+
TERM
|
|
90
|
+
}
|
|
91
|
+
};
|
|
92
|
+
memcpy(sw, &x, sizeof(csw));
|
|
93
|
+
}
|
|
94
|
+
"""
|
|
95
|
+
void default_sw(csw* sw)
|
|
96
|
+
|
|
97
|
+
cdef inline long int sq(data_set* d, long int pos) nogil:
|
|
98
|
+
return (pos + d.psmax - 1) % d.psmax + 1
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# --- Constants ----------------------------------------------------------------
|
|
102
|
+
|
|
103
|
+
import functools
|
|
104
|
+
|
|
105
|
+
cdef set _TRANSLATION_TABLES = set(range(1, 7)) | set(range(9, 17)) | set(range(21, 27)) | {29, 30} | {32, 33}
|
|
106
|
+
|
|
107
|
+
__version__ = PROJECT_VERSION
|
|
108
|
+
|
|
109
|
+
TRANSLATION_TABLES = _TRANSLATION_TABLES
|
|
110
|
+
ARAGORN_VERSION = PROJECT_ARAGORN_VERSION
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
# --- Classes ------------------------------------------------------------------
|
|
114
|
+
|
|
115
|
+
cdef class Gene:
|
|
116
|
+
"""A gene identified by ARAGORN.
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
cdef gene _gene
|
|
120
|
+
cdef int _genetic_code
|
|
121
|
+
|
|
122
|
+
@staticmethod
|
|
123
|
+
cdef Gene _new_gene(gene* _gene, int _genetic_code):
|
|
124
|
+
cdef Gene obj
|
|
125
|
+
|
|
126
|
+
if _gene.genetype == aragorn.tRNA:
|
|
127
|
+
obj = TRNAGene.__new__(TRNAGene)
|
|
128
|
+
elif _gene.genetype == aragorn.tmRNA:
|
|
129
|
+
obj = TMRNAGene.__new__(TMRNAGene)
|
|
130
|
+
else:
|
|
131
|
+
raise NotImplementedError
|
|
132
|
+
|
|
133
|
+
memcpy(&obj._gene, _gene, sizeof(gene))
|
|
134
|
+
obj._genetic_code = _genetic_code
|
|
135
|
+
return obj
|
|
136
|
+
|
|
137
|
+
def __sizeof__(self):
|
|
138
|
+
return sizeof(self)
|
|
139
|
+
|
|
140
|
+
@property
|
|
141
|
+
def type(self):
|
|
142
|
+
return ["tRNA", "tmRNA", "", "", "CDS"][<int> self._gene.genetype]
|
|
143
|
+
|
|
144
|
+
@property
|
|
145
|
+
def begin(self):
|
|
146
|
+
"""`int`: The sequence coordinate at which the gene begins.
|
|
147
|
+
|
|
148
|
+
Hint:
|
|
149
|
+
This coordinate is 1-based, inclusive. To use it to index
|
|
150
|
+
a Python array or string, subtract one.
|
|
151
|
+
|
|
152
|
+
"""
|
|
153
|
+
return self._gene.start
|
|
154
|
+
|
|
155
|
+
@property
|
|
156
|
+
def end(self):
|
|
157
|
+
"""`int`: The sequence coordinate at which the gene end.
|
|
158
|
+
|
|
159
|
+
Hint:
|
|
160
|
+
This coordinate is 1-based, inclusive. To use it to index
|
|
161
|
+
a Python array or string, subtract one.
|
|
162
|
+
|
|
163
|
+
"""
|
|
164
|
+
return self._gene.stop
|
|
165
|
+
|
|
166
|
+
@property
|
|
167
|
+
def length(self):
|
|
168
|
+
"""`int`: The length of the RNA gene.
|
|
169
|
+
"""
|
|
170
|
+
return aragorn.seqlen(&self._gene)
|
|
171
|
+
|
|
172
|
+
@property
|
|
173
|
+
def strand(self):
|
|
174
|
+
"""`int`: *-1* if the gene is on the reverse strand, *+1* otherwise.
|
|
175
|
+
"""
|
|
176
|
+
return -1 if self._gene.comp else +1
|
|
177
|
+
|
|
178
|
+
@property
|
|
179
|
+
def energy(self):
|
|
180
|
+
"""`float`: The approximated normalised energy of the RNA structure.
|
|
181
|
+
"""
|
|
182
|
+
cdef csw sw
|
|
183
|
+
default_sw(&sw) # FIXME: should use the same parameters as the
|
|
184
|
+
# RNAFinder that produced the gene
|
|
185
|
+
return aragorn.nenergy(&self._gene, &sw)
|
|
186
|
+
|
|
187
|
+
@property
|
|
188
|
+
def raw_energy(self):
|
|
189
|
+
"""`float`: The un-normalized energy value of the RNA structure."""
|
|
190
|
+
return <double> self._gene.energy
|
|
191
|
+
|
|
192
|
+
def sequence(self):
|
|
193
|
+
"""Retrieve the full sequence of the RNA gene.
|
|
194
|
+
"""
|
|
195
|
+
cdef int i
|
|
196
|
+
cdef int l = aragorn.seqlen(&self._gene)
|
|
197
|
+
cdef bytearray b = bytearray(l)
|
|
198
|
+
for i in range(l):
|
|
199
|
+
b[i] = aragorn.cpbase(self._gene.seq[i])
|
|
200
|
+
return b.decode('ascii')
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
cdef class TRNAGene(Gene):
|
|
204
|
+
"""A transfer RNA (tRNA) gene.
|
|
205
|
+
"""
|
|
206
|
+
|
|
207
|
+
def __repr__(self):
|
|
208
|
+
return (
|
|
209
|
+
f"<TRNAGene begin={self.begin} end={self.end} "
|
|
210
|
+
f"strand={self.strand:+} "
|
|
211
|
+
f"length={self.length} anticodon={self.anticodon!r} "
|
|
212
|
+
f"energy={self.energy:.2f}>"
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
@property
|
|
216
|
+
def amino_acid(self):
|
|
217
|
+
"""`str`: The 3-letter amino-acid(s) for this gene.
|
|
218
|
+
|
|
219
|
+
Hint:
|
|
220
|
+
If the anticodon loop contains 6 or 8 bases, ``???`` is
|
|
221
|
+
returned.
|
|
222
|
+
|
|
223
|
+
"""
|
|
224
|
+
cdef csw sw
|
|
225
|
+
cdef int* s = self._gene.seq + self._gene.anticodon
|
|
226
|
+
(<int*> &sw.geneticcode)[0] = self._genetic_code
|
|
227
|
+
if self._gene.cloop == 6 or self._gene.cloop == 8:
|
|
228
|
+
return "???"
|
|
229
|
+
else:
|
|
230
|
+
return aragorn.aa(s, &sw).decode('ascii')
|
|
231
|
+
|
|
232
|
+
@property
|
|
233
|
+
def amino_acids(self):
|
|
234
|
+
"""`tuple` of `str`: All possible 3-letter amino-acids for this gene.
|
|
235
|
+
|
|
236
|
+
Hint:
|
|
237
|
+
If the anticodon loop contains 6 or 8 bases, a tuple of two
|
|
238
|
+
amino-acid is returned, otherwise a tuple with a single element
|
|
239
|
+
is returned.
|
|
240
|
+
|
|
241
|
+
"""
|
|
242
|
+
cdef csw sw
|
|
243
|
+
cdef int* s = self._gene.seq + self._gene.anticodon
|
|
244
|
+
(<int*> &sw.geneticcode)[0] = self._genetic_code
|
|
245
|
+
if self._gene.cloop == 6:
|
|
246
|
+
return (
|
|
247
|
+
aragorn.aa(s - 1, &sw).decode('ascii'),
|
|
248
|
+
aragorn.aa(s, &sw).decode('ascii'),
|
|
249
|
+
)
|
|
250
|
+
elif self._gene.cloop == 8:
|
|
251
|
+
return (
|
|
252
|
+
aragorn.aa(s, &sw).decode('ascii'),
|
|
253
|
+
aragorn.aa(s + 1, &sw).decode('ascii')
|
|
254
|
+
)
|
|
255
|
+
else:
|
|
256
|
+
return aragorn.aa(s, &sw).decode('ascii')
|
|
257
|
+
|
|
258
|
+
@property
|
|
259
|
+
def anticodon(self):
|
|
260
|
+
"""`str`: The anticodon of the tRNA gene.
|
|
261
|
+
"""
|
|
262
|
+
cdef tuple c
|
|
263
|
+
cdef int* s = self._gene.seq + self._gene.anticodon
|
|
264
|
+
if self._gene.cloop == 6:
|
|
265
|
+
c = ( aragorn.cbase(s[0]), aragorn.cbase(s[1]) )
|
|
266
|
+
elif self._gene.cloop == 8:
|
|
267
|
+
c = ( aragorn.cbase(s[0]), aragorn.cbase(s[1]), aragorn.cbase(s[2]), aragorn.cbase(s[3]) )
|
|
268
|
+
else:
|
|
269
|
+
c = ( aragorn.cbase(s[0]), aragorn.cbase(s[1]), aragorn.cbase(s[2]) )
|
|
270
|
+
return ''.join(map(chr, c))
|
|
271
|
+
|
|
272
|
+
@property
|
|
273
|
+
def anticodon_offset(self):
|
|
274
|
+
"""`int`: The offset in the gene at which the anticodon starts.
|
|
275
|
+
"""
|
|
276
|
+
cdef int x = 1 + self._gene.anticodon
|
|
277
|
+
if self._gene.nintron > 0 and self._gene.intron <= self._gene.anticodon:
|
|
278
|
+
x += self._gene.nintron
|
|
279
|
+
return x
|
|
280
|
+
|
|
281
|
+
@property
|
|
282
|
+
def anticodon_length(self):
|
|
283
|
+
"""`int`: The length of the anticodon (in nucleotides).
|
|
284
|
+
"""
|
|
285
|
+
if self._gene.cloop == 6:
|
|
286
|
+
return 2
|
|
287
|
+
elif self._gene.cloop == 8:
|
|
288
|
+
return 4
|
|
289
|
+
else:
|
|
290
|
+
return 3
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
cdef class TMRNAGene(Gene):
|
|
294
|
+
"""A transfer-messenger RNA (tmRNA) gene.
|
|
295
|
+
|
|
296
|
+
Example:
|
|
297
|
+
>>> rna_finder = pyaragorn.RNAFinder(11, trna=False, tmrna=True)
|
|
298
|
+
>>> tmrna = rna_finder.find_rna(str(record.seq))[0]
|
|
299
|
+
>>> tmrna.begin, tmrna.end
|
|
300
|
+
(198037, 198447)
|
|
301
|
+
>>> tmrna.peptide()
|
|
302
|
+
'AEKNEENFEMPAFMINNASAGANYMFA**'
|
|
303
|
+
|
|
304
|
+
"""
|
|
305
|
+
|
|
306
|
+
def __repr__(self):
|
|
307
|
+
return (
|
|
308
|
+
f"<TMRNAGene begin={self.begin} end={self.end} "
|
|
309
|
+
f"strand={self.strand:+} "
|
|
310
|
+
f"length={self.length} orf_length={self.orf_length} "
|
|
311
|
+
f"energy={self.energy:.2f}>"
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
@property
|
|
315
|
+
def permuted(self):
|
|
316
|
+
"""`bool`: Whether this tmRNA gene is a permuted gene.
|
|
317
|
+
"""
|
|
318
|
+
return self._gene.asst != 0
|
|
319
|
+
|
|
320
|
+
@property
|
|
321
|
+
def orf_offset(self):
|
|
322
|
+
"""`int`: The offset in the gene at which the open-reading frame starts.
|
|
323
|
+
"""
|
|
324
|
+
return self._gene.tps + 1
|
|
325
|
+
|
|
326
|
+
@property
|
|
327
|
+
def orf_length(self):
|
|
328
|
+
"""`int`: The length of the open-reading frame (in nucleotides).
|
|
329
|
+
"""
|
|
330
|
+
cdef int tpe = self._gene.tpe
|
|
331
|
+
cdef int* se = (self._gene.eseq + tpe) + 1
|
|
332
|
+
cdef int* sb = (self._gene.eseq + self._gene.tps)
|
|
333
|
+
cdef int stride = 3
|
|
334
|
+
|
|
335
|
+
cdef csw sw
|
|
336
|
+
(<int*> &sw.geneticcode)[0] = self._genetic_code
|
|
337
|
+
|
|
338
|
+
while aragorn.ltranslate(se, &self._gene, &sw) == ord('*'):
|
|
339
|
+
se += stride
|
|
340
|
+
tpe += stride
|
|
341
|
+
|
|
342
|
+
return tpe - self._gene.tps
|
|
343
|
+
|
|
344
|
+
def orf(self, include_stop=True):
|
|
345
|
+
"""Retrieve the open-reading frame of the mRNA-like region.
|
|
346
|
+
|
|
347
|
+
Arguments:
|
|
348
|
+
include_stop (`bool`): Whether or not to include the STOP codons
|
|
349
|
+
in the returned nucleotide sequence. Defaults to `True`.
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
`str`: The sequence of the mRNA-like region in the tmRNA
|
|
353
|
+
gene, optionally without STOP codons.
|
|
354
|
+
|
|
355
|
+
"""
|
|
356
|
+
cdef int tpe = self._gene.tpe
|
|
357
|
+
cdef int* se = (self._gene.eseq + tpe) + 1
|
|
358
|
+
cdef int* sb = (self._gene.eseq + self._gene.tps)
|
|
359
|
+
cdef int stride = 3 if include_stop else -3
|
|
360
|
+
|
|
361
|
+
cdef csw sw
|
|
362
|
+
(<int*> &sw.geneticcode)[0] = self._genetic_code
|
|
363
|
+
|
|
364
|
+
while aragorn.ltranslate(se, &self._gene, &sw) == ord('*'):
|
|
365
|
+
se += stride
|
|
366
|
+
tpe += stride
|
|
367
|
+
|
|
368
|
+
cds = bytearray()
|
|
369
|
+
while sb < se:
|
|
370
|
+
cds.append(aragorn.cpbase(sb[0]))
|
|
371
|
+
sb += 1
|
|
372
|
+
|
|
373
|
+
return cds.decode('ascii')
|
|
374
|
+
|
|
375
|
+
def peptide(self, include_stop=True):
|
|
376
|
+
"""Retrieve the peptide sequence of the mRNA-like region.
|
|
377
|
+
|
|
378
|
+
Arguments:
|
|
379
|
+
include_stop (`bool`): Whether or not to include the STOP codons
|
|
380
|
+
in the returned peptide sequence. Defaults to `True`.
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
`str`: The translation of the mRNA-like region of the tmRNA
|
|
384
|
+
gene, optionally without STOP codons.
|
|
385
|
+
|
|
386
|
+
"""
|
|
387
|
+
cdef int tpe = self._gene.tpe
|
|
388
|
+
cdef int* se = (self._gene.eseq + tpe) + 1
|
|
389
|
+
cdef int* sb = (self._gene.eseq + self._gene.tps)
|
|
390
|
+
cdef int stride = 3 if include_stop else -3
|
|
391
|
+
|
|
392
|
+
cdef csw sw
|
|
393
|
+
(<int*> &sw.geneticcode)[0] = self._genetic_code
|
|
394
|
+
|
|
395
|
+
while aragorn.ltranslate(se, &self._gene, &sw) == ord('*'):
|
|
396
|
+
se += stride
|
|
397
|
+
tpe += stride
|
|
398
|
+
|
|
399
|
+
peptide = bytearray()
|
|
400
|
+
while sb < se:
|
|
401
|
+
peptide.append(aragorn.ltranslate(sb, &self._gene, &sw))
|
|
402
|
+
sb += 3
|
|
403
|
+
|
|
404
|
+
return peptide.decode('ascii')
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
cdef class Cursor:
|
|
408
|
+
cdef object obj
|
|
409
|
+
cdef const unsigned char[:] data
|
|
410
|
+
cdef int kind
|
|
411
|
+
cdef size_t length
|
|
412
|
+
cdef data_set ds
|
|
413
|
+
|
|
414
|
+
def __init__(self, obj):
|
|
415
|
+
if isinstance(obj, str):
|
|
416
|
+
obj = PyUnicode_AsASCIIString(obj)
|
|
417
|
+
|
|
418
|
+
# get a memoryview to the data contents
|
|
419
|
+
self.data = obj
|
|
420
|
+
self.length = self.data.shape[0]
|
|
421
|
+
|
|
422
|
+
# keep a reference to the data source
|
|
423
|
+
self.obj = obj
|
|
424
|
+
|
|
425
|
+
# reinitialize dataset book-keeping
|
|
426
|
+
self.ds.filepointer = 0
|
|
427
|
+
self.ds.ns = 0
|
|
428
|
+
self.ds.nf = 0
|
|
429
|
+
self.ds.nextseq = 0
|
|
430
|
+
self.ds.nextseqoff = 0
|
|
431
|
+
self.ds.seqstart = 0
|
|
432
|
+
self.ds.seqstartoff = 0
|
|
433
|
+
self.ds.ps = 0
|
|
434
|
+
self.ds.psmax = self.length
|
|
435
|
+
|
|
436
|
+
# count GC%
|
|
437
|
+
self.ds.gc = self._gc()
|
|
438
|
+
|
|
439
|
+
cdef int _forward(self) noexcept nogil:
|
|
440
|
+
cdef char x
|
|
441
|
+
cdef int base
|
|
442
|
+
|
|
443
|
+
if self.ds.ps >= self.ds.psmax:
|
|
444
|
+
return <int> aragorn.base.TERM
|
|
445
|
+
|
|
446
|
+
x = self.data[self.ds.ps]
|
|
447
|
+
if x >= 128:
|
|
448
|
+
return <int> aragorn.base.NOBASE
|
|
449
|
+
|
|
450
|
+
base = aragorn.map[x]
|
|
451
|
+
if base >= <int> aragorn.base.Adenine:
|
|
452
|
+
self.ds.ps += 1
|
|
453
|
+
return base
|
|
454
|
+
else:
|
|
455
|
+
return <int> aragorn.base.NOBASE
|
|
456
|
+
|
|
457
|
+
cdef double _gc(self) noexcept nogil:
|
|
458
|
+
cdef long i
|
|
459
|
+
cdef char x
|
|
460
|
+
cdef int base
|
|
461
|
+
cdef long ngc = 0
|
|
462
|
+
cdef long ps = 0
|
|
463
|
+
|
|
464
|
+
for i in range(self.length):
|
|
465
|
+
x = self.data[i]
|
|
466
|
+
base = aragorn.map[x]
|
|
467
|
+
if base == -1:
|
|
468
|
+
break
|
|
469
|
+
ngc += (base == <int> aragorn.base.Cytosine) or (base == <int> aragorn.base.Guanine)
|
|
470
|
+
ps += 1
|
|
471
|
+
|
|
472
|
+
return <double> ngc / <double> ps
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
cdef class RNAFinder:
|
|
476
|
+
"""A configurable RNA gene finder.
|
|
477
|
+
"""
|
|
478
|
+
cdef csw _sw
|
|
479
|
+
|
|
480
|
+
def __init__(
|
|
481
|
+
self,
|
|
482
|
+
int translation_table = 1,
|
|
483
|
+
*,
|
|
484
|
+
bint trna = True,
|
|
485
|
+
bint tmrna = True,
|
|
486
|
+
bint linear = False,
|
|
487
|
+
double threshold_scale = 1.0,
|
|
488
|
+
):
|
|
489
|
+
"""__init__(self, translation_table=1, *, trna=True, tmrna=True, linear=False, threshold_scale=1.0)\n--\n
|
|
490
|
+
|
|
491
|
+
Create a new RNA finder.
|
|
492
|
+
|
|
493
|
+
Arguments:
|
|
494
|
+
translation_table (`int`, optional): The translation table to
|
|
495
|
+
use. Check the :wiki:`List of genetic codes` page
|
|
496
|
+
listing all genetic codes for the available values, or
|
|
497
|
+
the :attr:`pyaragorn.TRANSLATION_TABLES` constant for allowed
|
|
498
|
+
values.
|
|
499
|
+
|
|
500
|
+
Keyword Arguments:
|
|
501
|
+
trna (`bool`): Enable detection of tRNA genes. Set to `False` to
|
|
502
|
+
disable.
|
|
503
|
+
trmna (`bool`): Enable detection of tmRNA genes. Set to `False` to
|
|
504
|
+
disable.
|
|
505
|
+
linear (`bool`): Set to `True` to assume that the given sequences
|
|
506
|
+
have linear topology (no closed genomes).
|
|
507
|
+
threshold_scale (`float`, optional): Rescale scoring thresholds
|
|
508
|
+
from the default levels. Defaults to 1.0 (no rescaling). Set
|
|
509
|
+
to e.g. 0.95 to report possible pseudogenes by lowering
|
|
510
|
+
the threshold by 5%.
|
|
511
|
+
|
|
512
|
+
.. versionadded:: 0.3.0
|
|
513
|
+
The ``threshold_scale`` keyword argument.
|
|
514
|
+
|
|
515
|
+
"""
|
|
516
|
+
default_sw(&self._sw)
|
|
517
|
+
self._sw.trna = trna
|
|
518
|
+
self._sw.tmrna = tmrna
|
|
519
|
+
self._sw.linear = linear
|
|
520
|
+
self._sw.f = stdout
|
|
521
|
+
self._sw.verbose = False #True
|
|
522
|
+
self.threshold_scale = threshold_scale
|
|
523
|
+
|
|
524
|
+
if translation_table not in _TRANSLATION_TABLES:
|
|
525
|
+
raise ValueError(f"invalid translation table: {translation_table!r}")
|
|
526
|
+
self._sw.geneticcode = translation_table
|
|
527
|
+
|
|
528
|
+
def __reduce__(self):
|
|
529
|
+
return functools.partial(
|
|
530
|
+
type(self),
|
|
531
|
+
translation_table=self.translation_table,
|
|
532
|
+
trna=self.trna,
|
|
533
|
+
tmrna=self.tmrna,
|
|
534
|
+
linear=self.linear,
|
|
535
|
+
threshold_scale=self.threshold_scale,
|
|
536
|
+
), ()
|
|
537
|
+
|
|
538
|
+
def __repr__(self):
|
|
539
|
+
cdef str ty = type(self).__name__
|
|
540
|
+
cdef list args = []
|
|
541
|
+
|
|
542
|
+
if self.translation_table != 1:
|
|
543
|
+
args.append(f"{self.translation_table!r}")
|
|
544
|
+
if not self.trna:
|
|
545
|
+
args.append(f"trna={self.trna!r}")
|
|
546
|
+
if not self.tmrna:
|
|
547
|
+
args.append(f"tmrna={self.tmrna!r}")
|
|
548
|
+
if self.linear:
|
|
549
|
+
args.append(f"linear={self.linear!r}")
|
|
550
|
+
if self.threshold_scale != 1.0:
|
|
551
|
+
args.append(f"threshold_scale={self.threshold_scale!r}")
|
|
552
|
+
return f"{ty}({', '.join(args)})"
|
|
553
|
+
|
|
554
|
+
@property
|
|
555
|
+
def translation_table(self):
|
|
556
|
+
"""`int`: The translation table in use by this object.
|
|
557
|
+
"""
|
|
558
|
+
return self._sw.geneticcode
|
|
559
|
+
|
|
560
|
+
@property
|
|
561
|
+
def trna(self):
|
|
562
|
+
"""`bool`: Whether tRNA detection is enabled.
|
|
563
|
+
"""
|
|
564
|
+
return bool(self._sw.trna)
|
|
565
|
+
|
|
566
|
+
@property
|
|
567
|
+
def tmrna(self):
|
|
568
|
+
"""`bool`: Whether tmRNA detection is enabled.
|
|
569
|
+
"""
|
|
570
|
+
return bool(self._sw.tmrna)
|
|
571
|
+
|
|
572
|
+
@property
|
|
573
|
+
def linear(self):
|
|
574
|
+
"""`bool`: Whether input sequences are assumed to have linear topology.
|
|
575
|
+
"""
|
|
576
|
+
return bool(self._sw.linear)
|
|
577
|
+
|
|
578
|
+
@property
|
|
579
|
+
def threshold_scale(self):
|
|
580
|
+
"""`float`: The scale used to change the default thresholds.
|
|
581
|
+
|
|
582
|
+
.. versionadded:: 0.3.0
|
|
583
|
+
|
|
584
|
+
"""
|
|
585
|
+
return self._sw.threshlevel
|
|
586
|
+
|
|
587
|
+
@threshold_scale.setter
|
|
588
|
+
def threshold_scale(self, double threshold_scale):
|
|
589
|
+
if threshold_scale <= 0.0:
|
|
590
|
+
raise ValueError(f"threshold_scale must be positive (got {threshold_scale!r})")
|
|
591
|
+
aragorn.change_thresholds(&self._sw, threshold_scale)
|
|
592
|
+
|
|
593
|
+
def find_rna(self, object sequence):
|
|
594
|
+
"""Find RNA genes in the input DNA sequence.
|
|
595
|
+
|
|
596
|
+
Arguments:
|
|
597
|
+
sequence (`str` or buffer): The nucleotide sequence to process,
|
|
598
|
+
either as a string of nucleotides (upper- or lowercase), or
|
|
599
|
+
as an object implementing the buffer protocol.
|
|
600
|
+
|
|
601
|
+
Returns:
|
|
602
|
+
`list` of `~pyaragorn.Gene`: A list of `~pyaragorn.Gene` (either
|
|
603
|
+
`~pyaragorn.TRNAGene` or `~pyaragorn.TMRNAGene`) corresponding
|
|
604
|
+
to RNA genes detected in the sequence according to the `RNAFinder`
|
|
605
|
+
parameters.
|
|
606
|
+
|
|
607
|
+
"""
|
|
608
|
+
cdef int n
|
|
609
|
+
cdef int nt
|
|
610
|
+
cdef csw sw
|
|
611
|
+
cdef int* vsort = NULL
|
|
612
|
+
cdef Cursor cursor = Cursor(sequence)
|
|
613
|
+
|
|
614
|
+
# copy parameters to ensure the `find_rna` method is re-entrant
|
|
615
|
+
memcpy(&sw, &self._sw, sizeof(csw))
|
|
616
|
+
|
|
617
|
+
try:
|
|
618
|
+
with nogil:
|
|
619
|
+
# allocate memory for the result genes
|
|
620
|
+
sw.genespace = aragorn.NT
|
|
621
|
+
sw.genes = <gene*> calloc(sw.genespace, sizeof(gene))
|
|
622
|
+
if sw.genes is NULL:
|
|
623
|
+
raise MemoryError("failed to allocate memory")
|
|
624
|
+
# detect RNA genes with the "batched" algorithm
|
|
625
|
+
nt = self._bopt(cursor, &sw)
|
|
626
|
+
# allocate array for sorting genes
|
|
627
|
+
vsort = <int*> calloc(nt, sizeof(int))
|
|
628
|
+
if vsort is NULL:
|
|
629
|
+
raise MemoryError("failed to allocate memory")
|
|
630
|
+
# sort and threshold genes
|
|
631
|
+
n = aragorn.gene_sort(&cursor.ds, nt, vsort, &sw)
|
|
632
|
+
# recover genes
|
|
633
|
+
genes = []
|
|
634
|
+
for i in range(n):
|
|
635
|
+
genes.append(Gene._new_gene(&sw.genes[vsort[i]], sw.geneticcode))
|
|
636
|
+
finally:
|
|
637
|
+
free(vsort)
|
|
638
|
+
free(sw.genes)
|
|
639
|
+
|
|
640
|
+
return genes
|
|
641
|
+
|
|
642
|
+
cdef int _bopt(
|
|
643
|
+
self,
|
|
644
|
+
Cursor cursor,
|
|
645
|
+
csw* sw
|
|
646
|
+
) except -1 nogil:
|
|
647
|
+
# adapted from bopt_fastafile to use with our own `Cursor` dataset
|
|
648
|
+
cdef int nt
|
|
649
|
+
cdef int seq[((2 * aragorn.LSEQ) + aragorn.WRAP) + 1]
|
|
650
|
+
cdef int cseq[((2 * aragorn.LSEQ) + aragorn.WRAP) + 1]
|
|
651
|
+
cdef int wseq[(2 * aragorn.WRAP) + 1]
|
|
652
|
+
cdef long i
|
|
653
|
+
cdef long rewind
|
|
654
|
+
cdef long drewind
|
|
655
|
+
cdef long tmaxlen
|
|
656
|
+
cdef bint flag
|
|
657
|
+
cdef int length
|
|
658
|
+
cdef int *s
|
|
659
|
+
cdef int *sf
|
|
660
|
+
cdef int *se
|
|
661
|
+
cdef int *sc
|
|
662
|
+
cdef int *swrap
|
|
663
|
+
cdef long gap
|
|
664
|
+
cdef long start
|
|
665
|
+
cdef bint loop
|
|
666
|
+
cdef bint NX
|
|
667
|
+
cdef bint SH
|
|
668
|
+
|
|
669
|
+
# compute width of sliding windows
|
|
670
|
+
rewind = aragorn.MAXTAGDIST + 20
|
|
671
|
+
if sw.trna or sw.mtrna:
|
|
672
|
+
tmaxlen = aragorn.MAXTRNALEN + sw.maxintronlen
|
|
673
|
+
if rewind < tmaxlen:
|
|
674
|
+
rewind = tmaxlen
|
|
675
|
+
if sw.tmrna:
|
|
676
|
+
if rewind < aragorn.MAXTMRNALEN:
|
|
677
|
+
rewind = aragorn.MAXTMRNALEN
|
|
678
|
+
if sw.peptide:
|
|
679
|
+
if sw.tagthresh >= 5 and rewind < aragorn.TSWEEP:
|
|
680
|
+
rewind = aragorn.TSWEEP
|
|
681
|
+
|
|
682
|
+
sw.loffset = rewind
|
|
683
|
+
sw.roffset = rewind
|
|
684
|
+
drewind = 2 * rewind
|
|
685
|
+
|
|
686
|
+
# cleanly initialize gene array
|
|
687
|
+
aragorn.init_gene(sw.genes, 0, aragorn.NT)
|
|
688
|
+
|
|
689
|
+
nt = 0
|
|
690
|
+
flag = 0
|
|
691
|
+
start = 1L
|
|
692
|
+
|
|
693
|
+
loop = True
|
|
694
|
+
NX = True
|
|
695
|
+
SH = True
|
|
696
|
+
|
|
697
|
+
se = seq
|
|
698
|
+
if sw.linear:
|
|
699
|
+
for i in range(rewind):
|
|
700
|
+
postincrement(se)[0] = aragorn.NOBASE
|
|
701
|
+
start -= rewind
|
|
702
|
+
else:
|
|
703
|
+
if cursor.ds.psmax <= drewind:
|
|
704
|
+
gap = drewind - cursor.ds.psmax
|
|
705
|
+
sc = se + gap
|
|
706
|
+
while se < sc:
|
|
707
|
+
postincrement(se)[0] = aragorn.NOBASE
|
|
708
|
+
|
|
709
|
+
swrap = wseq
|
|
710
|
+
sc = se + cursor.ds.psmax
|
|
711
|
+
while se < sc:
|
|
712
|
+
se[0] = cursor._forward()
|
|
713
|
+
postincrement(swrap)[0] = postincrement(se)[0]
|
|
714
|
+
|
|
715
|
+
sc = swrap + gap
|
|
716
|
+
while swrap < sc:
|
|
717
|
+
postincrement(swrap)[0] = aragorn.NOBASE
|
|
718
|
+
|
|
719
|
+
swrap = wseq
|
|
720
|
+
sc = swrap + cursor.ds.psmax
|
|
721
|
+
while swrap < sc:
|
|
722
|
+
postincrement(se)[0] = postincrement(swrap)[0]
|
|
723
|
+
|
|
724
|
+
swrap = wseq
|
|
725
|
+
sc = swrap + drewind
|
|
726
|
+
while swrap < sc:
|
|
727
|
+
postincrement(se)[0] = postincrement(swrap)[0]
|
|
728
|
+
|
|
729
|
+
sw.loffset = drewind
|
|
730
|
+
sw.roffset = drewind
|
|
731
|
+
start -= drewind
|
|
732
|
+
flag = 1
|
|
733
|
+
# goto SH
|
|
734
|
+
loop = True
|
|
735
|
+
SH = True
|
|
736
|
+
NX = False
|
|
737
|
+
|
|
738
|
+
else:
|
|
739
|
+
swrap = wseq
|
|
740
|
+
sc = seq + drewind
|
|
741
|
+
while se < sc:
|
|
742
|
+
se[0] = cursor._forward()
|
|
743
|
+
postincrement(swrap)[0] = postincrement(se)[0]
|
|
744
|
+
|
|
745
|
+
# weird ass loop to emulate a GOTO
|
|
746
|
+
while loop:
|
|
747
|
+
|
|
748
|
+
# label NX: next
|
|
749
|
+
sc = seq + aragorn.LSEQ
|
|
750
|
+
if NX:
|
|
751
|
+
while (se < sc):
|
|
752
|
+
postincrement(se)[0] = cursor._forward()
|
|
753
|
+
if cursor.ds.ps >= cursor.ds.psmax:
|
|
754
|
+
if sw.linear:
|
|
755
|
+
for i in range(rewind):
|
|
756
|
+
postincrement(se)[0] = aragorn.NOBASE
|
|
757
|
+
else:
|
|
758
|
+
sc = wseq + drewind
|
|
759
|
+
swrap = wseq
|
|
760
|
+
while (swrap < sc):
|
|
761
|
+
postincrement(se)[0] = postincrement(swrap)[0]
|
|
762
|
+
flag = 1
|
|
763
|
+
SH = True
|
|
764
|
+
break
|
|
765
|
+
|
|
766
|
+
# label SH: search
|
|
767
|
+
if SH:
|
|
768
|
+
length = <int> (se - seq)
|
|
769
|
+
|
|
770
|
+
with gil:
|
|
771
|
+
PyErr_CheckSignals()
|
|
772
|
+
|
|
773
|
+
# if (sw.verbose):
|
|
774
|
+
# vstart = sq(d, start + sw.loffset)
|
|
775
|
+
# vstop = sq(d, ((start + length) - sw.roffset) - 1)
|
|
776
|
+
# if (vstop < vstart):
|
|
777
|
+
# fprintf(stderr, "Searching from %ld to %ld\n", vstart, d.psmax)
|
|
778
|
+
# fprintf(stderr, "Searching from 1 to %ld\n", vstop)
|
|
779
|
+
# else:
|
|
780
|
+
# fprintf(stderr, "Searching from %ld to %ld\n", vstart, vstop)
|
|
781
|
+
|
|
782
|
+
if (sw.both != 1):
|
|
783
|
+
sw.start = start
|
|
784
|
+
sw.comp = 0
|
|
785
|
+
nt = aragorn.tmioptimise(&cursor.ds, seq, length, nt, sw)
|
|
786
|
+
|
|
787
|
+
if (sw.both > 0):
|
|
788
|
+
aragorn.sense_switch(seq, cseq, length)
|
|
789
|
+
sw.start = start + length
|
|
790
|
+
sw.comp = 1
|
|
791
|
+
nt = aragorn.tmioptimise(&cursor.ds, cseq, length, nt, sw)
|
|
792
|
+
|
|
793
|
+
if not flag:
|
|
794
|
+
s = seq
|
|
795
|
+
sf = se - drewind
|
|
796
|
+
se = seq + drewind
|
|
797
|
+
while (s < se):
|
|
798
|
+
postincrement(s)[0] = postincrement(sf)[0]
|
|
799
|
+
start += length - drewind
|
|
800
|
+
# goto NX
|
|
801
|
+
NX = SH = loop = True
|
|
802
|
+
continue
|
|
803
|
+
|
|
804
|
+
if nt < 1:
|
|
805
|
+
cursor.ds.nf += 1
|
|
806
|
+
if sw.maxintronlen > 0:
|
|
807
|
+
aragorn.remove_overlapping_trna(&cursor.ds, nt, sw)
|
|
808
|
+
if sw.updatetmrnatags:
|
|
809
|
+
aragorn.update_tmrna_tag_database(sw.genes, nt, sw)
|
|
810
|
+
|
|
811
|
+
# FIXME: here should sort genes and filter them with `gene_sort`
|
|
812
|
+
# aragorn.batch_gene_set(d, nt, sw)
|
|
813
|
+
|
|
814
|
+
# if sw.verbose:
|
|
815
|
+
# fprintf(stderr, "%s\nSearch Finished\n\n", d.seqname)
|
|
816
|
+
|
|
817
|
+
cursor.ds.ns += 1
|
|
818
|
+
# exit loop
|
|
819
|
+
loop = False
|
|
820
|
+
|
|
821
|
+
return nt
|
|
822
|
+
|
|
823
|
+
# if (d.ns > 1) and (sw.batch < 2):
|
|
824
|
+
# fprintf(f, ">end \t%d sequences", d.ns)
|
|
825
|
+
# if sw.trna or sw.mtrna:
|
|
826
|
+
# fprintf(f, " %d tRNA genes", sw.ngene[<int> aragorn.tRNA])
|
|
827
|
+
# if sw.tmrna:
|
|
828
|
+
# fprintf(f, " %d tmRNA genes", sw.ngene[<int> aragorn.tmRNA])
|
|
829
|
+
# if d.nf > 0:
|
|
830
|
+
# sens = (100.0 * (d.ns - d.nf)) / d.ns
|
|
831
|
+
# fprintf(f, ", nothing found in %d sequences, (%.2lf%% sensitivity)", d.nf, sens)
|
|
832
|
+
# fputc('\n', f)
|
|
833
|
+
# if sw.updatetmrnatags:
|
|
834
|
+
# aragorn.report_new_tmrna_tags(sw)
|