pyaragorn 0.1.0__pp37-pypy37_pp73-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyaragorn might be problematic. Click here for more details.
- pyaragorn/CMakeLists.txt +1 -0
- pyaragorn/__init__.py +36 -0
- pyaragorn/lib.pypy37-pp73-win_amd64.pyd +0 -0
- pyaragorn/lib.pyx +715 -0
- pyaragorn/tests/__init__.py +10 -0
- pyaragorn/tests/data/CP001621.default.txt +95 -0
- pyaragorn/tests/data/CP001621.fna.gz +0 -0
- pyaragorn/tests/data/__init__.py +30 -0
- pyaragorn/tests/fasta.py +86 -0
- pyaragorn/tests/requirements.txt +1 -0
- pyaragorn/tests/test_doctest.py +93 -0
- pyaragorn/tests/test_rna_finder.py +69 -0
- pyaragorn-0.1.0.dist-info/METADATA +884 -0
- pyaragorn-0.1.0.dist-info/RECORD +16 -0
- pyaragorn-0.1.0.dist-info/WHEEL +5 -0
- pyaragorn-0.1.0.dist-info/licenses/COPYING +674 -0
pyaragorn/lib.pyx
ADDED
|
@@ -0,0 +1,715 @@
|
|
|
1
|
+
# coding: utf-8
|
|
2
|
+
# cython: language_level=3, linetrace=True, binding=True
|
|
3
|
+
|
|
4
|
+
"""Bindings to ARAGORN, a (t|mt|tm)RNA gene finder.
|
|
5
|
+
|
|
6
|
+
Attributes:
|
|
7
|
+
ARAGORN_VERSION (`str`): The version of ARAGORN currently wrapped
|
|
8
|
+
in PyARAGORN.
|
|
9
|
+
TRANSLATION_TABLES (`set` of `int`): A set containing all the
|
|
10
|
+
translation tables supported by PyARAGORN.
|
|
11
|
+
|
|
12
|
+
Example:
|
|
13
|
+
PyARAGORN can work on any DNA sequence stored in either a text or a
|
|
14
|
+
byte array. To load a sequence from one of the common sequence formats,
|
|
15
|
+
you can use an external dedicated library such as
|
|
16
|
+
`Biopython <https://github.com/biopython/biopython>`_::
|
|
17
|
+
|
|
18
|
+
>>> import gzip
|
|
19
|
+
>>> import Bio.SeqIO
|
|
20
|
+
>>> with gzip.open("CP001621.fna.gz", "rt") as f:
|
|
21
|
+
... record = Bio.SeqIO.read(f, "fasta")
|
|
22
|
+
|
|
23
|
+
Then use PyARAGORN to find the tRNA genes using the
|
|
24
|
+
bacterial genetic code (translation table 11):
|
|
25
|
+
|
|
26
|
+
>>> import pyaragorn
|
|
27
|
+
>>> rna_finder = pyaragorn.RNAFinder(11, trna=True, tmrna=False)
|
|
28
|
+
>>> for gene in rna_finder.find_rna(record.seq.encode()):
|
|
29
|
+
... print(gene.anticodon, gene.amino_acid, gene.begin, gene.end)
|
|
30
|
+
tag Leu 87124 87207
|
|
31
|
+
ttt Lys 87210 87285
|
|
32
|
+
...
|
|
33
|
+
|
|
34
|
+
The gene coordinates are 1-indexed, inclusive, similarly to
|
|
35
|
+
`Pyrodigal <https://pyrodigal.readthedocs.io>`_ genes.
|
|
36
|
+
|
|
37
|
+
References:
|
|
38
|
+
- Laslett, Dean, and Björn Canback.
|
|
39
|
+
“ARAGORN, a program to detect tRNA genes and tmRNA genes in nucleotide
|
|
40
|
+
sequences.” Nucleic acids research vol. 32,1 11-6. 2 Jan. 2004,
|
|
41
|
+
:doi:`10.1093/nar/gkh152`. :pmid:`14704338`. :pmcid:`PMC373265`.
|
|
42
|
+
- Laslett, Dean, and Björn Canbäck.
|
|
43
|
+
“ARWEN: a program to detect tRNA genes in metazoan mitochondrial
|
|
44
|
+
nucleotide sequences.” Bioinformatics (Oxford, England) vol. 24,2
|
|
45
|
+
(2008): 172-5. :doi:`10.1093/bioinformatics/btm573`. :pmid:`18033792`.
|
|
46
|
+
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
from cython.operator cimport postincrement, dereference
|
|
50
|
+
from cpython.bytes cimport PyBytes_FromStringAndSize
|
|
51
|
+
from cpython.exc cimport PyErr_CheckSignals
|
|
52
|
+
from cpython.unicode cimport (
|
|
53
|
+
PyUnicode_KIND,
|
|
54
|
+
PyUnicode_DATA,
|
|
55
|
+
PyUnicode_1BYTE_KIND,
|
|
56
|
+
PyUnicode_GET_LENGTH,
|
|
57
|
+
# PyUnicode_READ,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
from libc.stdio cimport FILE, fopen, fdopen, fclose, fprintf, fputc, stdout, stderr
|
|
61
|
+
from libc.stdlib cimport calloc, free
|
|
62
|
+
from libc.string cimport memcpy
|
|
63
|
+
from libc.stdint cimport intptr_t
|
|
64
|
+
|
|
65
|
+
cimport aragorn
|
|
66
|
+
from aragorn cimport csw, data_set, gene
|
|
67
|
+
|
|
68
|
+
# --- Helpers ------------------------------------------------------------------
|
|
69
|
+
|
|
70
|
+
cdef extern from * nogil:
|
|
71
|
+
Py_UCS4 PyUnicode_READ(int kind, const void* data, size_t pos)
|
|
72
|
+
|
|
73
|
+
cdef extern from * nogil:
|
|
74
|
+
"""
|
|
75
|
+
void default_sw(csw* sw) {
|
|
76
|
+
csw x = {
|
|
77
|
+
{"tRNA", "tmRNA", "", "", "CDS", "overall"},
|
|
78
|
+
NULL, NULL, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, STANDARD, 0,
|
|
79
|
+
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
|
80
|
+
0, METAZOAN_MT, 1, 0, 5, 5, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
|
|
81
|
+
3, 0, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
|
|
82
|
+
{0, 0, 0, 0, 0, 0}, 0, 0, 0, 0, NTAG, 10, 30,
|
|
83
|
+
{0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0},
|
|
84
|
+
0, 0, 0, 0, 0L, 100.0, 1.0, tRNAthresh, 4.0, 29.0, 26.0, 7.5, 8.0,
|
|
85
|
+
mtRNAtthresh, mtRNAdthresh, mtRNAdtthresh, -7.9, -6.0, tmRNAthresh,
|
|
86
|
+
14.0, 10.0, 25.0, 9.0, srpRNAthresh, CDSthresh,
|
|
87
|
+
{tRNAthresh, tmRNAthresh, srpRNAthresh, 0.0, CDSthresh},
|
|
88
|
+
{
|
|
89
|
+
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
|
|
90
|
+
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 10, 65,
|
|
91
|
+
82, 65, 71, 79, 82, 78, 32, 118, 49, 46, 50, 46, 52, 49, 32,
|
|
92
|
+
32, 32, 68, 101, 97, 110, 32, 76, 97, 115, 108, 101, 116, 116,
|
|
93
|
+
10, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
|
|
94
|
+
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 10,
|
|
95
|
+
TERM
|
|
96
|
+
}
|
|
97
|
+
};
|
|
98
|
+
memcpy(sw, &x, sizeof(csw));
|
|
99
|
+
}
|
|
100
|
+
"""
|
|
101
|
+
void default_sw(csw* sw)
|
|
102
|
+
|
|
103
|
+
cdef inline long int sq(data_set* d, long int pos) nogil:
|
|
104
|
+
return (pos + d.psmax - 1) % d.psmax + 1
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
# --- Constants ----------------------------------------------------------------
|
|
108
|
+
|
|
109
|
+
cdef set _TRANSLATION_TABLES = set(range(1, 7)) | set(range(9, 17)) | set(range(21, 27)) | {29, 30} | {32, 33}
|
|
110
|
+
|
|
111
|
+
__version__ = PROJECT_VERSION
|
|
112
|
+
|
|
113
|
+
TRANSLATION_TABLES = _TRANSLATION_TABLES
|
|
114
|
+
ARAGORN_VERSION = PROJECT_ARAGORN_VERSION
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# --- Classes ------------------------------------------------------------------
|
|
118
|
+
|
|
119
|
+
cdef class Gene:
|
|
120
|
+
"""A gene identified by ARAGORN.
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
cdef gene _gene
|
|
124
|
+
cdef int _genetic_code
|
|
125
|
+
|
|
126
|
+
@staticmethod
|
|
127
|
+
cdef Gene _new_gene(gene* _gene, int _genetic_code):
|
|
128
|
+
cdef Gene obj
|
|
129
|
+
|
|
130
|
+
if _gene.genetype == aragorn.tRNA:
|
|
131
|
+
obj = TRNAGene.__new__(TRNAGene)
|
|
132
|
+
elif _gene.genetype == aragorn.tmRNA:
|
|
133
|
+
obj = TMRNAGene.__new__(TMRNAGene)
|
|
134
|
+
else:
|
|
135
|
+
raise NotImplementedError
|
|
136
|
+
|
|
137
|
+
memcpy(&obj._gene, _gene, sizeof(gene))
|
|
138
|
+
obj._genetic_code = _genetic_code
|
|
139
|
+
return obj
|
|
140
|
+
|
|
141
|
+
def __sizeof__(self):
|
|
142
|
+
return sizeof(self)
|
|
143
|
+
|
|
144
|
+
@property
|
|
145
|
+
def type(self):
|
|
146
|
+
return ["tRNA", "tmRNA", "", "", "CDS"][<int> self._gene.genetype]
|
|
147
|
+
|
|
148
|
+
@property
|
|
149
|
+
def begin(self):
|
|
150
|
+
"""`int`: The sequence coordinate at which the gene begins.
|
|
151
|
+
|
|
152
|
+
Hint:
|
|
153
|
+
This coordinate is 1-based, inclusive. To use it to index
|
|
154
|
+
a Python array or string, subtract one.
|
|
155
|
+
|
|
156
|
+
"""
|
|
157
|
+
return self._gene.start
|
|
158
|
+
|
|
159
|
+
@property
|
|
160
|
+
def end(self):
|
|
161
|
+
"""`int`: The sequence coordinate at which the gene end.
|
|
162
|
+
|
|
163
|
+
Hint:
|
|
164
|
+
This coordinate is 1-based, inclusive. To use it to index
|
|
165
|
+
a Python array or string, subtract one.
|
|
166
|
+
|
|
167
|
+
"""
|
|
168
|
+
return self._gene.stop
|
|
169
|
+
|
|
170
|
+
@property
|
|
171
|
+
def length(self):
|
|
172
|
+
"""`int`: The length of the RNA gene.
|
|
173
|
+
"""
|
|
174
|
+
return aragorn.seqlen(&self._gene)
|
|
175
|
+
|
|
176
|
+
@property
|
|
177
|
+
def strand(self):
|
|
178
|
+
"""`int`: *-1* if the gene is on the reverse strand, *+1* otherwise.
|
|
179
|
+
"""
|
|
180
|
+
return -1 if self._gene.comp else +1
|
|
181
|
+
|
|
182
|
+
@property
|
|
183
|
+
def energy(self):
|
|
184
|
+
"""`float`: The approximated energy of the RNA structure.
|
|
185
|
+
"""
|
|
186
|
+
cdef csw sw
|
|
187
|
+
default_sw(&sw) # FIXME?
|
|
188
|
+
return aragorn.nenergy(&self._gene, &sw)
|
|
189
|
+
|
|
190
|
+
def sequence(self):
|
|
191
|
+
"""Retrieve the full sequence of the RNA gene.
|
|
192
|
+
"""
|
|
193
|
+
cdef int i
|
|
194
|
+
cdef int l = aragorn.seqlen(&self._gene)
|
|
195
|
+
cdef bytearray b = bytearray(l)
|
|
196
|
+
for i in range(l):
|
|
197
|
+
b[i] = aragorn.cpbase(self._gene.seq[i])
|
|
198
|
+
return b.decode('ascii')
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
cdef class TRNAGene(Gene):
|
|
202
|
+
"""A transfer RNA (tRNA) gene.
|
|
203
|
+
"""
|
|
204
|
+
|
|
205
|
+
@property
|
|
206
|
+
def amino_acid(self):
|
|
207
|
+
"""`str` or (`str`, `str`): The 3-letter amino-acid(s) for this gene.
|
|
208
|
+
|
|
209
|
+
Hint:
|
|
210
|
+
A single string is given if the anticodon loop was identified
|
|
211
|
+
with exactly 3 nucleotides. Otherwise, this property stores
|
|
212
|
+
a pair of amino-acids.
|
|
213
|
+
|
|
214
|
+
"""
|
|
215
|
+
cdef csw sw
|
|
216
|
+
cdef int* s = self._gene.seq + self._gene.anticodon
|
|
217
|
+
(<int*> &sw.geneticcode)[0] = self._genetic_code
|
|
218
|
+
if self._gene.cloop == 6:
|
|
219
|
+
return (
|
|
220
|
+
aragorn.aa(s - 1, &sw).decode('ascii'),
|
|
221
|
+
aragorn.aa(s, &sw).decode('ascii'),
|
|
222
|
+
)
|
|
223
|
+
elif self._gene.cloop == 8:
|
|
224
|
+
return (
|
|
225
|
+
aragorn.aa(s, &sw).decode('ascii'),
|
|
226
|
+
aragorn.aa(s + 1, &sw).decode('ascii')
|
|
227
|
+
)
|
|
228
|
+
else:
|
|
229
|
+
return aragorn.aa(s, &sw).decode('ascii')
|
|
230
|
+
|
|
231
|
+
@property
|
|
232
|
+
def anticodon(self):
|
|
233
|
+
"""`str`: The anticodon of the tRNA gene.
|
|
234
|
+
"""
|
|
235
|
+
cdef tuple c
|
|
236
|
+
cdef int* s = self._gene.seq + self._gene.anticodon
|
|
237
|
+
if self._gene.cloop == 6:
|
|
238
|
+
c = ( aragorn.cbase(s[0]), aragorn.cbase(s[1]) )
|
|
239
|
+
elif self._gene.cloop == 8:
|
|
240
|
+
c = ( aragorn.cbase(s[0]), aragorn.cbase(s[1]), aragorn.cbase(s[2]), aragorn.cbase(s[3]) )
|
|
241
|
+
else:
|
|
242
|
+
c = ( aragorn.cbase(s[0]), aragorn.cbase(s[1]), aragorn.cbase(s[2]) )
|
|
243
|
+
return ''.join(map(chr, c))
|
|
244
|
+
|
|
245
|
+
@property
|
|
246
|
+
def anticodon_offset(self):
|
|
247
|
+
"""`int`: The offset in the gene at which the anticodon starts.
|
|
248
|
+
"""
|
|
249
|
+
cdef int x = 1 + self._gene.anticodon
|
|
250
|
+
if self._gene.nintron > 0 and self._gene.intron <= self._gene.anticodon:
|
|
251
|
+
x += self._gene.nintron
|
|
252
|
+
return x
|
|
253
|
+
|
|
254
|
+
@property
|
|
255
|
+
def anticodon_length(self):
|
|
256
|
+
"""`int`: The length of the anticodon (in nucleotides).
|
|
257
|
+
"""
|
|
258
|
+
if self._gene.cloop == 6:
|
|
259
|
+
return 2
|
|
260
|
+
elif self._gene.cloop == 8:
|
|
261
|
+
return 4
|
|
262
|
+
else:
|
|
263
|
+
return 3
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
cdef class TMRNAGene(Gene):
|
|
267
|
+
"""A transfer-messenger RNA (tmRNA) gene.
|
|
268
|
+
|
|
269
|
+
Example:
|
|
270
|
+
>>> rna_finder = pyaragorn.RNAFinder(11, trna=False, tmrna=True)
|
|
271
|
+
>>> tmrna = rna_finder.find_rna(str(record.seq))[0]
|
|
272
|
+
>>> tmrna.begin, tmrna.end
|
|
273
|
+
(198037, 198447)
|
|
274
|
+
>>> tmrna.peptide()
|
|
275
|
+
'AEKNEENFEMPAFMINNASAGANYMFA**'
|
|
276
|
+
|
|
277
|
+
"""
|
|
278
|
+
|
|
279
|
+
@property
|
|
280
|
+
def cds_offset(self):
|
|
281
|
+
"""`int`: The offset in the gene at which the coding sequence starts.
|
|
282
|
+
"""
|
|
283
|
+
return self._gene.tps + 1
|
|
284
|
+
|
|
285
|
+
@property
|
|
286
|
+
def cds_length(self):
|
|
287
|
+
"""`int`: The length of the coding sequence (in nucleotides).
|
|
288
|
+
"""
|
|
289
|
+
cdef int tpe = self._gene.tpe
|
|
290
|
+
cdef int* se = (self._gene.eseq + tpe) + 1
|
|
291
|
+
cdef int* sb = (self._gene.eseq + self._gene.tps)
|
|
292
|
+
cdef int stride = 3
|
|
293
|
+
|
|
294
|
+
cdef csw sw
|
|
295
|
+
(<int*> &sw.geneticcode)[0] = self._genetic_code
|
|
296
|
+
|
|
297
|
+
while aragorn.ltranslate(se, &self._gene, &sw) == ord('*'):
|
|
298
|
+
se += stride
|
|
299
|
+
tpe += stride
|
|
300
|
+
|
|
301
|
+
return tpe - self._gene.tps
|
|
302
|
+
|
|
303
|
+
def cds(self, include_stop=True):
|
|
304
|
+
"""Retrieve the coding sequence of the mRNA-like region.
|
|
305
|
+
|
|
306
|
+
Arguments:
|
|
307
|
+
include_stop (`bool`): Whether or not to include the STOP codons
|
|
308
|
+
in the returned nucleotide sequence. Defaults to `True`.
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
`str`: The sequence of the mRNA-like region in the tmRNA
|
|
312
|
+
gene, optionally without STOP codons.
|
|
313
|
+
|
|
314
|
+
"""
|
|
315
|
+
cdef int tpe = self._gene.tpe
|
|
316
|
+
cdef int* se = (self._gene.eseq + tpe) + 1
|
|
317
|
+
cdef int* sb = (self._gene.eseq + self._gene.tps)
|
|
318
|
+
cdef int stride = 3 if include_stop else -3
|
|
319
|
+
|
|
320
|
+
cdef csw sw
|
|
321
|
+
(<int*> &sw.geneticcode)[0] = self._genetic_code
|
|
322
|
+
|
|
323
|
+
while aragorn.ltranslate(se, &self._gene, &sw) == ord('*'):
|
|
324
|
+
se += stride
|
|
325
|
+
tpe += stride
|
|
326
|
+
|
|
327
|
+
cds = bytearray()
|
|
328
|
+
while sb < se:
|
|
329
|
+
cds.append(aragorn.cpbase(sb[0]))
|
|
330
|
+
sb += 1
|
|
331
|
+
|
|
332
|
+
return cds.decode('ascii')
|
|
333
|
+
|
|
334
|
+
def peptide(self, include_stop=True):
|
|
335
|
+
"""Retrieve the peptide sequence of the mRNA-like region.
|
|
336
|
+
|
|
337
|
+
Arguments:
|
|
338
|
+
include_stop (`bool`): Whether or not to include the STOP codons
|
|
339
|
+
in the returned peptide sequence. Defaults to `True`.
|
|
340
|
+
|
|
341
|
+
Returns:
|
|
342
|
+
`str`: The translation of the mRNA-like region of the tmRNA
|
|
343
|
+
gene, optionally without STOP codons.
|
|
344
|
+
|
|
345
|
+
"""
|
|
346
|
+
cdef int tpe = self._gene.tpe
|
|
347
|
+
cdef int* se = (self._gene.eseq + tpe) + 1
|
|
348
|
+
cdef int* sb = (self._gene.eseq + self._gene.tps)
|
|
349
|
+
cdef int stride = 3 if include_stop else -3
|
|
350
|
+
|
|
351
|
+
cdef csw sw
|
|
352
|
+
(<int*> &sw.geneticcode)[0] = self._genetic_code
|
|
353
|
+
|
|
354
|
+
while aragorn.ltranslate(se, &self._gene, &sw) == ord('*'):
|
|
355
|
+
se += stride
|
|
356
|
+
tpe += stride
|
|
357
|
+
|
|
358
|
+
peptide = bytearray()
|
|
359
|
+
while sb < se:
|
|
360
|
+
peptide.append(aragorn.ltranslate(sb, &self._gene, &sw))
|
|
361
|
+
sb += 3
|
|
362
|
+
|
|
363
|
+
return peptide.decode('ascii')
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
cdef class Cursor:
|
|
367
|
+
cdef object obj
|
|
368
|
+
cdef const void* data
|
|
369
|
+
cdef int kind
|
|
370
|
+
cdef size_t length
|
|
371
|
+
cdef data_set ds
|
|
372
|
+
|
|
373
|
+
def __init__(self, obj):
|
|
374
|
+
cdef const unsigned char[::1] view
|
|
375
|
+
if isinstance(obj, str):
|
|
376
|
+
self.kind = PyUnicode_KIND(obj)
|
|
377
|
+
self.data = PyUnicode_DATA(obj)
|
|
378
|
+
self.length = PyUnicode_GET_LENGTH(obj)
|
|
379
|
+
else:
|
|
380
|
+
view = obj
|
|
381
|
+
self.kind = PyUnicode_1BYTE_KIND
|
|
382
|
+
self.data = &view[0]
|
|
383
|
+
self.length = view.shape[0]
|
|
384
|
+
|
|
385
|
+
# keep a reference to the data source
|
|
386
|
+
self.obj = obj
|
|
387
|
+
|
|
388
|
+
# reinitialize dataset book-keeping
|
|
389
|
+
self.ds.filepointer = 0
|
|
390
|
+
self.ds.ns = 0
|
|
391
|
+
self.ds.nf = 0
|
|
392
|
+
self.ds.nextseq = 0L
|
|
393
|
+
self.ds.nextseqoff = 0L
|
|
394
|
+
self.ds.seqstart = 0
|
|
395
|
+
self.ds.seqstartoff = 0
|
|
396
|
+
self.ds.ps = 0
|
|
397
|
+
self.ds.psmax = self.length
|
|
398
|
+
|
|
399
|
+
# count GC%
|
|
400
|
+
self.ds.gc = self._gc()
|
|
401
|
+
|
|
402
|
+
cdef int _forward(self) noexcept nogil:
|
|
403
|
+
cdef Py_UCS4 x
|
|
404
|
+
cdef int base
|
|
405
|
+
|
|
406
|
+
if self.ds.ps >= self.ds.psmax:
|
|
407
|
+
return <int> aragorn.base.TERM
|
|
408
|
+
|
|
409
|
+
x = PyUnicode_READ(self.kind, self.data, self.ds.ps)
|
|
410
|
+
if x >= 128:
|
|
411
|
+
return <int> aragorn.base.NOBASE
|
|
412
|
+
|
|
413
|
+
base = aragorn.map[x]
|
|
414
|
+
if base >= <int> aragorn.base.Adenine:
|
|
415
|
+
self.ds.ps += 1
|
|
416
|
+
return base
|
|
417
|
+
else:
|
|
418
|
+
return <int> aragorn.base.NOBASE
|
|
419
|
+
|
|
420
|
+
cdef double _gc(self) noexcept nogil:
|
|
421
|
+
cdef long i
|
|
422
|
+
cdef Py_UCS4 x
|
|
423
|
+
cdef int base
|
|
424
|
+
cdef long ngc = 0
|
|
425
|
+
cdef long ps = 0
|
|
426
|
+
|
|
427
|
+
for i in range(self.length):
|
|
428
|
+
x = PyUnicode_READ(self.kind, self.data, i)
|
|
429
|
+
base = aragorn.map[x]
|
|
430
|
+
if base == -1:
|
|
431
|
+
break
|
|
432
|
+
ngc += (base == <int> aragorn.base.Cytosine) or (base == <int> aragorn.base.Guanine)
|
|
433
|
+
ps += 1
|
|
434
|
+
|
|
435
|
+
return <double> ngc / <double> ps
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
cdef class RNAFinder:
|
|
439
|
+
"""A configurable RNA gene finder.
|
|
440
|
+
"""
|
|
441
|
+
cdef csw _sw
|
|
442
|
+
|
|
443
|
+
def __init__(
|
|
444
|
+
self,
|
|
445
|
+
int translation_table = 1,
|
|
446
|
+
*,
|
|
447
|
+
bint trna = True,
|
|
448
|
+
bint tmrna = True,
|
|
449
|
+
bint linear = False,
|
|
450
|
+
):
|
|
451
|
+
"""__init__(self, translation_table=1, *, trna=True, tmrna=True, linear=False)\n--\n
|
|
452
|
+
|
|
453
|
+
Create a new RNA finder.
|
|
454
|
+
|
|
455
|
+
Arguments:
|
|
456
|
+
translation_table (`int`, optional): The translation table to
|
|
457
|
+
use. Check the :wiki:`List of genetic codes` page
|
|
458
|
+
listing all genetic codes for the available values, or
|
|
459
|
+
the :attr:`pyaragorn.TRANSLATION_TABLES` constant for allowed
|
|
460
|
+
values.
|
|
461
|
+
|
|
462
|
+
"""
|
|
463
|
+
default_sw(&self._sw)
|
|
464
|
+
self._sw.trna = trna
|
|
465
|
+
self._sw.tmrna = tmrna
|
|
466
|
+
self._sw.linear = linear
|
|
467
|
+
self._sw.f = stdout
|
|
468
|
+
self._sw.verbose = False #True
|
|
469
|
+
|
|
470
|
+
if translation_table not in _TRANSLATION_TABLES:
|
|
471
|
+
raise ValueError(f"invalid translation table: {translation_table!r}")
|
|
472
|
+
self._sw.geneticcode = translation_table
|
|
473
|
+
|
|
474
|
+
def find_rna(self, object sequence):
|
|
475
|
+
"""Find RNA genes in the input DNA sequence.
|
|
476
|
+
|
|
477
|
+
Arguments:
|
|
478
|
+
sequence (`str` or buffer): The nucleotide sequence to process,
|
|
479
|
+
either as a string of nucleotides (upper- or lowercase), or
|
|
480
|
+
as an object implementing the buffer protocol.
|
|
481
|
+
|
|
482
|
+
Returns:
|
|
483
|
+
`list` of `~pyaragorn.Gene`: A list of `~pyaragorn.Gene` (either
|
|
484
|
+
`~pyaragorn.TRNAGene` or `~pyaragorn.TMRNAGene`) corresponding
|
|
485
|
+
to RNA genes detected in the sequence according to the `RNAFinder`
|
|
486
|
+
parameters.
|
|
487
|
+
|
|
488
|
+
"""
|
|
489
|
+
cdef int n
|
|
490
|
+
cdef int nt
|
|
491
|
+
cdef csw sw
|
|
492
|
+
cdef int* vsort = NULL
|
|
493
|
+
cdef Cursor cursor = Cursor(sequence)
|
|
494
|
+
|
|
495
|
+
# copy parameters to ensure the `find_rna` method is re-entrant
|
|
496
|
+
memcpy(&sw, &self._sw, sizeof(csw))
|
|
497
|
+
|
|
498
|
+
try:
|
|
499
|
+
with nogil:
|
|
500
|
+
# allocate memory for the result genes
|
|
501
|
+
sw.genespace = aragorn.NT
|
|
502
|
+
sw.genes = <gene*> calloc(sw.genespace, sizeof(gene))
|
|
503
|
+
if sw.genes is NULL:
|
|
504
|
+
raise MemoryError("failed to allocate memory")
|
|
505
|
+
# detect RNA genes with the "batched" algorithm
|
|
506
|
+
nt = self._bopt(cursor, &sw)
|
|
507
|
+
# allocate array for sorting genes
|
|
508
|
+
vsort = <int*> calloc(nt, sizeof(int))
|
|
509
|
+
if vsort is NULL:
|
|
510
|
+
raise MemoryError("failed to allocate memory")
|
|
511
|
+
# sort and threshold genes
|
|
512
|
+
n = aragorn.gene_sort(&cursor.ds, nt, vsort, &sw)
|
|
513
|
+
# recover genes
|
|
514
|
+
genes = []
|
|
515
|
+
for i in range(n):
|
|
516
|
+
genes.append(Gene._new_gene(&sw.genes[vsort[i]], sw.geneticcode))
|
|
517
|
+
finally:
|
|
518
|
+
free(vsort)
|
|
519
|
+
free(sw.genes)
|
|
520
|
+
|
|
521
|
+
return genes
|
|
522
|
+
|
|
523
|
+
cdef int _bopt(
|
|
524
|
+
self,
|
|
525
|
+
Cursor cursor,
|
|
526
|
+
csw* sw
|
|
527
|
+
) except -1 nogil:
|
|
528
|
+
# adapted from bopt_fastafile to use with our own `Cursor` dataset
|
|
529
|
+
cdef int nt
|
|
530
|
+
cdef int seq[((2 * aragorn.LSEQ) + aragorn.WRAP) + 1]
|
|
531
|
+
cdef int cseq[((2 * aragorn.LSEQ) + aragorn.WRAP) + 1]
|
|
532
|
+
cdef int wseq[(2 * aragorn.WRAP) + 1]
|
|
533
|
+
cdef long i
|
|
534
|
+
cdef long rewind
|
|
535
|
+
cdef long drewind
|
|
536
|
+
cdef long tmaxlen
|
|
537
|
+
cdef bint flag
|
|
538
|
+
cdef int length
|
|
539
|
+
cdef int *s
|
|
540
|
+
cdef int *sf
|
|
541
|
+
cdef int *se
|
|
542
|
+
cdef int *sc
|
|
543
|
+
cdef int *swrap
|
|
544
|
+
cdef long gap
|
|
545
|
+
cdef long start
|
|
546
|
+
cdef bint loop
|
|
547
|
+
cdef bint NX
|
|
548
|
+
cdef bint SH
|
|
549
|
+
|
|
550
|
+
# compute width of sliding windows
|
|
551
|
+
rewind = aragorn.MAXTAGDIST + 20
|
|
552
|
+
if sw.trna or sw.mtrna:
|
|
553
|
+
tmaxlen = aragorn.MAXTRNALEN + sw.maxintronlen
|
|
554
|
+
if rewind < tmaxlen:
|
|
555
|
+
rewind = tmaxlen
|
|
556
|
+
if sw.tmrna:
|
|
557
|
+
if rewind < aragorn.MAXTMRNALEN:
|
|
558
|
+
rewind = aragorn.MAXTMRNALEN
|
|
559
|
+
if sw.peptide:
|
|
560
|
+
if sw.tagthresh >= 5 and rewind < aragorn.TSWEEP:
|
|
561
|
+
rewind = aragorn.TSWEEP
|
|
562
|
+
|
|
563
|
+
sw.loffset = rewind
|
|
564
|
+
sw.roffset = rewind
|
|
565
|
+
drewind = 2 * rewind
|
|
566
|
+
|
|
567
|
+
# cleanly initialize gene array
|
|
568
|
+
aragorn.init_gene(sw.genes, 0, aragorn.NT)
|
|
569
|
+
|
|
570
|
+
nt = 0
|
|
571
|
+
flag = 0
|
|
572
|
+
start = 1L
|
|
573
|
+
|
|
574
|
+
loop = True
|
|
575
|
+
NX = True
|
|
576
|
+
SH = True
|
|
577
|
+
|
|
578
|
+
se = seq
|
|
579
|
+
if sw.linear:
|
|
580
|
+
for i in range(rewind):
|
|
581
|
+
postincrement(se)[0] = aragorn.NOBASE
|
|
582
|
+
start -= rewind
|
|
583
|
+
else:
|
|
584
|
+
if cursor.ds.psmax <= drewind:
|
|
585
|
+
gap = drewind - cursor.ds.psmax
|
|
586
|
+
sc = se + gap
|
|
587
|
+
while se < sc:
|
|
588
|
+
postincrement(se)[0] = aragorn.NOBASE
|
|
589
|
+
|
|
590
|
+
swrap = wseq
|
|
591
|
+
sc = se + cursor.ds.psmax
|
|
592
|
+
while se < sc:
|
|
593
|
+
se[0] = cursor._forward()
|
|
594
|
+
postincrement(swrap)[0] = postincrement(se)[0]
|
|
595
|
+
|
|
596
|
+
sc = swrap + gap
|
|
597
|
+
while swrap < sc:
|
|
598
|
+
postincrement(swrap)[0] = aragorn.NOBASE
|
|
599
|
+
|
|
600
|
+
swrap = wseq
|
|
601
|
+
sc = swrap + cursor.ds.psmax
|
|
602
|
+
while swrap < sc:
|
|
603
|
+
postincrement(se)[0] = postincrement(swrap)[0]
|
|
604
|
+
|
|
605
|
+
swrap = wseq
|
|
606
|
+
sc = swrap + drewind
|
|
607
|
+
while swrap < sc:
|
|
608
|
+
postincrement(se)[0] = postincrement(swrap)[0]
|
|
609
|
+
|
|
610
|
+
sw.loffset = drewind
|
|
611
|
+
sw.roffset = drewind
|
|
612
|
+
start -= drewind
|
|
613
|
+
flag = 1
|
|
614
|
+
# goto SH
|
|
615
|
+
loop = True
|
|
616
|
+
SH = True
|
|
617
|
+
NX = False
|
|
618
|
+
|
|
619
|
+
else:
|
|
620
|
+
swrap = wseq
|
|
621
|
+
sc = seq + drewind
|
|
622
|
+
while se < sc:
|
|
623
|
+
se[0] = cursor._forward()
|
|
624
|
+
postincrement(swrap)[0] = postincrement(se)[0]
|
|
625
|
+
|
|
626
|
+
# weird ass loop to emulate a GOTO
|
|
627
|
+
while loop:
|
|
628
|
+
|
|
629
|
+
# label NX: next
|
|
630
|
+
sc = seq + aragorn.LSEQ
|
|
631
|
+
if NX:
|
|
632
|
+
while (se < sc):
|
|
633
|
+
postincrement(se)[0] = cursor._forward()
|
|
634
|
+
if cursor.ds.ps >= cursor.ds.psmax:
|
|
635
|
+
if sw.linear:
|
|
636
|
+
for i in range(rewind):
|
|
637
|
+
postincrement(se)[0] = aragorn.NOBASE
|
|
638
|
+
else:
|
|
639
|
+
sc = wseq + drewind
|
|
640
|
+
swrap = wseq
|
|
641
|
+
while (swrap < sc):
|
|
642
|
+
postincrement(se)[0] = postincrement(swrap)[0]
|
|
643
|
+
flag = 1
|
|
644
|
+
SH = True
|
|
645
|
+
break
|
|
646
|
+
|
|
647
|
+
# label SH: search
|
|
648
|
+
if SH:
|
|
649
|
+
length = <int> (se - seq)
|
|
650
|
+
|
|
651
|
+
with gil:
|
|
652
|
+
PyErr_CheckSignals()
|
|
653
|
+
|
|
654
|
+
# if (sw.verbose):
|
|
655
|
+
# vstart = sq(d, start + sw.loffset)
|
|
656
|
+
# vstop = sq(d, ((start + length) - sw.roffset) - 1)
|
|
657
|
+
# if (vstop < vstart):
|
|
658
|
+
# fprintf(stderr, "Searching from %ld to %ld\n", vstart, d.psmax)
|
|
659
|
+
# fprintf(stderr, "Searching from 1 to %ld\n", vstop)
|
|
660
|
+
# else:
|
|
661
|
+
# fprintf(stderr, "Searching from %ld to %ld\n", vstart, vstop)
|
|
662
|
+
|
|
663
|
+
if (sw.both != 1):
|
|
664
|
+
sw.start = start
|
|
665
|
+
sw.comp = 0
|
|
666
|
+
nt = aragorn.tmioptimise(&cursor.ds, seq, length, nt, sw)
|
|
667
|
+
|
|
668
|
+
if (sw.both > 0):
|
|
669
|
+
aragorn.sense_switch(seq, cseq, length)
|
|
670
|
+
sw.start = start + length
|
|
671
|
+
sw.comp = 1
|
|
672
|
+
nt = aragorn.tmioptimise(&cursor.ds, cseq, length, nt, sw)
|
|
673
|
+
|
|
674
|
+
if not flag:
|
|
675
|
+
s = seq
|
|
676
|
+
sf = se - drewind
|
|
677
|
+
se = seq + drewind
|
|
678
|
+
while (s < se):
|
|
679
|
+
postincrement(s)[0] = postincrement(sf)[0]
|
|
680
|
+
start += length - drewind
|
|
681
|
+
# goto NX
|
|
682
|
+
NX = SH = loop = True
|
|
683
|
+
continue
|
|
684
|
+
|
|
685
|
+
if nt < 1:
|
|
686
|
+
cursor.ds.nf += 1
|
|
687
|
+
if sw.maxintronlen > 0:
|
|
688
|
+
aragorn.remove_overlapping_trna(&cursor.ds, nt, sw)
|
|
689
|
+
if sw.updatetmrnatags:
|
|
690
|
+
aragorn.update_tmrna_tag_database(sw.genes, nt, sw)
|
|
691
|
+
|
|
692
|
+
# FIXME: here should sort genes and filter them with `gene_sort`
|
|
693
|
+
# aragorn.batch_gene_set(d, nt, sw)
|
|
694
|
+
|
|
695
|
+
# if sw.verbose:
|
|
696
|
+
# fprintf(stderr, "%s\nSearch Finished\n\n", d.seqname)
|
|
697
|
+
|
|
698
|
+
cursor.ds.ns += 1
|
|
699
|
+
# exit loop
|
|
700
|
+
loop = False
|
|
701
|
+
|
|
702
|
+
return nt
|
|
703
|
+
|
|
704
|
+
# if (d.ns > 1) and (sw.batch < 2):
|
|
705
|
+
# fprintf(f, ">end \t%d sequences", d.ns)
|
|
706
|
+
# if sw.trna or sw.mtrna:
|
|
707
|
+
# fprintf(f, " %d tRNA genes", sw.ngene[<int> aragorn.tRNA])
|
|
708
|
+
# if sw.tmrna:
|
|
709
|
+
# fprintf(f, " %d tmRNA genes", sw.ngene[<int> aragorn.tmRNA])
|
|
710
|
+
# if d.nf > 0:
|
|
711
|
+
# sens = (100.0 * (d.ns - d.nf)) / d.ns
|
|
712
|
+
# fprintf(f, ", nothing found in %d sequences, (%.2lf%% sensitivity)", d.nf, sens)
|
|
713
|
+
# fputc('\n', f)
|
|
714
|
+
# if sw.updatetmrnatags:
|
|
715
|
+
# aragorn.report_new_tmrna_tags(sw)
|