pyaragorn 0.3.0__cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pyaragorn/lib.pyx ADDED
@@ -0,0 +1,834 @@
1
+ # coding: utf-8
2
+ # cython: language_level=3, linetrace=True, binding=True
3
+
4
+ """Bindings to ARAGORN, a (t|mt|tm)RNA gene finder.
5
+
6
+ Attributes:
7
+ ARAGORN_VERSION (`str`): The version of ARAGORN currently wrapped
8
+ in PyARAGORN.
9
+ TRANSLATION_TABLES (`set` of `int`): A set containing all the
10
+ translation tables supported by PyARAGORN.
11
+
12
+ Example:
13
+ PyARAGORN can work on any DNA sequence stored in either a text or a
14
+ byte array. To load a sequence from one of the common sequence formats,
15
+ you can use an external dedicated library such as
16
+ `Biopython <https://github.com/biopython/biopython>`_::
17
+
18
+ >>> import gzip
19
+ >>> import Bio.SeqIO
20
+ >>> with gzip.open("CP001621.fna.gz", "rt") as f:
21
+ ... record = Bio.SeqIO.read(f, "fasta")
22
+
23
+ Then use PyARAGORN to find the tRNA genes using the
24
+ bacterial genetic code (translation table 11):
25
+
26
+ >>> import pyaragorn
27
+ >>> rna_finder = pyaragorn.RNAFinder(11, trna=True, tmrna=False)
28
+ >>> for gene in rna_finder.find_rna(record.seq.encode()):
29
+ ... print(gene.anticodon, gene.amino_acid, gene.begin, gene.end)
30
+ tag Leu 87124 87207
31
+ ttt Lys 87210 87285
32
+ ...
33
+
34
+ The gene coordinates are 1-indexed, inclusive, similarly to
35
+ `Pyrodigal <https://pyrodigal.readthedocs.io>`_ genes.
36
+
37
+ References:
38
+ - Laslett, Dean, and Björn Canback.
39
+ “ARAGORN, a program to detect tRNA genes and tmRNA genes in nucleotide
40
+ sequences.” Nucleic acids research vol. 32,1 11-6. 2 Jan. 2004,
41
+ :doi:`10.1093/nar/gkh152`. :pmid:`14704338`. :pmcid:`PMC373265`.
42
+ - Laslett, Dean, and Björn Canbäck.
43
+ “ARWEN: a program to detect tRNA genes in metazoan mitochondrial
44
+ nucleotide sequences.” Bioinformatics (Oxford, England) vol. 24,2
45
+ (2008): 172-5. :doi:`10.1093/bioinformatics/btm573`. :pmid:`18033792`.
46
+
47
+ """
48
+
49
+ from cython.operator cimport postincrement, dereference
50
+ from cpython.bytes cimport PyBytes_FromStringAndSize
51
+ from cpython.exc cimport PyErr_CheckSignals
52
+ from cpython.unicode cimport PyUnicode_AsASCIIString
53
+
54
+ from libc.stdio cimport FILE, fopen, fdopen, fclose, fprintf, fputc, stdout, stderr
55
+ from libc.stdlib cimport calloc, free
56
+ from libc.string cimport memcpy
57
+ from libc.stdint cimport intptr_t
58
+
59
+ cimport aragorn
60
+ from aragorn cimport csw, data_set, gene
61
+
62
+ # --- Helpers ------------------------------------------------------------------
63
+
64
+ cdef extern from * nogil:
65
+ Py_UCS4 PyUnicode_READ(int kind, const void* data, size_t pos)
66
+
67
+ cdef extern from * nogil:
68
+ """
69
+ void default_sw(csw* sw) {
70
+ csw x = {
71
+ {"tRNA", "tmRNA", "", "", "CDS", "overall"},
72
+ NULL, NULL, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, STANDARD, 0,
73
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
74
+ 0, METAZOAN_MT, 1, 0, 5, 5, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
75
+ 3, 0, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
76
+ {0, 0, 0, 0, 0, 0}, 0, 0, 0, 0, NTAG, 10, 30,
77
+ {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0},
78
+ 0, 0, 0, 0, 0L, 100.0, 1.0, tRNAthresh, 4.0, 29.0, 26.0, 7.5, 8.0,
79
+ mtRNAtthresh, mtRNAdthresh, mtRNAdtthresh, -7.9, -6.0, tmRNAthresh,
80
+ 14.0, 10.0, 25.0, 9.0, srpRNAthresh, CDSthresh,
81
+ {tRNAthresh, tmRNAthresh, srpRNAthresh, 0.0, CDSthresh},
82
+ {
83
+ 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
84
+ 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 10, 65,
85
+ 82, 65, 71, 79, 82, 78, 32, 118, 49, 46, 50, 46, 52, 49, 32,
86
+ 32, 32, 68, 101, 97, 110, 32, 76, 97, 115, 108, 101, 116, 116,
87
+ 10, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
88
+ 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 10,
89
+ TERM
90
+ }
91
+ };
92
+ memcpy(sw, &x, sizeof(csw));
93
+ }
94
+ """
95
+ void default_sw(csw* sw)
96
+
97
+ cdef inline long int sq(data_set* d, long int pos) nogil:
98
+ return (pos + d.psmax - 1) % d.psmax + 1
99
+
100
+
101
+ # --- Constants ----------------------------------------------------------------
102
+
103
+ import functools
104
+
105
+ cdef set _TRANSLATION_TABLES = set(range(1, 7)) | set(range(9, 17)) | set(range(21, 27)) | {29, 30} | {32, 33}
106
+
107
+ __version__ = PROJECT_VERSION
108
+
109
+ TRANSLATION_TABLES = _TRANSLATION_TABLES
110
+ ARAGORN_VERSION = PROJECT_ARAGORN_VERSION
111
+
112
+
113
+ # --- Classes ------------------------------------------------------------------
114
+
115
+ cdef class Gene:
116
+ """A gene identified by ARAGORN.
117
+ """
118
+
119
+ cdef gene _gene
120
+ cdef int _genetic_code
121
+
122
+ @staticmethod
123
+ cdef Gene _new_gene(gene* _gene, int _genetic_code):
124
+ cdef Gene obj
125
+
126
+ if _gene.genetype == aragorn.tRNA:
127
+ obj = TRNAGene.__new__(TRNAGene)
128
+ elif _gene.genetype == aragorn.tmRNA:
129
+ obj = TMRNAGene.__new__(TMRNAGene)
130
+ else:
131
+ raise NotImplementedError
132
+
133
+ memcpy(&obj._gene, _gene, sizeof(gene))
134
+ obj._genetic_code = _genetic_code
135
+ return obj
136
+
137
+ def __sizeof__(self):
138
+ return sizeof(self)
139
+
140
+ @property
141
+ def type(self):
142
+ return ["tRNA", "tmRNA", "", "", "CDS"][<int> self._gene.genetype]
143
+
144
+ @property
145
+ def begin(self):
146
+ """`int`: The sequence coordinate at which the gene begins.
147
+
148
+ Hint:
149
+ This coordinate is 1-based, inclusive. To use it to index
150
+ a Python array or string, subtract one.
151
+
152
+ """
153
+ return self._gene.start
154
+
155
+ @property
156
+ def end(self):
157
+ """`int`: The sequence coordinate at which the gene end.
158
+
159
+ Hint:
160
+ This coordinate is 1-based, inclusive. To use it to index
161
+ a Python array or string, subtract one.
162
+
163
+ """
164
+ return self._gene.stop
165
+
166
+ @property
167
+ def length(self):
168
+ """`int`: The length of the RNA gene.
169
+ """
170
+ return aragorn.seqlen(&self._gene)
171
+
172
+ @property
173
+ def strand(self):
174
+ """`int`: *-1* if the gene is on the reverse strand, *+1* otherwise.
175
+ """
176
+ return -1 if self._gene.comp else +1
177
+
178
+ @property
179
+ def energy(self):
180
+ """`float`: The approximated normalised energy of the RNA structure.
181
+ """
182
+ cdef csw sw
183
+ default_sw(&sw) # FIXME: should use the same parameters as the
184
+ # RNAFinder that produced the gene
185
+ return aragorn.nenergy(&self._gene, &sw)
186
+
187
+ @property
188
+ def raw_energy(self):
189
+ """`float`: The un-normalized energy value of the RNA structure."""
190
+ return <double> self._gene.energy
191
+
192
+ def sequence(self):
193
+ """Retrieve the full sequence of the RNA gene.
194
+ """
195
+ cdef int i
196
+ cdef int l = aragorn.seqlen(&self._gene)
197
+ cdef bytearray b = bytearray(l)
198
+ for i in range(l):
199
+ b[i] = aragorn.cpbase(self._gene.seq[i])
200
+ return b.decode('ascii')
201
+
202
+
203
+ cdef class TRNAGene(Gene):
204
+ """A transfer RNA (tRNA) gene.
205
+ """
206
+
207
+ def __repr__(self):
208
+ return (
209
+ f"<TRNAGene begin={self.begin} end={self.end} "
210
+ f"strand={self.strand:+} "
211
+ f"length={self.length} anticodon={self.anticodon!r} "
212
+ f"energy={self.energy:.2f}>"
213
+ )
214
+
215
+ @property
216
+ def amino_acid(self):
217
+ """`str`: The 3-letter amino-acid(s) for this gene.
218
+
219
+ Hint:
220
+ If the anticodon loop contains 6 or 8 bases, ``???`` is
221
+ returned.
222
+
223
+ """
224
+ cdef csw sw
225
+ cdef int* s = self._gene.seq + self._gene.anticodon
226
+ (<int*> &sw.geneticcode)[0] = self._genetic_code
227
+ if self._gene.cloop == 6 or self._gene.cloop == 8:
228
+ return "???"
229
+ else:
230
+ return aragorn.aa(s, &sw).decode('ascii')
231
+
232
+ @property
233
+ def amino_acids(self):
234
+ """`tuple` of `str`: All possible 3-letter amino-acids for this gene.
235
+
236
+ Hint:
237
+ If the anticodon loop contains 6 or 8 bases, a tuple of two
238
+ amino-acid is returned, otherwise a tuple with a single element
239
+ is returned.
240
+
241
+ """
242
+ cdef csw sw
243
+ cdef int* s = self._gene.seq + self._gene.anticodon
244
+ (<int*> &sw.geneticcode)[0] = self._genetic_code
245
+ if self._gene.cloop == 6:
246
+ return (
247
+ aragorn.aa(s - 1, &sw).decode('ascii'),
248
+ aragorn.aa(s, &sw).decode('ascii'),
249
+ )
250
+ elif self._gene.cloop == 8:
251
+ return (
252
+ aragorn.aa(s, &sw).decode('ascii'),
253
+ aragorn.aa(s + 1, &sw).decode('ascii')
254
+ )
255
+ else:
256
+ return aragorn.aa(s, &sw).decode('ascii')
257
+
258
+ @property
259
+ def anticodon(self):
260
+ """`str`: The anticodon of the tRNA gene.
261
+ """
262
+ cdef tuple c
263
+ cdef int* s = self._gene.seq + self._gene.anticodon
264
+ if self._gene.cloop == 6:
265
+ c = ( aragorn.cbase(s[0]), aragorn.cbase(s[1]) )
266
+ elif self._gene.cloop == 8:
267
+ c = ( aragorn.cbase(s[0]), aragorn.cbase(s[1]), aragorn.cbase(s[2]), aragorn.cbase(s[3]) )
268
+ else:
269
+ c = ( aragorn.cbase(s[0]), aragorn.cbase(s[1]), aragorn.cbase(s[2]) )
270
+ return ''.join(map(chr, c))
271
+
272
+ @property
273
+ def anticodon_offset(self):
274
+ """`int`: The offset in the gene at which the anticodon starts.
275
+ """
276
+ cdef int x = 1 + self._gene.anticodon
277
+ if self._gene.nintron > 0 and self._gene.intron <= self._gene.anticodon:
278
+ x += self._gene.nintron
279
+ return x
280
+
281
+ @property
282
+ def anticodon_length(self):
283
+ """`int`: The length of the anticodon (in nucleotides).
284
+ """
285
+ if self._gene.cloop == 6:
286
+ return 2
287
+ elif self._gene.cloop == 8:
288
+ return 4
289
+ else:
290
+ return 3
291
+
292
+
293
+ cdef class TMRNAGene(Gene):
294
+ """A transfer-messenger RNA (tmRNA) gene.
295
+
296
+ Example:
297
+ >>> rna_finder = pyaragorn.RNAFinder(11, trna=False, tmrna=True)
298
+ >>> tmrna = rna_finder.find_rna(str(record.seq))[0]
299
+ >>> tmrna.begin, tmrna.end
300
+ (198037, 198447)
301
+ >>> tmrna.peptide()
302
+ 'AEKNEENFEMPAFMINNASAGANYMFA**'
303
+
304
+ """
305
+
306
+ def __repr__(self):
307
+ return (
308
+ f"<TMRNAGene begin={self.begin} end={self.end} "
309
+ f"strand={self.strand:+} "
310
+ f"length={self.length} orf_length={self.orf_length} "
311
+ f"energy={self.energy:.2f}>"
312
+ )
313
+
314
+ @property
315
+ def permuted(self):
316
+ """`bool`: Whether this tmRNA gene is a permuted gene.
317
+ """
318
+ return self._gene.asst != 0
319
+
320
+ @property
321
+ def orf_offset(self):
322
+ """`int`: The offset in the gene at which the open-reading frame starts.
323
+ """
324
+ return self._gene.tps + 1
325
+
326
+ @property
327
+ def orf_length(self):
328
+ """`int`: The length of the open-reading frame (in nucleotides).
329
+ """
330
+ cdef int tpe = self._gene.tpe
331
+ cdef int* se = (self._gene.eseq + tpe) + 1
332
+ cdef int* sb = (self._gene.eseq + self._gene.tps)
333
+ cdef int stride = 3
334
+
335
+ cdef csw sw
336
+ (<int*> &sw.geneticcode)[0] = self._genetic_code
337
+
338
+ while aragorn.ltranslate(se, &self._gene, &sw) == ord('*'):
339
+ se += stride
340
+ tpe += stride
341
+
342
+ return tpe - self._gene.tps
343
+
344
+ def orf(self, include_stop=True):
345
+ """Retrieve the open-reading frame of the mRNA-like region.
346
+
347
+ Arguments:
348
+ include_stop (`bool`): Whether or not to include the STOP codons
349
+ in the returned nucleotide sequence. Defaults to `True`.
350
+
351
+ Returns:
352
+ `str`: The sequence of the mRNA-like region in the tmRNA
353
+ gene, optionally without STOP codons.
354
+
355
+ """
356
+ cdef int tpe = self._gene.tpe
357
+ cdef int* se = (self._gene.eseq + tpe) + 1
358
+ cdef int* sb = (self._gene.eseq + self._gene.tps)
359
+ cdef int stride = 3 if include_stop else -3
360
+
361
+ cdef csw sw
362
+ (<int*> &sw.geneticcode)[0] = self._genetic_code
363
+
364
+ while aragorn.ltranslate(se, &self._gene, &sw) == ord('*'):
365
+ se += stride
366
+ tpe += stride
367
+
368
+ cds = bytearray()
369
+ while sb < se:
370
+ cds.append(aragorn.cpbase(sb[0]))
371
+ sb += 1
372
+
373
+ return cds.decode('ascii')
374
+
375
+ def peptide(self, include_stop=True):
376
+ """Retrieve the peptide sequence of the mRNA-like region.
377
+
378
+ Arguments:
379
+ include_stop (`bool`): Whether or not to include the STOP codons
380
+ in the returned peptide sequence. Defaults to `True`.
381
+
382
+ Returns:
383
+ `str`: The translation of the mRNA-like region of the tmRNA
384
+ gene, optionally without STOP codons.
385
+
386
+ """
387
+ cdef int tpe = self._gene.tpe
388
+ cdef int* se = (self._gene.eseq + tpe) + 1
389
+ cdef int* sb = (self._gene.eseq + self._gene.tps)
390
+ cdef int stride = 3 if include_stop else -3
391
+
392
+ cdef csw sw
393
+ (<int*> &sw.geneticcode)[0] = self._genetic_code
394
+
395
+ while aragorn.ltranslate(se, &self._gene, &sw) == ord('*'):
396
+ se += stride
397
+ tpe += stride
398
+
399
+ peptide = bytearray()
400
+ while sb < se:
401
+ peptide.append(aragorn.ltranslate(sb, &self._gene, &sw))
402
+ sb += 3
403
+
404
+ return peptide.decode('ascii')
405
+
406
+
407
+ cdef class Cursor:
408
+ cdef object obj
409
+ cdef const unsigned char[:] data
410
+ cdef int kind
411
+ cdef size_t length
412
+ cdef data_set ds
413
+
414
+ def __init__(self, obj):
415
+ if isinstance(obj, str):
416
+ obj = PyUnicode_AsASCIIString(obj)
417
+
418
+ # get a memoryview to the data contents
419
+ self.data = obj
420
+ self.length = self.data.shape[0]
421
+
422
+ # keep a reference to the data source
423
+ self.obj = obj
424
+
425
+ # reinitialize dataset book-keeping
426
+ self.ds.filepointer = 0
427
+ self.ds.ns = 0
428
+ self.ds.nf = 0
429
+ self.ds.nextseq = 0
430
+ self.ds.nextseqoff = 0
431
+ self.ds.seqstart = 0
432
+ self.ds.seqstartoff = 0
433
+ self.ds.ps = 0
434
+ self.ds.psmax = self.length
435
+
436
+ # count GC%
437
+ self.ds.gc = self._gc()
438
+
439
+ cdef int _forward(self) noexcept nogil:
440
+ cdef char x
441
+ cdef int base
442
+
443
+ if self.ds.ps >= self.ds.psmax:
444
+ return <int> aragorn.base.TERM
445
+
446
+ x = self.data[self.ds.ps]
447
+ if x >= 128:
448
+ return <int> aragorn.base.NOBASE
449
+
450
+ base = aragorn.map[x]
451
+ if base >= <int> aragorn.base.Adenine:
452
+ self.ds.ps += 1
453
+ return base
454
+ else:
455
+ return <int> aragorn.base.NOBASE
456
+
457
+ cdef double _gc(self) noexcept nogil:
458
+ cdef long i
459
+ cdef char x
460
+ cdef int base
461
+ cdef long ngc = 0
462
+ cdef long ps = 0
463
+
464
+ for i in range(self.length):
465
+ x = self.data[i]
466
+ base = aragorn.map[x]
467
+ if base == -1:
468
+ break
469
+ ngc += (base == <int> aragorn.base.Cytosine) or (base == <int> aragorn.base.Guanine)
470
+ ps += 1
471
+
472
+ return <double> ngc / <double> ps
473
+
474
+
475
+ cdef class RNAFinder:
476
+ """A configurable RNA gene finder.
477
+ """
478
+ cdef csw _sw
479
+
480
+ def __init__(
481
+ self,
482
+ int translation_table = 1,
483
+ *,
484
+ bint trna = True,
485
+ bint tmrna = True,
486
+ bint linear = False,
487
+ double threshold_scale = 1.0,
488
+ ):
489
+ """__init__(self, translation_table=1, *, trna=True, tmrna=True, linear=False, threshold_scale=1.0)\n--\n
490
+
491
+ Create a new RNA finder.
492
+
493
+ Arguments:
494
+ translation_table (`int`, optional): The translation table to
495
+ use. Check the :wiki:`List of genetic codes` page
496
+ listing all genetic codes for the available values, or
497
+ the :attr:`pyaragorn.TRANSLATION_TABLES` constant for allowed
498
+ values.
499
+
500
+ Keyword Arguments:
501
+ trna (`bool`): Enable detection of tRNA genes. Set to `False` to
502
+ disable.
503
+ trmna (`bool`): Enable detection of tmRNA genes. Set to `False` to
504
+ disable.
505
+ linear (`bool`): Set to `True` to assume that the given sequences
506
+ have linear topology (no closed genomes).
507
+ threshold_scale (`float`, optional): Rescale scoring thresholds
508
+ from the default levels. Defaults to 1.0 (no rescaling). Set
509
+ to e.g. 0.95 to report possible pseudogenes by lowering
510
+ the threshold by 5%.
511
+
512
+ .. versionadded:: 0.3.0
513
+ The ``threshold_scale`` keyword argument.
514
+
515
+ """
516
+ default_sw(&self._sw)
517
+ self._sw.trna = trna
518
+ self._sw.tmrna = tmrna
519
+ self._sw.linear = linear
520
+ self._sw.f = stdout
521
+ self._sw.verbose = False #True
522
+ self.threshold_scale = threshold_scale
523
+
524
+ if translation_table not in _TRANSLATION_TABLES:
525
+ raise ValueError(f"invalid translation table: {translation_table!r}")
526
+ self._sw.geneticcode = translation_table
527
+
528
+ def __reduce__(self):
529
+ return functools.partial(
530
+ type(self),
531
+ translation_table=self.translation_table,
532
+ trna=self.trna,
533
+ tmrna=self.tmrna,
534
+ linear=self.linear,
535
+ threshold_scale=self.threshold_scale,
536
+ ), ()
537
+
538
+ def __repr__(self):
539
+ cdef str ty = type(self).__name__
540
+ cdef list args = []
541
+
542
+ if self.translation_table != 1:
543
+ args.append(f"{self.translation_table!r}")
544
+ if not self.trna:
545
+ args.append(f"trna={self.trna!r}")
546
+ if not self.tmrna:
547
+ args.append(f"tmrna={self.tmrna!r}")
548
+ if self.linear:
549
+ args.append(f"linear={self.linear!r}")
550
+ if self.threshold_scale != 1.0:
551
+ args.append(f"threshold_scale={self.threshold_scale!r}")
552
+ return f"{ty}({', '.join(args)})"
553
+
554
+ @property
555
+ def translation_table(self):
556
+ """`int`: The translation table in use by this object.
557
+ """
558
+ return self._sw.geneticcode
559
+
560
+ @property
561
+ def trna(self):
562
+ """`bool`: Whether tRNA detection is enabled.
563
+ """
564
+ return bool(self._sw.trna)
565
+
566
+ @property
567
+ def tmrna(self):
568
+ """`bool`: Whether tmRNA detection is enabled.
569
+ """
570
+ return bool(self._sw.tmrna)
571
+
572
+ @property
573
+ def linear(self):
574
+ """`bool`: Whether input sequences are assumed to have linear topology.
575
+ """
576
+ return bool(self._sw.linear)
577
+
578
+ @property
579
+ def threshold_scale(self):
580
+ """`float`: The scale used to change the default thresholds.
581
+
582
+ .. versionadded:: 0.3.0
583
+
584
+ """
585
+ return self._sw.threshlevel
586
+
587
+ @threshold_scale.setter
588
+ def threshold_scale(self, double threshold_scale):
589
+ if threshold_scale <= 0.0:
590
+ raise ValueError(f"threshold_scale must be positive (got {threshold_scale!r})")
591
+ aragorn.change_thresholds(&self._sw, threshold_scale)
592
+
593
+ def find_rna(self, object sequence):
594
+ """Find RNA genes in the input DNA sequence.
595
+
596
+ Arguments:
597
+ sequence (`str` or buffer): The nucleotide sequence to process,
598
+ either as a string of nucleotides (upper- or lowercase), or
599
+ as an object implementing the buffer protocol.
600
+
601
+ Returns:
602
+ `list` of `~pyaragorn.Gene`: A list of `~pyaragorn.Gene` (either
603
+ `~pyaragorn.TRNAGene` or `~pyaragorn.TMRNAGene`) corresponding
604
+ to RNA genes detected in the sequence according to the `RNAFinder`
605
+ parameters.
606
+
607
+ """
608
+ cdef int n
609
+ cdef int nt
610
+ cdef csw sw
611
+ cdef int* vsort = NULL
612
+ cdef Cursor cursor = Cursor(sequence)
613
+
614
+ # copy parameters to ensure the `find_rna` method is re-entrant
615
+ memcpy(&sw, &self._sw, sizeof(csw))
616
+
617
+ try:
618
+ with nogil:
619
+ # allocate memory for the result genes
620
+ sw.genespace = aragorn.NT
621
+ sw.genes = <gene*> calloc(sw.genespace, sizeof(gene))
622
+ if sw.genes is NULL:
623
+ raise MemoryError("failed to allocate memory")
624
+ # detect RNA genes with the "batched" algorithm
625
+ nt = self._bopt(cursor, &sw)
626
+ # allocate array for sorting genes
627
+ vsort = <int*> calloc(nt, sizeof(int))
628
+ if vsort is NULL:
629
+ raise MemoryError("failed to allocate memory")
630
+ # sort and threshold genes
631
+ n = aragorn.gene_sort(&cursor.ds, nt, vsort, &sw)
632
+ # recover genes
633
+ genes = []
634
+ for i in range(n):
635
+ genes.append(Gene._new_gene(&sw.genes[vsort[i]], sw.geneticcode))
636
+ finally:
637
+ free(vsort)
638
+ free(sw.genes)
639
+
640
+ return genes
641
+
642
+ cdef int _bopt(
643
+ self,
644
+ Cursor cursor,
645
+ csw* sw
646
+ ) except -1 nogil:
647
+ # adapted from bopt_fastafile to use with our own `Cursor` dataset
648
+ cdef int nt
649
+ cdef int seq[((2 * aragorn.LSEQ) + aragorn.WRAP) + 1]
650
+ cdef int cseq[((2 * aragorn.LSEQ) + aragorn.WRAP) + 1]
651
+ cdef int wseq[(2 * aragorn.WRAP) + 1]
652
+ cdef long i
653
+ cdef long rewind
654
+ cdef long drewind
655
+ cdef long tmaxlen
656
+ cdef bint flag
657
+ cdef int length
658
+ cdef int *s
659
+ cdef int *sf
660
+ cdef int *se
661
+ cdef int *sc
662
+ cdef int *swrap
663
+ cdef long gap
664
+ cdef long start
665
+ cdef bint loop
666
+ cdef bint NX
667
+ cdef bint SH
668
+
669
+ # compute width of sliding windows
670
+ rewind = aragorn.MAXTAGDIST + 20
671
+ if sw.trna or sw.mtrna:
672
+ tmaxlen = aragorn.MAXTRNALEN + sw.maxintronlen
673
+ if rewind < tmaxlen:
674
+ rewind = tmaxlen
675
+ if sw.tmrna:
676
+ if rewind < aragorn.MAXTMRNALEN:
677
+ rewind = aragorn.MAXTMRNALEN
678
+ if sw.peptide:
679
+ if sw.tagthresh >= 5 and rewind < aragorn.TSWEEP:
680
+ rewind = aragorn.TSWEEP
681
+
682
+ sw.loffset = rewind
683
+ sw.roffset = rewind
684
+ drewind = 2 * rewind
685
+
686
+ # cleanly initialize gene array
687
+ aragorn.init_gene(sw.genes, 0, aragorn.NT)
688
+
689
+ nt = 0
690
+ flag = 0
691
+ start = 1L
692
+
693
+ loop = True
694
+ NX = True
695
+ SH = True
696
+
697
+ se = seq
698
+ if sw.linear:
699
+ for i in range(rewind):
700
+ postincrement(se)[0] = aragorn.NOBASE
701
+ start -= rewind
702
+ else:
703
+ if cursor.ds.psmax <= drewind:
704
+ gap = drewind - cursor.ds.psmax
705
+ sc = se + gap
706
+ while se < sc:
707
+ postincrement(se)[0] = aragorn.NOBASE
708
+
709
+ swrap = wseq
710
+ sc = se + cursor.ds.psmax
711
+ while se < sc:
712
+ se[0] = cursor._forward()
713
+ postincrement(swrap)[0] = postincrement(se)[0]
714
+
715
+ sc = swrap + gap
716
+ while swrap < sc:
717
+ postincrement(swrap)[0] = aragorn.NOBASE
718
+
719
+ swrap = wseq
720
+ sc = swrap + cursor.ds.psmax
721
+ while swrap < sc:
722
+ postincrement(se)[0] = postincrement(swrap)[0]
723
+
724
+ swrap = wseq
725
+ sc = swrap + drewind
726
+ while swrap < sc:
727
+ postincrement(se)[0] = postincrement(swrap)[0]
728
+
729
+ sw.loffset = drewind
730
+ sw.roffset = drewind
731
+ start -= drewind
732
+ flag = 1
733
+ # goto SH
734
+ loop = True
735
+ SH = True
736
+ NX = False
737
+
738
+ else:
739
+ swrap = wseq
740
+ sc = seq + drewind
741
+ while se < sc:
742
+ se[0] = cursor._forward()
743
+ postincrement(swrap)[0] = postincrement(se)[0]
744
+
745
+ # weird ass loop to emulate a GOTO
746
+ while loop:
747
+
748
+ # label NX: next
749
+ sc = seq + aragorn.LSEQ
750
+ if NX:
751
+ while (se < sc):
752
+ postincrement(se)[0] = cursor._forward()
753
+ if cursor.ds.ps >= cursor.ds.psmax:
754
+ if sw.linear:
755
+ for i in range(rewind):
756
+ postincrement(se)[0] = aragorn.NOBASE
757
+ else:
758
+ sc = wseq + drewind
759
+ swrap = wseq
760
+ while (swrap < sc):
761
+ postincrement(se)[0] = postincrement(swrap)[0]
762
+ flag = 1
763
+ SH = True
764
+ break
765
+
766
+ # label SH: search
767
+ if SH:
768
+ length = <int> (se - seq)
769
+
770
+ with gil:
771
+ PyErr_CheckSignals()
772
+
773
+ # if (sw.verbose):
774
+ # vstart = sq(d, start + sw.loffset)
775
+ # vstop = sq(d, ((start + length) - sw.roffset) - 1)
776
+ # if (vstop < vstart):
777
+ # fprintf(stderr, "Searching from %ld to %ld\n", vstart, d.psmax)
778
+ # fprintf(stderr, "Searching from 1 to %ld\n", vstop)
779
+ # else:
780
+ # fprintf(stderr, "Searching from %ld to %ld\n", vstart, vstop)
781
+
782
+ if (sw.both != 1):
783
+ sw.start = start
784
+ sw.comp = 0
785
+ nt = aragorn.tmioptimise(&cursor.ds, seq, length, nt, sw)
786
+
787
+ if (sw.both > 0):
788
+ aragorn.sense_switch(seq, cseq, length)
789
+ sw.start = start + length
790
+ sw.comp = 1
791
+ nt = aragorn.tmioptimise(&cursor.ds, cseq, length, nt, sw)
792
+
793
+ if not flag:
794
+ s = seq
795
+ sf = se - drewind
796
+ se = seq + drewind
797
+ while (s < se):
798
+ postincrement(s)[0] = postincrement(sf)[0]
799
+ start += length - drewind
800
+ # goto NX
801
+ NX = SH = loop = True
802
+ continue
803
+
804
+ if nt < 1:
805
+ cursor.ds.nf += 1
806
+ if sw.maxintronlen > 0:
807
+ aragorn.remove_overlapping_trna(&cursor.ds, nt, sw)
808
+ if sw.updatetmrnatags:
809
+ aragorn.update_tmrna_tag_database(sw.genes, nt, sw)
810
+
811
+ # FIXME: here should sort genes and filter them with `gene_sort`
812
+ # aragorn.batch_gene_set(d, nt, sw)
813
+
814
+ # if sw.verbose:
815
+ # fprintf(stderr, "%s\nSearch Finished\n\n", d.seqname)
816
+
817
+ cursor.ds.ns += 1
818
+ # exit loop
819
+ loop = False
820
+
821
+ return nt
822
+
823
+ # if (d.ns > 1) and (sw.batch < 2):
824
+ # fprintf(f, ">end \t%d sequences", d.ns)
825
+ # if sw.trna or sw.mtrna:
826
+ # fprintf(f, " %d tRNA genes", sw.ngene[<int> aragorn.tRNA])
827
+ # if sw.tmrna:
828
+ # fprintf(f, " %d tmRNA genes", sw.ngene[<int> aragorn.tmRNA])
829
+ # if d.nf > 0:
830
+ # sens = (100.0 * (d.ns - d.nf)) / d.ns
831
+ # fprintf(f, ", nothing found in %d sequences, (%.2lf%% sensitivity)", d.nf, sens)
832
+ # fputc('\n', f)
833
+ # if sw.updatetmrnatags:
834
+ # aragorn.report_new_tmrna_tags(sw)