minimap2 0.2.22.0 → 0.2.24.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +60 -76
- data/ext/Rakefile +55 -0
- data/ext/cmappy/cmappy.c +129 -0
- data/ext/cmappy/cmappy.h +44 -0
- data/ext/minimap2/FAQ.md +46 -0
- data/ext/minimap2/LICENSE.txt +24 -0
- data/ext/minimap2/MANIFEST.in +10 -0
- data/ext/minimap2/Makefile +132 -0
- data/ext/minimap2/Makefile.simde +97 -0
- data/ext/minimap2/NEWS.md +821 -0
- data/ext/minimap2/README.md +403 -0
- data/ext/minimap2/align.c +1020 -0
- data/ext/minimap2/bseq.c +169 -0
- data/ext/minimap2/bseq.h +64 -0
- data/ext/minimap2/code_of_conduct.md +30 -0
- data/ext/minimap2/cookbook.md +243 -0
- data/ext/minimap2/esterr.c +64 -0
- data/ext/minimap2/example.c +63 -0
- data/ext/minimap2/format.c +559 -0
- data/ext/minimap2/hit.c +466 -0
- data/ext/minimap2/index.c +775 -0
- data/ext/minimap2/kalloc.c +205 -0
- data/ext/minimap2/kalloc.h +76 -0
- data/ext/minimap2/kdq.h +132 -0
- data/ext/minimap2/ketopt.h +120 -0
- data/ext/minimap2/khash.h +615 -0
- data/ext/minimap2/krmq.h +474 -0
- data/ext/minimap2/kseq.h +256 -0
- data/ext/minimap2/ksort.h +153 -0
- data/ext/minimap2/ksw2.h +184 -0
- data/ext/minimap2/ksw2_dispatch.c +96 -0
- data/ext/minimap2/ksw2_extd2_sse.c +402 -0
- data/ext/minimap2/ksw2_exts2_sse.c +416 -0
- data/ext/minimap2/ksw2_extz2_sse.c +313 -0
- data/ext/minimap2/ksw2_ll_sse.c +152 -0
- data/ext/minimap2/kthread.c +159 -0
- data/ext/minimap2/kthread.h +15 -0
- data/ext/minimap2/kvec.h +105 -0
- data/ext/minimap2/lchain.c +369 -0
- data/ext/minimap2/main.c +459 -0
- data/ext/minimap2/map.c +714 -0
- data/ext/minimap2/minimap.h +410 -0
- data/ext/minimap2/minimap2.1 +725 -0
- data/ext/minimap2/misc/README.md +179 -0
- data/ext/minimap2/misc/mmphase.js +335 -0
- data/ext/minimap2/misc/paftools.js +3149 -0
- data/ext/minimap2/misc.c +162 -0
- data/ext/minimap2/mmpriv.h +132 -0
- data/ext/minimap2/options.c +234 -0
- data/ext/minimap2/pe.c +177 -0
- data/ext/minimap2/python/README.rst +196 -0
- data/ext/minimap2/python/cmappy.h +152 -0
- data/ext/minimap2/python/cmappy.pxd +153 -0
- data/ext/minimap2/python/mappy.pyx +273 -0
- data/ext/minimap2/python/minimap2.py +39 -0
- data/ext/minimap2/sdust.c +213 -0
- data/ext/minimap2/sdust.h +25 -0
- data/ext/minimap2/seed.c +131 -0
- data/ext/minimap2/setup.py +55 -0
- data/ext/minimap2/sketch.c +143 -0
- data/ext/minimap2/splitidx.c +84 -0
- data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
- data/ext/minimap2/test/MT-human.fa +278 -0
- data/ext/minimap2/test/MT-orang.fa +276 -0
- data/ext/minimap2/test/q-inv.fa +4 -0
- data/ext/minimap2/test/q2.fa +2 -0
- data/ext/minimap2/test/t-inv.fa +127 -0
- data/ext/minimap2/test/t2.fa +2 -0
- data/ext/minimap2/tex/Makefile +21 -0
- data/ext/minimap2/tex/bioinfo.cls +930 -0
- data/ext/minimap2/tex/blasr-mc.eval +17 -0
- data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
- data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
- data/ext/minimap2/tex/bwa.eval +55 -0
- data/ext/minimap2/tex/eval2roc.pl +33 -0
- data/ext/minimap2/tex/graphmap.eval +4 -0
- data/ext/minimap2/tex/hs38-simu.sh +10 -0
- data/ext/minimap2/tex/minialign.eval +49 -0
- data/ext/minimap2/tex/minimap2.bib +460 -0
- data/ext/minimap2/tex/minimap2.tex +724 -0
- data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
- data/ext/minimap2/tex/mm2-update.tex +240 -0
- data/ext/minimap2/tex/mm2.approx.eval +12 -0
- data/ext/minimap2/tex/mm2.eval +13 -0
- data/ext/minimap2/tex/natbib.bst +1288 -0
- data/ext/minimap2/tex/natbib.sty +803 -0
- data/ext/minimap2/tex/ngmlr.eval +38 -0
- data/ext/minimap2/tex/roc.gp +60 -0
- data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
- data/ext/minimap2.patch +19 -0
- data/lib/minimap2/aligner.rb +4 -4
- data/lib/minimap2/alignment.rb +11 -11
- data/lib/minimap2/ffi/constants.rb +20 -16
- data/lib/minimap2/ffi/functions.rb +5 -0
- data/lib/minimap2/ffi.rb +4 -5
- data/lib/minimap2/version.rb +2 -2
- data/lib/minimap2.rb +51 -15
- metadata +97 -79
- data/lib/minimap2/ffi_helper.rb +0 -53
- data/vendor/libminimap2.so +0 -0
data/ext/minimap2/pe.c
ADDED
@@ -0,0 +1,177 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <math.h>
|
3
|
+
#include "mmpriv.h"
|
4
|
+
#include "kvec.h"
|
5
|
+
|
6
|
+
void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r)
|
7
|
+
{
|
8
|
+
if (pri_ratio > 0.0f && *n_ > 0) {
|
9
|
+
int i, k, n = *n_, n_2nd = 0;
|
10
|
+
int max_dist = n_segs == 2? qlens[0] + qlens[1] + max_gap_ref : 0;
|
11
|
+
for (i = k = 0; i < n; ++i) {
|
12
|
+
int to_keep = 0;
|
13
|
+
if (r[i].parent == i) { // primary
|
14
|
+
to_keep = 1;
|
15
|
+
} else if (r[i].score + min_diff >= r[r[i].parent].score) {
|
16
|
+
to_keep = 1;
|
17
|
+
} else {
|
18
|
+
mm_reg1_t *p = &r[r[i].parent], *q = &r[i];
|
19
|
+
if (p->rev == q->rev && p->rid == q->rid && q->re - p->rs < max_dist && p->re - q->rs < max_dist) { // child and parent are close on the ref
|
20
|
+
if (q->score >= p->score * pri1)
|
21
|
+
to_keep = 1;
|
22
|
+
} else {
|
23
|
+
int is_par_both = (n_segs == 2 && p->qs < qlens[0] && p->qe > qlens[0]);
|
24
|
+
int is_chi_both = (n_segs == 2 && q->qs < qlens[0] && q->qe > qlens[0]);
|
25
|
+
if (is_chi_both || is_chi_both == is_par_both) {
|
26
|
+
if (q->score >= p->score * pri_ratio)
|
27
|
+
to_keep = 1;
|
28
|
+
} else { // the remaining case: is_chi_both == 0 && is_par_both == 1
|
29
|
+
if (q->score >= p->score * pri2)
|
30
|
+
to_keep = 1;
|
31
|
+
}
|
32
|
+
}
|
33
|
+
}
|
34
|
+
if (to_keep && r[i].parent != i) {
|
35
|
+
if (n_2nd++ >= best_n) to_keep = 0; // don't keep if there are too many secondary hits
|
36
|
+
}
|
37
|
+
if (to_keep) r[k++] = r[i];
|
38
|
+
else if (r[i].p) free(r[i].p);
|
39
|
+
}
|
40
|
+
if (k != n) mm_sync_regs(km, k, r); // removing hits requires sync()
|
41
|
+
*n_ = k;
|
42
|
+
}
|
43
|
+
}
|
44
|
+
|
45
|
+
void mm_set_pe_thru(const int *qlens, int *n_regs, mm_reg1_t **regs)
|
46
|
+
{
|
47
|
+
int s, i, n_pri[2], pri[2];
|
48
|
+
n_pri[0] = n_pri[1] = 0;
|
49
|
+
pri[0] = pri[1] = -1;
|
50
|
+
for (s = 0; s < 2; ++s)
|
51
|
+
for (i = 0; i < n_regs[s]; ++i)
|
52
|
+
if (regs[s][i].id == regs[s][i].parent)
|
53
|
+
++n_pri[s], pri[s] = i;
|
54
|
+
if (n_pri[0] == 1 && n_pri[1] == 1) {
|
55
|
+
mm_reg1_t *p = ®s[0][pri[0]];
|
56
|
+
mm_reg1_t *q = ®s[1][pri[1]];
|
57
|
+
if (p->rid == q->rid && p->rev == q->rev && abs(p->rs - q->rs) < 3 && abs(p->re - q->re) < 3
|
58
|
+
&& ((p->qs == 0 && qlens[1] - q->qe == 0) || (q->qs == 0 && qlens[0] - p->qe == 0)))
|
59
|
+
{
|
60
|
+
p->pe_thru = q->pe_thru = 1;
|
61
|
+
}
|
62
|
+
}
|
63
|
+
}
|
64
|
+
|
65
|
+
#include "ksort.h"
|
66
|
+
|
67
|
+
typedef struct {
|
68
|
+
int s, rev;
|
69
|
+
uint64_t key;
|
70
|
+
mm_reg1_t *r;
|
71
|
+
} pair_arr_t;
|
72
|
+
|
73
|
+
#define sort_key_pair(a) ((a).key)
|
74
|
+
KRADIX_SORT_INIT(pair, pair_arr_t, sort_key_pair, 8)
|
75
|
+
|
76
|
+
void mm_pair(void *km, int max_gap_ref, int pe_bonus, int sub_diff, int match_sc, const int *qlens, int *n_regs, mm_reg1_t **regs)
|
77
|
+
{
|
78
|
+
int i, j, s, n, last[2], dp_thres, segs = 0, max_idx[2];
|
79
|
+
int64_t max;
|
80
|
+
pair_arr_t *a;
|
81
|
+
kvec_t(uint64_t) sc = {0,0,0};
|
82
|
+
|
83
|
+
a = (pair_arr_t*)kmalloc(km, (n_regs[0] + n_regs[1]) * sizeof(pair_arr_t));
|
84
|
+
for (s = n = 0, dp_thres = 0; s < 2; ++s) {
|
85
|
+
int max = 0;
|
86
|
+
for (i = 0; i < n_regs[s]; ++i) {
|
87
|
+
a[n].s = s;
|
88
|
+
a[n].r = ®s[s][i];
|
89
|
+
a[n].rev = a[n].r->rev;
|
90
|
+
a[n].key = (uint64_t)a[n].r->rid << 32 | a[n].r->rs<<1 | (s^a[n].rev);
|
91
|
+
max = max > a[n].r->p->dp_max? max : a[n].r->p->dp_max;
|
92
|
+
++n;
|
93
|
+
segs |= 1<<s;
|
94
|
+
}
|
95
|
+
dp_thres += max;
|
96
|
+
}
|
97
|
+
if (segs != 3) {
|
98
|
+
kfree(km, a); // only one end is mapped
|
99
|
+
return;
|
100
|
+
}
|
101
|
+
dp_thres -= pe_bonus;
|
102
|
+
if (dp_thres < 0) dp_thres = 0;
|
103
|
+
radix_sort_pair(a, a + n);
|
104
|
+
|
105
|
+
max = -1;
|
106
|
+
max_idx[0] = max_idx[1] = -1;
|
107
|
+
last[0] = last[1] = -1;
|
108
|
+
kv_resize(uint64_t, km, sc, (size_t)n);
|
109
|
+
for (i = 0; i < n; ++i) {
|
110
|
+
if (a[i].key & 1) { // reverse first read or forward second read
|
111
|
+
mm_reg1_t *q, *r;
|
112
|
+
if (last[a[i].rev] < 0) continue;
|
113
|
+
r = a[i].r;
|
114
|
+
q = a[last[a[i].rev]].r;
|
115
|
+
if (r->rid != q->rid || r->rs - q->re > max_gap_ref) continue;
|
116
|
+
for (j = last[a[i].rev]; j >= 0; --j) {
|
117
|
+
int64_t score;
|
118
|
+
if (a[j].rev != a[i].rev || a[j].s == a[i].s) continue;
|
119
|
+
q = a[j].r;
|
120
|
+
if (r->rid != q->rid || r->rs - q->re > max_gap_ref) break;
|
121
|
+
if (r->p->dp_max + q->p->dp_max < dp_thres) continue;
|
122
|
+
score = (int64_t)(r->p->dp_max + q->p->dp_max) << 32 | (r->hash + q->hash);
|
123
|
+
if (score > max)
|
124
|
+
max = score, max_idx[a[j].s] = j, max_idx[a[i].s] = i;
|
125
|
+
kv_push(uint64_t, km, sc, score);
|
126
|
+
}
|
127
|
+
} else { // forward first read or reverse second read
|
128
|
+
last[a[i].rev] = i;
|
129
|
+
}
|
130
|
+
}
|
131
|
+
if (sc.n > 1)
|
132
|
+
radix_sort_64(sc.a, sc.a + sc.n);
|
133
|
+
|
134
|
+
if (sc.n > 0 && max > 0) { // found at least one pair
|
135
|
+
int n_sub = 0, mapq_pe;
|
136
|
+
mm_reg1_t *r[2];
|
137
|
+
r[0] = a[max_idx[0]].r, r[1] = a[max_idx[1]].r;
|
138
|
+
r[0]->proper_frag = r[1]->proper_frag = 1;
|
139
|
+
for (s = 0; s < 2; ++s) {
|
140
|
+
if (r[s]->id != r[s]->parent) { // then lift to primary and update parent
|
141
|
+
mm_reg1_t *p = ®s[s][r[s]->parent];
|
142
|
+
for (i = 0; i < n_regs[s]; ++i)
|
143
|
+
if (regs[s][i].parent == p->id)
|
144
|
+
regs[s][i].parent = r[s]->id;
|
145
|
+
p->mapq = 0;
|
146
|
+
}
|
147
|
+
if (!r[s]->sam_pri) { // then sync sam_pri
|
148
|
+
for (i = 0; i < n_regs[s]; ++i)
|
149
|
+
regs[s][i].sam_pri = 0;
|
150
|
+
r[s]->sam_pri = 1;
|
151
|
+
}
|
152
|
+
}
|
153
|
+
mapq_pe = r[0]->mapq > r[1]->mapq? r[0]->mapq : r[1]->mapq;
|
154
|
+
for (i = 0; i < (int)sc.n; ++i)
|
155
|
+
if ((sc.a[i]>>32) + sub_diff >= (uint64_t)max>>32)
|
156
|
+
++n_sub;
|
157
|
+
if (sc.n > 1) {
|
158
|
+
int mapq_pe_alt;
|
159
|
+
mapq_pe_alt = (int)(6.02f * ((max>>32) - (sc.a[sc.n - 2]>>32)) / match_sc - 4.343f * logf(n_sub)); // n_sub > 0 because it counts the optimal, too
|
160
|
+
mapq_pe = mapq_pe < mapq_pe_alt? mapq_pe : mapq_pe_alt;
|
161
|
+
}
|
162
|
+
if (r[0]->mapq < mapq_pe) r[0]->mapq = (int)(.2f * r[0]->mapq + .8f * mapq_pe + .499f);
|
163
|
+
if (r[1]->mapq < mapq_pe) r[1]->mapq = (int)(.2f * r[1]->mapq + .8f * mapq_pe + .499f);
|
164
|
+
if (sc.n == 1) {
|
165
|
+
if (r[0]->mapq < 2) r[0]->mapq = 2;
|
166
|
+
if (r[1]->mapq < 2) r[1]->mapq = 2;
|
167
|
+
} else if ((uint64_t)max>>32 > sc.a[sc.n - 2]>>32) {
|
168
|
+
if (r[0]->mapq < 1) r[0]->mapq = 1;
|
169
|
+
if (r[1]->mapq < 1) r[1]->mapq = 1;
|
170
|
+
}
|
171
|
+
}
|
172
|
+
|
173
|
+
kfree(km, a);
|
174
|
+
kfree(km, sc.a);
|
175
|
+
|
176
|
+
mm_set_pe_thru(qlens, n_regs, regs);
|
177
|
+
}
|
@@ -0,0 +1,196 @@
|
|
1
|
+
==============================
|
2
|
+
Mappy: Minimap2 Python Binding
|
3
|
+
==============================
|
4
|
+
|
5
|
+
Mappy provides a convenient interface to `minimap2
|
6
|
+
<https://github.com/lh3/minimap2>`_, a fast and accurate C program to align
|
7
|
+
genomic and transcribe nucleotide sequences.
|
8
|
+
|
9
|
+
Installation
|
10
|
+
------------
|
11
|
+
|
12
|
+
Mappy depends on `zlib <http://zlib.net>`_. It can be installed with `pip
|
13
|
+
<https://en.wikipedia.org/wiki/Pip_(package_manager)>`_:
|
14
|
+
|
15
|
+
.. code:: shell
|
16
|
+
|
17
|
+
pip install --user mappy
|
18
|
+
|
19
|
+
or from the minimap2 github repo (`Cython <http://cython.org>`_ required):
|
20
|
+
|
21
|
+
.. code:: shell
|
22
|
+
|
23
|
+
git clone https://github.com/lh3/minimap2
|
24
|
+
cd minimap2
|
25
|
+
python setup.py install
|
26
|
+
|
27
|
+
Usage
|
28
|
+
-----
|
29
|
+
|
30
|
+
The following Python script demonstrates the key functionality of mappy:
|
31
|
+
|
32
|
+
.. code:: python
|
33
|
+
|
34
|
+
import mappy as mp
|
35
|
+
a = mp.Aligner("test/MT-human.fa") # load or build index
|
36
|
+
if not a: raise Exception("ERROR: failed to load/build index")
|
37
|
+
s = a.seq("MT_human", 100, 200) # retrieve a subsequence from the index
|
38
|
+
print(mp.revcomp(s)) # reverse complement
|
39
|
+
for name, seq, qual in mp.fastx_read("test/MT-orang.fa"): # read a fasta/q sequence
|
40
|
+
for hit in a.map(seq): # traverse alignments
|
41
|
+
print("{}\t{}\t{}\t{}".format(hit.ctg, hit.r_st, hit.r_en, hit.cigar_str))
|
42
|
+
|
43
|
+
APIs
|
44
|
+
----
|
45
|
+
|
46
|
+
Mappy implements two classes and two global function.
|
47
|
+
|
48
|
+
Class mappy.Aligner
|
49
|
+
~~~~~~~~~~~~~~~~~~~
|
50
|
+
|
51
|
+
.. code:: python
|
52
|
+
|
53
|
+
mappy.Aligner(fn_idx_in=None, preset=None, ...)
|
54
|
+
|
55
|
+
This constructor accepts the following arguments:
|
56
|
+
|
57
|
+
* **fn_idx_in**: index or sequence file name. Minimap2 automatically tests the
|
58
|
+
file type. If a sequence file is provided, minimap2 builds an index. The
|
59
|
+
sequence file can be optionally gzip'd. This option has no effect if **seq**
|
60
|
+
is set.
|
61
|
+
|
62
|
+
* **seq**: a single sequence to index. The sequence name will be set to
|
63
|
+
:code:`N/A`.
|
64
|
+
|
65
|
+
* **preset**: minimap2 preset. Currently, minimap2 supports the following
|
66
|
+
presets: **sr** for single-end short reads; **map-pb** for PacBio
|
67
|
+
read-to-reference mapping; **map-ont** for Oxford Nanopore read mapping;
|
68
|
+
**splice** for long-read spliced alignment; **asm5** for assembly-to-assembly
|
69
|
+
alignment; **asm10** for full genome alignment of closely related species. Note
|
70
|
+
that the Python module does not support all-vs-all read overlapping.
|
71
|
+
|
72
|
+
* **k**: k-mer length, no larger than 28
|
73
|
+
|
74
|
+
* **w**: minimizer window size, no larger than 255
|
75
|
+
|
76
|
+
* **min_cnt**: mininum number of minimizers on a chain
|
77
|
+
|
78
|
+
* **min_chain_score**: minimum chaing score
|
79
|
+
|
80
|
+
* **bw**: chaining and alignment band width
|
81
|
+
|
82
|
+
* **best_n**: max number of alignments to return
|
83
|
+
|
84
|
+
* **n_threads**: number of indexing threads; 3 by default
|
85
|
+
|
86
|
+
* **extra_flags**: additional flags defined in minimap.h
|
87
|
+
|
88
|
+
* **fn_idx_out**: name of file to which the index is written. This parameter
|
89
|
+
has no effect if **seq** is set.
|
90
|
+
|
91
|
+
* **scoring**: scoring system. It is a tuple/list consisting of 4, 6 or 7
|
92
|
+
positive integers. The first 4 elements specify match scoring, mismatch
|
93
|
+
penalty, gap open and gap extension penalty. The 5th and 6th elements, if
|
94
|
+
present, set long-gap open and long-gap extension penalty. The 7th sets a
|
95
|
+
mismatch penalty involving ambiguous bases.
|
96
|
+
|
97
|
+
.. code:: python
|
98
|
+
|
99
|
+
mappy.Aligner.map(seq, seq2=None, cs=False, MD=False)
|
100
|
+
|
101
|
+
This method aligns :code:`seq` against the index. It is a generator, *yielding*
|
102
|
+
a series of :code:`mappy.Alignment` objects. If :code:`seq2` is present, mappy
|
103
|
+
performs paired-end alignment, assuming the two ends are in the FR orientation.
|
104
|
+
Alignments of the two ends can be distinguished by the :code:`read_num` field
|
105
|
+
(see Class mappy.Alignment below). Argument :code:`cs` asks mappy to generate
|
106
|
+
the :code:`cs` tag; :code:`MD` is similar. These two arguments might slightly
|
107
|
+
degrade performance and are not enabled by default.
|
108
|
+
|
109
|
+
.. code:: python
|
110
|
+
|
111
|
+
mappy.Aligner.seq(name, start=0, end=0x7fffffff)
|
112
|
+
|
113
|
+
This method retrieves a (sub)sequence from the index and returns it as a Python
|
114
|
+
string. :code:`None` is returned if :code:`name` is not present in the index or
|
115
|
+
the start/end coordinates are invalid.
|
116
|
+
|
117
|
+
.. code:: python
|
118
|
+
|
119
|
+
mappy.Aligner.seq_names
|
120
|
+
|
121
|
+
This property gives the array of sequence names in the index.
|
122
|
+
|
123
|
+
Class mappy.Alignment
|
124
|
+
~~~~~~~~~~~~~~~~~~~~~
|
125
|
+
|
126
|
+
This class describes an alignment. An object of this class has the following
|
127
|
+
properties:
|
128
|
+
|
129
|
+
* **ctg**: name of the reference sequence the query is mapped to
|
130
|
+
|
131
|
+
* **ctg_len**: total length of the reference sequence
|
132
|
+
|
133
|
+
* **r_st** and **r_en**: start and end positions on the reference
|
134
|
+
|
135
|
+
* **q_st** and **q_en**: start and end positions on the query
|
136
|
+
|
137
|
+
* **strand**: +1 if on the forward strand; -1 if on the reverse strand
|
138
|
+
|
139
|
+
* **mapq**: mapping quality
|
140
|
+
|
141
|
+
* **blen**: length of the alignment, including both alignment matches and gaps
|
142
|
+
but excluding ambiguous bases.
|
143
|
+
|
144
|
+
* **mlen**: length of the matching bases in the alignment, excluding ambiguous
|
145
|
+
base matches.
|
146
|
+
|
147
|
+
* **NM**: number of mismatches, gaps and ambiguous positions in the alignment
|
148
|
+
|
149
|
+
* **trans_strand**: transcript strand. +1 if on the forward strand; -1 if on the
|
150
|
+
reverse strand; 0 if unknown
|
151
|
+
|
152
|
+
* **is_primary**: if the alignment is primary (typically the best and the first
|
153
|
+
to generate)
|
154
|
+
|
155
|
+
* **read_num**: read number that the alignment corresponds to; 1 for the first
|
156
|
+
read and 2 for the second read
|
157
|
+
|
158
|
+
* **cigar_str**: CIGAR string
|
159
|
+
|
160
|
+
* **cigar**: CIGAR returned as an array of shape :code:`(n_cigar,2)`. The two
|
161
|
+
numbers give the length and the operator of each CIGAR operation.
|
162
|
+
|
163
|
+
* **MD**: the :code:`MD` tag as in the SAM format. It is an empty string unless
|
164
|
+
the :code:`MD` argument is applied when calling :code:`mappy.Aligner.map()`.
|
165
|
+
|
166
|
+
* **cs**: the :code:`cs` tag.
|
167
|
+
|
168
|
+
An :code:`Alignment` object can be converted to a string with :code:`str()` in
|
169
|
+
the following format:
|
170
|
+
|
171
|
+
::
|
172
|
+
|
173
|
+
q_st q_en strand ctg ctg_len r_st r_en mlen blen mapq cg:Z:cigar_str
|
174
|
+
|
175
|
+
It is effectively the PAF format without the QueryName and QueryLength columns
|
176
|
+
(the first two columns in PAF).
|
177
|
+
|
178
|
+
Miscellaneous Functions
|
179
|
+
~~~~~~~~~~~~~~~~~~~~~~~
|
180
|
+
|
181
|
+
.. code:: python
|
182
|
+
|
183
|
+
mappy.fastx_read(fn, read_comment=False)
|
184
|
+
|
185
|
+
This generator function opens a FASTA/FASTQ file and *yields* a
|
186
|
+
:code:`(name,seq,qual)` tuple for each sequence entry. The input file may be
|
187
|
+
optionally gzip'd. If :code:`read_comment` is True, this generator yields
|
188
|
+
a :code:`(name,seq,qual,comment)` tuple instead.
|
189
|
+
|
190
|
+
.. code:: python
|
191
|
+
|
192
|
+
mappy.revcomp(seq)
|
193
|
+
|
194
|
+
Return the reverse complement of DNA string :code:`seq`. This function
|
195
|
+
recognizes IUB code and preserves the letter cases. Uracil :code:`U` is
|
196
|
+
complemented to :code:`A`.
|
@@ -0,0 +1,152 @@
|
|
1
|
+
#ifndef CMAPPY_H
|
2
|
+
#define CMAPPY_H
|
3
|
+
|
4
|
+
#include <stdlib.h>
|
5
|
+
#include <string.h>
|
6
|
+
#include <zlib.h>
|
7
|
+
#include "minimap.h"
|
8
|
+
#include "kseq.h"
|
9
|
+
KSEQ_DECLARE(gzFile)
|
10
|
+
|
11
|
+
typedef struct {
|
12
|
+
const char *ctg;
|
13
|
+
int32_t ctg_start, ctg_end;
|
14
|
+
int32_t qry_start, qry_end;
|
15
|
+
int32_t blen, mlen, NM, ctg_len;
|
16
|
+
uint8_t mapq, is_primary;
|
17
|
+
int8_t strand, trans_strand;
|
18
|
+
int32_t seg_id;
|
19
|
+
int32_t n_cigar32;
|
20
|
+
uint32_t *cigar32;
|
21
|
+
} mm_hitpy_t;
|
22
|
+
|
23
|
+
static inline void mm_reg2hitpy(const mm_idx_t *mi, mm_reg1_t *r, mm_hitpy_t *h)
|
24
|
+
{
|
25
|
+
h->ctg = mi->seq[r->rid].name;
|
26
|
+
h->ctg_len = mi->seq[r->rid].len;
|
27
|
+
h->ctg_start = r->rs, h->ctg_end = r->re;
|
28
|
+
h->qry_start = r->qs, h->qry_end = r->qe;
|
29
|
+
h->strand = r->rev? -1 : 1;
|
30
|
+
h->mapq = r->mapq;
|
31
|
+
h->mlen = r->mlen;
|
32
|
+
h->blen = r->blen;
|
33
|
+
h->NM = r->blen - r->mlen + r->p->n_ambi;
|
34
|
+
h->trans_strand = r->p->trans_strand == 1? 1 : r->p->trans_strand == 2? -1 : 0;
|
35
|
+
h->is_primary = (r->id == r->parent);
|
36
|
+
h->seg_id = r->seg_id;
|
37
|
+
h->n_cigar32 = r->p->n_cigar;
|
38
|
+
h->cigar32 = r->p->cigar;
|
39
|
+
}
|
40
|
+
|
41
|
+
static inline void mm_free_reg1(mm_reg1_t *r)
|
42
|
+
{
|
43
|
+
free(r->p);
|
44
|
+
}
|
45
|
+
|
46
|
+
static inline kseq_t *mm_fastx_open(const char *fn)
|
47
|
+
{
|
48
|
+
gzFile fp;
|
49
|
+
fp = fn && strcmp(fn, "-") != 0? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
|
50
|
+
return kseq_init(fp);
|
51
|
+
}
|
52
|
+
|
53
|
+
static inline void mm_fastx_close(kseq_t *ks)
|
54
|
+
{
|
55
|
+
gzFile fp;
|
56
|
+
fp = ks->f->f;
|
57
|
+
kseq_destroy(ks);
|
58
|
+
gzclose(fp);
|
59
|
+
}
|
60
|
+
|
61
|
+
static inline int mm_verbose_level(int v)
|
62
|
+
{
|
63
|
+
if (v >= 0) mm_verbose = v;
|
64
|
+
return mm_verbose;
|
65
|
+
}
|
66
|
+
|
67
|
+
static inline void mm_reset_timer(void)
|
68
|
+
{
|
69
|
+
extern double realtime(void);
|
70
|
+
mm_realtime0 = realtime();
|
71
|
+
}
|
72
|
+
|
73
|
+
extern unsigned char seq_comp_table[256];
|
74
|
+
static inline mm_reg1_t *mm_map_aux(const mm_idx_t *mi, const char *seq1, const char *seq2, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt)
|
75
|
+
{
|
76
|
+
mm_reg1_t *r;
|
77
|
+
|
78
|
+
Py_BEGIN_ALLOW_THREADS
|
79
|
+
if (seq2 == 0) {
|
80
|
+
r = mm_map(mi, strlen(seq1), seq1, n_regs, b, opt, NULL);
|
81
|
+
} else {
|
82
|
+
int _n_regs[2];
|
83
|
+
mm_reg1_t *regs[2];
|
84
|
+
char *seq[2];
|
85
|
+
int i, len[2];
|
86
|
+
|
87
|
+
len[0] = strlen(seq1);
|
88
|
+
len[1] = strlen(seq2);
|
89
|
+
seq[0] = (char*)seq1;
|
90
|
+
seq[1] = strdup(seq2);
|
91
|
+
for (i = 0; i < len[1]>>1; ++i) {
|
92
|
+
int t = seq[1][len[1] - i - 1];
|
93
|
+
seq[1][len[1] - i - 1] = seq_comp_table[(uint8_t)seq[1][i]];
|
94
|
+
seq[1][i] = seq_comp_table[t];
|
95
|
+
}
|
96
|
+
if (len[1]&1) seq[1][len[1]>>1] = seq_comp_table[(uint8_t)seq[1][len[1]>>1]];
|
97
|
+
mm_map_frag(mi, 2, len, (const char**)seq, _n_regs, regs, b, opt, NULL);
|
98
|
+
for (i = 0; i < _n_regs[1]; ++i)
|
99
|
+
regs[1][i].rev = !regs[1][i].rev;
|
100
|
+
*n_regs = _n_regs[0] + _n_regs[1];
|
101
|
+
regs[0] = (mm_reg1_t*)realloc(regs[0], sizeof(mm_reg1_t) * (*n_regs));
|
102
|
+
memcpy(®s[0][_n_regs[0]], regs[1], _n_regs[1] * sizeof(mm_reg1_t));
|
103
|
+
free(regs[1]);
|
104
|
+
r = regs[0];
|
105
|
+
}
|
106
|
+
Py_END_ALLOW_THREADS
|
107
|
+
|
108
|
+
return r;
|
109
|
+
}
|
110
|
+
|
111
|
+
static inline char *mappy_revcomp(int len, const uint8_t *seq)
|
112
|
+
{
|
113
|
+
int i;
|
114
|
+
char *rev;
|
115
|
+
rev = (char*)malloc(len + 1);
|
116
|
+
for (i = 0; i < len; ++i)
|
117
|
+
rev[len - i - 1] = seq_comp_table[seq[i]];
|
118
|
+
rev[len] = 0;
|
119
|
+
return rev;
|
120
|
+
}
|
121
|
+
|
122
|
+
static char *mappy_fetch_seq(const mm_idx_t *mi, const char *name, int st, int en, int *len)
|
123
|
+
{
|
124
|
+
int i, rid;
|
125
|
+
char *s;
|
126
|
+
*len = 0;
|
127
|
+
rid = mm_idx_name2id(mi, name);
|
128
|
+
if (rid < 0) return 0;
|
129
|
+
if ((uint32_t)st >= mi->seq[rid].len || st >= en) return 0;
|
130
|
+
if (en < 0 || (uint32_t)en > mi->seq[rid].len)
|
131
|
+
en = mi->seq[rid].len;
|
132
|
+
s = (char*)malloc(en - st + 1);
|
133
|
+
*len = mm_idx_getseq(mi, rid, st, en, (uint8_t*)s);
|
134
|
+
for (i = 0; i < *len; ++i)
|
135
|
+
s[i] = "ACGTN"[(uint8_t)s[i]];
|
136
|
+
s[*len] = 0;
|
137
|
+
return s;
|
138
|
+
}
|
139
|
+
|
140
|
+
static mm_idx_t *mappy_idx_seq(int w, int k, int is_hpc, int bucket_bits, const char *seq, int len)
|
141
|
+
{
|
142
|
+
const char *fake_name = "N/A";
|
143
|
+
char *s;
|
144
|
+
mm_idx_t *mi;
|
145
|
+
s = (char*)calloc(len + 1, 1);
|
146
|
+
memcpy(s, seq, len);
|
147
|
+
mi = mm_idx_str(w, k, is_hpc, bucket_bits, 1, (const char**)&s, (const char**)&fake_name);
|
148
|
+
free(s);
|
149
|
+
return mi;
|
150
|
+
}
|
151
|
+
|
152
|
+
#endif
|
@@ -0,0 +1,153 @@
|
|
1
|
+
from libc.stdint cimport int8_t, uint8_t, int32_t, int64_t, uint32_t, uint64_t
|
2
|
+
|
3
|
+
cdef extern from "minimap.h":
|
4
|
+
#
|
5
|
+
# Options
|
6
|
+
#
|
7
|
+
ctypedef struct mm_idxopt_t:
|
8
|
+
short k, w, flag, bucket_bits
|
9
|
+
int64_t mini_batch_size
|
10
|
+
uint64_t batch_size
|
11
|
+
|
12
|
+
ctypedef struct mm_mapopt_t:
|
13
|
+
int64_t flag
|
14
|
+
int seed
|
15
|
+
int sdust_thres
|
16
|
+
|
17
|
+
int max_qlen
|
18
|
+
|
19
|
+
int bw, bw_long
|
20
|
+
int max_gap, max_gap_ref
|
21
|
+
int max_frag_len
|
22
|
+
int max_chain_skip, max_chain_iter
|
23
|
+
int min_cnt
|
24
|
+
int min_chain_score
|
25
|
+
float chain_gap_scale
|
26
|
+
float chain_skip_scale
|
27
|
+
int rmq_size_cap, rmq_inner_dist
|
28
|
+
int rmq_rescue_size
|
29
|
+
float rmq_rescue_ratio
|
30
|
+
|
31
|
+
float mask_level
|
32
|
+
int mask_len
|
33
|
+
float pri_ratio
|
34
|
+
int best_n
|
35
|
+
|
36
|
+
float alt_drop
|
37
|
+
|
38
|
+
int a, b, q, e, q2, e2
|
39
|
+
int sc_ambi
|
40
|
+
int noncan
|
41
|
+
int junc_bonus
|
42
|
+
int zdrop, zdrop_inv
|
43
|
+
int end_bonus
|
44
|
+
int min_dp_max
|
45
|
+
int min_ksw_len
|
46
|
+
int anchor_ext_len, anchor_ext_shift
|
47
|
+
float max_clip_ratio
|
48
|
+
|
49
|
+
int rank_min_len
|
50
|
+
float rank_frac
|
51
|
+
|
52
|
+
int pe_ori, pe_bonus
|
53
|
+
|
54
|
+
float mid_occ_frac
|
55
|
+
float q_occ_frac
|
56
|
+
int32_t min_mid_occ
|
57
|
+
int32_t mid_occ
|
58
|
+
int32_t max_occ
|
59
|
+
int64_t mini_batch_size
|
60
|
+
int64_t max_sw_mat
|
61
|
+
int64_t cap_kalloc
|
62
|
+
|
63
|
+
const char *split_prefix
|
64
|
+
|
65
|
+
int mm_set_opt(char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
66
|
+
int mm_verbose
|
67
|
+
|
68
|
+
#
|
69
|
+
# Indexing
|
70
|
+
#
|
71
|
+
ctypedef struct mm_idx_seq_t:
|
72
|
+
char *name
|
73
|
+
uint64_t offset
|
74
|
+
uint32_t len
|
75
|
+
|
76
|
+
ctypedef struct mm_idx_bucket_t:
|
77
|
+
pass
|
78
|
+
|
79
|
+
ctypedef struct mm_idx_t:
|
80
|
+
int32_t b, w, k, flag
|
81
|
+
uint32_t n_seq
|
82
|
+
mm_idx_seq_t *seq
|
83
|
+
uint32_t *S
|
84
|
+
mm_idx_bucket_t *B
|
85
|
+
void *km
|
86
|
+
void *h
|
87
|
+
|
88
|
+
ctypedef struct mm_idx_reader_t:
|
89
|
+
pass
|
90
|
+
|
91
|
+
mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, const char *fn_out)
|
92
|
+
mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads)
|
93
|
+
void mm_idx_reader_close(mm_idx_reader_t *r)
|
94
|
+
void mm_idx_destroy(mm_idx_t *mi)
|
95
|
+
void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
|
96
|
+
|
97
|
+
int mm_idx_index_name(mm_idx_t *mi)
|
98
|
+
|
99
|
+
#
|
100
|
+
# Mapping (key struct defined in cmappy.h below)
|
101
|
+
#
|
102
|
+
ctypedef struct mm_reg1_t:
|
103
|
+
pass
|
104
|
+
|
105
|
+
ctypedef struct mm_tbuf_t:
|
106
|
+
pass
|
107
|
+
|
108
|
+
mm_tbuf_t *mm_tbuf_init()
|
109
|
+
void mm_tbuf_destroy(mm_tbuf_t *b)
|
110
|
+
void *mm_tbuf_get_km(mm_tbuf_t *b)
|
111
|
+
int mm_gen_cs(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int no_iden)
|
112
|
+
int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq)
|
113
|
+
|
114
|
+
#
|
115
|
+
# Helper header (because it is hard to expose mm_reg1_t with Cython)
|
116
|
+
#
|
117
|
+
cdef extern from "cmappy.h":
|
118
|
+
ctypedef struct mm_hitpy_t:
|
119
|
+
const char *ctg
|
120
|
+
int32_t ctg_start, ctg_end
|
121
|
+
int32_t qry_start, qry_end
|
122
|
+
int32_t blen, mlen, NM, ctg_len
|
123
|
+
uint8_t mapq, is_primary
|
124
|
+
int8_t strand, trans_strand
|
125
|
+
int32_t seg_id
|
126
|
+
int32_t n_cigar32
|
127
|
+
uint32_t *cigar32
|
128
|
+
|
129
|
+
void mm_reg2hitpy(const mm_idx_t *mi, mm_reg1_t *r, mm_hitpy_t *h)
|
130
|
+
void mm_free_reg1(mm_reg1_t *r)
|
131
|
+
mm_reg1_t *mm_map_aux(const mm_idx_t *mi, const char *seq1, const char *seq2, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt)
|
132
|
+
char *mappy_fetch_seq(const mm_idx_t *mi, const char *name, int st, int en, int *l)
|
133
|
+
mm_idx_t *mappy_idx_seq(int w, int k, int is_hpc, int bucket_bits, const char *seq, int l)
|
134
|
+
|
135
|
+
ctypedef struct kstring_t:
|
136
|
+
unsigned l, m
|
137
|
+
char *s
|
138
|
+
|
139
|
+
ctypedef struct kstream_t:
|
140
|
+
pass
|
141
|
+
|
142
|
+
ctypedef struct kseq_t:
|
143
|
+
kstring_t name, comment, seq, qual
|
144
|
+
int last_char
|
145
|
+
kstream_t *f
|
146
|
+
|
147
|
+
kseq_t *mm_fastx_open(const char *fn)
|
148
|
+
void mm_fastx_close(kseq_t *ks)
|
149
|
+
int kseq_read(kseq_t *seq)
|
150
|
+
|
151
|
+
char *mappy_revcomp(int l, const uint8_t *seq)
|
152
|
+
int mm_verbose_level(int v)
|
153
|
+
void mm_reset_timer()
|