minimap2 0.2.22.0 → 0.2.24.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +60 -76
- data/ext/Rakefile +55 -0
- data/ext/cmappy/cmappy.c +129 -0
- data/ext/cmappy/cmappy.h +44 -0
- data/ext/minimap2/FAQ.md +46 -0
- data/ext/minimap2/LICENSE.txt +24 -0
- data/ext/minimap2/MANIFEST.in +10 -0
- data/ext/minimap2/Makefile +132 -0
- data/ext/minimap2/Makefile.simde +97 -0
- data/ext/minimap2/NEWS.md +821 -0
- data/ext/minimap2/README.md +403 -0
- data/ext/minimap2/align.c +1020 -0
- data/ext/minimap2/bseq.c +169 -0
- data/ext/minimap2/bseq.h +64 -0
- data/ext/minimap2/code_of_conduct.md +30 -0
- data/ext/minimap2/cookbook.md +243 -0
- data/ext/minimap2/esterr.c +64 -0
- data/ext/minimap2/example.c +63 -0
- data/ext/minimap2/format.c +559 -0
- data/ext/minimap2/hit.c +466 -0
- data/ext/minimap2/index.c +775 -0
- data/ext/minimap2/kalloc.c +205 -0
- data/ext/minimap2/kalloc.h +76 -0
- data/ext/minimap2/kdq.h +132 -0
- data/ext/minimap2/ketopt.h +120 -0
- data/ext/minimap2/khash.h +615 -0
- data/ext/minimap2/krmq.h +474 -0
- data/ext/minimap2/kseq.h +256 -0
- data/ext/minimap2/ksort.h +153 -0
- data/ext/minimap2/ksw2.h +184 -0
- data/ext/minimap2/ksw2_dispatch.c +96 -0
- data/ext/minimap2/ksw2_extd2_sse.c +402 -0
- data/ext/minimap2/ksw2_exts2_sse.c +416 -0
- data/ext/minimap2/ksw2_extz2_sse.c +313 -0
- data/ext/minimap2/ksw2_ll_sse.c +152 -0
- data/ext/minimap2/kthread.c +159 -0
- data/ext/minimap2/kthread.h +15 -0
- data/ext/minimap2/kvec.h +105 -0
- data/ext/minimap2/lchain.c +369 -0
- data/ext/minimap2/main.c +459 -0
- data/ext/minimap2/map.c +714 -0
- data/ext/minimap2/minimap.h +410 -0
- data/ext/minimap2/minimap2.1 +725 -0
- data/ext/minimap2/misc/README.md +179 -0
- data/ext/minimap2/misc/mmphase.js +335 -0
- data/ext/minimap2/misc/paftools.js +3149 -0
- data/ext/minimap2/misc.c +162 -0
- data/ext/minimap2/mmpriv.h +132 -0
- data/ext/minimap2/options.c +234 -0
- data/ext/minimap2/pe.c +177 -0
- data/ext/minimap2/python/README.rst +196 -0
- data/ext/minimap2/python/cmappy.h +152 -0
- data/ext/minimap2/python/cmappy.pxd +153 -0
- data/ext/minimap2/python/mappy.pyx +273 -0
- data/ext/minimap2/python/minimap2.py +39 -0
- data/ext/minimap2/sdust.c +213 -0
- data/ext/minimap2/sdust.h +25 -0
- data/ext/minimap2/seed.c +131 -0
- data/ext/minimap2/setup.py +55 -0
- data/ext/minimap2/sketch.c +143 -0
- data/ext/minimap2/splitidx.c +84 -0
- data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
- data/ext/minimap2/test/MT-human.fa +278 -0
- data/ext/minimap2/test/MT-orang.fa +276 -0
- data/ext/minimap2/test/q-inv.fa +4 -0
- data/ext/minimap2/test/q2.fa +2 -0
- data/ext/minimap2/test/t-inv.fa +127 -0
- data/ext/minimap2/test/t2.fa +2 -0
- data/ext/minimap2/tex/Makefile +21 -0
- data/ext/minimap2/tex/bioinfo.cls +930 -0
- data/ext/minimap2/tex/blasr-mc.eval +17 -0
- data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
- data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
- data/ext/minimap2/tex/bwa.eval +55 -0
- data/ext/minimap2/tex/eval2roc.pl +33 -0
- data/ext/minimap2/tex/graphmap.eval +4 -0
- data/ext/minimap2/tex/hs38-simu.sh +10 -0
- data/ext/minimap2/tex/minialign.eval +49 -0
- data/ext/minimap2/tex/minimap2.bib +460 -0
- data/ext/minimap2/tex/minimap2.tex +724 -0
- data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
- data/ext/minimap2/tex/mm2-update.tex +240 -0
- data/ext/minimap2/tex/mm2.approx.eval +12 -0
- data/ext/minimap2/tex/mm2.eval +13 -0
- data/ext/minimap2/tex/natbib.bst +1288 -0
- data/ext/minimap2/tex/natbib.sty +803 -0
- data/ext/minimap2/tex/ngmlr.eval +38 -0
- data/ext/minimap2/tex/roc.gp +60 -0
- data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
- data/ext/minimap2.patch +19 -0
- data/lib/minimap2/aligner.rb +4 -4
- data/lib/minimap2/alignment.rb +11 -11
- data/lib/minimap2/ffi/constants.rb +20 -16
- data/lib/minimap2/ffi/functions.rb +5 -0
- data/lib/minimap2/ffi.rb +4 -5
- data/lib/minimap2/version.rb +2 -2
- data/lib/minimap2.rb +51 -15
- metadata +97 -79
- data/lib/minimap2/ffi_helper.rb +0 -53
- data/vendor/libminimap2.so +0 -0
data/ext/minimap2/pe.c
ADDED
@@ -0,0 +1,177 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <math.h>
|
3
|
+
#include "mmpriv.h"
|
4
|
+
#include "kvec.h"
|
5
|
+
|
6
|
+
void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r)
|
7
|
+
{
|
8
|
+
if (pri_ratio > 0.0f && *n_ > 0) {
|
9
|
+
int i, k, n = *n_, n_2nd = 0;
|
10
|
+
int max_dist = n_segs == 2? qlens[0] + qlens[1] + max_gap_ref : 0;
|
11
|
+
for (i = k = 0; i < n; ++i) {
|
12
|
+
int to_keep = 0;
|
13
|
+
if (r[i].parent == i) { // primary
|
14
|
+
to_keep = 1;
|
15
|
+
} else if (r[i].score + min_diff >= r[r[i].parent].score) {
|
16
|
+
to_keep = 1;
|
17
|
+
} else {
|
18
|
+
mm_reg1_t *p = &r[r[i].parent], *q = &r[i];
|
19
|
+
if (p->rev == q->rev && p->rid == q->rid && q->re - p->rs < max_dist && p->re - q->rs < max_dist) { // child and parent are close on the ref
|
20
|
+
if (q->score >= p->score * pri1)
|
21
|
+
to_keep = 1;
|
22
|
+
} else {
|
23
|
+
int is_par_both = (n_segs == 2 && p->qs < qlens[0] && p->qe > qlens[0]);
|
24
|
+
int is_chi_both = (n_segs == 2 && q->qs < qlens[0] && q->qe > qlens[0]);
|
25
|
+
if (is_chi_both || is_chi_both == is_par_both) {
|
26
|
+
if (q->score >= p->score * pri_ratio)
|
27
|
+
to_keep = 1;
|
28
|
+
} else { // the remaining case: is_chi_both == 0 && is_par_both == 1
|
29
|
+
if (q->score >= p->score * pri2)
|
30
|
+
to_keep = 1;
|
31
|
+
}
|
32
|
+
}
|
33
|
+
}
|
34
|
+
if (to_keep && r[i].parent != i) {
|
35
|
+
if (n_2nd++ >= best_n) to_keep = 0; // don't keep if there are too many secondary hits
|
36
|
+
}
|
37
|
+
if (to_keep) r[k++] = r[i];
|
38
|
+
else if (r[i].p) free(r[i].p);
|
39
|
+
}
|
40
|
+
if (k != n) mm_sync_regs(km, k, r); // removing hits requires sync()
|
41
|
+
*n_ = k;
|
42
|
+
}
|
43
|
+
}
|
44
|
+
|
45
|
+
void mm_set_pe_thru(const int *qlens, int *n_regs, mm_reg1_t **regs)
|
46
|
+
{
|
47
|
+
int s, i, n_pri[2], pri[2];
|
48
|
+
n_pri[0] = n_pri[1] = 0;
|
49
|
+
pri[0] = pri[1] = -1;
|
50
|
+
for (s = 0; s < 2; ++s)
|
51
|
+
for (i = 0; i < n_regs[s]; ++i)
|
52
|
+
if (regs[s][i].id == regs[s][i].parent)
|
53
|
+
++n_pri[s], pri[s] = i;
|
54
|
+
if (n_pri[0] == 1 && n_pri[1] == 1) {
|
55
|
+
mm_reg1_t *p = ®s[0][pri[0]];
|
56
|
+
mm_reg1_t *q = ®s[1][pri[1]];
|
57
|
+
if (p->rid == q->rid && p->rev == q->rev && abs(p->rs - q->rs) < 3 && abs(p->re - q->re) < 3
|
58
|
+
&& ((p->qs == 0 && qlens[1] - q->qe == 0) || (q->qs == 0 && qlens[0] - p->qe == 0)))
|
59
|
+
{
|
60
|
+
p->pe_thru = q->pe_thru = 1;
|
61
|
+
}
|
62
|
+
}
|
63
|
+
}
|
64
|
+
|
65
|
+
#include "ksort.h"
|
66
|
+
|
67
|
+
typedef struct {
|
68
|
+
int s, rev;
|
69
|
+
uint64_t key;
|
70
|
+
mm_reg1_t *r;
|
71
|
+
} pair_arr_t;
|
72
|
+
|
73
|
+
#define sort_key_pair(a) ((a).key)
|
74
|
+
KRADIX_SORT_INIT(pair, pair_arr_t, sort_key_pair, 8)
|
75
|
+
|
76
|
+
void mm_pair(void *km, int max_gap_ref, int pe_bonus, int sub_diff, int match_sc, const int *qlens, int *n_regs, mm_reg1_t **regs)
|
77
|
+
{
|
78
|
+
int i, j, s, n, last[2], dp_thres, segs = 0, max_idx[2];
|
79
|
+
int64_t max;
|
80
|
+
pair_arr_t *a;
|
81
|
+
kvec_t(uint64_t) sc = {0,0,0};
|
82
|
+
|
83
|
+
a = (pair_arr_t*)kmalloc(km, (n_regs[0] + n_regs[1]) * sizeof(pair_arr_t));
|
84
|
+
for (s = n = 0, dp_thres = 0; s < 2; ++s) {
|
85
|
+
int max = 0;
|
86
|
+
for (i = 0; i < n_regs[s]; ++i) {
|
87
|
+
a[n].s = s;
|
88
|
+
a[n].r = ®s[s][i];
|
89
|
+
a[n].rev = a[n].r->rev;
|
90
|
+
a[n].key = (uint64_t)a[n].r->rid << 32 | a[n].r->rs<<1 | (s^a[n].rev);
|
91
|
+
max = max > a[n].r->p->dp_max? max : a[n].r->p->dp_max;
|
92
|
+
++n;
|
93
|
+
segs |= 1<<s;
|
94
|
+
}
|
95
|
+
dp_thres += max;
|
96
|
+
}
|
97
|
+
if (segs != 3) {
|
98
|
+
kfree(km, a); // only one end is mapped
|
99
|
+
return;
|
100
|
+
}
|
101
|
+
dp_thres -= pe_bonus;
|
102
|
+
if (dp_thres < 0) dp_thres = 0;
|
103
|
+
radix_sort_pair(a, a + n);
|
104
|
+
|
105
|
+
max = -1;
|
106
|
+
max_idx[0] = max_idx[1] = -1;
|
107
|
+
last[0] = last[1] = -1;
|
108
|
+
kv_resize(uint64_t, km, sc, (size_t)n);
|
109
|
+
for (i = 0; i < n; ++i) {
|
110
|
+
if (a[i].key & 1) { // reverse first read or forward second read
|
111
|
+
mm_reg1_t *q, *r;
|
112
|
+
if (last[a[i].rev] < 0) continue;
|
113
|
+
r = a[i].r;
|
114
|
+
q = a[last[a[i].rev]].r;
|
115
|
+
if (r->rid != q->rid || r->rs - q->re > max_gap_ref) continue;
|
116
|
+
for (j = last[a[i].rev]; j >= 0; --j) {
|
117
|
+
int64_t score;
|
118
|
+
if (a[j].rev != a[i].rev || a[j].s == a[i].s) continue;
|
119
|
+
q = a[j].r;
|
120
|
+
if (r->rid != q->rid || r->rs - q->re > max_gap_ref) break;
|
121
|
+
if (r->p->dp_max + q->p->dp_max < dp_thres) continue;
|
122
|
+
score = (int64_t)(r->p->dp_max + q->p->dp_max) << 32 | (r->hash + q->hash);
|
123
|
+
if (score > max)
|
124
|
+
max = score, max_idx[a[j].s] = j, max_idx[a[i].s] = i;
|
125
|
+
kv_push(uint64_t, km, sc, score);
|
126
|
+
}
|
127
|
+
} else { // forward first read or reverse second read
|
128
|
+
last[a[i].rev] = i;
|
129
|
+
}
|
130
|
+
}
|
131
|
+
if (sc.n > 1)
|
132
|
+
radix_sort_64(sc.a, sc.a + sc.n);
|
133
|
+
|
134
|
+
if (sc.n > 0 && max > 0) { // found at least one pair
|
135
|
+
int n_sub = 0, mapq_pe;
|
136
|
+
mm_reg1_t *r[2];
|
137
|
+
r[0] = a[max_idx[0]].r, r[1] = a[max_idx[1]].r;
|
138
|
+
r[0]->proper_frag = r[1]->proper_frag = 1;
|
139
|
+
for (s = 0; s < 2; ++s) {
|
140
|
+
if (r[s]->id != r[s]->parent) { // then lift to primary and update parent
|
141
|
+
mm_reg1_t *p = ®s[s][r[s]->parent];
|
142
|
+
for (i = 0; i < n_regs[s]; ++i)
|
143
|
+
if (regs[s][i].parent == p->id)
|
144
|
+
regs[s][i].parent = r[s]->id;
|
145
|
+
p->mapq = 0;
|
146
|
+
}
|
147
|
+
if (!r[s]->sam_pri) { // then sync sam_pri
|
148
|
+
for (i = 0; i < n_regs[s]; ++i)
|
149
|
+
regs[s][i].sam_pri = 0;
|
150
|
+
r[s]->sam_pri = 1;
|
151
|
+
}
|
152
|
+
}
|
153
|
+
mapq_pe = r[0]->mapq > r[1]->mapq? r[0]->mapq : r[1]->mapq;
|
154
|
+
for (i = 0; i < (int)sc.n; ++i)
|
155
|
+
if ((sc.a[i]>>32) + sub_diff >= (uint64_t)max>>32)
|
156
|
+
++n_sub;
|
157
|
+
if (sc.n > 1) {
|
158
|
+
int mapq_pe_alt;
|
159
|
+
mapq_pe_alt = (int)(6.02f * ((max>>32) - (sc.a[sc.n - 2]>>32)) / match_sc - 4.343f * logf(n_sub)); // n_sub > 0 because it counts the optimal, too
|
160
|
+
mapq_pe = mapq_pe < mapq_pe_alt? mapq_pe : mapq_pe_alt;
|
161
|
+
}
|
162
|
+
if (r[0]->mapq < mapq_pe) r[0]->mapq = (int)(.2f * r[0]->mapq + .8f * mapq_pe + .499f);
|
163
|
+
if (r[1]->mapq < mapq_pe) r[1]->mapq = (int)(.2f * r[1]->mapq + .8f * mapq_pe + .499f);
|
164
|
+
if (sc.n == 1) {
|
165
|
+
if (r[0]->mapq < 2) r[0]->mapq = 2;
|
166
|
+
if (r[1]->mapq < 2) r[1]->mapq = 2;
|
167
|
+
} else if ((uint64_t)max>>32 > sc.a[sc.n - 2]>>32) {
|
168
|
+
if (r[0]->mapq < 1) r[0]->mapq = 1;
|
169
|
+
if (r[1]->mapq < 1) r[1]->mapq = 1;
|
170
|
+
}
|
171
|
+
}
|
172
|
+
|
173
|
+
kfree(km, a);
|
174
|
+
kfree(km, sc.a);
|
175
|
+
|
176
|
+
mm_set_pe_thru(qlens, n_regs, regs);
|
177
|
+
}
|
@@ -0,0 +1,196 @@
|
|
1
|
+
==============================
|
2
|
+
Mappy: Minimap2 Python Binding
|
3
|
+
==============================
|
4
|
+
|
5
|
+
Mappy provides a convenient interface to `minimap2
|
6
|
+
<https://github.com/lh3/minimap2>`_, a fast and accurate C program to align
|
7
|
+
genomic and transcribe nucleotide sequences.
|
8
|
+
|
9
|
+
Installation
|
10
|
+
------------
|
11
|
+
|
12
|
+
Mappy depends on `zlib <http://zlib.net>`_. It can be installed with `pip
|
13
|
+
<https://en.wikipedia.org/wiki/Pip_(package_manager)>`_:
|
14
|
+
|
15
|
+
.. code:: shell
|
16
|
+
|
17
|
+
pip install --user mappy
|
18
|
+
|
19
|
+
or from the minimap2 github repo (`Cython <http://cython.org>`_ required):
|
20
|
+
|
21
|
+
.. code:: shell
|
22
|
+
|
23
|
+
git clone https://github.com/lh3/minimap2
|
24
|
+
cd minimap2
|
25
|
+
python setup.py install
|
26
|
+
|
27
|
+
Usage
|
28
|
+
-----
|
29
|
+
|
30
|
+
The following Python script demonstrates the key functionality of mappy:
|
31
|
+
|
32
|
+
.. code:: python
|
33
|
+
|
34
|
+
import mappy as mp
|
35
|
+
a = mp.Aligner("test/MT-human.fa") # load or build index
|
36
|
+
if not a: raise Exception("ERROR: failed to load/build index")
|
37
|
+
s = a.seq("MT_human", 100, 200) # retrieve a subsequence from the index
|
38
|
+
print(mp.revcomp(s)) # reverse complement
|
39
|
+
for name, seq, qual in mp.fastx_read("test/MT-orang.fa"): # read a fasta/q sequence
|
40
|
+
for hit in a.map(seq): # traverse alignments
|
41
|
+
print("{}\t{}\t{}\t{}".format(hit.ctg, hit.r_st, hit.r_en, hit.cigar_str))
|
42
|
+
|
43
|
+
APIs
|
44
|
+
----
|
45
|
+
|
46
|
+
Mappy implements two classes and two global function.
|
47
|
+
|
48
|
+
Class mappy.Aligner
|
49
|
+
~~~~~~~~~~~~~~~~~~~
|
50
|
+
|
51
|
+
.. code:: python
|
52
|
+
|
53
|
+
mappy.Aligner(fn_idx_in=None, preset=None, ...)
|
54
|
+
|
55
|
+
This constructor accepts the following arguments:
|
56
|
+
|
57
|
+
* **fn_idx_in**: index or sequence file name. Minimap2 automatically tests the
|
58
|
+
file type. If a sequence file is provided, minimap2 builds an index. The
|
59
|
+
sequence file can be optionally gzip'd. This option has no effect if **seq**
|
60
|
+
is set.
|
61
|
+
|
62
|
+
* **seq**: a single sequence to index. The sequence name will be set to
|
63
|
+
:code:`N/A`.
|
64
|
+
|
65
|
+
* **preset**: minimap2 preset. Currently, minimap2 supports the following
|
66
|
+
presets: **sr** for single-end short reads; **map-pb** for PacBio
|
67
|
+
read-to-reference mapping; **map-ont** for Oxford Nanopore read mapping;
|
68
|
+
**splice** for long-read spliced alignment; **asm5** for assembly-to-assembly
|
69
|
+
alignment; **asm10** for full genome alignment of closely related species. Note
|
70
|
+
that the Python module does not support all-vs-all read overlapping.
|
71
|
+
|
72
|
+
* **k**: k-mer length, no larger than 28
|
73
|
+
|
74
|
+
* **w**: minimizer window size, no larger than 255
|
75
|
+
|
76
|
+
* **min_cnt**: mininum number of minimizers on a chain
|
77
|
+
|
78
|
+
* **min_chain_score**: minimum chaing score
|
79
|
+
|
80
|
+
* **bw**: chaining and alignment band width
|
81
|
+
|
82
|
+
* **best_n**: max number of alignments to return
|
83
|
+
|
84
|
+
* **n_threads**: number of indexing threads; 3 by default
|
85
|
+
|
86
|
+
* **extra_flags**: additional flags defined in minimap.h
|
87
|
+
|
88
|
+
* **fn_idx_out**: name of file to which the index is written. This parameter
|
89
|
+
has no effect if **seq** is set.
|
90
|
+
|
91
|
+
* **scoring**: scoring system. It is a tuple/list consisting of 4, 6 or 7
|
92
|
+
positive integers. The first 4 elements specify match scoring, mismatch
|
93
|
+
penalty, gap open and gap extension penalty. The 5th and 6th elements, if
|
94
|
+
present, set long-gap open and long-gap extension penalty. The 7th sets a
|
95
|
+
mismatch penalty involving ambiguous bases.
|
96
|
+
|
97
|
+
.. code:: python
|
98
|
+
|
99
|
+
mappy.Aligner.map(seq, seq2=None, cs=False, MD=False)
|
100
|
+
|
101
|
+
This method aligns :code:`seq` against the index. It is a generator, *yielding*
|
102
|
+
a series of :code:`mappy.Alignment` objects. If :code:`seq2` is present, mappy
|
103
|
+
performs paired-end alignment, assuming the two ends are in the FR orientation.
|
104
|
+
Alignments of the two ends can be distinguished by the :code:`read_num` field
|
105
|
+
(see Class mappy.Alignment below). Argument :code:`cs` asks mappy to generate
|
106
|
+
the :code:`cs` tag; :code:`MD` is similar. These two arguments might slightly
|
107
|
+
degrade performance and are not enabled by default.
|
108
|
+
|
109
|
+
.. code:: python
|
110
|
+
|
111
|
+
mappy.Aligner.seq(name, start=0, end=0x7fffffff)
|
112
|
+
|
113
|
+
This method retrieves a (sub)sequence from the index and returns it as a Python
|
114
|
+
string. :code:`None` is returned if :code:`name` is not present in the index or
|
115
|
+
the start/end coordinates are invalid.
|
116
|
+
|
117
|
+
.. code:: python
|
118
|
+
|
119
|
+
mappy.Aligner.seq_names
|
120
|
+
|
121
|
+
This property gives the array of sequence names in the index.
|
122
|
+
|
123
|
+
Class mappy.Alignment
|
124
|
+
~~~~~~~~~~~~~~~~~~~~~
|
125
|
+
|
126
|
+
This class describes an alignment. An object of this class has the following
|
127
|
+
properties:
|
128
|
+
|
129
|
+
* **ctg**: name of the reference sequence the query is mapped to
|
130
|
+
|
131
|
+
* **ctg_len**: total length of the reference sequence
|
132
|
+
|
133
|
+
* **r_st** and **r_en**: start and end positions on the reference
|
134
|
+
|
135
|
+
* **q_st** and **q_en**: start and end positions on the query
|
136
|
+
|
137
|
+
* **strand**: +1 if on the forward strand; -1 if on the reverse strand
|
138
|
+
|
139
|
+
* **mapq**: mapping quality
|
140
|
+
|
141
|
+
* **blen**: length of the alignment, including both alignment matches and gaps
|
142
|
+
but excluding ambiguous bases.
|
143
|
+
|
144
|
+
* **mlen**: length of the matching bases in the alignment, excluding ambiguous
|
145
|
+
base matches.
|
146
|
+
|
147
|
+
* **NM**: number of mismatches, gaps and ambiguous positions in the alignment
|
148
|
+
|
149
|
+
* **trans_strand**: transcript strand. +1 if on the forward strand; -1 if on the
|
150
|
+
reverse strand; 0 if unknown
|
151
|
+
|
152
|
+
* **is_primary**: if the alignment is primary (typically the best and the first
|
153
|
+
to generate)
|
154
|
+
|
155
|
+
* **read_num**: read number that the alignment corresponds to; 1 for the first
|
156
|
+
read and 2 for the second read
|
157
|
+
|
158
|
+
* **cigar_str**: CIGAR string
|
159
|
+
|
160
|
+
* **cigar**: CIGAR returned as an array of shape :code:`(n_cigar,2)`. The two
|
161
|
+
numbers give the length and the operator of each CIGAR operation.
|
162
|
+
|
163
|
+
* **MD**: the :code:`MD` tag as in the SAM format. It is an empty string unless
|
164
|
+
the :code:`MD` argument is applied when calling :code:`mappy.Aligner.map()`.
|
165
|
+
|
166
|
+
* **cs**: the :code:`cs` tag.
|
167
|
+
|
168
|
+
An :code:`Alignment` object can be converted to a string with :code:`str()` in
|
169
|
+
the following format:
|
170
|
+
|
171
|
+
::
|
172
|
+
|
173
|
+
q_st q_en strand ctg ctg_len r_st r_en mlen blen mapq cg:Z:cigar_str
|
174
|
+
|
175
|
+
It is effectively the PAF format without the QueryName and QueryLength columns
|
176
|
+
(the first two columns in PAF).
|
177
|
+
|
178
|
+
Miscellaneous Functions
|
179
|
+
~~~~~~~~~~~~~~~~~~~~~~~
|
180
|
+
|
181
|
+
.. code:: python
|
182
|
+
|
183
|
+
mappy.fastx_read(fn, read_comment=False)
|
184
|
+
|
185
|
+
This generator function opens a FASTA/FASTQ file and *yields* a
|
186
|
+
:code:`(name,seq,qual)` tuple for each sequence entry. The input file may be
|
187
|
+
optionally gzip'd. If :code:`read_comment` is True, this generator yields
|
188
|
+
a :code:`(name,seq,qual,comment)` tuple instead.
|
189
|
+
|
190
|
+
.. code:: python
|
191
|
+
|
192
|
+
mappy.revcomp(seq)
|
193
|
+
|
194
|
+
Return the reverse complement of DNA string :code:`seq`. This function
|
195
|
+
recognizes IUB code and preserves the letter cases. Uracil :code:`U` is
|
196
|
+
complemented to :code:`A`.
|
@@ -0,0 +1,152 @@
|
|
1
|
+
#ifndef CMAPPY_H
|
2
|
+
#define CMAPPY_H
|
3
|
+
|
4
|
+
#include <stdlib.h>
|
5
|
+
#include <string.h>
|
6
|
+
#include <zlib.h>
|
7
|
+
#include "minimap.h"
|
8
|
+
#include "kseq.h"
|
9
|
+
KSEQ_DECLARE(gzFile)
|
10
|
+
|
11
|
+
typedef struct {
|
12
|
+
const char *ctg;
|
13
|
+
int32_t ctg_start, ctg_end;
|
14
|
+
int32_t qry_start, qry_end;
|
15
|
+
int32_t blen, mlen, NM, ctg_len;
|
16
|
+
uint8_t mapq, is_primary;
|
17
|
+
int8_t strand, trans_strand;
|
18
|
+
int32_t seg_id;
|
19
|
+
int32_t n_cigar32;
|
20
|
+
uint32_t *cigar32;
|
21
|
+
} mm_hitpy_t;
|
22
|
+
|
23
|
+
static inline void mm_reg2hitpy(const mm_idx_t *mi, mm_reg1_t *r, mm_hitpy_t *h)
|
24
|
+
{
|
25
|
+
h->ctg = mi->seq[r->rid].name;
|
26
|
+
h->ctg_len = mi->seq[r->rid].len;
|
27
|
+
h->ctg_start = r->rs, h->ctg_end = r->re;
|
28
|
+
h->qry_start = r->qs, h->qry_end = r->qe;
|
29
|
+
h->strand = r->rev? -1 : 1;
|
30
|
+
h->mapq = r->mapq;
|
31
|
+
h->mlen = r->mlen;
|
32
|
+
h->blen = r->blen;
|
33
|
+
h->NM = r->blen - r->mlen + r->p->n_ambi;
|
34
|
+
h->trans_strand = r->p->trans_strand == 1? 1 : r->p->trans_strand == 2? -1 : 0;
|
35
|
+
h->is_primary = (r->id == r->parent);
|
36
|
+
h->seg_id = r->seg_id;
|
37
|
+
h->n_cigar32 = r->p->n_cigar;
|
38
|
+
h->cigar32 = r->p->cigar;
|
39
|
+
}
|
40
|
+
|
41
|
+
static inline void mm_free_reg1(mm_reg1_t *r)
|
42
|
+
{
|
43
|
+
free(r->p);
|
44
|
+
}
|
45
|
+
|
46
|
+
static inline kseq_t *mm_fastx_open(const char *fn)
|
47
|
+
{
|
48
|
+
gzFile fp;
|
49
|
+
fp = fn && strcmp(fn, "-") != 0? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
|
50
|
+
return kseq_init(fp);
|
51
|
+
}
|
52
|
+
|
53
|
+
static inline void mm_fastx_close(kseq_t *ks)
|
54
|
+
{
|
55
|
+
gzFile fp;
|
56
|
+
fp = ks->f->f;
|
57
|
+
kseq_destroy(ks);
|
58
|
+
gzclose(fp);
|
59
|
+
}
|
60
|
+
|
61
|
+
static inline int mm_verbose_level(int v)
|
62
|
+
{
|
63
|
+
if (v >= 0) mm_verbose = v;
|
64
|
+
return mm_verbose;
|
65
|
+
}
|
66
|
+
|
67
|
+
static inline void mm_reset_timer(void)
|
68
|
+
{
|
69
|
+
extern double realtime(void);
|
70
|
+
mm_realtime0 = realtime();
|
71
|
+
}
|
72
|
+
|
73
|
+
extern unsigned char seq_comp_table[256];
|
74
|
+
static inline mm_reg1_t *mm_map_aux(const mm_idx_t *mi, const char *seq1, const char *seq2, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt)
|
75
|
+
{
|
76
|
+
mm_reg1_t *r;
|
77
|
+
|
78
|
+
Py_BEGIN_ALLOW_THREADS
|
79
|
+
if (seq2 == 0) {
|
80
|
+
r = mm_map(mi, strlen(seq1), seq1, n_regs, b, opt, NULL);
|
81
|
+
} else {
|
82
|
+
int _n_regs[2];
|
83
|
+
mm_reg1_t *regs[2];
|
84
|
+
char *seq[2];
|
85
|
+
int i, len[2];
|
86
|
+
|
87
|
+
len[0] = strlen(seq1);
|
88
|
+
len[1] = strlen(seq2);
|
89
|
+
seq[0] = (char*)seq1;
|
90
|
+
seq[1] = strdup(seq2);
|
91
|
+
for (i = 0; i < len[1]>>1; ++i) {
|
92
|
+
int t = seq[1][len[1] - i - 1];
|
93
|
+
seq[1][len[1] - i - 1] = seq_comp_table[(uint8_t)seq[1][i]];
|
94
|
+
seq[1][i] = seq_comp_table[t];
|
95
|
+
}
|
96
|
+
if (len[1]&1) seq[1][len[1]>>1] = seq_comp_table[(uint8_t)seq[1][len[1]>>1]];
|
97
|
+
mm_map_frag(mi, 2, len, (const char**)seq, _n_regs, regs, b, opt, NULL);
|
98
|
+
for (i = 0; i < _n_regs[1]; ++i)
|
99
|
+
regs[1][i].rev = !regs[1][i].rev;
|
100
|
+
*n_regs = _n_regs[0] + _n_regs[1];
|
101
|
+
regs[0] = (mm_reg1_t*)realloc(regs[0], sizeof(mm_reg1_t) * (*n_regs));
|
102
|
+
memcpy(®s[0][_n_regs[0]], regs[1], _n_regs[1] * sizeof(mm_reg1_t));
|
103
|
+
free(regs[1]);
|
104
|
+
r = regs[0];
|
105
|
+
}
|
106
|
+
Py_END_ALLOW_THREADS
|
107
|
+
|
108
|
+
return r;
|
109
|
+
}
|
110
|
+
|
111
|
+
static inline char *mappy_revcomp(int len, const uint8_t *seq)
|
112
|
+
{
|
113
|
+
int i;
|
114
|
+
char *rev;
|
115
|
+
rev = (char*)malloc(len + 1);
|
116
|
+
for (i = 0; i < len; ++i)
|
117
|
+
rev[len - i - 1] = seq_comp_table[seq[i]];
|
118
|
+
rev[len] = 0;
|
119
|
+
return rev;
|
120
|
+
}
|
121
|
+
|
122
|
+
static char *mappy_fetch_seq(const mm_idx_t *mi, const char *name, int st, int en, int *len)
|
123
|
+
{
|
124
|
+
int i, rid;
|
125
|
+
char *s;
|
126
|
+
*len = 0;
|
127
|
+
rid = mm_idx_name2id(mi, name);
|
128
|
+
if (rid < 0) return 0;
|
129
|
+
if ((uint32_t)st >= mi->seq[rid].len || st >= en) return 0;
|
130
|
+
if (en < 0 || (uint32_t)en > mi->seq[rid].len)
|
131
|
+
en = mi->seq[rid].len;
|
132
|
+
s = (char*)malloc(en - st + 1);
|
133
|
+
*len = mm_idx_getseq(mi, rid, st, en, (uint8_t*)s);
|
134
|
+
for (i = 0; i < *len; ++i)
|
135
|
+
s[i] = "ACGTN"[(uint8_t)s[i]];
|
136
|
+
s[*len] = 0;
|
137
|
+
return s;
|
138
|
+
}
|
139
|
+
|
140
|
+
static mm_idx_t *mappy_idx_seq(int w, int k, int is_hpc, int bucket_bits, const char *seq, int len)
|
141
|
+
{
|
142
|
+
const char *fake_name = "N/A";
|
143
|
+
char *s;
|
144
|
+
mm_idx_t *mi;
|
145
|
+
s = (char*)calloc(len + 1, 1);
|
146
|
+
memcpy(s, seq, len);
|
147
|
+
mi = mm_idx_str(w, k, is_hpc, bucket_bits, 1, (const char**)&s, (const char**)&fake_name);
|
148
|
+
free(s);
|
149
|
+
return mi;
|
150
|
+
}
|
151
|
+
|
152
|
+
#endif
|
@@ -0,0 +1,153 @@
|
|
1
|
+
from libc.stdint cimport int8_t, uint8_t, int32_t, int64_t, uint32_t, uint64_t
|
2
|
+
|
3
|
+
cdef extern from "minimap.h":
|
4
|
+
#
|
5
|
+
# Options
|
6
|
+
#
|
7
|
+
ctypedef struct mm_idxopt_t:
|
8
|
+
short k, w, flag, bucket_bits
|
9
|
+
int64_t mini_batch_size
|
10
|
+
uint64_t batch_size
|
11
|
+
|
12
|
+
ctypedef struct mm_mapopt_t:
|
13
|
+
int64_t flag
|
14
|
+
int seed
|
15
|
+
int sdust_thres
|
16
|
+
|
17
|
+
int max_qlen
|
18
|
+
|
19
|
+
int bw, bw_long
|
20
|
+
int max_gap, max_gap_ref
|
21
|
+
int max_frag_len
|
22
|
+
int max_chain_skip, max_chain_iter
|
23
|
+
int min_cnt
|
24
|
+
int min_chain_score
|
25
|
+
float chain_gap_scale
|
26
|
+
float chain_skip_scale
|
27
|
+
int rmq_size_cap, rmq_inner_dist
|
28
|
+
int rmq_rescue_size
|
29
|
+
float rmq_rescue_ratio
|
30
|
+
|
31
|
+
float mask_level
|
32
|
+
int mask_len
|
33
|
+
float pri_ratio
|
34
|
+
int best_n
|
35
|
+
|
36
|
+
float alt_drop
|
37
|
+
|
38
|
+
int a, b, q, e, q2, e2
|
39
|
+
int sc_ambi
|
40
|
+
int noncan
|
41
|
+
int junc_bonus
|
42
|
+
int zdrop, zdrop_inv
|
43
|
+
int end_bonus
|
44
|
+
int min_dp_max
|
45
|
+
int min_ksw_len
|
46
|
+
int anchor_ext_len, anchor_ext_shift
|
47
|
+
float max_clip_ratio
|
48
|
+
|
49
|
+
int rank_min_len
|
50
|
+
float rank_frac
|
51
|
+
|
52
|
+
int pe_ori, pe_bonus
|
53
|
+
|
54
|
+
float mid_occ_frac
|
55
|
+
float q_occ_frac
|
56
|
+
int32_t min_mid_occ
|
57
|
+
int32_t mid_occ
|
58
|
+
int32_t max_occ
|
59
|
+
int64_t mini_batch_size
|
60
|
+
int64_t max_sw_mat
|
61
|
+
int64_t cap_kalloc
|
62
|
+
|
63
|
+
const char *split_prefix
|
64
|
+
|
65
|
+
int mm_set_opt(char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
66
|
+
int mm_verbose
|
67
|
+
|
68
|
+
#
|
69
|
+
# Indexing
|
70
|
+
#
|
71
|
+
ctypedef struct mm_idx_seq_t:
|
72
|
+
char *name
|
73
|
+
uint64_t offset
|
74
|
+
uint32_t len
|
75
|
+
|
76
|
+
ctypedef struct mm_idx_bucket_t:
|
77
|
+
pass
|
78
|
+
|
79
|
+
ctypedef struct mm_idx_t:
|
80
|
+
int32_t b, w, k, flag
|
81
|
+
uint32_t n_seq
|
82
|
+
mm_idx_seq_t *seq
|
83
|
+
uint32_t *S
|
84
|
+
mm_idx_bucket_t *B
|
85
|
+
void *km
|
86
|
+
void *h
|
87
|
+
|
88
|
+
ctypedef struct mm_idx_reader_t:
|
89
|
+
pass
|
90
|
+
|
91
|
+
mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, const char *fn_out)
|
92
|
+
mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads)
|
93
|
+
void mm_idx_reader_close(mm_idx_reader_t *r)
|
94
|
+
void mm_idx_destroy(mm_idx_t *mi)
|
95
|
+
void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
|
96
|
+
|
97
|
+
int mm_idx_index_name(mm_idx_t *mi)
|
98
|
+
|
99
|
+
#
|
100
|
+
# Mapping (key struct defined in cmappy.h below)
|
101
|
+
#
|
102
|
+
ctypedef struct mm_reg1_t:
|
103
|
+
pass
|
104
|
+
|
105
|
+
ctypedef struct mm_tbuf_t:
|
106
|
+
pass
|
107
|
+
|
108
|
+
mm_tbuf_t *mm_tbuf_init()
|
109
|
+
void mm_tbuf_destroy(mm_tbuf_t *b)
|
110
|
+
void *mm_tbuf_get_km(mm_tbuf_t *b)
|
111
|
+
int mm_gen_cs(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int no_iden)
|
112
|
+
int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq)
|
113
|
+
|
114
|
+
#
|
115
|
+
# Helper header (because it is hard to expose mm_reg1_t with Cython)
|
116
|
+
#
|
117
|
+
cdef extern from "cmappy.h":
|
118
|
+
ctypedef struct mm_hitpy_t:
|
119
|
+
const char *ctg
|
120
|
+
int32_t ctg_start, ctg_end
|
121
|
+
int32_t qry_start, qry_end
|
122
|
+
int32_t blen, mlen, NM, ctg_len
|
123
|
+
uint8_t mapq, is_primary
|
124
|
+
int8_t strand, trans_strand
|
125
|
+
int32_t seg_id
|
126
|
+
int32_t n_cigar32
|
127
|
+
uint32_t *cigar32
|
128
|
+
|
129
|
+
void mm_reg2hitpy(const mm_idx_t *mi, mm_reg1_t *r, mm_hitpy_t *h)
|
130
|
+
void mm_free_reg1(mm_reg1_t *r)
|
131
|
+
mm_reg1_t *mm_map_aux(const mm_idx_t *mi, const char *seq1, const char *seq2, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt)
|
132
|
+
char *mappy_fetch_seq(const mm_idx_t *mi, const char *name, int st, int en, int *l)
|
133
|
+
mm_idx_t *mappy_idx_seq(int w, int k, int is_hpc, int bucket_bits, const char *seq, int l)
|
134
|
+
|
135
|
+
ctypedef struct kstring_t:
|
136
|
+
unsigned l, m
|
137
|
+
char *s
|
138
|
+
|
139
|
+
ctypedef struct kstream_t:
|
140
|
+
pass
|
141
|
+
|
142
|
+
ctypedef struct kseq_t:
|
143
|
+
kstring_t name, comment, seq, qual
|
144
|
+
int last_char
|
145
|
+
kstream_t *f
|
146
|
+
|
147
|
+
kseq_t *mm_fastx_open(const char *fn)
|
148
|
+
void mm_fastx_close(kseq_t *ks)
|
149
|
+
int kseq_read(kseq_t *seq)
|
150
|
+
|
151
|
+
char *mappy_revcomp(int l, const uint8_t *seq)
|
152
|
+
int mm_verbose_level(int v)
|
153
|
+
void mm_reset_timer()
|