minimap2 0.2.22.0 → 0.2.24.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -76
  3. data/ext/Rakefile +55 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +821 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +369 -0
  41. data/ext/minimap2/main.c +459 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +410 -0
  44. data/ext/minimap2/minimap2.1 +725 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +132 -0
  50. data/ext/minimap2/options.c +234 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/lib/minimap2/aligner.rb +4 -4
  93. data/lib/minimap2/alignment.rb +11 -11
  94. data/lib/minimap2/ffi/constants.rb +20 -16
  95. data/lib/minimap2/ffi/functions.rb +5 -0
  96. data/lib/minimap2/ffi.rb +4 -5
  97. data/lib/minimap2/version.rb +2 -2
  98. data/lib/minimap2.rb +51 -15
  99. metadata +97 -79
  100. data/lib/minimap2/ffi_helper.rb +0 -53
  101. data/vendor/libminimap2.so +0 -0
data/ext/minimap2/pe.c ADDED
@@ -0,0 +1,177 @@
1
+ #include <stdlib.h>
2
+ #include <math.h>
3
+ #include "mmpriv.h"
4
+ #include "kvec.h"
5
+
6
+ void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r)
7
+ {
8
+ if (pri_ratio > 0.0f && *n_ > 0) {
9
+ int i, k, n = *n_, n_2nd = 0;
10
+ int max_dist = n_segs == 2? qlens[0] + qlens[1] + max_gap_ref : 0;
11
+ for (i = k = 0; i < n; ++i) {
12
+ int to_keep = 0;
13
+ if (r[i].parent == i) { // primary
14
+ to_keep = 1;
15
+ } else if (r[i].score + min_diff >= r[r[i].parent].score) {
16
+ to_keep = 1;
17
+ } else {
18
+ mm_reg1_t *p = &r[r[i].parent], *q = &r[i];
19
+ if (p->rev == q->rev && p->rid == q->rid && q->re - p->rs < max_dist && p->re - q->rs < max_dist) { // child and parent are close on the ref
20
+ if (q->score >= p->score * pri1)
21
+ to_keep = 1;
22
+ } else {
23
+ int is_par_both = (n_segs == 2 && p->qs < qlens[0] && p->qe > qlens[0]);
24
+ int is_chi_both = (n_segs == 2 && q->qs < qlens[0] && q->qe > qlens[0]);
25
+ if (is_chi_both || is_chi_both == is_par_both) {
26
+ if (q->score >= p->score * pri_ratio)
27
+ to_keep = 1;
28
+ } else { // the remaining case: is_chi_both == 0 && is_par_both == 1
29
+ if (q->score >= p->score * pri2)
30
+ to_keep = 1;
31
+ }
32
+ }
33
+ }
34
+ if (to_keep && r[i].parent != i) {
35
+ if (n_2nd++ >= best_n) to_keep = 0; // don't keep if there are too many secondary hits
36
+ }
37
+ if (to_keep) r[k++] = r[i];
38
+ else if (r[i].p) free(r[i].p);
39
+ }
40
+ if (k != n) mm_sync_regs(km, k, r); // removing hits requires sync()
41
+ *n_ = k;
42
+ }
43
+ }
44
+
45
+ void mm_set_pe_thru(const int *qlens, int *n_regs, mm_reg1_t **regs)
46
+ {
47
+ int s, i, n_pri[2], pri[2];
48
+ n_pri[0] = n_pri[1] = 0;
49
+ pri[0] = pri[1] = -1;
50
+ for (s = 0; s < 2; ++s)
51
+ for (i = 0; i < n_regs[s]; ++i)
52
+ if (regs[s][i].id == regs[s][i].parent)
53
+ ++n_pri[s], pri[s] = i;
54
+ if (n_pri[0] == 1 && n_pri[1] == 1) {
55
+ mm_reg1_t *p = &regs[0][pri[0]];
56
+ mm_reg1_t *q = &regs[1][pri[1]];
57
+ if (p->rid == q->rid && p->rev == q->rev && abs(p->rs - q->rs) < 3 && abs(p->re - q->re) < 3
58
+ && ((p->qs == 0 && qlens[1] - q->qe == 0) || (q->qs == 0 && qlens[0] - p->qe == 0)))
59
+ {
60
+ p->pe_thru = q->pe_thru = 1;
61
+ }
62
+ }
63
+ }
64
+
65
+ #include "ksort.h"
66
+
67
+ typedef struct {
68
+ int s, rev;
69
+ uint64_t key;
70
+ mm_reg1_t *r;
71
+ } pair_arr_t;
72
+
73
+ #define sort_key_pair(a) ((a).key)
74
+ KRADIX_SORT_INIT(pair, pair_arr_t, sort_key_pair, 8)
75
+
76
+ void mm_pair(void *km, int max_gap_ref, int pe_bonus, int sub_diff, int match_sc, const int *qlens, int *n_regs, mm_reg1_t **regs)
77
+ {
78
+ int i, j, s, n, last[2], dp_thres, segs = 0, max_idx[2];
79
+ int64_t max;
80
+ pair_arr_t *a;
81
+ kvec_t(uint64_t) sc = {0,0,0};
82
+
83
+ a = (pair_arr_t*)kmalloc(km, (n_regs[0] + n_regs[1]) * sizeof(pair_arr_t));
84
+ for (s = n = 0, dp_thres = 0; s < 2; ++s) {
85
+ int max = 0;
86
+ for (i = 0; i < n_regs[s]; ++i) {
87
+ a[n].s = s;
88
+ a[n].r = &regs[s][i];
89
+ a[n].rev = a[n].r->rev;
90
+ a[n].key = (uint64_t)a[n].r->rid << 32 | a[n].r->rs<<1 | (s^a[n].rev);
91
+ max = max > a[n].r->p->dp_max? max : a[n].r->p->dp_max;
92
+ ++n;
93
+ segs |= 1<<s;
94
+ }
95
+ dp_thres += max;
96
+ }
97
+ if (segs != 3) {
98
+ kfree(km, a); // only one end is mapped
99
+ return;
100
+ }
101
+ dp_thres -= pe_bonus;
102
+ if (dp_thres < 0) dp_thres = 0;
103
+ radix_sort_pair(a, a + n);
104
+
105
+ max = -1;
106
+ max_idx[0] = max_idx[1] = -1;
107
+ last[0] = last[1] = -1;
108
+ kv_resize(uint64_t, km, sc, (size_t)n);
109
+ for (i = 0; i < n; ++i) {
110
+ if (a[i].key & 1) { // reverse first read or forward second read
111
+ mm_reg1_t *q, *r;
112
+ if (last[a[i].rev] < 0) continue;
113
+ r = a[i].r;
114
+ q = a[last[a[i].rev]].r;
115
+ if (r->rid != q->rid || r->rs - q->re > max_gap_ref) continue;
116
+ for (j = last[a[i].rev]; j >= 0; --j) {
117
+ int64_t score;
118
+ if (a[j].rev != a[i].rev || a[j].s == a[i].s) continue;
119
+ q = a[j].r;
120
+ if (r->rid != q->rid || r->rs - q->re > max_gap_ref) break;
121
+ if (r->p->dp_max + q->p->dp_max < dp_thres) continue;
122
+ score = (int64_t)(r->p->dp_max + q->p->dp_max) << 32 | (r->hash + q->hash);
123
+ if (score > max)
124
+ max = score, max_idx[a[j].s] = j, max_idx[a[i].s] = i;
125
+ kv_push(uint64_t, km, sc, score);
126
+ }
127
+ } else { // forward first read or reverse second read
128
+ last[a[i].rev] = i;
129
+ }
130
+ }
131
+ if (sc.n > 1)
132
+ radix_sort_64(sc.a, sc.a + sc.n);
133
+
134
+ if (sc.n > 0 && max > 0) { // found at least one pair
135
+ int n_sub = 0, mapq_pe;
136
+ mm_reg1_t *r[2];
137
+ r[0] = a[max_idx[0]].r, r[1] = a[max_idx[1]].r;
138
+ r[0]->proper_frag = r[1]->proper_frag = 1;
139
+ for (s = 0; s < 2; ++s) {
140
+ if (r[s]->id != r[s]->parent) { // then lift to primary and update parent
141
+ mm_reg1_t *p = &regs[s][r[s]->parent];
142
+ for (i = 0; i < n_regs[s]; ++i)
143
+ if (regs[s][i].parent == p->id)
144
+ regs[s][i].parent = r[s]->id;
145
+ p->mapq = 0;
146
+ }
147
+ if (!r[s]->sam_pri) { // then sync sam_pri
148
+ for (i = 0; i < n_regs[s]; ++i)
149
+ regs[s][i].sam_pri = 0;
150
+ r[s]->sam_pri = 1;
151
+ }
152
+ }
153
+ mapq_pe = r[0]->mapq > r[1]->mapq? r[0]->mapq : r[1]->mapq;
154
+ for (i = 0; i < (int)sc.n; ++i)
155
+ if ((sc.a[i]>>32) + sub_diff >= (uint64_t)max>>32)
156
+ ++n_sub;
157
+ if (sc.n > 1) {
158
+ int mapq_pe_alt;
159
+ mapq_pe_alt = (int)(6.02f * ((max>>32) - (sc.a[sc.n - 2]>>32)) / match_sc - 4.343f * logf(n_sub)); // n_sub > 0 because it counts the optimal, too
160
+ mapq_pe = mapq_pe < mapq_pe_alt? mapq_pe : mapq_pe_alt;
161
+ }
162
+ if (r[0]->mapq < mapq_pe) r[0]->mapq = (int)(.2f * r[0]->mapq + .8f * mapq_pe + .499f);
163
+ if (r[1]->mapq < mapq_pe) r[1]->mapq = (int)(.2f * r[1]->mapq + .8f * mapq_pe + .499f);
164
+ if (sc.n == 1) {
165
+ if (r[0]->mapq < 2) r[0]->mapq = 2;
166
+ if (r[1]->mapq < 2) r[1]->mapq = 2;
167
+ } else if ((uint64_t)max>>32 > sc.a[sc.n - 2]>>32) {
168
+ if (r[0]->mapq < 1) r[0]->mapq = 1;
169
+ if (r[1]->mapq < 1) r[1]->mapq = 1;
170
+ }
171
+ }
172
+
173
+ kfree(km, a);
174
+ kfree(km, sc.a);
175
+
176
+ mm_set_pe_thru(qlens, n_regs, regs);
177
+ }
@@ -0,0 +1,196 @@
1
+ ==============================
2
+ Mappy: Minimap2 Python Binding
3
+ ==============================
4
+
5
+ Mappy provides a convenient interface to `minimap2
6
+ <https://github.com/lh3/minimap2>`_, a fast and accurate C program to align
7
+ genomic and transcribe nucleotide sequences.
8
+
9
+ Installation
10
+ ------------
11
+
12
+ Mappy depends on `zlib <http://zlib.net>`_. It can be installed with `pip
13
+ <https://en.wikipedia.org/wiki/Pip_(package_manager)>`_:
14
+
15
+ .. code:: shell
16
+
17
+ pip install --user mappy
18
+
19
+ or from the minimap2 github repo (`Cython <http://cython.org>`_ required):
20
+
21
+ .. code:: shell
22
+
23
+ git clone https://github.com/lh3/minimap2
24
+ cd minimap2
25
+ python setup.py install
26
+
27
+ Usage
28
+ -----
29
+
30
+ The following Python script demonstrates the key functionality of mappy:
31
+
32
+ .. code:: python
33
+
34
+ import mappy as mp
35
+ a = mp.Aligner("test/MT-human.fa") # load or build index
36
+ if not a: raise Exception("ERROR: failed to load/build index")
37
+ s = a.seq("MT_human", 100, 200) # retrieve a subsequence from the index
38
+ print(mp.revcomp(s)) # reverse complement
39
+ for name, seq, qual in mp.fastx_read("test/MT-orang.fa"): # read a fasta/q sequence
40
+ for hit in a.map(seq): # traverse alignments
41
+ print("{}\t{}\t{}\t{}".format(hit.ctg, hit.r_st, hit.r_en, hit.cigar_str))
42
+
43
+ APIs
44
+ ----
45
+
46
+ Mappy implements two classes and two global function.
47
+
48
+ Class mappy.Aligner
49
+ ~~~~~~~~~~~~~~~~~~~
50
+
51
+ .. code:: python
52
+
53
+ mappy.Aligner(fn_idx_in=None, preset=None, ...)
54
+
55
+ This constructor accepts the following arguments:
56
+
57
+ * **fn_idx_in**: index or sequence file name. Minimap2 automatically tests the
58
+ file type. If a sequence file is provided, minimap2 builds an index. The
59
+ sequence file can be optionally gzip'd. This option has no effect if **seq**
60
+ is set.
61
+
62
+ * **seq**: a single sequence to index. The sequence name will be set to
63
+ :code:`N/A`.
64
+
65
+ * **preset**: minimap2 preset. Currently, minimap2 supports the following
66
+ presets: **sr** for single-end short reads; **map-pb** for PacBio
67
+ read-to-reference mapping; **map-ont** for Oxford Nanopore read mapping;
68
+ **splice** for long-read spliced alignment; **asm5** for assembly-to-assembly
69
+ alignment; **asm10** for full genome alignment of closely related species. Note
70
+ that the Python module does not support all-vs-all read overlapping.
71
+
72
+ * **k**: k-mer length, no larger than 28
73
+
74
+ * **w**: minimizer window size, no larger than 255
75
+
76
+ * **min_cnt**: mininum number of minimizers on a chain
77
+
78
+ * **min_chain_score**: minimum chaing score
79
+
80
+ * **bw**: chaining and alignment band width
81
+
82
+ * **best_n**: max number of alignments to return
83
+
84
+ * **n_threads**: number of indexing threads; 3 by default
85
+
86
+ * **extra_flags**: additional flags defined in minimap.h
87
+
88
+ * **fn_idx_out**: name of file to which the index is written. This parameter
89
+ has no effect if **seq** is set.
90
+
91
+ * **scoring**: scoring system. It is a tuple/list consisting of 4, 6 or 7
92
+ positive integers. The first 4 elements specify match scoring, mismatch
93
+ penalty, gap open and gap extension penalty. The 5th and 6th elements, if
94
+ present, set long-gap open and long-gap extension penalty. The 7th sets a
95
+ mismatch penalty involving ambiguous bases.
96
+
97
+ .. code:: python
98
+
99
+ mappy.Aligner.map(seq, seq2=None, cs=False, MD=False)
100
+
101
+ This method aligns :code:`seq` against the index. It is a generator, *yielding*
102
+ a series of :code:`mappy.Alignment` objects. If :code:`seq2` is present, mappy
103
+ performs paired-end alignment, assuming the two ends are in the FR orientation.
104
+ Alignments of the two ends can be distinguished by the :code:`read_num` field
105
+ (see Class mappy.Alignment below). Argument :code:`cs` asks mappy to generate
106
+ the :code:`cs` tag; :code:`MD` is similar. These two arguments might slightly
107
+ degrade performance and are not enabled by default.
108
+
109
+ .. code:: python
110
+
111
+ mappy.Aligner.seq(name, start=0, end=0x7fffffff)
112
+
113
+ This method retrieves a (sub)sequence from the index and returns it as a Python
114
+ string. :code:`None` is returned if :code:`name` is not present in the index or
115
+ the start/end coordinates are invalid.
116
+
117
+ .. code:: python
118
+
119
+ mappy.Aligner.seq_names
120
+
121
+ This property gives the array of sequence names in the index.
122
+
123
+ Class mappy.Alignment
124
+ ~~~~~~~~~~~~~~~~~~~~~
125
+
126
+ This class describes an alignment. An object of this class has the following
127
+ properties:
128
+
129
+ * **ctg**: name of the reference sequence the query is mapped to
130
+
131
+ * **ctg_len**: total length of the reference sequence
132
+
133
+ * **r_st** and **r_en**: start and end positions on the reference
134
+
135
+ * **q_st** and **q_en**: start and end positions on the query
136
+
137
+ * **strand**: +1 if on the forward strand; -1 if on the reverse strand
138
+
139
+ * **mapq**: mapping quality
140
+
141
+ * **blen**: length of the alignment, including both alignment matches and gaps
142
+ but excluding ambiguous bases.
143
+
144
+ * **mlen**: length of the matching bases in the alignment, excluding ambiguous
145
+ base matches.
146
+
147
+ * **NM**: number of mismatches, gaps and ambiguous positions in the alignment
148
+
149
+ * **trans_strand**: transcript strand. +1 if on the forward strand; -1 if on the
150
+ reverse strand; 0 if unknown
151
+
152
+ * **is_primary**: if the alignment is primary (typically the best and the first
153
+ to generate)
154
+
155
+ * **read_num**: read number that the alignment corresponds to; 1 for the first
156
+ read and 2 for the second read
157
+
158
+ * **cigar_str**: CIGAR string
159
+
160
+ * **cigar**: CIGAR returned as an array of shape :code:`(n_cigar,2)`. The two
161
+ numbers give the length and the operator of each CIGAR operation.
162
+
163
+ * **MD**: the :code:`MD` tag as in the SAM format. It is an empty string unless
164
+ the :code:`MD` argument is applied when calling :code:`mappy.Aligner.map()`.
165
+
166
+ * **cs**: the :code:`cs` tag.
167
+
168
+ An :code:`Alignment` object can be converted to a string with :code:`str()` in
169
+ the following format:
170
+
171
+ ::
172
+
173
+ q_st q_en strand ctg ctg_len r_st r_en mlen blen mapq cg:Z:cigar_str
174
+
175
+ It is effectively the PAF format without the QueryName and QueryLength columns
176
+ (the first two columns in PAF).
177
+
178
+ Miscellaneous Functions
179
+ ~~~~~~~~~~~~~~~~~~~~~~~
180
+
181
+ .. code:: python
182
+
183
+ mappy.fastx_read(fn, read_comment=False)
184
+
185
+ This generator function opens a FASTA/FASTQ file and *yields* a
186
+ :code:`(name,seq,qual)` tuple for each sequence entry. The input file may be
187
+ optionally gzip'd. If :code:`read_comment` is True, this generator yields
188
+ a :code:`(name,seq,qual,comment)` tuple instead.
189
+
190
+ .. code:: python
191
+
192
+ mappy.revcomp(seq)
193
+
194
+ Return the reverse complement of DNA string :code:`seq`. This function
195
+ recognizes IUB code and preserves the letter cases. Uracil :code:`U` is
196
+ complemented to :code:`A`.
@@ -0,0 +1,152 @@
1
+ #ifndef CMAPPY_H
2
+ #define CMAPPY_H
3
+
4
+ #include <stdlib.h>
5
+ #include <string.h>
6
+ #include <zlib.h>
7
+ #include "minimap.h"
8
+ #include "kseq.h"
9
+ KSEQ_DECLARE(gzFile)
10
+
11
+ typedef struct {
12
+ const char *ctg;
13
+ int32_t ctg_start, ctg_end;
14
+ int32_t qry_start, qry_end;
15
+ int32_t blen, mlen, NM, ctg_len;
16
+ uint8_t mapq, is_primary;
17
+ int8_t strand, trans_strand;
18
+ int32_t seg_id;
19
+ int32_t n_cigar32;
20
+ uint32_t *cigar32;
21
+ } mm_hitpy_t;
22
+
23
+ static inline void mm_reg2hitpy(const mm_idx_t *mi, mm_reg1_t *r, mm_hitpy_t *h)
24
+ {
25
+ h->ctg = mi->seq[r->rid].name;
26
+ h->ctg_len = mi->seq[r->rid].len;
27
+ h->ctg_start = r->rs, h->ctg_end = r->re;
28
+ h->qry_start = r->qs, h->qry_end = r->qe;
29
+ h->strand = r->rev? -1 : 1;
30
+ h->mapq = r->mapq;
31
+ h->mlen = r->mlen;
32
+ h->blen = r->blen;
33
+ h->NM = r->blen - r->mlen + r->p->n_ambi;
34
+ h->trans_strand = r->p->trans_strand == 1? 1 : r->p->trans_strand == 2? -1 : 0;
35
+ h->is_primary = (r->id == r->parent);
36
+ h->seg_id = r->seg_id;
37
+ h->n_cigar32 = r->p->n_cigar;
38
+ h->cigar32 = r->p->cigar;
39
+ }
40
+
41
+ static inline void mm_free_reg1(mm_reg1_t *r)
42
+ {
43
+ free(r->p);
44
+ }
45
+
46
+ static inline kseq_t *mm_fastx_open(const char *fn)
47
+ {
48
+ gzFile fp;
49
+ fp = fn && strcmp(fn, "-") != 0? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
50
+ return kseq_init(fp);
51
+ }
52
+
53
+ static inline void mm_fastx_close(kseq_t *ks)
54
+ {
55
+ gzFile fp;
56
+ fp = ks->f->f;
57
+ kseq_destroy(ks);
58
+ gzclose(fp);
59
+ }
60
+
61
+ static inline int mm_verbose_level(int v)
62
+ {
63
+ if (v >= 0) mm_verbose = v;
64
+ return mm_verbose;
65
+ }
66
+
67
+ static inline void mm_reset_timer(void)
68
+ {
69
+ extern double realtime(void);
70
+ mm_realtime0 = realtime();
71
+ }
72
+
73
+ extern unsigned char seq_comp_table[256];
74
+ static inline mm_reg1_t *mm_map_aux(const mm_idx_t *mi, const char *seq1, const char *seq2, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt)
75
+ {
76
+ mm_reg1_t *r;
77
+
78
+ Py_BEGIN_ALLOW_THREADS
79
+ if (seq2 == 0) {
80
+ r = mm_map(mi, strlen(seq1), seq1, n_regs, b, opt, NULL);
81
+ } else {
82
+ int _n_regs[2];
83
+ mm_reg1_t *regs[2];
84
+ char *seq[2];
85
+ int i, len[2];
86
+
87
+ len[0] = strlen(seq1);
88
+ len[1] = strlen(seq2);
89
+ seq[0] = (char*)seq1;
90
+ seq[1] = strdup(seq2);
91
+ for (i = 0; i < len[1]>>1; ++i) {
92
+ int t = seq[1][len[1] - i - 1];
93
+ seq[1][len[1] - i - 1] = seq_comp_table[(uint8_t)seq[1][i]];
94
+ seq[1][i] = seq_comp_table[t];
95
+ }
96
+ if (len[1]&1) seq[1][len[1]>>1] = seq_comp_table[(uint8_t)seq[1][len[1]>>1]];
97
+ mm_map_frag(mi, 2, len, (const char**)seq, _n_regs, regs, b, opt, NULL);
98
+ for (i = 0; i < _n_regs[1]; ++i)
99
+ regs[1][i].rev = !regs[1][i].rev;
100
+ *n_regs = _n_regs[0] + _n_regs[1];
101
+ regs[0] = (mm_reg1_t*)realloc(regs[0], sizeof(mm_reg1_t) * (*n_regs));
102
+ memcpy(&regs[0][_n_regs[0]], regs[1], _n_regs[1] * sizeof(mm_reg1_t));
103
+ free(regs[1]);
104
+ r = regs[0];
105
+ }
106
+ Py_END_ALLOW_THREADS
107
+
108
+ return r;
109
+ }
110
+
111
+ static inline char *mappy_revcomp(int len, const uint8_t *seq)
112
+ {
113
+ int i;
114
+ char *rev;
115
+ rev = (char*)malloc(len + 1);
116
+ for (i = 0; i < len; ++i)
117
+ rev[len - i - 1] = seq_comp_table[seq[i]];
118
+ rev[len] = 0;
119
+ return rev;
120
+ }
121
+
122
+ static char *mappy_fetch_seq(const mm_idx_t *mi, const char *name, int st, int en, int *len)
123
+ {
124
+ int i, rid;
125
+ char *s;
126
+ *len = 0;
127
+ rid = mm_idx_name2id(mi, name);
128
+ if (rid < 0) return 0;
129
+ if ((uint32_t)st >= mi->seq[rid].len || st >= en) return 0;
130
+ if (en < 0 || (uint32_t)en > mi->seq[rid].len)
131
+ en = mi->seq[rid].len;
132
+ s = (char*)malloc(en - st + 1);
133
+ *len = mm_idx_getseq(mi, rid, st, en, (uint8_t*)s);
134
+ for (i = 0; i < *len; ++i)
135
+ s[i] = "ACGTN"[(uint8_t)s[i]];
136
+ s[*len] = 0;
137
+ return s;
138
+ }
139
+
140
+ static mm_idx_t *mappy_idx_seq(int w, int k, int is_hpc, int bucket_bits, const char *seq, int len)
141
+ {
142
+ const char *fake_name = "N/A";
143
+ char *s;
144
+ mm_idx_t *mi;
145
+ s = (char*)calloc(len + 1, 1);
146
+ memcpy(s, seq, len);
147
+ mi = mm_idx_str(w, k, is_hpc, bucket_bits, 1, (const char**)&s, (const char**)&fake_name);
148
+ free(s);
149
+ return mi;
150
+ }
151
+
152
+ #endif
@@ -0,0 +1,153 @@
1
+ from libc.stdint cimport int8_t, uint8_t, int32_t, int64_t, uint32_t, uint64_t
2
+
3
+ cdef extern from "minimap.h":
4
+ #
5
+ # Options
6
+ #
7
+ ctypedef struct mm_idxopt_t:
8
+ short k, w, flag, bucket_bits
9
+ int64_t mini_batch_size
10
+ uint64_t batch_size
11
+
12
+ ctypedef struct mm_mapopt_t:
13
+ int64_t flag
14
+ int seed
15
+ int sdust_thres
16
+
17
+ int max_qlen
18
+
19
+ int bw, bw_long
20
+ int max_gap, max_gap_ref
21
+ int max_frag_len
22
+ int max_chain_skip, max_chain_iter
23
+ int min_cnt
24
+ int min_chain_score
25
+ float chain_gap_scale
26
+ float chain_skip_scale
27
+ int rmq_size_cap, rmq_inner_dist
28
+ int rmq_rescue_size
29
+ float rmq_rescue_ratio
30
+
31
+ float mask_level
32
+ int mask_len
33
+ float pri_ratio
34
+ int best_n
35
+
36
+ float alt_drop
37
+
38
+ int a, b, q, e, q2, e2
39
+ int sc_ambi
40
+ int noncan
41
+ int junc_bonus
42
+ int zdrop, zdrop_inv
43
+ int end_bonus
44
+ int min_dp_max
45
+ int min_ksw_len
46
+ int anchor_ext_len, anchor_ext_shift
47
+ float max_clip_ratio
48
+
49
+ int rank_min_len
50
+ float rank_frac
51
+
52
+ int pe_ori, pe_bonus
53
+
54
+ float mid_occ_frac
55
+ float q_occ_frac
56
+ int32_t min_mid_occ
57
+ int32_t mid_occ
58
+ int32_t max_occ
59
+ int64_t mini_batch_size
60
+ int64_t max_sw_mat
61
+ int64_t cap_kalloc
62
+
63
+ const char *split_prefix
64
+
65
+ int mm_set_opt(char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
66
+ int mm_verbose
67
+
68
+ #
69
+ # Indexing
70
+ #
71
+ ctypedef struct mm_idx_seq_t:
72
+ char *name
73
+ uint64_t offset
74
+ uint32_t len
75
+
76
+ ctypedef struct mm_idx_bucket_t:
77
+ pass
78
+
79
+ ctypedef struct mm_idx_t:
80
+ int32_t b, w, k, flag
81
+ uint32_t n_seq
82
+ mm_idx_seq_t *seq
83
+ uint32_t *S
84
+ mm_idx_bucket_t *B
85
+ void *km
86
+ void *h
87
+
88
+ ctypedef struct mm_idx_reader_t:
89
+ pass
90
+
91
+ mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, const char *fn_out)
92
+ mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads)
93
+ void mm_idx_reader_close(mm_idx_reader_t *r)
94
+ void mm_idx_destroy(mm_idx_t *mi)
95
+ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
96
+
97
+ int mm_idx_index_name(mm_idx_t *mi)
98
+
99
+ #
100
+ # Mapping (key struct defined in cmappy.h below)
101
+ #
102
+ ctypedef struct mm_reg1_t:
103
+ pass
104
+
105
+ ctypedef struct mm_tbuf_t:
106
+ pass
107
+
108
+ mm_tbuf_t *mm_tbuf_init()
109
+ void mm_tbuf_destroy(mm_tbuf_t *b)
110
+ void *mm_tbuf_get_km(mm_tbuf_t *b)
111
+ int mm_gen_cs(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int no_iden)
112
+ int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq)
113
+
114
+ #
115
+ # Helper header (because it is hard to expose mm_reg1_t with Cython)
116
+ #
117
+ cdef extern from "cmappy.h":
118
+ ctypedef struct mm_hitpy_t:
119
+ const char *ctg
120
+ int32_t ctg_start, ctg_end
121
+ int32_t qry_start, qry_end
122
+ int32_t blen, mlen, NM, ctg_len
123
+ uint8_t mapq, is_primary
124
+ int8_t strand, trans_strand
125
+ int32_t seg_id
126
+ int32_t n_cigar32
127
+ uint32_t *cigar32
128
+
129
+ void mm_reg2hitpy(const mm_idx_t *mi, mm_reg1_t *r, mm_hitpy_t *h)
130
+ void mm_free_reg1(mm_reg1_t *r)
131
+ mm_reg1_t *mm_map_aux(const mm_idx_t *mi, const char *seq1, const char *seq2, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt)
132
+ char *mappy_fetch_seq(const mm_idx_t *mi, const char *name, int st, int en, int *l)
133
+ mm_idx_t *mappy_idx_seq(int w, int k, int is_hpc, int bucket_bits, const char *seq, int l)
134
+
135
+ ctypedef struct kstring_t:
136
+ unsigned l, m
137
+ char *s
138
+
139
+ ctypedef struct kstream_t:
140
+ pass
141
+
142
+ ctypedef struct kseq_t:
143
+ kstring_t name, comment, seq, qual
144
+ int last_char
145
+ kstream_t *f
146
+
147
+ kseq_t *mm_fastx_open(const char *fn)
148
+ void mm_fastx_close(kseq_t *ks)
149
+ int kseq_read(kseq_t *seq)
150
+
151
+ char *mappy_revcomp(int l, const uint8_t *seq)
152
+ int mm_verbose_level(int v)
153
+ void mm_reset_timer()