minimap2 0.2.22.0 → 0.2.24.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -76
  3. data/ext/Rakefile +55 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +821 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +369 -0
  41. data/ext/minimap2/main.c +459 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +410 -0
  44. data/ext/minimap2/minimap2.1 +725 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +132 -0
  50. data/ext/minimap2/options.c +234 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/lib/minimap2/aligner.rb +4 -4
  93. data/lib/minimap2/alignment.rb +11 -11
  94. data/lib/minimap2/ffi/constants.rb +20 -16
  95. data/lib/minimap2/ffi/functions.rb +5 -0
  96. data/lib/minimap2/ffi.rb +4 -5
  97. data/lib/minimap2/version.rb +2 -2
  98. data/lib/minimap2.rb +51 -15
  99. metadata +97 -79
  100. data/lib/minimap2/ffi_helper.rb +0 -53
  101. data/vendor/libminimap2.so +0 -0
data/ext/minimap2/pe.c ADDED
@@ -0,0 +1,177 @@
1
+ #include <stdlib.h>
2
+ #include <math.h>
3
+ #include "mmpriv.h"
4
+ #include "kvec.h"
5
+
6
+ void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r)
7
+ {
8
+ if (pri_ratio > 0.0f && *n_ > 0) {
9
+ int i, k, n = *n_, n_2nd = 0;
10
+ int max_dist = n_segs == 2? qlens[0] + qlens[1] + max_gap_ref : 0;
11
+ for (i = k = 0; i < n; ++i) {
12
+ int to_keep = 0;
13
+ if (r[i].parent == i) { // primary
14
+ to_keep = 1;
15
+ } else if (r[i].score + min_diff >= r[r[i].parent].score) {
16
+ to_keep = 1;
17
+ } else {
18
+ mm_reg1_t *p = &r[r[i].parent], *q = &r[i];
19
+ if (p->rev == q->rev && p->rid == q->rid && q->re - p->rs < max_dist && p->re - q->rs < max_dist) { // child and parent are close on the ref
20
+ if (q->score >= p->score * pri1)
21
+ to_keep = 1;
22
+ } else {
23
+ int is_par_both = (n_segs == 2 && p->qs < qlens[0] && p->qe > qlens[0]);
24
+ int is_chi_both = (n_segs == 2 && q->qs < qlens[0] && q->qe > qlens[0]);
25
+ if (is_chi_both || is_chi_both == is_par_both) {
26
+ if (q->score >= p->score * pri_ratio)
27
+ to_keep = 1;
28
+ } else { // the remaining case: is_chi_both == 0 && is_par_both == 1
29
+ if (q->score >= p->score * pri2)
30
+ to_keep = 1;
31
+ }
32
+ }
33
+ }
34
+ if (to_keep && r[i].parent != i) {
35
+ if (n_2nd++ >= best_n) to_keep = 0; // don't keep if there are too many secondary hits
36
+ }
37
+ if (to_keep) r[k++] = r[i];
38
+ else if (r[i].p) free(r[i].p);
39
+ }
40
+ if (k != n) mm_sync_regs(km, k, r); // removing hits requires sync()
41
+ *n_ = k;
42
+ }
43
+ }
44
+
45
+ void mm_set_pe_thru(const int *qlens, int *n_regs, mm_reg1_t **regs)
46
+ {
47
+ int s, i, n_pri[2], pri[2];
48
+ n_pri[0] = n_pri[1] = 0;
49
+ pri[0] = pri[1] = -1;
50
+ for (s = 0; s < 2; ++s)
51
+ for (i = 0; i < n_regs[s]; ++i)
52
+ if (regs[s][i].id == regs[s][i].parent)
53
+ ++n_pri[s], pri[s] = i;
54
+ if (n_pri[0] == 1 && n_pri[1] == 1) {
55
+ mm_reg1_t *p = &regs[0][pri[0]];
56
+ mm_reg1_t *q = &regs[1][pri[1]];
57
+ if (p->rid == q->rid && p->rev == q->rev && abs(p->rs - q->rs) < 3 && abs(p->re - q->re) < 3
58
+ && ((p->qs == 0 && qlens[1] - q->qe == 0) || (q->qs == 0 && qlens[0] - p->qe == 0)))
59
+ {
60
+ p->pe_thru = q->pe_thru = 1;
61
+ }
62
+ }
63
+ }
64
+
65
+ #include "ksort.h"
66
+
67
+ typedef struct {
68
+ int s, rev;
69
+ uint64_t key;
70
+ mm_reg1_t *r;
71
+ } pair_arr_t;
72
+
73
+ #define sort_key_pair(a) ((a).key)
74
+ KRADIX_SORT_INIT(pair, pair_arr_t, sort_key_pair, 8)
75
+
76
+ void mm_pair(void *km, int max_gap_ref, int pe_bonus, int sub_diff, int match_sc, const int *qlens, int *n_regs, mm_reg1_t **regs)
77
+ {
78
+ int i, j, s, n, last[2], dp_thres, segs = 0, max_idx[2];
79
+ int64_t max;
80
+ pair_arr_t *a;
81
+ kvec_t(uint64_t) sc = {0,0,0};
82
+
83
+ a = (pair_arr_t*)kmalloc(km, (n_regs[0] + n_regs[1]) * sizeof(pair_arr_t));
84
+ for (s = n = 0, dp_thres = 0; s < 2; ++s) {
85
+ int max = 0;
86
+ for (i = 0; i < n_regs[s]; ++i) {
87
+ a[n].s = s;
88
+ a[n].r = &regs[s][i];
89
+ a[n].rev = a[n].r->rev;
90
+ a[n].key = (uint64_t)a[n].r->rid << 32 | a[n].r->rs<<1 | (s^a[n].rev);
91
+ max = max > a[n].r->p->dp_max? max : a[n].r->p->dp_max;
92
+ ++n;
93
+ segs |= 1<<s;
94
+ }
95
+ dp_thres += max;
96
+ }
97
+ if (segs != 3) {
98
+ kfree(km, a); // only one end is mapped
99
+ return;
100
+ }
101
+ dp_thres -= pe_bonus;
102
+ if (dp_thres < 0) dp_thres = 0;
103
+ radix_sort_pair(a, a + n);
104
+
105
+ max = -1;
106
+ max_idx[0] = max_idx[1] = -1;
107
+ last[0] = last[1] = -1;
108
+ kv_resize(uint64_t, km, sc, (size_t)n);
109
+ for (i = 0; i < n; ++i) {
110
+ if (a[i].key & 1) { // reverse first read or forward second read
111
+ mm_reg1_t *q, *r;
112
+ if (last[a[i].rev] < 0) continue;
113
+ r = a[i].r;
114
+ q = a[last[a[i].rev]].r;
115
+ if (r->rid != q->rid || r->rs - q->re > max_gap_ref) continue;
116
+ for (j = last[a[i].rev]; j >= 0; --j) {
117
+ int64_t score;
118
+ if (a[j].rev != a[i].rev || a[j].s == a[i].s) continue;
119
+ q = a[j].r;
120
+ if (r->rid != q->rid || r->rs - q->re > max_gap_ref) break;
121
+ if (r->p->dp_max + q->p->dp_max < dp_thres) continue;
122
+ score = (int64_t)(r->p->dp_max + q->p->dp_max) << 32 | (r->hash + q->hash);
123
+ if (score > max)
124
+ max = score, max_idx[a[j].s] = j, max_idx[a[i].s] = i;
125
+ kv_push(uint64_t, km, sc, score);
126
+ }
127
+ } else { // forward first read or reverse second read
128
+ last[a[i].rev] = i;
129
+ }
130
+ }
131
+ if (sc.n > 1)
132
+ radix_sort_64(sc.a, sc.a + sc.n);
133
+
134
+ if (sc.n > 0 && max > 0) { // found at least one pair
135
+ int n_sub = 0, mapq_pe;
136
+ mm_reg1_t *r[2];
137
+ r[0] = a[max_idx[0]].r, r[1] = a[max_idx[1]].r;
138
+ r[0]->proper_frag = r[1]->proper_frag = 1;
139
+ for (s = 0; s < 2; ++s) {
140
+ if (r[s]->id != r[s]->parent) { // then lift to primary and update parent
141
+ mm_reg1_t *p = &regs[s][r[s]->parent];
142
+ for (i = 0; i < n_regs[s]; ++i)
143
+ if (regs[s][i].parent == p->id)
144
+ regs[s][i].parent = r[s]->id;
145
+ p->mapq = 0;
146
+ }
147
+ if (!r[s]->sam_pri) { // then sync sam_pri
148
+ for (i = 0; i < n_regs[s]; ++i)
149
+ regs[s][i].sam_pri = 0;
150
+ r[s]->sam_pri = 1;
151
+ }
152
+ }
153
+ mapq_pe = r[0]->mapq > r[1]->mapq? r[0]->mapq : r[1]->mapq;
154
+ for (i = 0; i < (int)sc.n; ++i)
155
+ if ((sc.a[i]>>32) + sub_diff >= (uint64_t)max>>32)
156
+ ++n_sub;
157
+ if (sc.n > 1) {
158
+ int mapq_pe_alt;
159
+ mapq_pe_alt = (int)(6.02f * ((max>>32) - (sc.a[sc.n - 2]>>32)) / match_sc - 4.343f * logf(n_sub)); // n_sub > 0 because it counts the optimal, too
160
+ mapq_pe = mapq_pe < mapq_pe_alt? mapq_pe : mapq_pe_alt;
161
+ }
162
+ if (r[0]->mapq < mapq_pe) r[0]->mapq = (int)(.2f * r[0]->mapq + .8f * mapq_pe + .499f);
163
+ if (r[1]->mapq < mapq_pe) r[1]->mapq = (int)(.2f * r[1]->mapq + .8f * mapq_pe + .499f);
164
+ if (sc.n == 1) {
165
+ if (r[0]->mapq < 2) r[0]->mapq = 2;
166
+ if (r[1]->mapq < 2) r[1]->mapq = 2;
167
+ } else if ((uint64_t)max>>32 > sc.a[sc.n - 2]>>32) {
168
+ if (r[0]->mapq < 1) r[0]->mapq = 1;
169
+ if (r[1]->mapq < 1) r[1]->mapq = 1;
170
+ }
171
+ }
172
+
173
+ kfree(km, a);
174
+ kfree(km, sc.a);
175
+
176
+ mm_set_pe_thru(qlens, n_regs, regs);
177
+ }
@@ -0,0 +1,196 @@
1
+ ==============================
2
+ Mappy: Minimap2 Python Binding
3
+ ==============================
4
+
5
+ Mappy provides a convenient interface to `minimap2
6
+ <https://github.com/lh3/minimap2>`_, a fast and accurate C program to align
7
+ genomic and transcribe nucleotide sequences.
8
+
9
+ Installation
10
+ ------------
11
+
12
+ Mappy depends on `zlib <http://zlib.net>`_. It can be installed with `pip
13
+ <https://en.wikipedia.org/wiki/Pip_(package_manager)>`_:
14
+
15
+ .. code:: shell
16
+
17
+ pip install --user mappy
18
+
19
+ or from the minimap2 github repo (`Cython <http://cython.org>`_ required):
20
+
21
+ .. code:: shell
22
+
23
+ git clone https://github.com/lh3/minimap2
24
+ cd minimap2
25
+ python setup.py install
26
+
27
+ Usage
28
+ -----
29
+
30
+ The following Python script demonstrates the key functionality of mappy:
31
+
32
+ .. code:: python
33
+
34
+ import mappy as mp
35
+ a = mp.Aligner("test/MT-human.fa") # load or build index
36
+ if not a: raise Exception("ERROR: failed to load/build index")
37
+ s = a.seq("MT_human", 100, 200) # retrieve a subsequence from the index
38
+ print(mp.revcomp(s)) # reverse complement
39
+ for name, seq, qual in mp.fastx_read("test/MT-orang.fa"): # read a fasta/q sequence
40
+ for hit in a.map(seq): # traverse alignments
41
+ print("{}\t{}\t{}\t{}".format(hit.ctg, hit.r_st, hit.r_en, hit.cigar_str))
42
+
43
+ APIs
44
+ ----
45
+
46
+ Mappy implements two classes and two global function.
47
+
48
+ Class mappy.Aligner
49
+ ~~~~~~~~~~~~~~~~~~~
50
+
51
+ .. code:: python
52
+
53
+ mappy.Aligner(fn_idx_in=None, preset=None, ...)
54
+
55
+ This constructor accepts the following arguments:
56
+
57
+ * **fn_idx_in**: index or sequence file name. Minimap2 automatically tests the
58
+ file type. If a sequence file is provided, minimap2 builds an index. The
59
+ sequence file can be optionally gzip'd. This option has no effect if **seq**
60
+ is set.
61
+
62
+ * **seq**: a single sequence to index. The sequence name will be set to
63
+ :code:`N/A`.
64
+
65
+ * **preset**: minimap2 preset. Currently, minimap2 supports the following
66
+ presets: **sr** for single-end short reads; **map-pb** for PacBio
67
+ read-to-reference mapping; **map-ont** for Oxford Nanopore read mapping;
68
+ **splice** for long-read spliced alignment; **asm5** for assembly-to-assembly
69
+ alignment; **asm10** for full genome alignment of closely related species. Note
70
+ that the Python module does not support all-vs-all read overlapping.
71
+
72
+ * **k**: k-mer length, no larger than 28
73
+
74
+ * **w**: minimizer window size, no larger than 255
75
+
76
+ * **min_cnt**: mininum number of minimizers on a chain
77
+
78
+ * **min_chain_score**: minimum chaing score
79
+
80
+ * **bw**: chaining and alignment band width
81
+
82
+ * **best_n**: max number of alignments to return
83
+
84
+ * **n_threads**: number of indexing threads; 3 by default
85
+
86
+ * **extra_flags**: additional flags defined in minimap.h
87
+
88
+ * **fn_idx_out**: name of file to which the index is written. This parameter
89
+ has no effect if **seq** is set.
90
+
91
+ * **scoring**: scoring system. It is a tuple/list consisting of 4, 6 or 7
92
+ positive integers. The first 4 elements specify match scoring, mismatch
93
+ penalty, gap open and gap extension penalty. The 5th and 6th elements, if
94
+ present, set long-gap open and long-gap extension penalty. The 7th sets a
95
+ mismatch penalty involving ambiguous bases.
96
+
97
+ .. code:: python
98
+
99
+ mappy.Aligner.map(seq, seq2=None, cs=False, MD=False)
100
+
101
+ This method aligns :code:`seq` against the index. It is a generator, *yielding*
102
+ a series of :code:`mappy.Alignment` objects. If :code:`seq2` is present, mappy
103
+ performs paired-end alignment, assuming the two ends are in the FR orientation.
104
+ Alignments of the two ends can be distinguished by the :code:`read_num` field
105
+ (see Class mappy.Alignment below). Argument :code:`cs` asks mappy to generate
106
+ the :code:`cs` tag; :code:`MD` is similar. These two arguments might slightly
107
+ degrade performance and are not enabled by default.
108
+
109
+ .. code:: python
110
+
111
+ mappy.Aligner.seq(name, start=0, end=0x7fffffff)
112
+
113
+ This method retrieves a (sub)sequence from the index and returns it as a Python
114
+ string. :code:`None` is returned if :code:`name` is not present in the index or
115
+ the start/end coordinates are invalid.
116
+
117
+ .. code:: python
118
+
119
+ mappy.Aligner.seq_names
120
+
121
+ This property gives the array of sequence names in the index.
122
+
123
+ Class mappy.Alignment
124
+ ~~~~~~~~~~~~~~~~~~~~~
125
+
126
+ This class describes an alignment. An object of this class has the following
127
+ properties:
128
+
129
+ * **ctg**: name of the reference sequence the query is mapped to
130
+
131
+ * **ctg_len**: total length of the reference sequence
132
+
133
+ * **r_st** and **r_en**: start and end positions on the reference
134
+
135
+ * **q_st** and **q_en**: start and end positions on the query
136
+
137
+ * **strand**: +1 if on the forward strand; -1 if on the reverse strand
138
+
139
+ * **mapq**: mapping quality
140
+
141
+ * **blen**: length of the alignment, including both alignment matches and gaps
142
+ but excluding ambiguous bases.
143
+
144
+ * **mlen**: length of the matching bases in the alignment, excluding ambiguous
145
+ base matches.
146
+
147
+ * **NM**: number of mismatches, gaps and ambiguous positions in the alignment
148
+
149
+ * **trans_strand**: transcript strand. +1 if on the forward strand; -1 if on the
150
+ reverse strand; 0 if unknown
151
+
152
+ * **is_primary**: if the alignment is primary (typically the best and the first
153
+ to generate)
154
+
155
+ * **read_num**: read number that the alignment corresponds to; 1 for the first
156
+ read and 2 for the second read
157
+
158
+ * **cigar_str**: CIGAR string
159
+
160
+ * **cigar**: CIGAR returned as an array of shape :code:`(n_cigar,2)`. The two
161
+ numbers give the length and the operator of each CIGAR operation.
162
+
163
+ * **MD**: the :code:`MD` tag as in the SAM format. It is an empty string unless
164
+ the :code:`MD` argument is applied when calling :code:`mappy.Aligner.map()`.
165
+
166
+ * **cs**: the :code:`cs` tag.
167
+
168
+ An :code:`Alignment` object can be converted to a string with :code:`str()` in
169
+ the following format:
170
+
171
+ ::
172
+
173
+ q_st q_en strand ctg ctg_len r_st r_en mlen blen mapq cg:Z:cigar_str
174
+
175
+ It is effectively the PAF format without the QueryName and QueryLength columns
176
+ (the first two columns in PAF).
177
+
178
+ Miscellaneous Functions
179
+ ~~~~~~~~~~~~~~~~~~~~~~~
180
+
181
+ .. code:: python
182
+
183
+ mappy.fastx_read(fn, read_comment=False)
184
+
185
+ This generator function opens a FASTA/FASTQ file and *yields* a
186
+ :code:`(name,seq,qual)` tuple for each sequence entry. The input file may be
187
+ optionally gzip'd. If :code:`read_comment` is True, this generator yields
188
+ a :code:`(name,seq,qual,comment)` tuple instead.
189
+
190
+ .. code:: python
191
+
192
+ mappy.revcomp(seq)
193
+
194
+ Return the reverse complement of DNA string :code:`seq`. This function
195
+ recognizes IUB code and preserves the letter cases. Uracil :code:`U` is
196
+ complemented to :code:`A`.
@@ -0,0 +1,152 @@
1
+ #ifndef CMAPPY_H
2
+ #define CMAPPY_H
3
+
4
+ #include <stdlib.h>
5
+ #include <string.h>
6
+ #include <zlib.h>
7
+ #include "minimap.h"
8
+ #include "kseq.h"
9
+ KSEQ_DECLARE(gzFile)
10
+
11
+ typedef struct {
12
+ const char *ctg;
13
+ int32_t ctg_start, ctg_end;
14
+ int32_t qry_start, qry_end;
15
+ int32_t blen, mlen, NM, ctg_len;
16
+ uint8_t mapq, is_primary;
17
+ int8_t strand, trans_strand;
18
+ int32_t seg_id;
19
+ int32_t n_cigar32;
20
+ uint32_t *cigar32;
21
+ } mm_hitpy_t;
22
+
23
+ static inline void mm_reg2hitpy(const mm_idx_t *mi, mm_reg1_t *r, mm_hitpy_t *h)
24
+ {
25
+ h->ctg = mi->seq[r->rid].name;
26
+ h->ctg_len = mi->seq[r->rid].len;
27
+ h->ctg_start = r->rs, h->ctg_end = r->re;
28
+ h->qry_start = r->qs, h->qry_end = r->qe;
29
+ h->strand = r->rev? -1 : 1;
30
+ h->mapq = r->mapq;
31
+ h->mlen = r->mlen;
32
+ h->blen = r->blen;
33
+ h->NM = r->blen - r->mlen + r->p->n_ambi;
34
+ h->trans_strand = r->p->trans_strand == 1? 1 : r->p->trans_strand == 2? -1 : 0;
35
+ h->is_primary = (r->id == r->parent);
36
+ h->seg_id = r->seg_id;
37
+ h->n_cigar32 = r->p->n_cigar;
38
+ h->cigar32 = r->p->cigar;
39
+ }
40
+
41
+ static inline void mm_free_reg1(mm_reg1_t *r)
42
+ {
43
+ free(r->p);
44
+ }
45
+
46
+ static inline kseq_t *mm_fastx_open(const char *fn)
47
+ {
48
+ gzFile fp;
49
+ fp = fn && strcmp(fn, "-") != 0? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
50
+ return kseq_init(fp);
51
+ }
52
+
53
+ static inline void mm_fastx_close(kseq_t *ks)
54
+ {
55
+ gzFile fp;
56
+ fp = ks->f->f;
57
+ kseq_destroy(ks);
58
+ gzclose(fp);
59
+ }
60
+
61
+ static inline int mm_verbose_level(int v)
62
+ {
63
+ if (v >= 0) mm_verbose = v;
64
+ return mm_verbose;
65
+ }
66
+
67
+ static inline void mm_reset_timer(void)
68
+ {
69
+ extern double realtime(void);
70
+ mm_realtime0 = realtime();
71
+ }
72
+
73
+ extern unsigned char seq_comp_table[256];
74
+ static inline mm_reg1_t *mm_map_aux(const mm_idx_t *mi, const char *seq1, const char *seq2, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt)
75
+ {
76
+ mm_reg1_t *r;
77
+
78
+ Py_BEGIN_ALLOW_THREADS
79
+ if (seq2 == 0) {
80
+ r = mm_map(mi, strlen(seq1), seq1, n_regs, b, opt, NULL);
81
+ } else {
82
+ int _n_regs[2];
83
+ mm_reg1_t *regs[2];
84
+ char *seq[2];
85
+ int i, len[2];
86
+
87
+ len[0] = strlen(seq1);
88
+ len[1] = strlen(seq2);
89
+ seq[0] = (char*)seq1;
90
+ seq[1] = strdup(seq2);
91
+ for (i = 0; i < len[1]>>1; ++i) {
92
+ int t = seq[1][len[1] - i - 1];
93
+ seq[1][len[1] - i - 1] = seq_comp_table[(uint8_t)seq[1][i]];
94
+ seq[1][i] = seq_comp_table[t];
95
+ }
96
+ if (len[1]&1) seq[1][len[1]>>1] = seq_comp_table[(uint8_t)seq[1][len[1]>>1]];
97
+ mm_map_frag(mi, 2, len, (const char**)seq, _n_regs, regs, b, opt, NULL);
98
+ for (i = 0; i < _n_regs[1]; ++i)
99
+ regs[1][i].rev = !regs[1][i].rev;
100
+ *n_regs = _n_regs[0] + _n_regs[1];
101
+ regs[0] = (mm_reg1_t*)realloc(regs[0], sizeof(mm_reg1_t) * (*n_regs));
102
+ memcpy(&regs[0][_n_regs[0]], regs[1], _n_regs[1] * sizeof(mm_reg1_t));
103
+ free(regs[1]);
104
+ r = regs[0];
105
+ }
106
+ Py_END_ALLOW_THREADS
107
+
108
+ return r;
109
+ }
110
+
111
+ static inline char *mappy_revcomp(int len, const uint8_t *seq)
112
+ {
113
+ int i;
114
+ char *rev;
115
+ rev = (char*)malloc(len + 1);
116
+ for (i = 0; i < len; ++i)
117
+ rev[len - i - 1] = seq_comp_table[seq[i]];
118
+ rev[len] = 0;
119
+ return rev;
120
+ }
121
+
122
+ static char *mappy_fetch_seq(const mm_idx_t *mi, const char *name, int st, int en, int *len)
123
+ {
124
+ int i, rid;
125
+ char *s;
126
+ *len = 0;
127
+ rid = mm_idx_name2id(mi, name);
128
+ if (rid < 0) return 0;
129
+ if ((uint32_t)st >= mi->seq[rid].len || st >= en) return 0;
130
+ if (en < 0 || (uint32_t)en > mi->seq[rid].len)
131
+ en = mi->seq[rid].len;
132
+ s = (char*)malloc(en - st + 1);
133
+ *len = mm_idx_getseq(mi, rid, st, en, (uint8_t*)s);
134
+ for (i = 0; i < *len; ++i)
135
+ s[i] = "ACGTN"[(uint8_t)s[i]];
136
+ s[*len] = 0;
137
+ return s;
138
+ }
139
+
140
+ static mm_idx_t *mappy_idx_seq(int w, int k, int is_hpc, int bucket_bits, const char *seq, int len)
141
+ {
142
+ const char *fake_name = "N/A";
143
+ char *s;
144
+ mm_idx_t *mi;
145
+ s = (char*)calloc(len + 1, 1);
146
+ memcpy(s, seq, len);
147
+ mi = mm_idx_str(w, k, is_hpc, bucket_bits, 1, (const char**)&s, (const char**)&fake_name);
148
+ free(s);
149
+ return mi;
150
+ }
151
+
152
+ #endif
@@ -0,0 +1,153 @@
1
+ from libc.stdint cimport int8_t, uint8_t, int32_t, int64_t, uint32_t, uint64_t
2
+
3
+ cdef extern from "minimap.h":
4
+ #
5
+ # Options
6
+ #
7
+ ctypedef struct mm_idxopt_t:
8
+ short k, w, flag, bucket_bits
9
+ int64_t mini_batch_size
10
+ uint64_t batch_size
11
+
12
+ ctypedef struct mm_mapopt_t:
13
+ int64_t flag
14
+ int seed
15
+ int sdust_thres
16
+
17
+ int max_qlen
18
+
19
+ int bw, bw_long
20
+ int max_gap, max_gap_ref
21
+ int max_frag_len
22
+ int max_chain_skip, max_chain_iter
23
+ int min_cnt
24
+ int min_chain_score
25
+ float chain_gap_scale
26
+ float chain_skip_scale
27
+ int rmq_size_cap, rmq_inner_dist
28
+ int rmq_rescue_size
29
+ float rmq_rescue_ratio
30
+
31
+ float mask_level
32
+ int mask_len
33
+ float pri_ratio
34
+ int best_n
35
+
36
+ float alt_drop
37
+
38
+ int a, b, q, e, q2, e2
39
+ int sc_ambi
40
+ int noncan
41
+ int junc_bonus
42
+ int zdrop, zdrop_inv
43
+ int end_bonus
44
+ int min_dp_max
45
+ int min_ksw_len
46
+ int anchor_ext_len, anchor_ext_shift
47
+ float max_clip_ratio
48
+
49
+ int rank_min_len
50
+ float rank_frac
51
+
52
+ int pe_ori, pe_bonus
53
+
54
+ float mid_occ_frac
55
+ float q_occ_frac
56
+ int32_t min_mid_occ
57
+ int32_t mid_occ
58
+ int32_t max_occ
59
+ int64_t mini_batch_size
60
+ int64_t max_sw_mat
61
+ int64_t cap_kalloc
62
+
63
+ const char *split_prefix
64
+
65
+ int mm_set_opt(char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
66
+ int mm_verbose
67
+
68
+ #
69
+ # Indexing
70
+ #
71
+ ctypedef struct mm_idx_seq_t:
72
+ char *name
73
+ uint64_t offset
74
+ uint32_t len
75
+
76
+ ctypedef struct mm_idx_bucket_t:
77
+ pass
78
+
79
+ ctypedef struct mm_idx_t:
80
+ int32_t b, w, k, flag
81
+ uint32_t n_seq
82
+ mm_idx_seq_t *seq
83
+ uint32_t *S
84
+ mm_idx_bucket_t *B
85
+ void *km
86
+ void *h
87
+
88
+ ctypedef struct mm_idx_reader_t:
89
+ pass
90
+
91
+ mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, const char *fn_out)
92
+ mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads)
93
+ void mm_idx_reader_close(mm_idx_reader_t *r)
94
+ void mm_idx_destroy(mm_idx_t *mi)
95
+ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
96
+
97
+ int mm_idx_index_name(mm_idx_t *mi)
98
+
99
+ #
100
+ # Mapping (key struct defined in cmappy.h below)
101
+ #
102
+ ctypedef struct mm_reg1_t:
103
+ pass
104
+
105
+ ctypedef struct mm_tbuf_t:
106
+ pass
107
+
108
+ mm_tbuf_t *mm_tbuf_init()
109
+ void mm_tbuf_destroy(mm_tbuf_t *b)
110
+ void *mm_tbuf_get_km(mm_tbuf_t *b)
111
+ int mm_gen_cs(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int no_iden)
112
+ int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq)
113
+
114
+ #
115
+ # Helper header (because it is hard to expose mm_reg1_t with Cython)
116
+ #
117
+ cdef extern from "cmappy.h":
118
+ ctypedef struct mm_hitpy_t:
119
+ const char *ctg
120
+ int32_t ctg_start, ctg_end
121
+ int32_t qry_start, qry_end
122
+ int32_t blen, mlen, NM, ctg_len
123
+ uint8_t mapq, is_primary
124
+ int8_t strand, trans_strand
125
+ int32_t seg_id
126
+ int32_t n_cigar32
127
+ uint32_t *cigar32
128
+
129
+ void mm_reg2hitpy(const mm_idx_t *mi, mm_reg1_t *r, mm_hitpy_t *h)
130
+ void mm_free_reg1(mm_reg1_t *r)
131
+ mm_reg1_t *mm_map_aux(const mm_idx_t *mi, const char *seq1, const char *seq2, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt)
132
+ char *mappy_fetch_seq(const mm_idx_t *mi, const char *name, int st, int en, int *l)
133
+ mm_idx_t *mappy_idx_seq(int w, int k, int is_hpc, int bucket_bits, const char *seq, int l)
134
+
135
+ ctypedef struct kstring_t:
136
+ unsigned l, m
137
+ char *s
138
+
139
+ ctypedef struct kstream_t:
140
+ pass
141
+
142
+ ctypedef struct kseq_t:
143
+ kstring_t name, comment, seq, qual
144
+ int last_char
145
+ kstream_t *f
146
+
147
+ kseq_t *mm_fastx_open(const char *fn)
148
+ void mm_fastx_close(kseq_t *ks)
149
+ int kseq_read(kseq_t *seq)
150
+
151
+ char *mappy_revcomp(int l, const uint8_t *seq)
152
+ int mm_verbose_level(int v)
153
+ void mm_reset_timer()