minimap2 0.2.28.0 → 0.2.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,201 @@
1
+ #include <stdio.h>
2
+ #include "mmpriv.h"
3
+ #include "kalloc.h"
4
+
5
+ #define MM_MIN_EXON_LEN 20
6
+
7
+ static int32_t mm_jump_check(void *km, const mm_idx_t *mi, int32_t qlen, const uint8_t *qseq0, const mm_reg1_t *r, int32_t ext, int32_t is_left) // TODO: check close N
8
+ {
9
+ int32_t clip, clen, e = !r->rev ^ !is_left; // 0 for left of the alignment; 1 for right
10
+ uint32_t cigar;
11
+ if (!r->p || r->p->n_cigar <= 0) return -1; // only working with CIGAR
12
+ clip = e == 0? r->qs : qlen - r->qe;
13
+ cigar = r->p->cigar[is_left? 0 : r->p->n_cigar - 1];
14
+ clen = (cigar&0xf) == MM_CIGAR_MATCH? cigar>>4 : 0;
15
+ if (clen <= ext) return -1;
16
+ if (is_left) {
17
+ if (clip >= r->rs) return -1; // no space to jump
18
+ } else {
19
+ if (clip >= mi->seq[r->rid].len - r->re) return -1; // no space to jump
20
+ }
21
+ return 0;
22
+ }
23
+
24
+ static uint8_t *mm_jump_get_qseq_seq(void *km, int32_t qlen, const uint8_t *qseq0, const mm_reg1_t *r, int32_t is_left, int32_t ql0, uint8_t *qseq)
25
+ {
26
+ extern unsigned char seq_nt4_table[256];
27
+ int32_t i, k = 0;
28
+ if (!r->rev) {
29
+ if (is_left)
30
+ for (i = 0; i < ql0; ++i)
31
+ qseq[k++] = seq_nt4_table[(uint8_t)qseq0[i]];
32
+ else
33
+ for (i = qlen - ql0; i < qlen; ++i)
34
+ qseq[k++] = seq_nt4_table[(uint8_t)qseq0[i]];
35
+ } else {
36
+ if (is_left)
37
+ for (i = qlen - 1; i >= qlen - ql0; --i) {
38
+ uint8_t c = seq_nt4_table[(uint8_t)qseq0[i]];
39
+ qseq[k++] = c >= 4? c : 3 - c;
40
+ }
41
+ else
42
+ for (i = ql0 - 1; i >= 0; --i) {
43
+ uint8_t c = seq_nt4_table[(uint8_t)qseq0[i]];
44
+ qseq[k++] = c >= 4? c : 3 - c;
45
+ }
46
+ }
47
+ return qseq;
48
+ }
49
+
50
+ static void mm_jump_split_left(void *km, const mm_idx_t *mi, const mm_mapopt_t *opt, int32_t qlen, const uint8_t *qseq0, mm_reg1_t *r, int32_t ts_strand)
51
+ {
52
+ uint8_t *tseq = 0, *qseq = 0;
53
+ int32_t i, n, l, i0, m, mm0;
54
+ int32_t i0_anno = -1, n_anno = 0, mm0_anno = 0, i0_misc = -1, n_misc = 0, mm0_misc = 0;
55
+ int32_t ext = 1 + (opt->b + opt->a - 1) / opt->a + 1;
56
+ int32_t clip = !r->rev? r->qs : qlen - r->qe;
57
+ int32_t extt = clip < ext? clip : ext;
58
+ const mm_idx_jjump1_t *a;
59
+
60
+ if (mm_jump_check(km, mi, qlen, qseq0, r, ext + MM_MIN_EXON_LEN, 1) < 0) return;
61
+ a = mm_idx_jump_get(mi, r->rid, r->rs - extt, r->rs + ext, &n);
62
+ if (n == 0) return;
63
+
64
+ for (i = 0; i < n; ++i) { // traverse possible jumps
65
+ const mm_idx_jjump1_t *ai = &a[i];
66
+ int32_t tlen, tl1, j, mm1, mm2;
67
+ assert(ai->off >= r->rs - extt && ai->off <= r->rs + ext);
68
+ if (ts_strand * ai->strand < 0) continue; // wrong strand
69
+ if (ai->off2 >= ai->off) continue; // wrong direction
70
+ if (ai->off - ai->off2 < 6) continue; // intron too small
71
+ if (ai->off2 < clip + ext) continue; // not long enough
72
+ if (tseq == 0) {
73
+ tseq = Kcalloc(km, uint8_t, (clip + ext) * 2); // tseq and qseq are allocated together
74
+ qseq = tseq + clip + ext;
75
+ mm_jump_get_qseq_seq(km, qlen, qseq0, r, 1, clip + ext, qseq);
76
+ }
77
+ tl1 = clip + (ai->off - r->rs);
78
+ tlen = mm_idx_getseq2(mi, 0, r->rid, ai->off, r->rs + ext, &tseq[tl1]);
79
+ assert(tlen == r->rs + ext - ai->off);
80
+ tlen = mm_idx_getseq2(mi, 0, r->rid, ai->off2 - tl1, ai->off2, tseq);
81
+ assert(tlen == tl1);
82
+ for (j = 0, mm1 = 0; j < tl1; ++j)
83
+ if (qseq[j] != tseq[j] || qseq[j] > 3 || tseq[j] > 3)
84
+ ++mm1;
85
+ for (mm2 = 0; j < clip + ext; ++j)
86
+ if (qseq[j] != tseq[j] || qseq[j] > 3 || tseq[j] > 3)
87
+ ++mm2;
88
+ if (mm1 == 0 && mm2 <= 1) {
89
+ if (ai->flag & MM_JUNC_ANNO)
90
+ i0_anno = i, mm0_anno = mm1 + mm2, ++n_anno; // i0 points to the rightmost i
91
+ else
92
+ i0_misc = i, mm0_misc = mm1 + mm2, ++n_misc;
93
+ }
94
+ }
95
+ if (n_anno > 0) m = n_anno, i0 = i0_anno, mm0 = mm0_anno;
96
+ else m = n_misc, i0 = i0_misc, mm0 = mm0_misc;
97
+ kfree(km, tseq);
98
+
99
+ l = m > 0? a[i0].off - r->rs : 0; // may be negative
100
+ if (m == 1 && clip + l >= opt->jump_min_match) { // add one more exon
101
+ mm_enlarge_cigar(r, 2);
102
+ memmove(r->p->cigar + 2, r->p->cigar, r->p->n_cigar * 4);
103
+ r->p->cigar[0] = (clip + l) << 4 | MM_CIGAR_MATCH;
104
+ r->p->cigar[1] = (a[i0].off - a[i0].off2) << 4 | MM_CIGAR_N_SKIP;
105
+ r->p->cigar[2] = ((r->p->cigar[2]>>4) - l) << 4 | MM_CIGAR_MATCH;
106
+ r->p->n_cigar += 2;
107
+ r->rs = a[i0].off2 - (clip + l);
108
+ if (!r->rev) r->qs = 0;
109
+ else r->qe = qlen;
110
+ r->blen += clip, r->mlen += clip - mm0;
111
+ r->p->dp_max0 += (clip - mm0) * opt->a - mm0 * opt->b;
112
+ r->p->dp_max += (clip - mm0) * opt->a - mm0 * opt->b;
113
+ if (!r->is_spliced) r->is_spliced = 1, r->p->dp_max += (opt->a + opt->b) + ((opt->a + opt->b) >> 1);
114
+ } else if (m > 0 && a[i0].off > r->rs) { // trim by l; l is always positive
115
+ r->p->cigar[0] -= l << 4 | MM_CIGAR_MATCH;
116
+ r->rs += l;
117
+ if (!r->rev) r->qs += l;
118
+ else r->qe -= l;
119
+ }
120
+ }
121
+
122
+ static void mm_jump_split_right(void *km, const mm_idx_t *mi, const mm_mapopt_t *opt, int32_t qlen, const uint8_t *qseq0, mm_reg1_t *r, int32_t ts_strand)
123
+ {
124
+ uint8_t *tseq = 0, *qseq = 0;
125
+ int32_t i, n, l, i0, m, mm0;
126
+ int32_t i0_anno = -1, n_anno = 0, mm0_anno = 0, i0_misc = -1, n_misc = 0, mm0_misc = 0;
127
+ int32_t ext = 1 + (opt->b + opt->a - 1) / opt->a + 1;
128
+ int32_t clip = !r->rev? qlen - r->qe : r->qs;
129
+ int32_t extt = clip < ext? clip : ext;
130
+ const mm_idx_jjump1_t *a;
131
+
132
+ if (mm_jump_check(km, mi, qlen, qseq0, r, ext + MM_MIN_EXON_LEN, 0) < 0) return;
133
+ a = mm_idx_jump_get(mi, r->rid, r->re - ext, r->re + extt, &n);
134
+ if (n == 0) return;
135
+
136
+ for (i = 0; i < n; ++i) { // traverse possible jumps
137
+ const mm_idx_jjump1_t *ai = &a[i];
138
+ int32_t tlen, tl1, j, mm1, mm2;
139
+ assert(ai->off >= r->re - ext && ai->off <= r->re + extt);
140
+ if (ts_strand * ai->strand < 0) continue; // wrong strand
141
+ if (ai->off2 <= ai->off) continue; // wrong direction
142
+ if (ai->off2 - ai->off < 6) continue; // intron too small
143
+ if (ai->off2 + clip + ext > mi->seq[r->rid].len) continue; // not long enough
144
+ if (tseq == 0) {
145
+ tseq = Kcalloc(km, uint8_t, (clip + ext) * 2); // tseq and qseq are allocated together
146
+ qseq = tseq + clip + ext;
147
+ mm_jump_get_qseq_seq(km, qlen, qseq0, r, 0, clip + ext, qseq);
148
+ }
149
+ tl1 = clip + (r->re - ai->off);
150
+ tlen = mm_idx_getseq2(mi, 0, r->rid, r->re - ext, ai->off, tseq);
151
+ assert(tlen == ai->off - (r->re - ext));
152
+ tlen = mm_idx_getseq2(mi, 0, r->rid, ai->off2, ai->off2 + tl1, &tseq[clip + ext - tl1]);
153
+ assert(tlen == tl1);
154
+ for (j = 0, mm2 = 0; j < clip + ext - tl1; ++j)
155
+ if (qseq[j] != tseq[j] || qseq[j] > 3 || tseq[j] > 3)
156
+ ++mm2;
157
+ for (mm1 = 0; j < clip + ext; ++j)
158
+ if (qseq[j] != tseq[j] || qseq[j] > 3 || tseq[j] > 3)
159
+ ++mm1;
160
+ if (mm1 == 0 && mm2 <= 1) {
161
+ if (ai->flag & MM_JUNC_ANNO) {
162
+ if (i0_anno < 0) i0_anno = i, mm0_anno = mm1 + mm2;
163
+ ++n_anno;
164
+ } else {
165
+ if (i0_misc < 0) i0_misc = i, mm0_misc = mm1 + mm2;
166
+ ++n_misc;
167
+ }
168
+ }
169
+ }
170
+ if (n_anno > 0) m = n_anno, i0 = i0_anno, mm0 = mm0_anno;
171
+ else m = n_misc, i0 = i0_misc, mm0 = mm0_misc;
172
+ kfree(km, tseq);
173
+
174
+ l = m > 0? r->re - a[i0].off : 0; // may be negative
175
+ if (m == 1 && clip + l >= opt->jump_min_match) { // add one more exon
176
+ mm_enlarge_cigar(r, 2);
177
+ r->p->cigar[r->p->n_cigar - 1] = ((r->p->cigar[r->p->n_cigar - 1]>>4) - l) << 4 | MM_CIGAR_MATCH;
178
+ r->p->cigar[r->p->n_cigar] = (a[i0].off2 - a[i0].off) << 4 | MM_CIGAR_N_SKIP;
179
+ r->p->cigar[r->p->n_cigar + 1] = (clip + l) << 4 | MM_CIGAR_MATCH;
180
+ r->p->n_cigar += 2;
181
+ r->re = a[i0].off2 + (clip + l);
182
+ if (!r->rev) r->qe = qlen;
183
+ else r->qs = 0;
184
+ r->blen += clip, r->mlen += clip - mm0;
185
+ r->p->dp_max0 += (clip - mm0) * opt->a - mm0 * opt->b;
186
+ r->p->dp_max += (clip - mm0) * opt->a - mm0 * opt->b;
187
+ if (!r->is_spliced) r->is_spliced = 1, r->p->dp_max += (opt->a + opt->b) + ((opt->a + opt->b) >> 1);
188
+ } else if (m > 0 && r->re > a[i0].off) { // trim by l; l is always positive
189
+ r->p->cigar[r->p->n_cigar - 1] -= l << 4 | MM_CIGAR_MATCH;
190
+ r->re -= l;
191
+ if (!r->rev) r->qe -= l;
192
+ else r->qs += l;
193
+ }
194
+ }
195
+
196
+ void mm_jump_split(void *km, const mm_idx_t *mi, const mm_mapopt_t *opt, int32_t qlen, const uint8_t *qseq, mm_reg1_t *r, int32_t ts_strand)
197
+ {
198
+ assert((opt->flag & MM_F_EQX) == 0);
199
+ mm_jump_split_left(km, mi, opt, qlen, qseq, r, ts_strand);
200
+ mm_jump_split_right(km, mi, opt, qlen, qseq, r, ts_strand);
201
+ }
@@ -31,6 +31,14 @@ void km_stat_print(const void *km);
31
31
  #define Kcalloc(km, type, cnt) ((type*)kcalloc((km), (cnt), sizeof(type)))
32
32
  #define Krealloc(km, type, ptr, cnt) ((type*)krealloc((km), (ptr), (cnt) * sizeof(type)))
33
33
 
34
+ #define Kgrow(km, type, ptr, __i, __m) do { \
35
+ if ((__i) >= (__m)) { \
36
+ (__m) = (__i) + 1; \
37
+ (__m) += ((__m)>>1) + 16; \
38
+ (ptr) = Krealloc(km, type, ptr, (__m)); \
39
+ } \
40
+ } while (0)
41
+
34
42
  #define Kexpand(km, type, a, m) do { \
35
43
  (m) = (m) >= 4? (m) + ((m)>>1) : 16; \
36
44
  (a) = Krealloc(km, type, (a), (m)); \
data/ext/minimap2/ksw2.h CHANGED
@@ -15,7 +15,8 @@
15
15
  #define KSW_EZ_SPLICE_FOR 0x100
16
16
  #define KSW_EZ_SPLICE_REV 0x200
17
17
  #define KSW_EZ_SPLICE_FLANK 0x400
18
- #define KSW_EZ_SPLICE_CMPLX 0x800
18
+ #define KSW_EZ_SPLICE_CMPLX 0x800 // use the miniprot splice model
19
+ #define KSW_EZ_SPLICE_SCORE 0x1000 // use splice score
19
20
 
20
21
  // The subset of CIGAR operators used by ksw code.
21
22
  // Use MM_CIGAR_* from minimap.h if you need the full list.
@@ -24,6 +25,8 @@
24
25
  #define KSW_CIGAR_DEL 2
25
26
  #define KSW_CIGAR_N_SKIP 3
26
27
 
28
+ #define KSW_SPSC_OFFSET 64
29
+
27
30
  #ifdef __cplusplus
28
31
  extern "C" {
29
32
  #endif
@@ -69,7 +72,7 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
69
72
  int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez);
70
73
 
71
74
  void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
72
- int8_t gapo, int8_t gape, int8_t gapo2, int8_t noncan, int zdrop, int8_t junc_bonus, int flag, const uint8_t *junc, ksw_extz_t *ez);
75
+ int8_t gapo, int8_t gape, int8_t gapo2, int8_t noncan, int zdrop, int end_bonus, int8_t junc_bonus, int8_t junc_pen, int flag, const uint8_t *junc, ksw_extz_t *ez);
73
76
 
74
77
  void ksw_extf2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t mch, int8_t mis, int8_t e, int w, int xdrop, ksw_extz_t *ez);
75
78
 
@@ -80,17 +80,17 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
80
80
  }
81
81
 
82
82
  void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
83
- int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int8_t junc_bonus, int flag, const uint8_t *junc, ksw_extz_t *ez)
83
+ int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int end_bonus, int8_t junc_bonus, int8_t junc_pen, int flag, const uint8_t *junc, ksw_extz_t *ez)
84
84
  {
85
85
  extern void ksw_exts2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
86
- int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int8_t junc_bonus, int flag, const uint8_t *junc, ksw_extz_t *ez);
86
+ int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int end_bonus, int8_t junc_bonus, int8_t junc_pen, int flag, const uint8_t *junc, ksw_extz_t *ez);
87
87
  extern void ksw_exts2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
88
- int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int8_t junc_bonus, int flag, const uint8_t *junc, ksw_extz_t *ez);
88
+ int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int end_bonus, int8_t junc_bonus, int8_t junc_pen, int flag, const uint8_t *junc, ksw_extz_t *ez);
89
89
  if (ksw_simd < 0) ksw_simd = x86_simd();
90
90
  if (ksw_simd & SIMD_SSE4_1)
91
- ksw_exts2_sse41(km, qlen, query, tlen, target, m, mat, q, e, q2, noncan, zdrop, junc_bonus, flag, junc, ez);
91
+ ksw_exts2_sse41(km, qlen, query, tlen, target, m, mat, q, e, q2, noncan, zdrop, end_bonus, junc_bonus, junc_pen, flag, junc, ez);
92
92
  else if (ksw_simd & SIMD_SSE2)
93
- ksw_exts2_sse2(km, qlen, query, tlen, target, m, mat, q, e, q2, noncan, zdrop, junc_bonus, flag, junc, ez);
93
+ ksw_exts2_sse2(km, qlen, query, tlen, target, m, mat, q, e, q2, noncan, zdrop, end_bonus, junc_bonus, junc_pen, flag, junc, ez);
94
94
  else abort();
95
95
  }
96
96
  #endif
@@ -24,14 +24,14 @@
24
24
  #ifdef KSW_CPU_DISPATCH
25
25
  #ifdef __SSE4_1__
26
26
  void ksw_exts2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
27
- int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int8_t junc_bonus, int flag, const uint8_t *junc, ksw_extz_t *ez)
27
+ int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int end_bonus, int8_t junc_bonus, int8_t junc_pen, int flag, const uint8_t *junc, ksw_extz_t *ez)
28
28
  #else
29
29
  void ksw_exts2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
30
- int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int8_t junc_bonus, int flag, const uint8_t *junc, ksw_extz_t *ez)
30
+ int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int end_bonus, int8_t junc_bonus, int8_t junc_pen, int flag, const uint8_t *junc, ksw_extz_t *ez)
31
31
  #endif
32
32
  #else
33
33
  void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
34
- int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int8_t junc_bonus, int flag, const uint8_t *junc, ksw_extz_t *ez)
34
+ int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int end_bonus, int8_t junc_bonus, int8_t junc_pen, int flag, const uint8_t *junc, ksw_extz_t *ez)
35
35
  #endif // ~KSW_CPU_DISPATCH
36
36
  {
37
37
  #define __dp_code_block1 \
@@ -191,7 +191,14 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
191
191
  }
192
192
  }
193
193
 
194
- if (junc) {
194
+ if (junc && (flag & KSW_EZ_SPLICE_SCORE)) { // junc[] keeps the donor score
195
+ uint8_t donor_val = !!(flag & KSW_EZ_SPLICE_FOR) == !(flag & KSW_EZ_REV_CIGAR)? 0 : 1;
196
+ for (t = 0; t < tlen - 1; ++t)
197
+ ((int8_t*)donor)[t] += junc[t+1] == 0xff || (junc[t+1]&1) != donor_val? -junc_pen : (int8_t)(junc[t+1]>>1) - (int8_t)KSW_SPSC_OFFSET;
198
+ for (t = 0; t < tlen - 1; ++t)
199
+ ((int8_t*)acceptor)[t] += junc[t+1] == 0xff || (junc[t+1]&1) != !donor_val? -junc_pen : (int8_t)(junc[t+1]>>1) - (int8_t)KSW_SPSC_OFFSET;
200
+ //for (t = 0; t < tlen - 1; ++t) if (junc[t+1] != 0xff) fprintf(stderr, "Y2\t%d\t%d\t%c\t%d\n", ((int8_t*)donor)[t], ((int8_t*)acceptor)[t], "DA"[junc[t+1]&1], (int8_t)(junc[t+1]>>1) - (int8_t)KSW_SPSC_OFFSET);
201
+ } else if (junc) { // junc[] keeps the splice sites
195
202
  if (!(flag & KSW_EZ_REV_CIGAR)) {
196
203
  for (t = 0; t < tlen - 1; ++t)
197
204
  if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t+1]&1)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t+1]&8)))
@@ -445,10 +452,14 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
445
452
  if (!approx_max) kfree(km, H);
446
453
  if (with_cigar) { // backtrack
447
454
  int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR);
448
- if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY))
455
+ if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY)) {
449
456
  ksw_backtrack(km, 1, rev_cigar, long_thres, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
450
- else if (ez->max_t >= 0 && ez->max_q >= 0)
457
+ } else if (!ez->zdropped && (flag&KSW_EZ_EXTZ_ONLY) && ez->mqe + end_bonus > (int)ez->max) {
458
+ ez->reach_end = 1;
459
+ ksw_backtrack(km, 1, rev_cigar, long_thres, (uint8_t*)p, off, off_end, n_col_*16, ez->mqe_t, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
460
+ } else if (ez->max_t >= 0 && ez->max_q >= 0) {
451
461
  ksw_backtrack(km, 1, rev_cigar, long_thres, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
462
+ }
452
463
  kfree(km, mem2); kfree(km, off);
453
464
  }
454
465
  }
data/ext/minimap2/main.c CHANGED
@@ -35,12 +35,12 @@ static ko_longopt_t long_options[] = {
35
35
  { "splice", ko_no_argument, 310 },
36
36
  { "cost-non-gt-ag", ko_required_argument, 'C' },
37
37
  { "no-long-join", ko_no_argument, 312 },
38
- { "sr", ko_no_argument, 313 },
38
+ { "sr", ko_optional_argument, 313 },
39
39
  { "frag", ko_required_argument, 314 },
40
40
  { "secondary", ko_required_argument, 315 },
41
41
  { "cs", ko_optional_argument, 316 },
42
42
  { "end-bonus", ko_required_argument, 317 },
43
- { "no-pairing", ko_no_argument, 318 },
43
+ { "no-pairing", ko_no_argument, 318 }, // deprecated but reserved for backward compatibility
44
44
  { "splice-flank", ko_required_argument, 319 },
45
45
  { "idx-no-seq", ko_no_argument, 320 },
46
46
  { "end-seed-pen", ko_required_argument, 321 },
@@ -79,6 +79,12 @@ static ko_longopt_t long_options[] = {
79
79
  { "secondary-seq", ko_no_argument, 354 },
80
80
  { "ds", ko_no_argument, 355 },
81
81
  { "rmq-inner", ko_required_argument, 356 },
82
+ { "spsc", ko_required_argument, 357 },
83
+ { "junc-pen", ko_required_argument, 358 },
84
+ { "pairing", ko_required_argument, 359 },
85
+ { "jump-min-match", ko_required_argument, 360 },
86
+ { "write-junc", ko_no_argument, 361 },
87
+ { "pass1", ko_required_argument, 362 },
82
88
  { "dbg-seed-occ", ko_no_argument, 501 },
83
89
  { "help", ko_no_argument, 'h' },
84
90
  { "max-intron-len", ko_required_argument, 'G' },
@@ -123,12 +129,12 @@ static inline void yes_or_no(mm_mapopt_t *opt, int64_t flag, int long_idx, const
123
129
 
124
130
  int main(int argc, char *argv[])
125
131
  {
126
- const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:b:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:";
132
+ const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:b:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:j:";
127
133
  ketopt_t o = KETOPT_INIT;
128
134
  mm_mapopt_t opt;
129
135
  mm_idxopt_t ipt;
130
136
  int i, c, n_threads = 3, n_parts, old_best_n = -1;
131
- char *fnw = 0, *rg = 0, *junc_bed = 0, *s, *alt_list = 0;
137
+ char *fnw = 0, *rg = 0, *fn_bed_junc = 0, *fn_bed_jump = 0, *fn_bed_pass1 = 0, *fn_spsc = 0, *s, *alt_list = 0;
132
138
  FILE *fp_help = stderr;
133
139
  mm_idx_reader_t *idx_rdr;
134
140
  mm_idx_t *mi;
@@ -190,6 +196,7 @@ int main(int argc, char *argv[])
190
196
  else if (c == 'R') rg = o.arg;
191
197
  else if (c == 'h') fp_help = stdout;
192
198
  else if (c == '2') opt.flag |= MM_F_2_IO_THREADS;
199
+ else if (c == 'j') fn_bed_jump = o.arg;
193
200
  else if (c == 'J') {
194
201
  int t;
195
202
  t = atoi(o.arg);
@@ -214,9 +221,8 @@ int main(int argc, char *argv[])
214
221
  else if (c == 309) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ, n_threads = 1; // --print-aln-seq
215
222
  else if (c == 310) opt.flag |= MM_F_SPLICE; // --splice
216
223
  else if (c == 312) opt.flag |= MM_F_NO_LJOIN; // --no-long-join
217
- else if (c == 313) opt.flag |= MM_F_SR; // --sr
218
224
  else if (c == 317) opt.end_bonus = atoi(o.arg); // --end-bonus
219
- else if (c == 318) opt.flag |= MM_F_INDEPEND_SEG; // --no-pairing
225
+ else if (c == 318) opt.flag |= MM_F_INDEPEND_SEG; // --no-pairing (deprecated)
220
226
  else if (c == 320) ipt.flag |= MM_I_NO_SEQ; // --idx-no-seq
221
227
  else if (c == 321) opt.anchor_ext_shift = atoi(o.arg); // --end-seed-pen
222
228
  else if (c == 322) opt.flag |= MM_F_FOR_ONLY; // --for-only
@@ -232,8 +238,9 @@ int main(int argc, char *argv[])
232
238
  else if (c == 336) opt.flag |= MM_F_HARD_MLEVEL; // --hard-mask-level
233
239
  else if (c == 337) opt.max_sw_mat = mm_parse_num(o.arg); // --cap-sw-mat
234
240
  else if (c == 338) opt.max_qlen = mm_parse_num(o.arg); // --max-qlen
235
- else if (c == 340) junc_bed = o.arg; // --junc-bed
241
+ else if (c == 340) fn_bed_junc = o.arg; // --junc-bed
236
242
  else if (c == 341) opt.junc_bonus = atoi(o.arg); // --junc-bonus
243
+ else if (c == 358) opt.junc_pen = atoi(o.arg); // --junc-pen
237
244
  else if (c == 342) opt.flag |= MM_F_SAM_HIT_ONLY; // --sam-hit-only
238
245
  else if (c == 343) opt.chain_gap_scale = atof(o.arg); // --chain-gap-scale
239
246
  else if (c == 351) opt.chain_skip_scale = atof(o.arg); // --chain-skip-scale
@@ -248,9 +255,24 @@ int main(int argc, char *argv[])
248
255
  else if (c == 354) opt.flag |= MM_F_SECONDARY_SEQ; // --secondary-seq
249
256
  else if (c == 355) opt.flag |= MM_F_OUT_DS; // --ds
250
257
  else if (c == 356) opt.rmq_inner_dist = mm_parse_num(o.arg); // --rmq-inner
258
+ else if (c == 357) fn_spsc = o.arg; // --spsc
259
+ else if (c == 360) opt.jump_min_match = mm_parse_num(o.arg); // --jump-min-match
260
+ else if (c == 361) opt.flag |= MM_F_OUT_JUNC | MM_F_CIGAR; // --write-junc
261
+ else if (c == 362) fn_bed_pass1 = o.arg; // --jump-pass1
251
262
  else if (c == 501) mm_dbg_flag |= MM_DBG_SEED_FREQ; // --dbg-seed-occ
252
263
  else if (c == 330) {
253
264
  fprintf(stderr, "[WARNING] \033[1;31m --lj-min-ratio has been deprecated.\033[0m\n");
265
+ } else if (c == 313) { // --sr
266
+ if (o.arg == 0 || strcmp(o.arg, "dna") == 0) {
267
+ opt.flag |= MM_F_SR;
268
+ } else if (strcmp(o.arg, "rna") == 0) {
269
+ opt.flag |= MM_F_SR_RNA;
270
+ } else if (strcmp(o.arg, "no") == 0) {
271
+ opt.flag &= ~(uint64_t)(MM_F_SR|MM_F_SR_RNA);
272
+ } else if (mm_verbose >= 2) {
273
+ opt.flag |= MM_F_SR;
274
+ fprintf(stderr, "[WARNING]\033[1;31m --sr only takes 'dna' or 'rna'. Invalid values are assumed to be 'dna'.\033[0m\n");
275
+ }
254
276
  } else if (c == 314) { // --frag
255
277
  yes_or_no(&opt, MM_F_FRAG_MODE, o.longidx, o.arg, 1);
256
278
  } else if (c == 315) { // --secondary
@@ -275,6 +297,14 @@ int main(int argc, char *argv[])
275
297
  } else if (c == 347) { // --rmq
276
298
  if (o.arg) yes_or_no(&opt, MM_F_RMQ, o.longidx, o.arg, 1);
277
299
  else opt.flag |= MM_F_RMQ;
300
+ } else if (c == 359) { // --pairing
301
+ if (strcmp(o.arg, "no") == 0) opt.flag |= MM_F_INDEPEND_SEG;
302
+ else if (strcmp(o.arg, "weak") == 0) opt.flag |= MM_F_WEAK_PAIRING, opt.flag &= ~(uint64_t)MM_F_INDEPEND_SEG;
303
+ else {
304
+ if (strcmp(o.arg, "strong") != 0 && mm_verbose >= 2)
305
+ fprintf(stderr, "[WARNING]\033[1;31m unrecognized argument for --pairing; assuming 'strong'.\033[0m\n");
306
+ opt.flag &= ~(uint64_t)(MM_F_INDEPEND_SEG|MM_F_WEAK_PAIRING);
307
+ }
278
308
  } else if (c == 'S') {
279
309
  opt.flag |= MM_F_OUT_CS | MM_F_CIGAR | MM_F_OUT_CS_LONG;
280
310
  if (mm_verbose >= 2)
@@ -315,10 +345,6 @@ int main(int argc, char *argv[])
315
345
  if (*s == ',') opt.e2 = strtol(s + 1, &s, 10);
316
346
  }
317
347
  }
318
- if ((opt.flag & MM_F_SPLICE) && (opt.flag & MM_F_FRAG_MODE)) {
319
- fprintf(stderr, "[ERROR]\033[1;31m --splice and --frag should not be specified at the same time.\033[0m\n");
320
- return 1;
321
- }
322
348
  if (!fnw && !(opt.flag&MM_F_CIGAR))
323
349
  ipt.flag |= MM_I_NO_SEQ;
324
350
  if (mm_check_opt(&ipt, &opt) < 0)
@@ -358,6 +384,7 @@ int main(int argc, char *argv[])
358
384
  fprintf(fp_help, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max);
359
385
  fprintf(fp_help, " -u CHAR how to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG [n]\n");
360
386
  fprintf(fp_help, " -J INT splice mode. 0: original minimap2 model; 1: miniprot model [1]\n");
387
+ fprintf(fp_help, " -j FILE junctions in BED12 to extend *short* RNA-seq alignment []\n");
361
388
  fprintf(fp_help, " Input/Output:\n");
362
389
  fprintf(fp_help, " -a output in the SAM format (PAF by default)\n");
363
390
  fprintf(fp_help, " -o FILE output alignments to FILE [stdout]\n");
@@ -369,6 +396,7 @@ int main(int argc, char *argv[])
369
396
  fprintf(fp_help, " --MD output the MD tag\n");
370
397
  fprintf(fp_help, " --eqx write =/X CIGAR operators\n");
371
398
  fprintf(fp_help, " -Y use soft clipping for supplementary alignments\n");
399
+ fprintf(fp_help, " -y copy FASTA/Q comments to output SAM\n");
372
400
  fprintf(fp_help, " -t INT number of threads [%d]\n", n_threads);
373
401
  fprintf(fp_help, " -K NUM minibatch size for mapping [500M]\n");
374
402
  // fprintf(fp_help, " -v INT verbose level [%d]\n", mm_verbose);
@@ -377,6 +405,7 @@ int main(int argc, char *argv[])
377
405
  fprintf(fp_help, " -x STR preset (always applied before other options; see minimap2.1 for details) []\n");
378
406
  fprintf(fp_help, " - lr:hq - accurate long reads (error rate <1%%) against a reference genome\n");
379
407
  fprintf(fp_help, " - splice/splice:hq - spliced alignment for long reads/accurate long reads\n");
408
+ fprintf(fp_help, " - splice:sr - spliced alignment for short RNA-seq reads\n");
380
409
  fprintf(fp_help, " - asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5%% sequence divergence\n");
381
410
  fprintf(fp_help, " - sr - short reads against a reference\n");
382
411
  fprintf(fp_help, " - map-pb/map-hifi/map-ont/map-iclr - CLR/HiFi/Nanopore/ICLR vs reference mapping\n");
@@ -431,7 +460,26 @@ int main(int argc, char *argv[])
431
460
  __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n_seq);
432
461
  if (argc != o.ind + 1) mm_mapopt_update(&opt, mi);
433
462
  if (mm_verbose >= 3) mm_idx_stat(mi);
434
- if (junc_bed) mm_idx_bed_read(mi, junc_bed, 1);
463
+ if (fn_bed_junc) {
464
+ mm_idx_bed_read(mi, fn_bed_junc, 1);
465
+ if (mi->I == 0 && mm_verbose >= 2)
466
+ fprintf(stderr, "[WARNING] failed to load the junction BED file\n");
467
+ }
468
+ if (fn_bed_jump) {
469
+ mm_idx_jjump_read(mi, fn_bed_jump, MM_JUNC_ANNO, -1);
470
+ if (mi->J == 0 && mm_verbose >= 2)
471
+ fprintf(stderr, "[WARNING] failed to load the jump BED file\n");
472
+ }
473
+ if (fn_bed_pass1) {
474
+ mm_idx_jjump_read(mi, fn_bed_pass1, MM_JUNC_MISC, 5);
475
+ if (mi->J == 0 && mm_verbose >= 2)
476
+ fprintf(stderr, "[WARNING] failed to load the pass-1 jump BED file\n");
477
+ }
478
+ if (fn_spsc) {
479
+ mm_idx_spsc_read(mi, fn_spsc, mm_max_spsc_bonus(&opt));
480
+ if (mi->spsc == 0 && mm_verbose >= 2)
481
+ fprintf(stderr, "[WARNING] failed to load the splice score file\n");
482
+ }
435
483
  if (alt_list) mm_idx_alt_read(mi, alt_list);
436
484
  if (argc - (o.ind + 1) == 0) {
437
485
  mm_idx_destroy(mi);
data/ext/minimap2/map.c CHANGED
@@ -224,10 +224,10 @@ static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *k
224
224
  return regs;
225
225
  }
226
226
 
227
- void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname)
227
+ void mm_map_frag_core(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname)
228
228
  {
229
229
  int i, j, rep_len, qlen_sum, n_regs0, n_mini_pos;
230
- int max_chain_gap_qry, max_chain_gap_ref, is_splice = !!(opt->flag & MM_F_SPLICE), is_sr = !!(opt->flag & MM_F_SR);
230
+ int max_chain_gap_qry, max_chain_gap_ref, is_splice = !!(opt->flag & MM_F_SPLICE), is_sr = !!(opt->flag & MM_F_SR), is_sr_rna = !!(opt->flag & MM_F_SR_RNA);
231
231
  uint32_t hash;
232
232
  int64_t n_a;
233
233
  uint64_t *u, *mini_pos;
@@ -338,7 +338,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
338
338
  if (n_segs == 1) { // uni-segment
339
339
  regs0 = align_regs(opt, mi, b->km, qlens[0], seqs[0], &n_regs0, regs0, a);
340
340
  regs0 = (mm_reg1_t*)realloc(regs0, sizeof(*regs0) * n_regs0);
341
- mm_set_mapq(b->km, n_regs0, regs0, opt->min_chain_score, opt->a, rep_len, is_sr);
341
+ mm_set_mapq2(b->km, n_regs0, regs0, opt->min_chain_score, opt->a, rep_len, is_sr || is_sr_rna, is_splice);
342
342
  n_regs[0] = n_regs0, regs[0] = regs0;
343
343
  } else { // multi-segment
344
344
  mm_seg_t *seg;
@@ -347,7 +347,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
347
347
  for (i = 0; i < n_segs; ++i) {
348
348
  mm_set_parent(b->km, opt->mask_level, opt->mask_len, n_regs[i], regs[i], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); // update mm_reg1_t::parent
349
349
  regs[i] = align_regs(opt, mi, b->km, qlens[i], seqs[i], &n_regs[i], regs[i], seg[i].a);
350
- mm_set_mapq(b->km, n_regs[i], regs[i], opt->min_chain_score, opt->a, rep_len, is_sr);
350
+ mm_set_mapq2(b->km, n_regs[i], regs[i], opt->min_chain_score, opt->a, rep_len, is_sr || is_sr_rna, is_splice);
351
351
  }
352
352
  mm_seg_free(b->km, n_segs, seg);
353
353
  if (n_segs == 2 && opt->pe_ori >= 0 && (opt->flag&MM_F_CIGAR))
@@ -359,6 +359,10 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
359
359
  kfree(b->km, u);
360
360
  kfree(b->km, mini_pos);
361
361
 
362
+ if (mi->J && n_segs == 1 && is_splice)
363
+ for (i = 0; i < n_regs0; ++i)
364
+ mm_jump_split(b->km, mi, opt, qlens[0], (const uint8_t*)seqs[0], &regs0[i], 0);
365
+
362
366
  if (b->km) {
363
367
  km_stat(b->km, &kmst);
364
368
  if (mm_dbg_flag & MM_DBG_PRINT_QNAME)
@@ -373,6 +377,18 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
373
377
  }
374
378
  }
375
379
 
380
+ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname)
381
+ {
382
+ if ((opt->flag & MM_F_WEAK_PAIRING) && n_segs == 2 && opt->pe_ori >= 0 && (opt->flag&MM_F_CIGAR)) {
383
+ int i;
384
+ for (i = 0; i < n_segs; ++i)
385
+ mm_map_frag_core(mi, 1, &qlens[i], &seqs[i], &n_regs[i], &regs[i], b, opt, qname);
386
+ mm_pair(b->km, opt->max_gap_ref, opt->pe_bonus, opt->a * 2 + opt->b, opt->a, qlens, n_regs, regs);
387
+ } else {
388
+ mm_map_frag_core(mi, n_segs, qlens, seqs, n_regs, regs, b, opt, qname);
389
+ }
390
+ }
391
+
376
392
  mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname)
377
393
  {
378
394
  mm_reg1_t *regs;
@@ -447,6 +463,10 @@ static void worker_for(void *_data, long i, int tid) // kt_for() callback
447
463
  r->qs = qlens[j] - r->qe;
448
464
  r->qe = qlens[j] - t;
449
465
  r->rev = !r->rev;
466
+ if (r->p) {
467
+ if (r->p->trans_strand == 1) r->p->trans_strand = 2;
468
+ else if (r->p->trans_strand == 2) r->p->trans_strand = 1;
469
+ }
450
470
  }
451
471
  }
452
472
  if (mm_dbg_flag & MM_DBG_PRINT_QNAME)
@@ -509,7 +529,7 @@ static void merge_hits(step_t *s)
509
529
  mm_select_sub(km, opt->pri_ratio, s->p->mi->k*2, opt->best_n, 0, opt->max_gap * 0.8, &s->n_reg[k], s->reg[k]);
510
530
  mm_set_sam_pri(s->n_reg[k], s->reg[k]);
511
531
  }
512
- mm_set_mapq(km, s->n_reg[k], s->reg[k], opt->min_chain_score, opt->a, rep_len, !!(opt->flag & MM_F_SR));
532
+ mm_set_mapq2(km, s->n_reg[k], s->reg[k], opt->min_chain_score, opt->a, rep_len, !!(opt->flag & (MM_F_SR|MM_F_SR_RNA)), !!(opt->flag & MM_F_SPLICE));
513
533
  }
514
534
  if (s->n_seg[f] == 2 && opt->pe_ori >= 0 && (opt->flag&MM_F_CIGAR))
515
535
  mm_pair(km, frag_gap_part[0], opt->pe_bonus, opt->a * 2 + opt->b, opt->a, qlens, &s->n_reg[k0], &s->reg[k0]);
@@ -578,23 +598,30 @@ static void *worker_pipeline(void *shared, int step, void *in)
578
598
  mm_err_fwrite(r->p, r->p->capacity, 4, p->fp_split);
579
599
  }
580
600
  }
601
+ } else if (p->opt->flag & MM_F_OUT_JUNC) { // extra logic for --write-junc
602
+ for (j = 0; j < s->n_reg[i]; ++j) {
603
+ const mm_reg1_t *r = &s->reg[i][j];
604
+ if (r->id != r->parent || r->mapq < 10) continue;
605
+ mm_write_junc(&p->str, mi, t, r);
606
+ if (p->str.l > 0) mm_err_puts(p->str.s);
607
+ }
581
608
  } else if (s->n_reg[i] > 0) { // the query has at least one hit
582
609
  for (j = 0; j < s->n_reg[i]; ++j) {
583
- mm_reg1_t *r = &s->reg[i][j];
610
+ const mm_reg1_t *r = &s->reg[i][j];
584
611
  assert(!r->sam_pri || r->id == r->parent);
585
612
  if ((p->opt->flag & MM_F_NO_PRINT_2ND) && r->id != r->parent)
586
613
  continue;
587
614
  if (p->opt->flag & MM_F_OUT_SAM)
588
615
  mm_write_sam3(&p->str, mi, t, i - seg_st, j, s->n_seg[k], &s->n_reg[seg_st], (const mm_reg1_t*const*)&s->reg[seg_st], km, p->opt->flag, s->rep_len[i]);
589
616
  else
590
- mm_write_paf3(&p->str, mi, t, r, km, p->opt->flag, s->rep_len[i]);
617
+ mm_write_paf4(&p->str, mi, t, r, km, p->opt->flag, s->rep_len[i], s->n_seg[k], i - seg_st);
591
618
  mm_err_puts(p->str.s);
592
619
  }
593
620
  } else if ((p->opt->flag & MM_F_PAF_NO_HIT) || ((p->opt->flag & MM_F_OUT_SAM) && !(p->opt->flag & MM_F_SAM_HIT_ONLY))) { // output an empty hit, if requested
594
621
  if (p->opt->flag & MM_F_OUT_SAM)
595
622
  mm_write_sam3(&p->str, mi, t, i - seg_st, -1, s->n_seg[k], &s->n_reg[seg_st], (const mm_reg1_t*const*)&s->reg[seg_st], km, p->opt->flag, s->rep_len[i]);
596
623
  else
597
- mm_write_paf3(&p->str, mi, t, 0, 0, p->opt->flag, s->rep_len[i]);
624
+ mm_write_paf4(&p->str, mi, t, 0, 0, p->opt->flag, s->rep_len[i], s->n_seg[k], i - seg_st);
598
625
  mm_err_puts(p->str.s);
599
626
  }
600
627
  }