minimap2 0.2.22.0 → 0.2.24.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -76
  3. data/ext/Rakefile +55 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +821 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +369 -0
  41. data/ext/minimap2/main.c +459 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +410 -0
  44. data/ext/minimap2/minimap2.1 +725 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +132 -0
  50. data/ext/minimap2/options.c +234 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/lib/minimap2/aligner.rb +4 -4
  93. data/lib/minimap2/alignment.rb +11 -11
  94. data/lib/minimap2/ffi/constants.rb +20 -16
  95. data/lib/minimap2/ffi/functions.rb +5 -0
  96. data/lib/minimap2/ffi.rb +4 -5
  97. data/lib/minimap2/version.rb +2 -2
  98. data/lib/minimap2.rb +51 -15
  99. metadata +97 -79
  100. data/lib/minimap2/ffi_helper.rb +0 -53
  101. data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,559 @@
1
+ #include <stdarg.h>
2
+ #include <stdlib.h>
3
+ #include <string.h>
4
+ #include <assert.h>
5
+ #include <stdio.h>
6
+ #include "kalloc.h"
7
+ #include "mmpriv.h"
8
+
9
+ static char mm_rg_id[256];
10
+
11
+ static inline void str_enlarge(kstring_t *s, int l)
12
+ {
13
+ if (s->l + l + 1 > s->m) {
14
+ s->m = s->l + l + 1;
15
+ kroundup32(s->m);
16
+ s->s = (char*)realloc(s->s, s->m);
17
+ }
18
+ }
19
+
20
+ static inline void str_copy(kstring_t *s, const char *st, const char *en)
21
+ {
22
+ str_enlarge(s, en - st);
23
+ memcpy(&s->s[s->l], st, en - st);
24
+ s->l += en - st;
25
+ }
26
+
27
+ static void mm_sprintf_lite(kstring_t *s, const char *fmt, ...)
28
+ {
29
+ char buf[16]; // for integer to string conversion
30
+ const char *p, *q;
31
+ va_list ap;
32
+ va_start(ap, fmt);
33
+ for (q = p = fmt; *p; ++p) {
34
+ if (*p == '%') {
35
+ if (p > q) str_copy(s, q, p);
36
+ ++p;
37
+ if (*p == 'd') {
38
+ int c, i, l = 0;
39
+ unsigned int x;
40
+ c = va_arg(ap, int);
41
+ x = c >= 0? c : -c;
42
+ do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0);
43
+ if (c < 0) buf[l++] = '-';
44
+ str_enlarge(s, l);
45
+ for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
46
+ } else if (*p == 'u') {
47
+ int i, l = 0;
48
+ uint32_t x;
49
+ x = va_arg(ap, uint32_t);
50
+ do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0);
51
+ str_enlarge(s, l);
52
+ for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
53
+ } else if (*p == 's') {
54
+ char *r = va_arg(ap, char*);
55
+ str_copy(s, r, r + strlen(r));
56
+ } else if (*p == 'c') {
57
+ str_enlarge(s, 1);
58
+ s->s[s->l++] = va_arg(ap, int);
59
+ } else abort();
60
+ q = p + 1;
61
+ }
62
+ }
63
+ if (p > q) str_copy(s, q, p);
64
+ va_end(ap);
65
+ s->s[s->l] = 0;
66
+ }
67
+
68
+ static char *mm_escape(char *s)
69
+ {
70
+ char *p, *q;
71
+ for (p = q = s; *p; ++p) {
72
+ if (*p == '\\') {
73
+ ++p;
74
+ if (*p == 't') *q++ = '\t';
75
+ else if (*p == '\\') *q++ = '\\';
76
+ } else *q++ = *p;
77
+ }
78
+ *q = '\0';
79
+ return s;
80
+ }
81
+
82
+ static int sam_write_rg_line(kstring_t *str, const char *s)
83
+ {
84
+ char *p, *q, *r, *rg_line = 0;
85
+ memset(mm_rg_id, 0, 256);
86
+ if (s == 0) return 0;
87
+ if (strstr(s, "@RG") != s) {
88
+ if (mm_verbose >= 1) fprintf(stderr, "[ERROR] the read group line is not started with @RG\n");
89
+ goto err_set_rg;
90
+ }
91
+ if (strstr(s, "\t") != NULL) {
92
+ if (mm_verbose >= 1) fprintf(stderr, "[ERROR] the read group line contained literal <tab> characters -- replace with escaped tabs: \\t\n");
93
+ goto err_set_rg;
94
+ }
95
+ rg_line = (char*)malloc(strlen(s) + 1);
96
+ strcpy(rg_line, s);
97
+ mm_escape(rg_line);
98
+ if ((p = strstr(rg_line, "\tID:")) == 0) {
99
+ if (mm_verbose >= 1) fprintf(stderr, "[ERROR] no ID within the read group line\n");
100
+ goto err_set_rg;
101
+ }
102
+ p += 4;
103
+ for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
104
+ if (q - p + 1 > 256) {
105
+ if (mm_verbose >= 1) fprintf(stderr, "[ERROR] @RG:ID is longer than 255 characters\n");
106
+ goto err_set_rg;
107
+ }
108
+ for (q = p, r = mm_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
109
+ *r++ = *q;
110
+ mm_sprintf_lite(str, "%s\n", rg_line);
111
+ return 0;
112
+
113
+ err_set_rg:
114
+ free(rg_line);
115
+ return -1;
116
+ }
117
+
118
+ int mm_write_sam_hdr(const mm_idx_t *idx, const char *rg, const char *ver, int argc, char *argv[])
119
+ {
120
+ kstring_t str = {0,0,0};
121
+ int ret = 0;
122
+ if (idx) {
123
+ uint32_t i;
124
+ for (i = 0; i < idx->n_seq; ++i)
125
+ mm_sprintf_lite(&str, "@SQ\tSN:%s\tLN:%d\n", idx->seq[i].name, idx->seq[i].len);
126
+ }
127
+ if (rg) ret = sam_write_rg_line(&str, rg);
128
+ mm_sprintf_lite(&str, "@PG\tID:minimap2\tPN:minimap2");
129
+ if (ver) mm_sprintf_lite(&str, "\tVN:%s", ver);
130
+ if (argc > 1) {
131
+ int i;
132
+ mm_sprintf_lite(&str, "\tCL:minimap2");
133
+ for (i = 1; i < argc; ++i)
134
+ mm_sprintf_lite(&str, " %s", argv[i]);
135
+ }
136
+ mm_err_puts(str.s);
137
+ free(str.s);
138
+ return ret;
139
+ }
140
+
141
+ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int no_iden, int write_tag)
142
+ {
143
+ int i, q_off, t_off;
144
+ if (write_tag) mm_sprintf_lite(s, "\tcs:Z:");
145
+ for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) {
146
+ int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
147
+ assert((op >= MM_CIGAR_MATCH && op <= MM_CIGAR_N_SKIP) || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH);
148
+ if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH) {
149
+ int l_tmp = 0;
150
+ for (j = 0; j < len; ++j) {
151
+ if (qseq[q_off + j] != tseq[t_off + j]) {
152
+ if (l_tmp > 0) {
153
+ if (!no_iden) {
154
+ tmp[l_tmp] = 0;
155
+ mm_sprintf_lite(s, "=%s", tmp);
156
+ } else mm_sprintf_lite(s, ":%d", l_tmp);
157
+ l_tmp = 0;
158
+ }
159
+ mm_sprintf_lite(s, "*%c%c", "acgtn"[tseq[t_off + j]], "acgtn"[qseq[q_off + j]]);
160
+ } else tmp[l_tmp++] = "ACGTN"[qseq[q_off + j]];
161
+ }
162
+ if (l_tmp > 0) {
163
+ if (!no_iden) {
164
+ tmp[l_tmp] = 0;
165
+ mm_sprintf_lite(s, "=%s", tmp);
166
+ } else mm_sprintf_lite(s, ":%d", l_tmp);
167
+ }
168
+ q_off += len, t_off += len;
169
+ } else if (op == MM_CIGAR_INS) {
170
+ for (j = 0, tmp[len] = 0; j < len; ++j)
171
+ tmp[j] = "acgtn"[qseq[q_off + j]];
172
+ mm_sprintf_lite(s, "+%s", tmp);
173
+ q_off += len;
174
+ } else if (op == MM_CIGAR_DEL) {
175
+ for (j = 0, tmp[len] = 0; j < len; ++j)
176
+ tmp[j] = "acgtn"[tseq[t_off + j]];
177
+ mm_sprintf_lite(s, "-%s", tmp);
178
+ t_off += len;
179
+ } else { // intron
180
+ assert(len >= 2);
181
+ mm_sprintf_lite(s, "~%c%c%d%c%c", "acgtn"[tseq[t_off]], "acgtn"[tseq[t_off+1]],
182
+ len, "acgtn"[tseq[t_off+len-2]], "acgtn"[tseq[t_off+len-1]]);
183
+ t_off += len;
184
+ }
185
+ }
186
+ assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
187
+ }
188
+
189
+ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int write_tag)
190
+ {
191
+ int i, q_off, t_off, l_MD = 0;
192
+ if (write_tag) mm_sprintf_lite(s, "\tMD:Z:");
193
+ for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) {
194
+ int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
195
+ assert((op >= MM_CIGAR_MATCH && op <= MM_CIGAR_N_SKIP) || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH);
196
+ if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH) {
197
+ for (j = 0; j < len; ++j) {
198
+ if (qseq[q_off + j] != tseq[t_off + j]) {
199
+ mm_sprintf_lite(s, "%d%c", l_MD, "ACGTN"[tseq[t_off + j]]);
200
+ l_MD = 0;
201
+ } else ++l_MD;
202
+ }
203
+ q_off += len, t_off += len;
204
+ } else if (op == MM_CIGAR_INS) {
205
+ q_off += len;
206
+ } else if (op == MM_CIGAR_DEL) {
207
+ for (j = 0, tmp[len] = 0; j < len; ++j)
208
+ tmp[j] = "ACGTN"[tseq[t_off + j]];
209
+ mm_sprintf_lite(s, "%d^%s", l_MD, tmp);
210
+ l_MD = 0;
211
+ t_off += len;
212
+ } else if (op == MM_CIGAR_N_SKIP) {
213
+ t_off += len;
214
+ }
215
+ }
216
+ if (l_MD > 0) mm_sprintf_lite(s, "%d", l_MD);
217
+ assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
218
+ }
219
+
220
+ static void write_cs_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int no_iden, int is_MD, int write_tag, int is_qstrand)
221
+ {
222
+ extern unsigned char seq_nt4_table[256];
223
+ int i;
224
+ uint8_t *qseq, *tseq;
225
+ char *tmp;
226
+ if (r->p == 0) return;
227
+ qseq = (uint8_t*)kmalloc(km, r->qe - r->qs);
228
+ tseq = (uint8_t*)kmalloc(km, r->re - r->rs);
229
+ tmp = (char*)kmalloc(km, r->re - r->rs > r->qe - r->qs? r->re - r->rs + 1 : r->qe - r->qs + 1);
230
+ if (is_qstrand) {
231
+ mm_idx_getseq2(mi, r->rev, r->rid, r->rs, r->re, tseq);
232
+ for (i = r->qs; i < r->qe; ++i)
233
+ qseq[i - r->qs] = seq_nt4_table[(uint8_t)t->seq[i]];
234
+ } else {
235
+ mm_idx_getseq(mi, r->rid, r->rs, r->re, tseq);
236
+ if (!r->rev) {
237
+ for (i = r->qs; i < r->qe; ++i)
238
+ qseq[i - r->qs] = seq_nt4_table[(uint8_t)t->seq[i]];
239
+ } else {
240
+ for (i = r->qs; i < r->qe; ++i) {
241
+ uint8_t c = seq_nt4_table[(uint8_t)t->seq[i]];
242
+ qseq[r->qe - i - 1] = c >= 4? 4 : 3 - c;
243
+ }
244
+ }
245
+ }
246
+ if (is_MD) write_MD_core(s, tseq, qseq, r, tmp, write_tag);
247
+ else write_cs_core(s, tseq, qseq, r, tmp, no_iden, write_tag);
248
+ kfree(km, qseq); kfree(km, tseq); kfree(km, tmp);
249
+ }
250
+
251
+ int mm_gen_cs_or_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int is_MD, int no_iden, int is_qstrand)
252
+ {
253
+ mm_bseq1_t t;
254
+ kstring_t str;
255
+ str.s = *buf, str.l = 0, str.m = *max_len;
256
+ t.l_seq = strlen(seq);
257
+ t.seq = (char*)seq;
258
+ write_cs_or_MD(km, &str, mi, &t, r, no_iden, is_MD, 0, is_qstrand);
259
+ *max_len = str.m;
260
+ *buf = str.s;
261
+ return str.l;
262
+ }
263
+
264
+ int mm_gen_cs(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int no_iden)
265
+ {
266
+ return mm_gen_cs_or_MD(km, buf, max_len, mi, r, seq, 0, no_iden, 0);
267
+ }
268
+
269
+ int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq)
270
+ {
271
+ return mm_gen_cs_or_MD(km, buf, max_len, mi, r, seq, 1, 0, 0);
272
+ }
273
+
274
+ static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
275
+ {
276
+ int type;
277
+ if (r->id == r->parent) type = r->inv? 'I' : 'P';
278
+ else type = r->inv? 'i' : 'S';
279
+ if (r->p) {
280
+ mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->blen - r->mlen + r->p->n_ambi, r->p->dp_max, r->p->dp_score, r->p->n_ambi);
281
+ if (r->p->trans_strand == 1 || r->p->trans_strand == 2)
282
+ mm_sprintf_lite(s, "\tts:A:%c", "?+-?"[r->p->trans_strand]);
283
+ }
284
+ mm_sprintf_lite(s, "\ttp:A:%c\tcm:i:%d\ts1:i:%d", type, r->cnt, r->score);
285
+ if (r->parent == r->id) mm_sprintf_lite(s, "\ts2:i:%d", r->subsc);
286
+ if (r->p) {
287
+ char buf[16];
288
+ double div;
289
+ div = 1.0 - mm_event_identity(r);
290
+ if (div == 0.0) buf[0] = '0', buf[1] = 0;
291
+ else snprintf(buf, 16, "%.4f", 1.0 - mm_event_identity(r));
292
+ mm_sprintf_lite(s, "\tde:f:%s", buf);
293
+ } else if (r->div >= 0.0f && r->div <= 1.0f) {
294
+ char buf[16];
295
+ if (r->div == 0.0f) buf[0] = '0', buf[1] = 0;
296
+ else snprintf(buf, 16, "%.4f", r->div);
297
+ mm_sprintf_lite(s, "\tdv:f:%s", buf);
298
+ }
299
+ if (r->split) mm_sprintf_lite(s, "\tzd:i:%d", r->split);
300
+ }
301
+
302
+ void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag, int rep_len)
303
+ {
304
+ s->l = 0;
305
+ if (r == 0) {
306
+ mm_sprintf_lite(s, "%s\t%d\t0\t0\t*\t*\t0\t0\t0\t0\t0\t0", t->name, t->l_seq);
307
+ if (rep_len >= 0) mm_sprintf_lite(s, "\trl:i:%d", rep_len);
308
+ return;
309
+ }
310
+ mm_sprintf_lite(s, "%s\t%d\t%d\t%d\t%c\t", t->name, t->l_seq, r->qs, r->qe, "+-"[r->rev]);
311
+ if (mi->seq[r->rid].name) mm_sprintf_lite(s, "%s", mi->seq[r->rid].name);
312
+ else mm_sprintf_lite(s, "%d", r->rid);
313
+ mm_sprintf_lite(s, "\t%d", mi->seq[r->rid].len);
314
+ if ((opt_flag & MM_F_QSTRAND) && r->rev)
315
+ mm_sprintf_lite(s, "\t%d\t%d", mi->seq[r->rid].len - r->re, mi->seq[r->rid].len - r->rs);
316
+ else
317
+ mm_sprintf_lite(s, "\t%d\t%d", r->rs, r->re);
318
+ mm_sprintf_lite(s, "\t%d\t%d", r->mlen, r->blen);
319
+ mm_sprintf_lite(s, "\t%d", r->mapq);
320
+ write_tags(s, r);
321
+ if (rep_len >= 0) mm_sprintf_lite(s, "\trl:i:%d", rep_len);
322
+ if (r->p && (opt_flag & MM_F_OUT_CG)) {
323
+ uint32_t k;
324
+ mm_sprintf_lite(s, "\tcg:Z:");
325
+ for (k = 0; k < r->p->n_cigar; ++k)
326
+ mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, MM_CIGAR_STR[r->p->cigar[k]&0xf]);
327
+ }
328
+ if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
329
+ write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, 1, !!(opt_flag&MM_F_QSTRAND));
330
+ if ((opt_flag & MM_F_COPY_COMMENT) && t->comment)
331
+ mm_sprintf_lite(s, "\t%s", t->comment);
332
+ }
333
+
334
+ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag)
335
+ {
336
+ mm_write_paf3(s, mi, t, r, km, opt_flag, -1);
337
+ }
338
+
339
+ static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp)
340
+ {
341
+ extern unsigned char seq_comp_table[256];
342
+ if (rev) {
343
+ int i;
344
+ str_enlarge(s, l);
345
+ for (i = 0; i < l; ++i) {
346
+ int c = seq[l - 1 - i];
347
+ s->s[s->l + i] = c < 128 && comp? seq_comp_table[c] : c;
348
+ }
349
+ s->l += l;
350
+ } else str_copy(s, seq, seq + l);
351
+ }
352
+
353
+ static inline const mm_reg1_t *get_sam_pri(int n_regs, const mm_reg1_t *regs)
354
+ {
355
+ int i;
356
+ for (i = 0; i < n_regs; ++i)
357
+ if (regs[i].sam_pri)
358
+ return &regs[i];
359
+ assert(n_regs == 0);
360
+ return NULL;
361
+ }
362
+
363
+ static void write_sam_cigar(kstring_t *s, int sam_flag, int in_tag, int qlen, const mm_reg1_t *r, int64_t opt_flag)
364
+ {
365
+ if (r->p == 0) {
366
+ mm_sprintf_lite(s, "*");
367
+ } else {
368
+ uint32_t k, clip_len[2];
369
+ clip_len[0] = r->rev? qlen - r->qe : r->qs;
370
+ clip_len[1] = r->rev? r->qs : qlen - r->qe;
371
+ if (in_tag) {
372
+ int clip_char = (sam_flag&0x800) && !(opt_flag&MM_F_SOFTCLIP)? 5 : 4;
373
+ mm_sprintf_lite(s, "\tCG:B:I");
374
+ if (clip_len[0]) mm_sprintf_lite(s, ",%u", clip_len[0]<<4|clip_char);
375
+ for (k = 0; k < r->p->n_cigar; ++k)
376
+ mm_sprintf_lite(s, ",%u", r->p->cigar[k]);
377
+ if (clip_len[1]) mm_sprintf_lite(s, ",%u", clip_len[1]<<4|clip_char);
378
+ } else {
379
+ int clip_char = (sam_flag&0x800) && !(opt_flag&MM_F_SOFTCLIP)? 'H' : 'S';
380
+ assert(clip_len[0] < qlen && clip_len[1] < qlen);
381
+ if (clip_len[0]) mm_sprintf_lite(s, "%d%c", clip_len[0], clip_char);
382
+ for (k = 0; k < r->p->n_cigar; ++k)
383
+ mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, MM_CIGAR_STR[r->p->cigar[k]&0xf]);
384
+ if (clip_len[1]) mm_sprintf_lite(s, "%d%c", clip_len[1], clip_char);
385
+ }
386
+ }
387
+ }
388
+
389
+ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int seg_idx, int reg_idx, int n_seg, const int *n_regss, const mm_reg1_t *const* regss, void *km, int64_t opt_flag, int rep_len)
390
+ {
391
+ const int max_bam_cigar_op = 65535;
392
+ int flag, n_regs = n_regss[seg_idx], cigar_in_tag = 0;
393
+ int this_rid = -1, this_pos = -1;
394
+ const mm_reg1_t *regs = regss[seg_idx], *r_prev = NULL, *r_next;
395
+ const mm_reg1_t *r = n_regs > 0 && reg_idx < n_regs && reg_idx >= 0? &regs[reg_idx] : NULL;
396
+
397
+ // find the primary of the previous and the next segments, if they are mapped
398
+ if (n_seg > 1) {
399
+ int i, next_sid = (seg_idx + 1) % n_seg;
400
+ r_next = get_sam_pri(n_regss[next_sid], regss[next_sid]);
401
+ if (n_seg > 2) {
402
+ for (i = 1; i <= n_seg - 1; ++i) {
403
+ int prev_sid = (seg_idx + n_seg - i) % n_seg;
404
+ if (n_regss[prev_sid] > 0) {
405
+ r_prev = get_sam_pri(n_regss[prev_sid], regss[prev_sid]);
406
+ break;
407
+ }
408
+ }
409
+ } else r_prev = r_next;
410
+ } else r_prev = r_next = NULL;
411
+
412
+ // write QNAME
413
+ s->l = 0;
414
+ mm_sprintf_lite(s, "%s", t->name);
415
+ if (n_seg > 1) s->l = mm_qname_len(t->name); // trim the suffix like /1 or /2
416
+
417
+ // write flag
418
+ flag = n_seg > 1? 0x1 : 0x0;
419
+ if (r == 0) {
420
+ flag |= 0x4;
421
+ } else {
422
+ if (r->rev) flag |= 0x10;
423
+ if (r->parent != r->id) flag |= 0x100;
424
+ else if (!r->sam_pri) flag |= 0x800;
425
+ }
426
+ if (n_seg > 1) {
427
+ if (r && r->proper_frag) flag |= 0x2; // TODO: this doesn't work when there are more than 2 segments
428
+ if (seg_idx == 0) flag |= 0x40;
429
+ else if (seg_idx == n_seg - 1) flag |= 0x80;
430
+ if (r_next == NULL) flag |= 0x8;
431
+ else if (r_next->rev) flag |= 0x20;
432
+ }
433
+ mm_sprintf_lite(s, "\t%d", flag);
434
+
435
+ // write coordinate, MAPQ and CIGAR
436
+ if (r == 0) {
437
+ if (r_prev) {
438
+ this_rid = r_prev->rid, this_pos = r_prev->rs;
439
+ mm_sprintf_lite(s, "\t%s\t%d\t0\t*", mi->seq[this_rid].name, this_pos+1);
440
+ } else mm_sprintf_lite(s, "\t*\t0\t0\t*");
441
+ } else {
442
+ this_rid = r->rid, this_pos = r->rs;
443
+ mm_sprintf_lite(s, "\t%s\t%d\t%d\t", mi->seq[r->rid].name, r->rs+1, r->mapq);
444
+ if ((opt_flag & MM_F_LONG_CIGAR) && r->p && r->p->n_cigar > max_bam_cigar_op - 2) {
445
+ int n_cigar = r->p->n_cigar;
446
+ if (r->qs != 0) ++n_cigar;
447
+ if (r->qe != t->l_seq) ++n_cigar;
448
+ if (n_cigar > max_bam_cigar_op)
449
+ cigar_in_tag = 1;
450
+ }
451
+ if (cigar_in_tag) {
452
+ int slen;
453
+ if ((flag & 0x900) == 0 || (opt_flag & MM_F_SOFTCLIP)) slen = t->l_seq;
454
+ else if (flag & 0x100) slen = 0;
455
+ else slen = r->qe - r->qs;
456
+ mm_sprintf_lite(s, "%dS%dN", slen, r->re - r->rs);
457
+ } else write_sam_cigar(s, flag, 0, t->l_seq, r, opt_flag);
458
+ }
459
+
460
+ // write mate positions
461
+ if (n_seg > 1) {
462
+ int tlen = 0;
463
+ if (this_rid >= 0 && r_next) {
464
+ if (this_rid == r_next->rid) {
465
+ if (r) {
466
+ int this_pos5 = r->rev? r->re - 1 : this_pos;
467
+ int next_pos5 = r_next->rev? r_next->re - 1 : r_next->rs;
468
+ tlen = next_pos5 - this_pos5;
469
+ }
470
+ mm_sprintf_lite(s, "\t=\t");
471
+ } else mm_sprintf_lite(s, "\t%s\t", mi->seq[r_next->rid].name);
472
+ mm_sprintf_lite(s, "%d\t", r_next->rs + 1);
473
+ } else if (r_next) { // && this_rid < 0
474
+ mm_sprintf_lite(s, "\t%s\t%d\t", mi->seq[r_next->rid].name, r_next->rs + 1);
475
+ } else if (this_rid >= 0) { // && r_next == NULL
476
+ mm_sprintf_lite(s, "\t=\t%d\t", this_pos + 1); // next segment will take r's coordinate
477
+ } else mm_sprintf_lite(s, "\t*\t0\t"); // neither has coordinates
478
+ if (tlen > 0) ++tlen;
479
+ else if (tlen < 0) --tlen;
480
+ mm_sprintf_lite(s, "%d\t", tlen);
481
+ } else mm_sprintf_lite(s, "\t*\t0\t0\t");
482
+
483
+ // write SEQ and QUAL
484
+ if (r == 0) {
485
+ sam_write_sq(s, t->seq, t->l_seq, 0, 0);
486
+ mm_sprintf_lite(s, "\t");
487
+ if (t->qual) sam_write_sq(s, t->qual, t->l_seq, 0, 0);
488
+ else mm_sprintf_lite(s, "*");
489
+ } else {
490
+ if ((flag & 0x900) == 0 || (opt_flag & MM_F_SOFTCLIP)) {
491
+ sam_write_sq(s, t->seq, t->l_seq, r->rev, r->rev);
492
+ mm_sprintf_lite(s, "\t");
493
+ if (t->qual) sam_write_sq(s, t->qual, t->l_seq, r->rev, 0);
494
+ else mm_sprintf_lite(s, "*");
495
+ } else if (flag & 0x100) {
496
+ mm_sprintf_lite(s, "*\t*");
497
+ } else {
498
+ sam_write_sq(s, t->seq + r->qs, r->qe - r->qs, r->rev, r->rev);
499
+ mm_sprintf_lite(s, "\t");
500
+ if (t->qual) sam_write_sq(s, t->qual + r->qs, r->qe - r->qs, r->rev, 0);
501
+ else mm_sprintf_lite(s, "*");
502
+ }
503
+ }
504
+
505
+ // write tags
506
+ if (mm_rg_id[0]) mm_sprintf_lite(s, "\tRG:Z:%s", mm_rg_id);
507
+ if (n_seg > 2) mm_sprintf_lite(s, "\tFI:i:%d", seg_idx);
508
+ if (r) {
509
+ write_tags(s, r);
510
+ if (r->parent == r->id && r->p && n_regs > 1 && regs && r >= regs && r - regs < n_regs) { // supplementary aln may exist
511
+ int i, n_sa = 0; // n_sa: number of SA fields
512
+ for (i = 0; i < n_regs; ++i)
513
+ if (i != r - regs && regs[i].parent == regs[i].id && regs[i].p)
514
+ ++n_sa;
515
+ if (n_sa > 0) {
516
+ mm_sprintf_lite(s, "\tSA:Z:");
517
+ for (i = 0; i < n_regs; ++i) {
518
+ const mm_reg1_t *q = &regs[i];
519
+ int l_M, l_I = 0, l_D = 0, clip5 = 0, clip3 = 0;
520
+ if (r == q || q->parent != q->id || q->p == 0) continue;
521
+ if (q->qe - q->qs < q->re - q->rs) l_M = q->qe - q->qs, l_D = (q->re - q->rs) - l_M;
522
+ else l_M = q->re - q->rs, l_I = (q->qe - q->qs) - l_M;
523
+ clip5 = q->rev? t->l_seq - q->qe : q->qs;
524
+ clip3 = q->rev? q->qs : t->l_seq - q->qe;
525
+ mm_sprintf_lite(s, "%s,%d,%c,", mi->seq[q->rid].name, q->rs+1, "+-"[q->rev]);
526
+ if (clip5) mm_sprintf_lite(s, "%dS", clip5);
527
+ if (l_M) mm_sprintf_lite(s, "%dM", l_M);
528
+ if (l_I) mm_sprintf_lite(s, "%dI", l_I);
529
+ if (l_D) mm_sprintf_lite(s, "%dD", l_D);
530
+ if (clip3) mm_sprintf_lite(s, "%dS", clip3);
531
+ mm_sprintf_lite(s, ",%d,%d;", q->mapq, q->blen - q->mlen + q->p->n_ambi);
532
+ }
533
+ }
534
+ }
535
+ if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
536
+ write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, 1, 0);
537
+ if (cigar_in_tag)
538
+ write_sam_cigar(s, flag, 1, t->l_seq, r, opt_flag);
539
+ }
540
+ if (rep_len >= 0) mm_sprintf_lite(s, "\trl:i:%d", rep_len);
541
+
542
+ if ((opt_flag & MM_F_COPY_COMMENT) && t->comment)
543
+ mm_sprintf_lite(s, "\t%s", t->comment);
544
+
545
+ s->s[s->l] = 0; // we always have room for an extra byte (see str_enlarge)
546
+ }
547
+
548
+ void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int seg_idx, int reg_idx, int n_seg, const int *n_regss, const mm_reg1_t *const* regss, void *km, int64_t opt_flag)
549
+ {
550
+ mm_write_sam3(s, mi, t, seg_idx, reg_idx, n_seg, n_regss, regss, km, opt_flag, -1);
551
+ }
552
+
553
+ void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs)
554
+ {
555
+ int i;
556
+ for (i = 0; i < n_regs; ++i)
557
+ if (r == &regs[i]) break;
558
+ mm_write_sam2(s, mi, t, 0, i, 1, &n_regs, &regs, NULL, 0);
559
+ }