minimap2 0.0.4 → 0.2.23.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +113 -98
  3. data/ext/Rakefile +41 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +807 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +344 -0
  41. data/ext/minimap2/main.c +455 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +409 -0
  44. data/ext/minimap2/minimap2.1 +722 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +131 -0
  50. data/ext/minimap2/options.c +233 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/ext/vendor/libminimap2.so +0 -0
  93. data/lib/minimap2/aligner.rb +16 -5
  94. data/lib/minimap2/alignment.rb +6 -2
  95. data/lib/minimap2/ffi/constants.rb +74 -53
  96. data/lib/minimap2/ffi/functions.rb +5 -0
  97. data/lib/minimap2/ffi.rb +1 -2
  98. data/lib/minimap2/version.rb +2 -1
  99. data/lib/minimap2.rb +67 -22
  100. metadata +98 -64
  101. data/lib/minimap2/ffi_helper.rb +0 -53
@@ -0,0 +1,559 @@
1
+ #include <stdarg.h>
2
+ #include <stdlib.h>
3
+ #include <string.h>
4
+ #include <assert.h>
5
+ #include <stdio.h>
6
+ #include "kalloc.h"
7
+ #include "mmpriv.h"
8
+
9
+ static char mm_rg_id[256];
10
+
11
+ static inline void str_enlarge(kstring_t *s, int l)
12
+ {
13
+ if (s->l + l + 1 > s->m) {
14
+ s->m = s->l + l + 1;
15
+ kroundup32(s->m);
16
+ s->s = (char*)realloc(s->s, s->m);
17
+ }
18
+ }
19
+
20
+ static inline void str_copy(kstring_t *s, const char *st, const char *en)
21
+ {
22
+ str_enlarge(s, en - st);
23
+ memcpy(&s->s[s->l], st, en - st);
24
+ s->l += en - st;
25
+ }
26
+
27
+ static void mm_sprintf_lite(kstring_t *s, const char *fmt, ...)
28
+ {
29
+ char buf[16]; // for integer to string conversion
30
+ const char *p, *q;
31
+ va_list ap;
32
+ va_start(ap, fmt);
33
+ for (q = p = fmt; *p; ++p) {
34
+ if (*p == '%') {
35
+ if (p > q) str_copy(s, q, p);
36
+ ++p;
37
+ if (*p == 'd') {
38
+ int c, i, l = 0;
39
+ unsigned int x;
40
+ c = va_arg(ap, int);
41
+ x = c >= 0? c : -c;
42
+ do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0);
43
+ if (c < 0) buf[l++] = '-';
44
+ str_enlarge(s, l);
45
+ for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
46
+ } else if (*p == 'u') {
47
+ int i, l = 0;
48
+ uint32_t x;
49
+ x = va_arg(ap, uint32_t);
50
+ do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0);
51
+ str_enlarge(s, l);
52
+ for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
53
+ } else if (*p == 's') {
54
+ char *r = va_arg(ap, char*);
55
+ str_copy(s, r, r + strlen(r));
56
+ } else if (*p == 'c') {
57
+ str_enlarge(s, 1);
58
+ s->s[s->l++] = va_arg(ap, int);
59
+ } else abort();
60
+ q = p + 1;
61
+ }
62
+ }
63
+ if (p > q) str_copy(s, q, p);
64
+ va_end(ap);
65
+ s->s[s->l] = 0;
66
+ }
67
+
68
+ static char *mm_escape(char *s)
69
+ {
70
+ char *p, *q;
71
+ for (p = q = s; *p; ++p) {
72
+ if (*p == '\\') {
73
+ ++p;
74
+ if (*p == 't') *q++ = '\t';
75
+ else if (*p == '\\') *q++ = '\\';
76
+ } else *q++ = *p;
77
+ }
78
+ *q = '\0';
79
+ return s;
80
+ }
81
+
82
+ static int sam_write_rg_line(kstring_t *str, const char *s)
83
+ {
84
+ char *p, *q, *r, *rg_line = 0;
85
+ memset(mm_rg_id, 0, 256);
86
+ if (s == 0) return 0;
87
+ if (strstr(s, "@RG") != s) {
88
+ if (mm_verbose >= 1) fprintf(stderr, "[ERROR] the read group line is not started with @RG\n");
89
+ goto err_set_rg;
90
+ }
91
+ if (strstr(s, "\t") != NULL) {
92
+ if (mm_verbose >= 1) fprintf(stderr, "[ERROR] the read group line contained literal <tab> characters -- replace with escaped tabs: \\t\n");
93
+ goto err_set_rg;
94
+ }
95
+ rg_line = (char*)malloc(strlen(s) + 1);
96
+ strcpy(rg_line, s);
97
+ mm_escape(rg_line);
98
+ if ((p = strstr(rg_line, "\tID:")) == 0) {
99
+ if (mm_verbose >= 1) fprintf(stderr, "[ERROR] no ID within the read group line\n");
100
+ goto err_set_rg;
101
+ }
102
+ p += 4;
103
+ for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
104
+ if (q - p + 1 > 256) {
105
+ if (mm_verbose >= 1) fprintf(stderr, "[ERROR] @RG:ID is longer than 255 characters\n");
106
+ goto err_set_rg;
107
+ }
108
+ for (q = p, r = mm_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
109
+ *r++ = *q;
110
+ mm_sprintf_lite(str, "%s\n", rg_line);
111
+ return 0;
112
+
113
+ err_set_rg:
114
+ free(rg_line);
115
+ return -1;
116
+ }
117
+
118
+ int mm_write_sam_hdr(const mm_idx_t *idx, const char *rg, const char *ver, int argc, char *argv[])
119
+ {
120
+ kstring_t str = {0,0,0};
121
+ int ret = 0;
122
+ if (idx) {
123
+ uint32_t i;
124
+ for (i = 0; i < idx->n_seq; ++i)
125
+ mm_sprintf_lite(&str, "@SQ\tSN:%s\tLN:%d\n", idx->seq[i].name, idx->seq[i].len);
126
+ }
127
+ if (rg) ret = sam_write_rg_line(&str, rg);
128
+ mm_sprintf_lite(&str, "@PG\tID:minimap2\tPN:minimap2");
129
+ if (ver) mm_sprintf_lite(&str, "\tVN:%s", ver);
130
+ if (argc > 1) {
131
+ int i;
132
+ mm_sprintf_lite(&str, "\tCL:minimap2");
133
+ for (i = 1; i < argc; ++i)
134
+ mm_sprintf_lite(&str, " %s", argv[i]);
135
+ }
136
+ mm_err_puts(str.s);
137
+ free(str.s);
138
+ return ret;
139
+ }
140
+
141
+ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int no_iden, int write_tag)
142
+ {
143
+ int i, q_off, t_off;
144
+ if (write_tag) mm_sprintf_lite(s, "\tcs:Z:");
145
+ for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) {
146
+ int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
147
+ assert((op >= MM_CIGAR_MATCH && op <= MM_CIGAR_N_SKIP) || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH);
148
+ if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH) {
149
+ int l_tmp = 0;
150
+ for (j = 0; j < len; ++j) {
151
+ if (qseq[q_off + j] != tseq[t_off + j]) {
152
+ if (l_tmp > 0) {
153
+ if (!no_iden) {
154
+ tmp[l_tmp] = 0;
155
+ mm_sprintf_lite(s, "=%s", tmp);
156
+ } else mm_sprintf_lite(s, ":%d", l_tmp);
157
+ l_tmp = 0;
158
+ }
159
+ mm_sprintf_lite(s, "*%c%c", "acgtn"[tseq[t_off + j]], "acgtn"[qseq[q_off + j]]);
160
+ } else tmp[l_tmp++] = "ACGTN"[qseq[q_off + j]];
161
+ }
162
+ if (l_tmp > 0) {
163
+ if (!no_iden) {
164
+ tmp[l_tmp] = 0;
165
+ mm_sprintf_lite(s, "=%s", tmp);
166
+ } else mm_sprintf_lite(s, ":%d", l_tmp);
167
+ }
168
+ q_off += len, t_off += len;
169
+ } else if (op == MM_CIGAR_INS) {
170
+ for (j = 0, tmp[len] = 0; j < len; ++j)
171
+ tmp[j] = "acgtn"[qseq[q_off + j]];
172
+ mm_sprintf_lite(s, "+%s", tmp);
173
+ q_off += len;
174
+ } else if (op == MM_CIGAR_DEL) {
175
+ for (j = 0, tmp[len] = 0; j < len; ++j)
176
+ tmp[j] = "acgtn"[tseq[t_off + j]];
177
+ mm_sprintf_lite(s, "-%s", tmp);
178
+ t_off += len;
179
+ } else { // intron
180
+ assert(len >= 2);
181
+ mm_sprintf_lite(s, "~%c%c%d%c%c", "acgtn"[tseq[t_off]], "acgtn"[tseq[t_off+1]],
182
+ len, "acgtn"[tseq[t_off+len-2]], "acgtn"[tseq[t_off+len-1]]);
183
+ t_off += len;
184
+ }
185
+ }
186
+ assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
187
+ }
188
+
189
+ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int write_tag)
190
+ {
191
+ int i, q_off, t_off, l_MD = 0;
192
+ if (write_tag) mm_sprintf_lite(s, "\tMD:Z:");
193
+ for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) {
194
+ int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
195
+ assert((op >= MM_CIGAR_MATCH && op <= MM_CIGAR_N_SKIP) || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH);
196
+ if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH) {
197
+ for (j = 0; j < len; ++j) {
198
+ if (qseq[q_off + j] != tseq[t_off + j]) {
199
+ mm_sprintf_lite(s, "%d%c", l_MD, "ACGTN"[tseq[t_off + j]]);
200
+ l_MD = 0;
201
+ } else ++l_MD;
202
+ }
203
+ q_off += len, t_off += len;
204
+ } else if (op == MM_CIGAR_INS) {
205
+ q_off += len;
206
+ } else if (op == MM_CIGAR_DEL) {
207
+ for (j = 0, tmp[len] = 0; j < len; ++j)
208
+ tmp[j] = "ACGTN"[tseq[t_off + j]];
209
+ mm_sprintf_lite(s, "%d^%s", l_MD, tmp);
210
+ l_MD = 0;
211
+ t_off += len;
212
+ } else if (op == MM_CIGAR_N_SKIP) {
213
+ t_off += len;
214
+ }
215
+ }
216
+ if (l_MD > 0) mm_sprintf_lite(s, "%d", l_MD);
217
+ assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
218
+ }
219
+
220
+ static void write_cs_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int no_iden, int is_MD, int write_tag, int is_qstrand)
221
+ {
222
+ extern unsigned char seq_nt4_table[256];
223
+ int i;
224
+ uint8_t *qseq, *tseq;
225
+ char *tmp;
226
+ if (r->p == 0) return;
227
+ qseq = (uint8_t*)kmalloc(km, r->qe - r->qs);
228
+ tseq = (uint8_t*)kmalloc(km, r->re - r->rs);
229
+ tmp = (char*)kmalloc(km, r->re - r->rs > r->qe - r->qs? r->re - r->rs + 1 : r->qe - r->qs + 1);
230
+ if (is_qstrand) {
231
+ mm_idx_getseq2(mi, r->rev, r->rid, r->rs, r->re, tseq);
232
+ for (i = r->qs; i < r->qe; ++i)
233
+ qseq[i - r->qs] = seq_nt4_table[(uint8_t)t->seq[i]];
234
+ } else {
235
+ mm_idx_getseq(mi, r->rid, r->rs, r->re, tseq);
236
+ if (!r->rev) {
237
+ for (i = r->qs; i < r->qe; ++i)
238
+ qseq[i - r->qs] = seq_nt4_table[(uint8_t)t->seq[i]];
239
+ } else {
240
+ for (i = r->qs; i < r->qe; ++i) {
241
+ uint8_t c = seq_nt4_table[(uint8_t)t->seq[i]];
242
+ qseq[r->qe - i - 1] = c >= 4? 4 : 3 - c;
243
+ }
244
+ }
245
+ }
246
+ if (is_MD) write_MD_core(s, tseq, qseq, r, tmp, write_tag);
247
+ else write_cs_core(s, tseq, qseq, r, tmp, no_iden, write_tag);
248
+ kfree(km, qseq); kfree(km, tseq); kfree(km, tmp);
249
+ }
250
+
251
+ int mm_gen_cs_or_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int is_MD, int no_iden, int is_qstrand)
252
+ {
253
+ mm_bseq1_t t;
254
+ kstring_t str;
255
+ str.s = *buf, str.l = 0, str.m = *max_len;
256
+ t.l_seq = strlen(seq);
257
+ t.seq = (char*)seq;
258
+ write_cs_or_MD(km, &str, mi, &t, r, no_iden, is_MD, 0, is_qstrand);
259
+ *max_len = str.m;
260
+ *buf = str.s;
261
+ return str.l;
262
+ }
263
+
264
+ int mm_gen_cs(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int no_iden)
265
+ {
266
+ return mm_gen_cs_or_MD(km, buf, max_len, mi, r, seq, 0, no_iden, 0);
267
+ }
268
+
269
+ int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq)
270
+ {
271
+ return mm_gen_cs_or_MD(km, buf, max_len, mi, r, seq, 1, 0, 0);
272
+ }
273
+
274
+ static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
275
+ {
276
+ int type;
277
+ if (r->id == r->parent) type = r->inv? 'I' : 'P';
278
+ else type = r->inv? 'i' : 'S';
279
+ if (r->p) {
280
+ mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->blen - r->mlen + r->p->n_ambi, r->p->dp_max, r->p->dp_score, r->p->n_ambi);
281
+ if (r->p->trans_strand == 1 || r->p->trans_strand == 2)
282
+ mm_sprintf_lite(s, "\tts:A:%c", "?+-?"[r->p->trans_strand]);
283
+ }
284
+ mm_sprintf_lite(s, "\ttp:A:%c\tcm:i:%d\ts1:i:%d", type, r->cnt, r->score);
285
+ if (r->parent == r->id) mm_sprintf_lite(s, "\ts2:i:%d", r->subsc);
286
+ if (r->p) {
287
+ char buf[16];
288
+ double div;
289
+ div = 1.0 - mm_event_identity(r);
290
+ if (div == 0.0) buf[0] = '0', buf[1] = 0;
291
+ else snprintf(buf, 16, "%.4f", 1.0 - mm_event_identity(r));
292
+ mm_sprintf_lite(s, "\tde:f:%s", buf);
293
+ } else if (r->div >= 0.0f && r->div <= 1.0f) {
294
+ char buf[16];
295
+ if (r->div == 0.0f) buf[0] = '0', buf[1] = 0;
296
+ else snprintf(buf, 16, "%.4f", r->div);
297
+ mm_sprintf_lite(s, "\tdv:f:%s", buf);
298
+ }
299
+ if (r->split) mm_sprintf_lite(s, "\tzd:i:%d", r->split);
300
+ }
301
+
302
+ void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag, int rep_len)
303
+ {
304
+ s->l = 0;
305
+ if (r == 0) {
306
+ mm_sprintf_lite(s, "%s\t%d\t0\t0\t*\t*\t0\t0\t0\t0\t0\t0", t->name, t->l_seq);
307
+ if (rep_len >= 0) mm_sprintf_lite(s, "\trl:i:%d", rep_len);
308
+ return;
309
+ }
310
+ mm_sprintf_lite(s, "%s\t%d\t%d\t%d\t%c\t", t->name, t->l_seq, r->qs, r->qe, "+-"[r->rev]);
311
+ if (mi->seq[r->rid].name) mm_sprintf_lite(s, "%s", mi->seq[r->rid].name);
312
+ else mm_sprintf_lite(s, "%d", r->rid);
313
+ mm_sprintf_lite(s, "\t%d", mi->seq[r->rid].len);
314
+ if ((opt_flag & MM_F_QSTRAND) && r->rev)
315
+ mm_sprintf_lite(s, "\t%d\t%d", mi->seq[r->rid].len - r->re, mi->seq[r->rid].len - r->rs);
316
+ else
317
+ mm_sprintf_lite(s, "\t%d\t%d", r->rs, r->re);
318
+ mm_sprintf_lite(s, "\t%d\t%d", r->mlen, r->blen);
319
+ mm_sprintf_lite(s, "\t%d", r->mapq);
320
+ write_tags(s, r);
321
+ if (rep_len >= 0) mm_sprintf_lite(s, "\trl:i:%d", rep_len);
322
+ if (r->p && (opt_flag & MM_F_OUT_CG)) {
323
+ uint32_t k;
324
+ mm_sprintf_lite(s, "\tcg:Z:");
325
+ for (k = 0; k < r->p->n_cigar; ++k)
326
+ mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, MM_CIGAR_STR[r->p->cigar[k]&0xf]);
327
+ }
328
+ if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
329
+ write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, 1, !!(opt_flag&MM_F_QSTRAND));
330
+ if ((opt_flag & MM_F_COPY_COMMENT) && t->comment)
331
+ mm_sprintf_lite(s, "\t%s", t->comment);
332
+ }
333
+
334
+ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag)
335
+ {
336
+ mm_write_paf3(s, mi, t, r, km, opt_flag, -1);
337
+ }
338
+
339
+ static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp)
340
+ {
341
+ extern unsigned char seq_comp_table[256];
342
+ if (rev) {
343
+ int i;
344
+ str_enlarge(s, l);
345
+ for (i = 0; i < l; ++i) {
346
+ int c = seq[l - 1 - i];
347
+ s->s[s->l + i] = c < 128 && comp? seq_comp_table[c] : c;
348
+ }
349
+ s->l += l;
350
+ } else str_copy(s, seq, seq + l);
351
+ }
352
+
353
+ static inline const mm_reg1_t *get_sam_pri(int n_regs, const mm_reg1_t *regs)
354
+ {
355
+ int i;
356
+ for (i = 0; i < n_regs; ++i)
357
+ if (regs[i].sam_pri)
358
+ return &regs[i];
359
+ assert(n_regs == 0);
360
+ return NULL;
361
+ }
362
+
363
+ static void write_sam_cigar(kstring_t *s, int sam_flag, int in_tag, int qlen, const mm_reg1_t *r, int64_t opt_flag)
364
+ {
365
+ if (r->p == 0) {
366
+ mm_sprintf_lite(s, "*");
367
+ } else {
368
+ uint32_t k, clip_len[2];
369
+ clip_len[0] = r->rev? qlen - r->qe : r->qs;
370
+ clip_len[1] = r->rev? r->qs : qlen - r->qe;
371
+ if (in_tag) {
372
+ int clip_char = (sam_flag&0x800) && !(opt_flag&MM_F_SOFTCLIP)? 5 : 4;
373
+ mm_sprintf_lite(s, "\tCG:B:I");
374
+ if (clip_len[0]) mm_sprintf_lite(s, ",%u", clip_len[0]<<4|clip_char);
375
+ for (k = 0; k < r->p->n_cigar; ++k)
376
+ mm_sprintf_lite(s, ",%u", r->p->cigar[k]);
377
+ if (clip_len[1]) mm_sprintf_lite(s, ",%u", clip_len[1]<<4|clip_char);
378
+ } else {
379
+ int clip_char = (sam_flag&0x800) && !(opt_flag&MM_F_SOFTCLIP)? 'H' : 'S';
380
+ assert(clip_len[0] < qlen && clip_len[1] < qlen);
381
+ if (clip_len[0]) mm_sprintf_lite(s, "%d%c", clip_len[0], clip_char);
382
+ for (k = 0; k < r->p->n_cigar; ++k)
383
+ mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, MM_CIGAR_STR[r->p->cigar[k]&0xf]);
384
+ if (clip_len[1]) mm_sprintf_lite(s, "%d%c", clip_len[1], clip_char);
385
+ }
386
+ }
387
+ }
388
+
389
+ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int seg_idx, int reg_idx, int n_seg, const int *n_regss, const mm_reg1_t *const* regss, void *km, int64_t opt_flag, int rep_len)
390
+ {
391
+ const int max_bam_cigar_op = 65535;
392
+ int flag, n_regs = n_regss[seg_idx], cigar_in_tag = 0;
393
+ int this_rid = -1, this_pos = -1;
394
+ const mm_reg1_t *regs = regss[seg_idx], *r_prev = NULL, *r_next;
395
+ const mm_reg1_t *r = n_regs > 0 && reg_idx < n_regs && reg_idx >= 0? &regs[reg_idx] : NULL;
396
+
397
+ // find the primary of the previous and the next segments, if they are mapped
398
+ if (n_seg > 1) {
399
+ int i, next_sid = (seg_idx + 1) % n_seg;
400
+ r_next = get_sam_pri(n_regss[next_sid], regss[next_sid]);
401
+ if (n_seg > 2) {
402
+ for (i = 1; i <= n_seg - 1; ++i) {
403
+ int prev_sid = (seg_idx + n_seg - i) % n_seg;
404
+ if (n_regss[prev_sid] > 0) {
405
+ r_prev = get_sam_pri(n_regss[prev_sid], regss[prev_sid]);
406
+ break;
407
+ }
408
+ }
409
+ } else r_prev = r_next;
410
+ } else r_prev = r_next = NULL;
411
+
412
+ // write QNAME
413
+ s->l = 0;
414
+ mm_sprintf_lite(s, "%s", t->name);
415
+ if (n_seg > 1) s->l = mm_qname_len(t->name); // trim the suffix like /1 or /2
416
+
417
+ // write flag
418
+ flag = n_seg > 1? 0x1 : 0x0;
419
+ if (r == 0) {
420
+ flag |= 0x4;
421
+ } else {
422
+ if (r->rev) flag |= 0x10;
423
+ if (r->parent != r->id) flag |= 0x100;
424
+ else if (!r->sam_pri) flag |= 0x800;
425
+ }
426
+ if (n_seg > 1) {
427
+ if (r && r->proper_frag) flag |= 0x2; // TODO: this doesn't work when there are more than 2 segments
428
+ if (seg_idx == 0) flag |= 0x40;
429
+ else if (seg_idx == n_seg - 1) flag |= 0x80;
430
+ if (r_next == NULL) flag |= 0x8;
431
+ else if (r_next->rev) flag |= 0x20;
432
+ }
433
+ mm_sprintf_lite(s, "\t%d", flag);
434
+
435
+ // write coordinate, MAPQ and CIGAR
436
+ if (r == 0) {
437
+ if (r_prev) {
438
+ this_rid = r_prev->rid, this_pos = r_prev->rs;
439
+ mm_sprintf_lite(s, "\t%s\t%d\t0\t*", mi->seq[this_rid].name, this_pos+1);
440
+ } else mm_sprintf_lite(s, "\t*\t0\t0\t*");
441
+ } else {
442
+ this_rid = r->rid, this_pos = r->rs;
443
+ mm_sprintf_lite(s, "\t%s\t%d\t%d\t", mi->seq[r->rid].name, r->rs+1, r->mapq);
444
+ if ((opt_flag & MM_F_LONG_CIGAR) && r->p && r->p->n_cigar > max_bam_cigar_op - 2) {
445
+ int n_cigar = r->p->n_cigar;
446
+ if (r->qs != 0) ++n_cigar;
447
+ if (r->qe != t->l_seq) ++n_cigar;
448
+ if (n_cigar > max_bam_cigar_op)
449
+ cigar_in_tag = 1;
450
+ }
451
+ if (cigar_in_tag) {
452
+ int slen;
453
+ if ((flag & 0x900) == 0 || (opt_flag & MM_F_SOFTCLIP)) slen = t->l_seq;
454
+ else if (flag & 0x100) slen = 0;
455
+ else slen = r->qe - r->qs;
456
+ mm_sprintf_lite(s, "%dS%dN", slen, r->re - r->rs);
457
+ } else write_sam_cigar(s, flag, 0, t->l_seq, r, opt_flag);
458
+ }
459
+
460
+ // write mate positions
461
+ if (n_seg > 1) {
462
+ int tlen = 0;
463
+ if (this_rid >= 0 && r_next) {
464
+ if (this_rid == r_next->rid) {
465
+ if (r) {
466
+ int this_pos5 = r->rev? r->re - 1 : this_pos;
467
+ int next_pos5 = r_next->rev? r_next->re - 1 : r_next->rs;
468
+ tlen = next_pos5 - this_pos5;
469
+ }
470
+ mm_sprintf_lite(s, "\t=\t");
471
+ } else mm_sprintf_lite(s, "\t%s\t", mi->seq[r_next->rid].name);
472
+ mm_sprintf_lite(s, "%d\t", r_next->rs + 1);
473
+ } else if (r_next) { // && this_rid < 0
474
+ mm_sprintf_lite(s, "\t%s\t%d\t", mi->seq[r_next->rid].name, r_next->rs + 1);
475
+ } else if (this_rid >= 0) { // && r_next == NULL
476
+ mm_sprintf_lite(s, "\t=\t%d\t", this_pos + 1); // next segment will take r's coordinate
477
+ } else mm_sprintf_lite(s, "\t*\t0\t"); // neither has coordinates
478
+ if (tlen > 0) ++tlen;
479
+ else if (tlen < 0) --tlen;
480
+ mm_sprintf_lite(s, "%d\t", tlen);
481
+ } else mm_sprintf_lite(s, "\t*\t0\t0\t");
482
+
483
+ // write SEQ and QUAL
484
+ if (r == 0) {
485
+ sam_write_sq(s, t->seq, t->l_seq, 0, 0);
486
+ mm_sprintf_lite(s, "\t");
487
+ if (t->qual) sam_write_sq(s, t->qual, t->l_seq, 0, 0);
488
+ else mm_sprintf_lite(s, "*");
489
+ } else {
490
+ if ((flag & 0x900) == 0 || (opt_flag & MM_F_SOFTCLIP)) {
491
+ sam_write_sq(s, t->seq, t->l_seq, r->rev, r->rev);
492
+ mm_sprintf_lite(s, "\t");
493
+ if (t->qual) sam_write_sq(s, t->qual, t->l_seq, r->rev, 0);
494
+ else mm_sprintf_lite(s, "*");
495
+ } else if (flag & 0x100) {
496
+ mm_sprintf_lite(s, "*\t*");
497
+ } else {
498
+ sam_write_sq(s, t->seq + r->qs, r->qe - r->qs, r->rev, r->rev);
499
+ mm_sprintf_lite(s, "\t");
500
+ if (t->qual) sam_write_sq(s, t->qual + r->qs, r->qe - r->qs, r->rev, 0);
501
+ else mm_sprintf_lite(s, "*");
502
+ }
503
+ }
504
+
505
+ // write tags
506
+ if (mm_rg_id[0]) mm_sprintf_lite(s, "\tRG:Z:%s", mm_rg_id);
507
+ if (n_seg > 2) mm_sprintf_lite(s, "\tFI:i:%d", seg_idx);
508
+ if (r) {
509
+ write_tags(s, r);
510
+ if (r->parent == r->id && r->p && n_regs > 1 && regs && r >= regs && r - regs < n_regs) { // supplementary aln may exist
511
+ int i, n_sa = 0; // n_sa: number of SA fields
512
+ for (i = 0; i < n_regs; ++i)
513
+ if (i != r - regs && regs[i].parent == regs[i].id && regs[i].p)
514
+ ++n_sa;
515
+ if (n_sa > 0) {
516
+ mm_sprintf_lite(s, "\tSA:Z:");
517
+ for (i = 0; i < n_regs; ++i) {
518
+ const mm_reg1_t *q = &regs[i];
519
+ int l_M, l_I = 0, l_D = 0, clip5 = 0, clip3 = 0;
520
+ if (r == q || q->parent != q->id || q->p == 0) continue;
521
+ if (q->qe - q->qs < q->re - q->rs) l_M = q->qe - q->qs, l_D = (q->re - q->rs) - l_M;
522
+ else l_M = q->re - q->rs, l_I = (q->qe - q->qs) - l_M;
523
+ clip5 = q->rev? t->l_seq - q->qe : q->qs;
524
+ clip3 = q->rev? q->qs : t->l_seq - q->qe;
525
+ mm_sprintf_lite(s, "%s,%d,%c,", mi->seq[q->rid].name, q->rs+1, "+-"[q->rev]);
526
+ if (clip5) mm_sprintf_lite(s, "%dS", clip5);
527
+ if (l_M) mm_sprintf_lite(s, "%dM", l_M);
528
+ if (l_I) mm_sprintf_lite(s, "%dI", l_I);
529
+ if (l_D) mm_sprintf_lite(s, "%dD", l_D);
530
+ if (clip3) mm_sprintf_lite(s, "%dS", clip3);
531
+ mm_sprintf_lite(s, ",%d,%d;", q->mapq, q->blen - q->mlen + q->p->n_ambi);
532
+ }
533
+ }
534
+ }
535
+ if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
536
+ write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, 1, 0);
537
+ if (cigar_in_tag)
538
+ write_sam_cigar(s, flag, 1, t->l_seq, r, opt_flag);
539
+ }
540
+ if (rep_len >= 0) mm_sprintf_lite(s, "\trl:i:%d", rep_len);
541
+
542
+ if ((opt_flag & MM_F_COPY_COMMENT) && t->comment)
543
+ mm_sprintf_lite(s, "\t%s", t->comment);
544
+
545
+ s->s[s->l] = 0; // we always have room for an extra byte (see str_enlarge)
546
+ }
547
+
548
+ void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int seg_idx, int reg_idx, int n_seg, const int *n_regss, const mm_reg1_t *const* regss, void *km, int64_t opt_flag)
549
+ {
550
+ mm_write_sam3(s, mi, t, seg_idx, reg_idx, n_seg, n_regss, regss, km, opt_flag, -1);
551
+ }
552
+
553
+ void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs)
554
+ {
555
+ int i;
556
+ for (i = 0; i < n_regs; ++i)
557
+ if (r == &regs[i]) break;
558
+ mm_write_sam2(s, mi, t, 0, i, 1, &n_regs, &regs, NULL, 0);
559
+ }