ruby-minigraph 0.0.20.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +62 -0
  4. data/ext/Rakefile +56 -0
  5. data/ext/cmappy/cmappy.c +7 -0
  6. data/ext/cmappy/cmappy.h +8 -0
  7. data/ext/minigraph/LICENSE.txt +23 -0
  8. data/ext/minigraph/Makefile +66 -0
  9. data/ext/minigraph/NEWS.md +317 -0
  10. data/ext/minigraph/README.md +207 -0
  11. data/ext/minigraph/algo.c +194 -0
  12. data/ext/minigraph/algo.h +33 -0
  13. data/ext/minigraph/asm-call.c +147 -0
  14. data/ext/minigraph/bseq.c +133 -0
  15. data/ext/minigraph/bseq.h +76 -0
  16. data/ext/minigraph/cal_cov.c +139 -0
  17. data/ext/minigraph/doc/example1.png +0 -0
  18. data/ext/minigraph/doc/example2.png +0 -0
  19. data/ext/minigraph/doc/examples.graffle +0 -0
  20. data/ext/minigraph/format.c +241 -0
  21. data/ext/minigraph/galign.c +140 -0
  22. data/ext/minigraph/gchain1.c +532 -0
  23. data/ext/minigraph/gcmisc.c +223 -0
  24. data/ext/minigraph/gfa-aug.c +260 -0
  25. data/ext/minigraph/gfa-base.c +526 -0
  26. data/ext/minigraph/gfa-bbl.c +372 -0
  27. data/ext/minigraph/gfa-ed.c +617 -0
  28. data/ext/minigraph/gfa-io.c +395 -0
  29. data/ext/minigraph/gfa-priv.h +154 -0
  30. data/ext/minigraph/gfa.h +166 -0
  31. data/ext/minigraph/ggen.c +182 -0
  32. data/ext/minigraph/ggen.h +21 -0
  33. data/ext/minigraph/ggsimple.c +570 -0
  34. data/ext/minigraph/gmap.c +211 -0
  35. data/ext/minigraph/index.c +230 -0
  36. data/ext/minigraph/kalloc.c +224 -0
  37. data/ext/minigraph/kalloc.h +82 -0
  38. data/ext/minigraph/kavl.h +414 -0
  39. data/ext/minigraph/kdq.h +134 -0
  40. data/ext/minigraph/ketopt.h +116 -0
  41. data/ext/minigraph/khashl.h +348 -0
  42. data/ext/minigraph/krmq.h +474 -0
  43. data/ext/minigraph/kseq.h +256 -0
  44. data/ext/minigraph/ksort.h +164 -0
  45. data/ext/minigraph/kstring.h +165 -0
  46. data/ext/minigraph/kthread.c +159 -0
  47. data/ext/minigraph/kthread.h +15 -0
  48. data/ext/minigraph/kvec-km.h +105 -0
  49. data/ext/minigraph/kvec.h +110 -0
  50. data/ext/minigraph/lchain.c +441 -0
  51. data/ext/minigraph/main.c +301 -0
  52. data/ext/minigraph/map-algo.c +500 -0
  53. data/ext/minigraph/mgpriv.h +128 -0
  54. data/ext/minigraph/minigraph.1 +359 -0
  55. data/ext/minigraph/minigraph.h +176 -0
  56. data/ext/minigraph/miniwfa.c +834 -0
  57. data/ext/minigraph/miniwfa.h +95 -0
  58. data/ext/minigraph/misc/mgutils.js +1451 -0
  59. data/ext/minigraph/misc.c +12 -0
  60. data/ext/minigraph/options.c +134 -0
  61. data/ext/minigraph/shortk.c +251 -0
  62. data/ext/minigraph/sketch.c +109 -0
  63. data/ext/minigraph/sys.c +147 -0
  64. data/ext/minigraph/sys.h +20 -0
  65. data/ext/minigraph/test/MT-chimp.fa +277 -0
  66. data/ext/minigraph/test/MT-human.fa +239 -0
  67. data/ext/minigraph/test/MT-orangA.fa +276 -0
  68. data/ext/minigraph/test/MT.gfa +19 -0
  69. data/ext/minigraph/tex/Makefile +13 -0
  70. data/ext/minigraph/tex/minigraph.bib +676 -0
  71. data/ext/minigraph/tex/minigraph.tex +986 -0
  72. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
  73. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
  74. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
  75. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
  76. data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
  77. data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
  78. data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
  79. data/ext/minigraph/tex/plots/bedutils.js +367 -0
  80. data/ext/minigraph/tex/plots/chr-plot.js +130 -0
  81. data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
  82. data/ext/minigraph.patch +21 -0
  83. data/lib/minigraph/ffi/constants.rb +230 -0
  84. data/lib/minigraph/ffi/functions.rb +70 -0
  85. data/lib/minigraph/ffi/mappy.rb +8 -0
  86. data/lib/minigraph/ffi.rb +27 -0
  87. data/lib/minigraph/version.rb +5 -0
  88. data/lib/minigraph.rb +72 -0
  89. metadata +159 -0
@@ -0,0 +1,133 @@
1
+ #include <zlib.h>
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include <assert.h>
5
+ #define __STDC_LIMIT_MACROS
6
+ #include "bseq.h"
7
+ #include "kvec-km.h"
8
+ #include "kseq.h"
9
+ KSEQ_INIT(gzFile, gzread)
10
+
11
+ #define CHECK_PAIR_THRES 1000000
12
+
13
+ struct mg_bseq_file_s {
14
+ gzFile fp;
15
+ kseq_t *ks;
16
+ mg_bseq1_t s;
17
+ };
18
+
19
+ mg_bseq_file_t *mg_bseq_open(const char *fn)
20
+ {
21
+ mg_bseq_file_t *fp;
22
+ gzFile f;
23
+ f = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(0, "r");
24
+ if (f == 0) return 0;
25
+ fp = (mg_bseq_file_t*)calloc(1, sizeof(mg_bseq_file_t));
26
+ fp->fp = f;
27
+ fp->ks = kseq_init(fp->fp);
28
+ return fp;
29
+ }
30
+
31
+ void mg_bseq_close(mg_bseq_file_t *fp)
32
+ {
33
+ kseq_destroy(fp->ks);
34
+ gzclose(fp->fp);
35
+ free(fp);
36
+ }
37
+
38
+ static inline char *kstrdup(const kstring_t *s)
39
+ {
40
+ char *t;
41
+ t = (char*)malloc(s->l + 1);
42
+ memcpy(t, s->s, s->l + 1);
43
+ return t;
44
+ }
45
+
46
+ static inline void kseq2bseq(kseq_t *ks, mg_bseq1_t *s, int with_qual, int with_comment)
47
+ {
48
+ int i;
49
+ if (ks->name.l == 0)
50
+ fprintf(stderr, "[WARNING]\033[1;31m empty sequence name in the input.\033[0m\n");
51
+ s->name = kstrdup(&ks->name);
52
+ s->seq = kstrdup(&ks->seq);
53
+ for (i = 0; i < (int)ks->seq.l; ++i) // convert U to T
54
+ if (s->seq[i] == 'u' || s->seq[i] == 'U')
55
+ --s->seq[i];
56
+ s->qual = with_qual && ks->qual.l? kstrdup(&ks->qual) : 0;
57
+ s->comment = with_comment && ks->comment.l? kstrdup(&ks->comment) : 0;
58
+ s->l_seq = ks->seq.l;
59
+ }
60
+
61
+ mg_bseq1_t *mg_bseq_read(mg_bseq_file_t *fp, int64_t chunk_size, int with_qual, int with_comment, int frag_mode, int *n_)
62
+ {
63
+ int64_t size = 0;
64
+ int ret;
65
+ kvec_t(mg_bseq1_t) a = {0,0,0};
66
+ kseq_t *ks = fp->ks;
67
+ *n_ = 0;
68
+ if (fp->s.seq) {
69
+ kv_resize(mg_bseq1_t, 0, a, 256);
70
+ kv_push(mg_bseq1_t, 0, a, fp->s);
71
+ size = fp->s.l_seq;
72
+ memset(&fp->s, 0, sizeof(mg_bseq1_t));
73
+ }
74
+ while ((ret = kseq_read(ks)) >= 0) {
75
+ mg_bseq1_t *s;
76
+ assert(ks->seq.l <= INT32_MAX);
77
+ if (a.m == 0) kv_resize(mg_bseq1_t, 0, a, 256);
78
+ kv_pushp(mg_bseq1_t, 0, a, &s);
79
+ kseq2bseq(ks, s, with_qual, with_comment);
80
+ size += s->l_seq;
81
+ if (size >= chunk_size) {
82
+ if (frag_mode && a.a[a.n-1].l_seq < CHECK_PAIR_THRES) {
83
+ while (kseq_read(ks) >= 0) {
84
+ kseq2bseq(ks, &fp->s, with_qual, with_comment);
85
+ if (mg_qname_same(fp->s.name, a.a[a.n-1].name)) {
86
+ kv_push(mg_bseq1_t, 0, a, fp->s);
87
+ memset(&fp->s, 0, sizeof(mg_bseq1_t));
88
+ } else break;
89
+ }
90
+ }
91
+ break;
92
+ }
93
+ }
94
+ if (ret < -1)
95
+ fprintf(stderr, "[WARNING]\033[1;31m wrong FASTA/FASTQ record. Continue anyway.\033[0m\n");
96
+ *n_ = a.n;
97
+ return a.a;
98
+ }
99
+
100
+ mg_bseq1_t *mg_bseq_read_frag(int n_fp, mg_bseq_file_t **fp, int64_t chunk_size, int with_qual, int with_comment, int *n_)
101
+ {
102
+ int i;
103
+ int64_t size = 0;
104
+ kvec_t(mg_bseq1_t) a = {0,0,0};
105
+ *n_ = 0;
106
+ if (n_fp < 1) return 0;
107
+ while (1) {
108
+ int n_read = 0;
109
+ for (i = 0; i < n_fp; ++i)
110
+ if (kseq_read(fp[i]->ks) >= 0)
111
+ ++n_read;
112
+ if (n_read < n_fp) {
113
+ if (n_read > 0)
114
+ fprintf(stderr, "[W::%s]\033[1;31m query files have different number of records; extra records skipped.\033[0m\n", __func__);
115
+ break; // some file reaches the end
116
+ }
117
+ if (a.m == 0) kv_resize(mg_bseq1_t, 0, a, 256);
118
+ for (i = 0; i < n_fp; ++i) {
119
+ mg_bseq1_t *s;
120
+ kv_pushp(mg_bseq1_t, 0, a, &s);
121
+ kseq2bseq(fp[i]->ks, s, with_qual, with_comment);
122
+ size += s->l_seq;
123
+ }
124
+ if (size >= chunk_size) break;
125
+ }
126
+ *n_ = a.n;
127
+ return a.a;
128
+ }
129
+
130
+ int mg_bseq_eof(mg_bseq_file_t *fp)
131
+ {
132
+ return (ks_eof(fp->ks->f) && fp->s.seq == 0);
133
+ }
@@ -0,0 +1,76 @@
1
+ #ifndef MM_BSEQ_H
2
+ #define MM_BSEQ_H
3
+
4
+ #include <stdint.h>
5
+ #include <string.h>
6
+
7
+ #ifdef __cplusplus
8
+ extern "C" {
9
+ #endif
10
+
11
+ extern unsigned char gfa_comp_table[256];
12
+
13
+ struct mg_bseq_file_s;
14
+ typedef struct mg_bseq_file_s mg_bseq_file_t;
15
+
16
+ typedef struct {
17
+ int32_t l_seq, rid;
18
+ char *name, *seq, *qual, *comment;
19
+ } mg_bseq1_t;
20
+
21
+ mg_bseq_file_t *mg_bseq_open(const char *fn);
22
+ void mg_bseq_close(mg_bseq_file_t *fp);
23
+ mg_bseq1_t *mg_bseq_read(mg_bseq_file_t *fp, int64_t chunk_size, int with_qual, int with_comment, int frag_mode, int *n_);
24
+ mg_bseq1_t *mg_bseq_read_frag(int n_fp, mg_bseq_file_t **fp, int64_t chunk_size, int with_qual, int with_comment, int *n_);
25
+ int mg_bseq_eof(mg_bseq_file_t *fp);
26
+
27
+ extern unsigned char seq_nt4_table[256];
28
+ extern unsigned char gfa_comp_table[256];
29
+
30
+ static inline int32_t mg_qname_len(const char *s)
31
+ {
32
+ int32_t l;
33
+ l = strlen(s);
34
+ return l >= 3 && s[l-1] >= '0' && s[l-1] <= '9' && s[l-2] == '/'? l - 2 : l;
35
+ }
36
+
37
+ static inline int32_t mg_qname_same(const char *s1, const char *s2)
38
+ {
39
+ int32_t l1, l2;
40
+ l1 = mg_qname_len(s1);
41
+ l2 = mg_qname_len(s2);
42
+ return (l1 == l2 && strncmp(s1, s2, l1) == 0);
43
+ }
44
+
45
+ static inline void mg_toupper(int32_t len, char *seq)
46
+ {
47
+ int32_t j;
48
+ for (j = 0; j < len; ++j)
49
+ seq[j] = seq[j] < 'a' || seq[j] > 'z'? seq[j] : seq[j] - 32;
50
+ }
51
+
52
+ static inline void mg_revcomp_seq(int32_t len, char *seq)
53
+ {
54
+ int32_t i;
55
+ for (i = 0; i < len>>1; ++i) {
56
+ int32_t t = seq[len - i - 1];
57
+ seq[len - i - 1] = gfa_comp_table[(uint8_t)seq[i]];
58
+ seq[i] = gfa_comp_table[t];
59
+ }
60
+ if (len&1) seq[len>>1] = gfa_comp_table[(uint8_t)seq[len>>1]];
61
+ }
62
+
63
+ static inline void mg_revcomp_bseq(mg_bseq1_t *s)
64
+ {
65
+ int32_t i, t, l = s->l_seq;
66
+ mg_revcomp_seq(s->l_seq, s->seq);
67
+ if (s->qual)
68
+ for (i = 0; i < l>>1; ++i)
69
+ t = s->qual[l - i - 1], s->qual[l - i - 1] = s->qual[i], s->qual[i] = t;
70
+ }
71
+
72
+ #ifdef __cplusplus
73
+ }
74
+ #endif
75
+
76
+ #endif
@@ -0,0 +1,139 @@
1
+ #include <assert.h>
2
+ #include <string.h>
3
+ #include "mgpriv.h"
4
+ #include "gfa-priv.h"
5
+ #include "algo.h"
6
+ #include "kalloc.h"
7
+
8
+ void mg_cov_map(const gfa_t *g, const mg_gchains_t *gt, int32_t min_mapq, int32_t min_blen, double *c_seg, double *c_link, const char *qname)
9
+ {
10
+ int32_t i, j;
11
+ if (c_seg == 0 && c_link == 0) return;
12
+ if (gt == 0 || gt->n_gc == 0) return;
13
+ for (i = 0; i < gt->n_gc; ++i) {
14
+ const mg_gchain_t *gc = &gt->gc[i];
15
+ const mg128_t *last_an;
16
+ assert(gc->cnt > 0 && gc->n_anchor > 0);
17
+ if (gc->mapq < min_mapq || gc->blen < min_blen) continue;
18
+ // count segment coverage
19
+ for (j = 0; j < gc->cnt; ++j) {
20
+ const mg_llchain_t *lc = &gt->lc[gc->off + j];
21
+ int32_t s, e;
22
+ s = 0, e = g->seg[lc->v>>1].len;
23
+ if (j == 0) s = (int32_t)gt->a[lc->off].x + 1 - (int32_t)(gt->a[lc->off].y>>32&0xff);
24
+ if (j == gc->cnt - 1) e = (int32_t)gt->a[lc->off + lc->cnt - 1].x + 1;
25
+ if (c_seg) c_seg[lc->v>>1] += (double)(e - s) / g->seg[lc->v>>1].len;
26
+ }
27
+ // count link
28
+ assert(gt->lc[gc->off].cnt > 0);
29
+ last_an = &gt->a[gt->lc[gc->off].off + gt->lc[gc->off].cnt - 1];
30
+ for (j = 1; j < gc->cnt; ++j) {
31
+ const mg_llchain_t *lc0 = &gt->lc[gc->off + j - 1];
32
+ const mg_llchain_t *lc1 = &gt->lc[gc->off + j];
33
+ int64_t a01, a10;
34
+ if (lc1->cnt > 0) {
35
+ const mg128_t *curr_an = &gt->a[lc1->off];
36
+ int32_t is_skip = (mg_seg_id(*curr_an) != mg_seg_id(*last_an));
37
+ last_an = &gt->a[lc1->off + lc1->cnt - 1];
38
+ if (is_skip) continue;
39
+ }
40
+ a01 = gfa_find_arc(g, lc0->v, lc1->v);
41
+ a10 = gfa_find_arc(g, lc1->v^1, lc0->v^1);
42
+ if (a01 < 0 || a10 < 0) {
43
+ if (mg_verbose >= 2)
44
+ fprintf(stderr, "[W] Multi/disconnected link: %c%s[%d] -> %c%s[%d] (%s, %ld, %ld). Continue anyway!\n",
45
+ "><"[lc0->v&1], g->seg[lc0->v>>1].name, lc0->v,
46
+ "><"[lc1->v&1], g->seg[lc1->v>>1].name, lc1->v, qname, (long)a01, (long)a10);
47
+ continue;
48
+ }
49
+ assert((g->arc[a01].comp ^ g->arc[a10].comp) == 1);
50
+ if (c_link) c_link[a01] += 1.0, c_link[a10] += 1.0;
51
+ }
52
+ }
53
+ }
54
+
55
+ void mg_cov_asm(const gfa_t *g, int32_t n_seq, mg_gchains_t *const *gcs, int32_t min_mapq, int32_t min_blen, double *cov_seg, double *cov_link)
56
+ {
57
+ int32_t i, j, t, *soff, *scnt, *cnt_link;
58
+ int64_t k;
59
+ mg_intv_t *sintv = 0;
60
+ void *km = 0;
61
+
62
+ // precalculate the size of sintv[] for each segment
63
+ KCALLOC(km, scnt, g->n_seg);
64
+ for (t = 0; t < n_seq; ++t) {
65
+ const mg_gchains_t *gt = gcs[t];
66
+ if (gt == 0 || gt->n_gc == 0) continue;
67
+ for (i = 0; i < gt->n_gc; ++i) {
68
+ const mg_gchain_t *gc = &gt->gc[i];
69
+ assert(gc->cnt > 0 && gc->n_anchor > 0);
70
+ if (gc->mapq < min_mapq || gc->blen < min_blen) continue;
71
+ for (j = 0; j < gc->cnt; ++j) {
72
+ const mg_llchain_t *lc = &gt->lc[gc->off + j];
73
+ ++scnt[lc->v>>1];
74
+ }
75
+ }
76
+ }
77
+ KMALLOC(km, soff, g->n_seg + 1);
78
+ for (soff[0] = 0, i = 1; i <= g->n_seg; ++i)
79
+ soff[i] = soff[i - 1] + scnt[i - 1];
80
+ memset(scnt, 0, 4 * g->n_seg);
81
+ KMALLOC(km, sintv, soff[g->n_seg]);
82
+
83
+ // fill sintv[]
84
+ KCALLOC(km, cnt_link, g->n_arc);
85
+ for (t = 0; t < n_seq; ++t) {
86
+ const mg_gchains_t *gt = gcs[t];
87
+ if (gt == 0 || gt->n_gc == 0) continue;
88
+ for (i = 0; i < gt->n_gc;) {
89
+ const mg_gchain_t *gc = &gt->gc[i];
90
+ if (gc->mapq < min_mapq || gc->blen < min_blen) continue;
91
+ // count segment coverage
92
+ for (j = 0; j < gc->cnt; ++j) {
93
+ const mg_llchain_t *lc = &gt->lc[gc->off + j];
94
+ int32_t s, e, tmp;
95
+ mg_intv_t *p;
96
+ s = 0, e = g->seg[lc->v>>1].len;
97
+ if (j == 0) s = (int32_t)gt->a[lc->off].x + 1 - (int32_t)(gt->a[lc->off].y>>32&0xff);
98
+ if (j == gc->cnt - 1) e = (int32_t)gt->a[lc->off + lc->cnt - 1].x + 1;
99
+ if (lc->v&1) // convert to the forward strand of segment lc->v>>1
100
+ tmp = g->seg[lc->v>>1].len - s, s = g->seg[lc->v>>1].len - e, e = tmp;
101
+ p = &sintv[soff[lc->v>>1] + scnt[lc->v>>1]];
102
+ ++scnt[lc->v>>1];
103
+ p->st = s, p->en = e, p->rev = lc->v&1, p->far = -1, p->i = -1;
104
+ }
105
+ // count link
106
+ for (j = 1; j < gc->cnt; ++j) {
107
+ const mg_llchain_t *lc0 = &gt->lc[gc->off + j - 1];
108
+ const mg_llchain_t *lc1 = &gt->lc[gc->off + j];
109
+ int64_t a01, a10;
110
+ a01 = gfa_find_arc(g, lc0->v, lc1->v);
111
+ a10 = gfa_find_arc(g, lc1->v^1, lc0->v^1);
112
+ assert(a01 >= 0 && a10 >= 0);
113
+ assert((g->arc[a01].comp ^ g->arc[a10].comp) == 1);
114
+ ++cnt_link[a01];
115
+ ++cnt_link[a10];
116
+ }
117
+ }
118
+ }
119
+
120
+ // update cov_link[] and cov_seg[]
121
+ for (k = 0; k < g->n_arc; ++k)
122
+ if (cnt_link[k] > 0) cov_link[k] += 1.0;
123
+ for (i = 0; i < g->n_seg; ++i) {
124
+ int32_t st = 0, en = 0, cov = 0;
125
+ assert(scnt[i] == soff[i+1] - soff[i]);
126
+ radix_sort_mg_intv(&sintv[soff[i]], &sintv[soff[i+1]]);
127
+ for (j = soff[i]; j < soff[i+1]; ++j) {
128
+ if (sintv[j].st > en)
129
+ cov += en - st, st = sintv[j].st, en = sintv[j].en;
130
+ else en = sintv[j].en > en? sintv[j].en : en;
131
+ }
132
+ cov += en - st;
133
+ cov_seg[i] += (double)cov / g->seg[i].len;
134
+ }
135
+
136
+ // free
137
+ kfree(km, cnt_link);
138
+ kfree(km, sintv); kfree(km, soff); kfree(km, scnt);
139
+ }
Binary file
Binary file
Binary file
@@ -0,0 +1,241 @@
1
+ #include <stdarg.h>
2
+ #include <stdlib.h>
3
+ #include <string.h>
4
+ #include <assert.h>
5
+ #include <stdio.h>
6
+ #include <math.h>
7
+ #include "kalloc.h"
8
+ #include "mgpriv.h"
9
+
10
+ static inline void str_enlarge(kstring_t *s, int l)
11
+ {
12
+ if (s->l + l + 1 > s->m) {
13
+ s->m = s->l + l + 1;
14
+ kroundup32(s->m);
15
+ s->s = (char*)realloc(s->s, s->m);
16
+ }
17
+ }
18
+
19
+ static inline void str_copy(kstring_t *s, const char *st, const char *en)
20
+ {
21
+ str_enlarge(s, en - st);
22
+ memcpy(&s->s[s->l], st, en - st);
23
+ s->l += en - st;
24
+ }
25
+
26
+ void mg_sprintf_lite(kstring_t *s, const char *fmt, ...)
27
+ {
28
+ char buf[16]; // for integer to string conversion
29
+ const char *p, *q;
30
+ va_list ap;
31
+ va_start(ap, fmt);
32
+ for (q = p = fmt; *p; ++p) {
33
+ if (*p == '%') {
34
+ if (p > q) str_copy(s, q, p);
35
+ ++p;
36
+ if (*p == 'd') {
37
+ int c, i, l = 0;
38
+ unsigned int x;
39
+ c = va_arg(ap, int);
40
+ x = c >= 0? c : -c;
41
+ do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0);
42
+ if (c < 0) buf[l++] = '-';
43
+ str_enlarge(s, l);
44
+ for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
45
+ } else if (*p == 'u') {
46
+ int i, l = 0;
47
+ uint32_t x;
48
+ x = va_arg(ap, uint32_t);
49
+ do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0);
50
+ str_enlarge(s, l);
51
+ for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
52
+ } else if (*p == 's') {
53
+ char *r = va_arg(ap, char*);
54
+ str_copy(s, r, r + strlen(r));
55
+ } else if (*p == 'c') {
56
+ str_enlarge(s, 1);
57
+ s->s[s->l++] = va_arg(ap, int);
58
+ } else abort();
59
+ q = p + 1;
60
+ }
61
+ }
62
+ if (p > q) str_copy(s, q, p);
63
+ va_end(ap);
64
+ s->s[s->l] = 0;
65
+ }
66
+
67
+ void mg_print_lchain(FILE *fp, const mg_idx_t *gi, int n_lc, const mg_lchain_t *lc, const mg128_t *a, const char *qname)
68
+ {
69
+ kstring_t str = {0,0,0};
70
+ int i, j;
71
+ for (i = 0; i < n_lc; ++i) {
72
+ const mg_lchain_t *p = &lc[i];
73
+ int mlen, blen, span = a[p->off].y>>32&0xff;
74
+ mlen = blen = span;
75
+ for (j = 1; j < p->cnt; ++j) {
76
+ int ql = (int32_t)a[p->off + j].y - (int32_t)a[p->off + j - 1].y;
77
+ int pl = (int32_t)a[p->off + j].x - (int32_t)a[p->off + j - 1].x;
78
+ blen += pl > ql? pl : ql;
79
+ mlen += pl > span && ql > span? span : pl < ql? pl : ql;
80
+ }
81
+ str.l = 0;
82
+ mg_sprintf_lite(&str, "LC\t%s\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t", qname, p->qs, p->qe, "+-"[p->v&1], gi->g->seg[p->v>>1].name, gi->g->seg[p->v>>1].len,
83
+ p->rs, p->re, p->score, mlen, blen, p->cnt);
84
+ for (j = 0; j < p->cnt; ++j)
85
+ mg_sprintf_lite(&str, "%d,", (int32_t)a[p->off + j].y);
86
+ mg_sprintf_lite(&str, "\t");
87
+ for (j = 0; j < p->cnt; ++j)
88
+ mg_sprintf_lite(&str, "%d,", (int32_t)a[p->off + j].x);
89
+ mg_sprintf_lite(&str, "\t");
90
+ for (j = 0; j < p->cnt; ++j)
91
+ mg_sprintf_lite(&str, "%d,", (int32_t)(a[p->off + j].y>>MG_SEED_OCC_SHIFT));
92
+ mg_sprintf_lite(&str, "\n");
93
+ fwrite(str.s, 1, str.l, fp);
94
+ }
95
+ free(str.s);
96
+ }
97
+
98
+ void mg_write_gaf(kstring_t *s, const gfa_t *g, const mg_gchains_t *gs, int32_t n_seg, const int32_t *qlens, const char *qname, uint64_t flag, void *km)
99
+ {
100
+ int32_t i, j, qlen, rev_sign = 0;
101
+ s->l = 0;
102
+ for (i = 0, qlen = 0; i < n_seg; ++i) qlen += qlens[i];
103
+ if ((gs == 0 || gs->n_gc == 0) && (flag&MG_M_SHOW_UNMAP)) {
104
+ mg_sprintf_lite(s, "%s", qname);
105
+ if ((flag&MG_M_FRAG_MERGE) && n_seg == 2 && s->l > 2 && s->s[s->l-1] == '1' && s->s[s->l-2] == '/') s->l -= 2;
106
+ mg_sprintf_lite(s, "\t%d\t0\t0\t*\t*\t0\t0\t0\t0\t0\t0\n", qlen);
107
+ return;
108
+ }
109
+ if (gs == 0) return;
110
+ for (i = 0; i < gs->n_gc; ++i) {
111
+ const mg_gchain_t *p = &gs->gc[i];
112
+ int32_t sign_pos, compact;
113
+ if (p->id != p->parent && !(flag&MG_M_PRINT_2ND)) continue;
114
+ if (p->cnt == 0) continue;
115
+ mg_sprintf_lite(s, "%s", qname);
116
+ if ((flag&MG_M_FRAG_MERGE) && n_seg == 2 && s->l > 2 && s->s[s->l-1] == '1' && s->s[s->l-2] == '/') s->l -= 2;
117
+ mg_sprintf_lite(s, "\t%d\t%d\t%d\t+\t", qlen, p->qs, p->qe);
118
+ assert(p->cnt > 0);
119
+ sign_pos = s->l - 2;
120
+ if (flag & MG_M_VERTEX_COOR) {
121
+ compact = 0;
122
+ for (j = 0; j < p->cnt; ++j) {
123
+ const mg_llchain_t *q = &gs->lc[p->off + j];
124
+ mg_sprintf_lite(s, "%c%s", "><"[q->v&1], g->seg[q->v>>1].name);
125
+ }
126
+ } else {
127
+ int32_t last_pnid = -1, st = -1, en = -1, rev = -1;
128
+ compact = flag&MG_M_NO_COMP_PATH? 0 : 1;
129
+ for (j = 0; j < p->cnt; ++j) {
130
+ const mg_llchain_t *q;
131
+ const gfa_seg_t *t;
132
+ assert(p->off + j < gs->n_lc);
133
+ q = &gs->lc[p->off + j];
134
+ t = &g->seg[q->v>>1];
135
+ if (t->snid < 0) { // no stable ID; write the vertex coordinate
136
+ compact = 0;
137
+ if (last_pnid >= 0) mg_sprintf_lite(s, "%c%s:%d-%d", "><"[rev], g->sseq[last_pnid].name, st, en);
138
+ last_pnid = -1, st = -1, en = -1, rev = -1;
139
+ mg_sprintf_lite(s, "%c%s", "><"[q->v&1], g->seg[q->v>>1].name);
140
+ } else {
141
+ int cont = 0;
142
+ if (last_pnid >= 0 && t->snid == last_pnid && (q->v&1) == rev) { // same stable sequence and same strand
143
+ if (!(q->v&1)) { // forward strand
144
+ if (t->soff == en)
145
+ en = t->soff + t->len, cont = 1;
146
+ } else { // reverse strand
147
+ if (t->soff + t->len == st)
148
+ st = t->soff, cont = 1;
149
+ }
150
+ }
151
+ if (cont == 0) {
152
+ if (last_pnid >= 0) compact = 0;
153
+ if (last_pnid >= 0) mg_sprintf_lite(s, "%c%s:%d-%d", "><"[rev], g->sseq[last_pnid].name, st, en);
154
+ last_pnid = t->snid, rev = q->v&1, st = t->soff, en = st + t->len;
155
+ }
156
+ }
157
+ }
158
+ if (last_pnid >= 0) {
159
+ if (g->sseq[last_pnid].rank != 0 || g->sseq[last_pnid].min != 0)
160
+ compact = 0;
161
+ if (!compact) mg_sprintf_lite(s, "%c%s:%d-%d", "><"[rev], g->sseq[last_pnid].name, st, en);
162
+ } else compact = 0;
163
+ }
164
+ if (compact) {
165
+ int32_t rev = gs->lc[p->off].v&1;
166
+ const gfa_seg_t *t = &g->seg[gs->lc[rev? p->off + p->cnt - 1 : p->off].v>>1];
167
+ const gfa_sseq_t *ps = &g->sseq[t->snid];
168
+ mg_sprintf_lite(s, "%s\t%d\t", ps->name, ps->max);
169
+ if (rev) {
170
+ rev_sign = 1;
171
+ s->s[sign_pos] = '-';
172
+ mg_sprintf_lite(s, "%d\t%d", t->soff + (p->plen - p->pe), t->soff + (p->plen - p->ps));
173
+ } else {
174
+ mg_sprintf_lite(s, "%d\t%d", t->soff + p->ps, t->soff + p->pe);
175
+ }
176
+ } else mg_sprintf_lite(s, "\t%d\t%d\t%d", p->plen, p->ps, p->pe);
177
+ if (p->p) mg_sprintf_lite(s, "\t%d\t%d\t%d", p->p->mlen, p->p->blen, p->mapq);
178
+ else mg_sprintf_lite(s, "\t%d\t%d\t%d", p->mlen, p->blen, p->mapq);
179
+ mg_sprintf_lite(s, "\ttp:A:%c", p->id == p->parent? 'P' : 'S');
180
+ if (p->p) mg_sprintf_lite(s, "\tNM:i:%d", p->p->blen - p->p->mlen);
181
+ mg_sprintf_lite(s, "\tcm:i:%d\ts1:i:%d\ts2:i:%d", p->n_anchor, p->score, p->subsc);
182
+ if (p->div >= 0.0f && p->div <= 1.0f) {
183
+ char buf[16];
184
+ if (p->div == 0.0f) buf[0] = '0', buf[1] = 0;
185
+ else snprintf(buf, 16, "%.4f", p->div);
186
+ mg_sprintf_lite(s, "\tdv:f:%s", buf);
187
+ }
188
+ if (n_seg > 1) {
189
+ mg_sprintf_lite(s, "\tql:B:i");
190
+ for (j = 0; j < n_seg; ++j) mg_sprintf_lite(s, ",%d", qlens[j]);
191
+ }
192
+ if (p->p) {
193
+ mg_sprintf_lite(s, "\tcg:Z:");
194
+ if (rev_sign)
195
+ for (j = p->p->n_cigar - 1; j >= 0; --j)
196
+ mg_sprintf_lite(s, "%d%c", (int32_t)(p->p->cigar[j]>>4), "MIDNSHP=XB"[p->p->cigar[j]&0xf]);
197
+ else
198
+ for (j = 0; j < p->p->n_cigar; ++j)
199
+ mg_sprintf_lite(s, "%d%c", (int32_t)(p->p->cigar[j]>>4), "MIDNSHP=XB"[p->p->cigar[j]&0xf]);
200
+ }
201
+ mg_sprintf_lite(s, "\n");
202
+ if ((mg_dbg_flag & MG_DBG_LCHAIN) || (flag & MG_M_WRITE_LCHAIN)) {
203
+ char buf[16];
204
+ for (j = 0; j < p->cnt; ++j) {
205
+ const mg_llchain_t *lc = &gs->lc[p->off + j];
206
+ mg_sprintf_lite(s, "*\t%c%s\t%d\t%d", "><"[lc->v&1], g->seg[lc->v>>1].name, g->seg[lc->v>>1].len, lc->cnt);
207
+ if (lc->cnt > 0) {
208
+ double div;
209
+ int32_t q_span = (int32_t)(gs->a[lc->off].y>>32&0xff);
210
+ int32_t n = (int32_t)(gs->a[lc->off + lc->cnt - 1].x>>32) - (int32_t)(gs->a[lc->off].x>>32) + 1;
211
+ div = n == lc->cnt? 0.0 : (n > lc->cnt? log((double)n / lc->cnt) : log((double)lc->cnt / n)) / q_span;
212
+ if (div == 0.0) buf[0] = '0', buf[1] = 0;
213
+ else snprintf(buf, 16, "%.4f", div);
214
+ mg_sprintf_lite(s, "\t%s", buf);
215
+ mg_sprintf_lite(s, "\t%d\t%d", (int32_t)gs->a[lc->off].x + 1 - q_span, (int32_t)gs->a[lc->off + lc->cnt - 1].x + 1);
216
+ mg_sprintf_lite(s, "\t%d\t%d", (int32_t)gs->a[lc->off].y + 1 - q_span, (int32_t)gs->a[lc->off + lc->cnt - 1].y + 1);
217
+ if (flag & MG_M_WRITE_MZ) {
218
+ int32_t i, last;
219
+ last = (int32_t)gs->a[lc->off].x + 1 - q_span;
220
+ mg_sprintf_lite(s, "\t%d\t", q_span);
221
+ for (i = 1; i < lc->cnt; ++i) {
222
+ int32_t x = (int32_t)gs->a[lc->off + i].x + 1 - q_span;
223
+ if (i > 1) mg_sprintf_lite(s, ",");
224
+ mg_sprintf_lite(s, "%d", x - last);
225
+ last = x;
226
+ }
227
+ last = (int32_t)gs->a[lc->off].y + 1 - q_span;
228
+ mg_sprintf_lite(s, "\t");
229
+ for (i = 1; i < lc->cnt; ++i) {
230
+ int32_t x = (int32_t)gs->a[lc->off + i].y + 1 - q_span;
231
+ if (i > 1) mg_sprintf_lite(s, ",");
232
+ mg_sprintf_lite(s, "%d", x - last);
233
+ last = x;
234
+ }
235
+ }
236
+ }
237
+ mg_sprintf_lite(s, "\n");
238
+ }
239
+ }
240
+ }
241
+ }