ruby-minigraph 0.0.20.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +62 -0
- data/ext/Rakefile +56 -0
- data/ext/cmappy/cmappy.c +7 -0
- data/ext/cmappy/cmappy.h +8 -0
- data/ext/minigraph/LICENSE.txt +23 -0
- data/ext/minigraph/Makefile +66 -0
- data/ext/minigraph/NEWS.md +317 -0
- data/ext/minigraph/README.md +207 -0
- data/ext/minigraph/algo.c +194 -0
- data/ext/minigraph/algo.h +33 -0
- data/ext/minigraph/asm-call.c +147 -0
- data/ext/minigraph/bseq.c +133 -0
- data/ext/minigraph/bseq.h +76 -0
- data/ext/minigraph/cal_cov.c +139 -0
- data/ext/minigraph/doc/example1.png +0 -0
- data/ext/minigraph/doc/example2.png +0 -0
- data/ext/minigraph/doc/examples.graffle +0 -0
- data/ext/minigraph/format.c +241 -0
- data/ext/minigraph/galign.c +140 -0
- data/ext/minigraph/gchain1.c +532 -0
- data/ext/minigraph/gcmisc.c +223 -0
- data/ext/minigraph/gfa-aug.c +260 -0
- data/ext/minigraph/gfa-base.c +526 -0
- data/ext/minigraph/gfa-bbl.c +372 -0
- data/ext/minigraph/gfa-ed.c +617 -0
- data/ext/minigraph/gfa-io.c +395 -0
- data/ext/minigraph/gfa-priv.h +154 -0
- data/ext/minigraph/gfa.h +166 -0
- data/ext/minigraph/ggen.c +182 -0
- data/ext/minigraph/ggen.h +21 -0
- data/ext/minigraph/ggsimple.c +570 -0
- data/ext/minigraph/gmap.c +211 -0
- data/ext/minigraph/index.c +230 -0
- data/ext/minigraph/kalloc.c +224 -0
- data/ext/minigraph/kalloc.h +82 -0
- data/ext/minigraph/kavl.h +414 -0
- data/ext/minigraph/kdq.h +134 -0
- data/ext/minigraph/ketopt.h +116 -0
- data/ext/minigraph/khashl.h +348 -0
- data/ext/minigraph/krmq.h +474 -0
- data/ext/minigraph/kseq.h +256 -0
- data/ext/minigraph/ksort.h +164 -0
- data/ext/minigraph/kstring.h +165 -0
- data/ext/minigraph/kthread.c +159 -0
- data/ext/minigraph/kthread.h +15 -0
- data/ext/minigraph/kvec-km.h +105 -0
- data/ext/minigraph/kvec.h +110 -0
- data/ext/minigraph/lchain.c +441 -0
- data/ext/minigraph/main.c +301 -0
- data/ext/minigraph/map-algo.c +500 -0
- data/ext/minigraph/mgpriv.h +128 -0
- data/ext/minigraph/minigraph.1 +359 -0
- data/ext/minigraph/minigraph.h +176 -0
- data/ext/minigraph/miniwfa.c +834 -0
- data/ext/minigraph/miniwfa.h +95 -0
- data/ext/minigraph/misc/mgutils.js +1451 -0
- data/ext/minigraph/misc.c +12 -0
- data/ext/minigraph/options.c +134 -0
- data/ext/minigraph/shortk.c +251 -0
- data/ext/minigraph/sketch.c +109 -0
- data/ext/minigraph/sys.c +147 -0
- data/ext/minigraph/sys.h +20 -0
- data/ext/minigraph/test/MT-chimp.fa +277 -0
- data/ext/minigraph/test/MT-human.fa +239 -0
- data/ext/minigraph/test/MT-orangA.fa +276 -0
- data/ext/minigraph/test/MT.gfa +19 -0
- data/ext/minigraph/tex/Makefile +13 -0
- data/ext/minigraph/tex/minigraph.bib +676 -0
- data/ext/minigraph/tex/minigraph.tex +986 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
- data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
- data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
- data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
- data/ext/minigraph/tex/plots/bedutils.js +367 -0
- data/ext/minigraph/tex/plots/chr-plot.js +130 -0
- data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
- data/ext/minigraph.patch +21 -0
- data/lib/minigraph/ffi/constants.rb +230 -0
- data/lib/minigraph/ffi/functions.rb +70 -0
- data/lib/minigraph/ffi/mappy.rb +8 -0
- data/lib/minigraph/ffi.rb +27 -0
- data/lib/minigraph/version.rb +5 -0
- data/lib/minigraph.rb +72 -0
- metadata +159 -0
@@ -0,0 +1,133 @@
|
|
1
|
+
#include <zlib.h>
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include <assert.h>
|
5
|
+
#define __STDC_LIMIT_MACROS
|
6
|
+
#include "bseq.h"
|
7
|
+
#include "kvec-km.h"
|
8
|
+
#include "kseq.h"
|
9
|
+
KSEQ_INIT(gzFile, gzread)
|
10
|
+
|
11
|
+
#define CHECK_PAIR_THRES 1000000
|
12
|
+
|
13
|
+
struct mg_bseq_file_s {
|
14
|
+
gzFile fp;
|
15
|
+
kseq_t *ks;
|
16
|
+
mg_bseq1_t s;
|
17
|
+
};
|
18
|
+
|
19
|
+
mg_bseq_file_t *mg_bseq_open(const char *fn)
|
20
|
+
{
|
21
|
+
mg_bseq_file_t *fp;
|
22
|
+
gzFile f;
|
23
|
+
f = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(0, "r");
|
24
|
+
if (f == 0) return 0;
|
25
|
+
fp = (mg_bseq_file_t*)calloc(1, sizeof(mg_bseq_file_t));
|
26
|
+
fp->fp = f;
|
27
|
+
fp->ks = kseq_init(fp->fp);
|
28
|
+
return fp;
|
29
|
+
}
|
30
|
+
|
31
|
+
void mg_bseq_close(mg_bseq_file_t *fp)
|
32
|
+
{
|
33
|
+
kseq_destroy(fp->ks);
|
34
|
+
gzclose(fp->fp);
|
35
|
+
free(fp);
|
36
|
+
}
|
37
|
+
|
38
|
+
static inline char *kstrdup(const kstring_t *s)
|
39
|
+
{
|
40
|
+
char *t;
|
41
|
+
t = (char*)malloc(s->l + 1);
|
42
|
+
memcpy(t, s->s, s->l + 1);
|
43
|
+
return t;
|
44
|
+
}
|
45
|
+
|
46
|
+
static inline void kseq2bseq(kseq_t *ks, mg_bseq1_t *s, int with_qual, int with_comment)
|
47
|
+
{
|
48
|
+
int i;
|
49
|
+
if (ks->name.l == 0)
|
50
|
+
fprintf(stderr, "[WARNING]\033[1;31m empty sequence name in the input.\033[0m\n");
|
51
|
+
s->name = kstrdup(&ks->name);
|
52
|
+
s->seq = kstrdup(&ks->seq);
|
53
|
+
for (i = 0; i < (int)ks->seq.l; ++i) // convert U to T
|
54
|
+
if (s->seq[i] == 'u' || s->seq[i] == 'U')
|
55
|
+
--s->seq[i];
|
56
|
+
s->qual = with_qual && ks->qual.l? kstrdup(&ks->qual) : 0;
|
57
|
+
s->comment = with_comment && ks->comment.l? kstrdup(&ks->comment) : 0;
|
58
|
+
s->l_seq = ks->seq.l;
|
59
|
+
}
|
60
|
+
|
61
|
+
mg_bseq1_t *mg_bseq_read(mg_bseq_file_t *fp, int64_t chunk_size, int with_qual, int with_comment, int frag_mode, int *n_)
|
62
|
+
{
|
63
|
+
int64_t size = 0;
|
64
|
+
int ret;
|
65
|
+
kvec_t(mg_bseq1_t) a = {0,0,0};
|
66
|
+
kseq_t *ks = fp->ks;
|
67
|
+
*n_ = 0;
|
68
|
+
if (fp->s.seq) {
|
69
|
+
kv_resize(mg_bseq1_t, 0, a, 256);
|
70
|
+
kv_push(mg_bseq1_t, 0, a, fp->s);
|
71
|
+
size = fp->s.l_seq;
|
72
|
+
memset(&fp->s, 0, sizeof(mg_bseq1_t));
|
73
|
+
}
|
74
|
+
while ((ret = kseq_read(ks)) >= 0) {
|
75
|
+
mg_bseq1_t *s;
|
76
|
+
assert(ks->seq.l <= INT32_MAX);
|
77
|
+
if (a.m == 0) kv_resize(mg_bseq1_t, 0, a, 256);
|
78
|
+
kv_pushp(mg_bseq1_t, 0, a, &s);
|
79
|
+
kseq2bseq(ks, s, with_qual, with_comment);
|
80
|
+
size += s->l_seq;
|
81
|
+
if (size >= chunk_size) {
|
82
|
+
if (frag_mode && a.a[a.n-1].l_seq < CHECK_PAIR_THRES) {
|
83
|
+
while (kseq_read(ks) >= 0) {
|
84
|
+
kseq2bseq(ks, &fp->s, with_qual, with_comment);
|
85
|
+
if (mg_qname_same(fp->s.name, a.a[a.n-1].name)) {
|
86
|
+
kv_push(mg_bseq1_t, 0, a, fp->s);
|
87
|
+
memset(&fp->s, 0, sizeof(mg_bseq1_t));
|
88
|
+
} else break;
|
89
|
+
}
|
90
|
+
}
|
91
|
+
break;
|
92
|
+
}
|
93
|
+
}
|
94
|
+
if (ret < -1)
|
95
|
+
fprintf(stderr, "[WARNING]\033[1;31m wrong FASTA/FASTQ record. Continue anyway.\033[0m\n");
|
96
|
+
*n_ = a.n;
|
97
|
+
return a.a;
|
98
|
+
}
|
99
|
+
|
100
|
+
mg_bseq1_t *mg_bseq_read_frag(int n_fp, mg_bseq_file_t **fp, int64_t chunk_size, int with_qual, int with_comment, int *n_)
|
101
|
+
{
|
102
|
+
int i;
|
103
|
+
int64_t size = 0;
|
104
|
+
kvec_t(mg_bseq1_t) a = {0,0,0};
|
105
|
+
*n_ = 0;
|
106
|
+
if (n_fp < 1) return 0;
|
107
|
+
while (1) {
|
108
|
+
int n_read = 0;
|
109
|
+
for (i = 0; i < n_fp; ++i)
|
110
|
+
if (kseq_read(fp[i]->ks) >= 0)
|
111
|
+
++n_read;
|
112
|
+
if (n_read < n_fp) {
|
113
|
+
if (n_read > 0)
|
114
|
+
fprintf(stderr, "[W::%s]\033[1;31m query files have different number of records; extra records skipped.\033[0m\n", __func__);
|
115
|
+
break; // some file reaches the end
|
116
|
+
}
|
117
|
+
if (a.m == 0) kv_resize(mg_bseq1_t, 0, a, 256);
|
118
|
+
for (i = 0; i < n_fp; ++i) {
|
119
|
+
mg_bseq1_t *s;
|
120
|
+
kv_pushp(mg_bseq1_t, 0, a, &s);
|
121
|
+
kseq2bseq(fp[i]->ks, s, with_qual, with_comment);
|
122
|
+
size += s->l_seq;
|
123
|
+
}
|
124
|
+
if (size >= chunk_size) break;
|
125
|
+
}
|
126
|
+
*n_ = a.n;
|
127
|
+
return a.a;
|
128
|
+
}
|
129
|
+
|
130
|
+
int mg_bseq_eof(mg_bseq_file_t *fp)
|
131
|
+
{
|
132
|
+
return (ks_eof(fp->ks->f) && fp->s.seq == 0);
|
133
|
+
}
|
@@ -0,0 +1,76 @@
|
|
1
|
+
#ifndef MM_BSEQ_H
|
2
|
+
#define MM_BSEQ_H
|
3
|
+
|
4
|
+
#include <stdint.h>
|
5
|
+
#include <string.h>
|
6
|
+
|
7
|
+
#ifdef __cplusplus
|
8
|
+
extern "C" {
|
9
|
+
#endif
|
10
|
+
|
11
|
+
extern unsigned char gfa_comp_table[256];
|
12
|
+
|
13
|
+
struct mg_bseq_file_s;
|
14
|
+
typedef struct mg_bseq_file_s mg_bseq_file_t;
|
15
|
+
|
16
|
+
typedef struct {
|
17
|
+
int32_t l_seq, rid;
|
18
|
+
char *name, *seq, *qual, *comment;
|
19
|
+
} mg_bseq1_t;
|
20
|
+
|
21
|
+
mg_bseq_file_t *mg_bseq_open(const char *fn);
|
22
|
+
void mg_bseq_close(mg_bseq_file_t *fp);
|
23
|
+
mg_bseq1_t *mg_bseq_read(mg_bseq_file_t *fp, int64_t chunk_size, int with_qual, int with_comment, int frag_mode, int *n_);
|
24
|
+
mg_bseq1_t *mg_bseq_read_frag(int n_fp, mg_bseq_file_t **fp, int64_t chunk_size, int with_qual, int with_comment, int *n_);
|
25
|
+
int mg_bseq_eof(mg_bseq_file_t *fp);
|
26
|
+
|
27
|
+
extern unsigned char seq_nt4_table[256];
|
28
|
+
extern unsigned char gfa_comp_table[256];
|
29
|
+
|
30
|
+
static inline int32_t mg_qname_len(const char *s)
|
31
|
+
{
|
32
|
+
int32_t l;
|
33
|
+
l = strlen(s);
|
34
|
+
return l >= 3 && s[l-1] >= '0' && s[l-1] <= '9' && s[l-2] == '/'? l - 2 : l;
|
35
|
+
}
|
36
|
+
|
37
|
+
static inline int32_t mg_qname_same(const char *s1, const char *s2)
|
38
|
+
{
|
39
|
+
int32_t l1, l2;
|
40
|
+
l1 = mg_qname_len(s1);
|
41
|
+
l2 = mg_qname_len(s2);
|
42
|
+
return (l1 == l2 && strncmp(s1, s2, l1) == 0);
|
43
|
+
}
|
44
|
+
|
45
|
+
static inline void mg_toupper(int32_t len, char *seq)
|
46
|
+
{
|
47
|
+
int32_t j;
|
48
|
+
for (j = 0; j < len; ++j)
|
49
|
+
seq[j] = seq[j] < 'a' || seq[j] > 'z'? seq[j] : seq[j] - 32;
|
50
|
+
}
|
51
|
+
|
52
|
+
static inline void mg_revcomp_seq(int32_t len, char *seq)
|
53
|
+
{
|
54
|
+
int32_t i;
|
55
|
+
for (i = 0; i < len>>1; ++i) {
|
56
|
+
int32_t t = seq[len - i - 1];
|
57
|
+
seq[len - i - 1] = gfa_comp_table[(uint8_t)seq[i]];
|
58
|
+
seq[i] = gfa_comp_table[t];
|
59
|
+
}
|
60
|
+
if (len&1) seq[len>>1] = gfa_comp_table[(uint8_t)seq[len>>1]];
|
61
|
+
}
|
62
|
+
|
63
|
+
static inline void mg_revcomp_bseq(mg_bseq1_t *s)
|
64
|
+
{
|
65
|
+
int32_t i, t, l = s->l_seq;
|
66
|
+
mg_revcomp_seq(s->l_seq, s->seq);
|
67
|
+
if (s->qual)
|
68
|
+
for (i = 0; i < l>>1; ++i)
|
69
|
+
t = s->qual[l - i - 1], s->qual[l - i - 1] = s->qual[i], s->qual[i] = t;
|
70
|
+
}
|
71
|
+
|
72
|
+
#ifdef __cplusplus
|
73
|
+
}
|
74
|
+
#endif
|
75
|
+
|
76
|
+
#endif
|
@@ -0,0 +1,139 @@
|
|
1
|
+
#include <assert.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include "mgpriv.h"
|
4
|
+
#include "gfa-priv.h"
|
5
|
+
#include "algo.h"
|
6
|
+
#include "kalloc.h"
|
7
|
+
|
8
|
+
void mg_cov_map(const gfa_t *g, const mg_gchains_t *gt, int32_t min_mapq, int32_t min_blen, double *c_seg, double *c_link, const char *qname)
|
9
|
+
{
|
10
|
+
int32_t i, j;
|
11
|
+
if (c_seg == 0 && c_link == 0) return;
|
12
|
+
if (gt == 0 || gt->n_gc == 0) return;
|
13
|
+
for (i = 0; i < gt->n_gc; ++i) {
|
14
|
+
const mg_gchain_t *gc = >->gc[i];
|
15
|
+
const mg128_t *last_an;
|
16
|
+
assert(gc->cnt > 0 && gc->n_anchor > 0);
|
17
|
+
if (gc->mapq < min_mapq || gc->blen < min_blen) continue;
|
18
|
+
// count segment coverage
|
19
|
+
for (j = 0; j < gc->cnt; ++j) {
|
20
|
+
const mg_llchain_t *lc = >->lc[gc->off + j];
|
21
|
+
int32_t s, e;
|
22
|
+
s = 0, e = g->seg[lc->v>>1].len;
|
23
|
+
if (j == 0) s = (int32_t)gt->a[lc->off].x + 1 - (int32_t)(gt->a[lc->off].y>>32&0xff);
|
24
|
+
if (j == gc->cnt - 1) e = (int32_t)gt->a[lc->off + lc->cnt - 1].x + 1;
|
25
|
+
if (c_seg) c_seg[lc->v>>1] += (double)(e - s) / g->seg[lc->v>>1].len;
|
26
|
+
}
|
27
|
+
// count link
|
28
|
+
assert(gt->lc[gc->off].cnt > 0);
|
29
|
+
last_an = >->a[gt->lc[gc->off].off + gt->lc[gc->off].cnt - 1];
|
30
|
+
for (j = 1; j < gc->cnt; ++j) {
|
31
|
+
const mg_llchain_t *lc0 = >->lc[gc->off + j - 1];
|
32
|
+
const mg_llchain_t *lc1 = >->lc[gc->off + j];
|
33
|
+
int64_t a01, a10;
|
34
|
+
if (lc1->cnt > 0) {
|
35
|
+
const mg128_t *curr_an = >->a[lc1->off];
|
36
|
+
int32_t is_skip = (mg_seg_id(*curr_an) != mg_seg_id(*last_an));
|
37
|
+
last_an = >->a[lc1->off + lc1->cnt - 1];
|
38
|
+
if (is_skip) continue;
|
39
|
+
}
|
40
|
+
a01 = gfa_find_arc(g, lc0->v, lc1->v);
|
41
|
+
a10 = gfa_find_arc(g, lc1->v^1, lc0->v^1);
|
42
|
+
if (a01 < 0 || a10 < 0) {
|
43
|
+
if (mg_verbose >= 2)
|
44
|
+
fprintf(stderr, "[W] Multi/disconnected link: %c%s[%d] -> %c%s[%d] (%s, %ld, %ld). Continue anyway!\n",
|
45
|
+
"><"[lc0->v&1], g->seg[lc0->v>>1].name, lc0->v,
|
46
|
+
"><"[lc1->v&1], g->seg[lc1->v>>1].name, lc1->v, qname, (long)a01, (long)a10);
|
47
|
+
continue;
|
48
|
+
}
|
49
|
+
assert((g->arc[a01].comp ^ g->arc[a10].comp) == 1);
|
50
|
+
if (c_link) c_link[a01] += 1.0, c_link[a10] += 1.0;
|
51
|
+
}
|
52
|
+
}
|
53
|
+
}
|
54
|
+
|
55
|
+
void mg_cov_asm(const gfa_t *g, int32_t n_seq, mg_gchains_t *const *gcs, int32_t min_mapq, int32_t min_blen, double *cov_seg, double *cov_link)
|
56
|
+
{
|
57
|
+
int32_t i, j, t, *soff, *scnt, *cnt_link;
|
58
|
+
int64_t k;
|
59
|
+
mg_intv_t *sintv = 0;
|
60
|
+
void *km = 0;
|
61
|
+
|
62
|
+
// precalculate the size of sintv[] for each segment
|
63
|
+
KCALLOC(km, scnt, g->n_seg);
|
64
|
+
for (t = 0; t < n_seq; ++t) {
|
65
|
+
const mg_gchains_t *gt = gcs[t];
|
66
|
+
if (gt == 0 || gt->n_gc == 0) continue;
|
67
|
+
for (i = 0; i < gt->n_gc; ++i) {
|
68
|
+
const mg_gchain_t *gc = >->gc[i];
|
69
|
+
assert(gc->cnt > 0 && gc->n_anchor > 0);
|
70
|
+
if (gc->mapq < min_mapq || gc->blen < min_blen) continue;
|
71
|
+
for (j = 0; j < gc->cnt; ++j) {
|
72
|
+
const mg_llchain_t *lc = >->lc[gc->off + j];
|
73
|
+
++scnt[lc->v>>1];
|
74
|
+
}
|
75
|
+
}
|
76
|
+
}
|
77
|
+
KMALLOC(km, soff, g->n_seg + 1);
|
78
|
+
for (soff[0] = 0, i = 1; i <= g->n_seg; ++i)
|
79
|
+
soff[i] = soff[i - 1] + scnt[i - 1];
|
80
|
+
memset(scnt, 0, 4 * g->n_seg);
|
81
|
+
KMALLOC(km, sintv, soff[g->n_seg]);
|
82
|
+
|
83
|
+
// fill sintv[]
|
84
|
+
KCALLOC(km, cnt_link, g->n_arc);
|
85
|
+
for (t = 0; t < n_seq; ++t) {
|
86
|
+
const mg_gchains_t *gt = gcs[t];
|
87
|
+
if (gt == 0 || gt->n_gc == 0) continue;
|
88
|
+
for (i = 0; i < gt->n_gc;) {
|
89
|
+
const mg_gchain_t *gc = >->gc[i];
|
90
|
+
if (gc->mapq < min_mapq || gc->blen < min_blen) continue;
|
91
|
+
// count segment coverage
|
92
|
+
for (j = 0; j < gc->cnt; ++j) {
|
93
|
+
const mg_llchain_t *lc = >->lc[gc->off + j];
|
94
|
+
int32_t s, e, tmp;
|
95
|
+
mg_intv_t *p;
|
96
|
+
s = 0, e = g->seg[lc->v>>1].len;
|
97
|
+
if (j == 0) s = (int32_t)gt->a[lc->off].x + 1 - (int32_t)(gt->a[lc->off].y>>32&0xff);
|
98
|
+
if (j == gc->cnt - 1) e = (int32_t)gt->a[lc->off + lc->cnt - 1].x + 1;
|
99
|
+
if (lc->v&1) // convert to the forward strand of segment lc->v>>1
|
100
|
+
tmp = g->seg[lc->v>>1].len - s, s = g->seg[lc->v>>1].len - e, e = tmp;
|
101
|
+
p = &sintv[soff[lc->v>>1] + scnt[lc->v>>1]];
|
102
|
+
++scnt[lc->v>>1];
|
103
|
+
p->st = s, p->en = e, p->rev = lc->v&1, p->far = -1, p->i = -1;
|
104
|
+
}
|
105
|
+
// count link
|
106
|
+
for (j = 1; j < gc->cnt; ++j) {
|
107
|
+
const mg_llchain_t *lc0 = >->lc[gc->off + j - 1];
|
108
|
+
const mg_llchain_t *lc1 = >->lc[gc->off + j];
|
109
|
+
int64_t a01, a10;
|
110
|
+
a01 = gfa_find_arc(g, lc0->v, lc1->v);
|
111
|
+
a10 = gfa_find_arc(g, lc1->v^1, lc0->v^1);
|
112
|
+
assert(a01 >= 0 && a10 >= 0);
|
113
|
+
assert((g->arc[a01].comp ^ g->arc[a10].comp) == 1);
|
114
|
+
++cnt_link[a01];
|
115
|
+
++cnt_link[a10];
|
116
|
+
}
|
117
|
+
}
|
118
|
+
}
|
119
|
+
|
120
|
+
// update cov_link[] and cov_seg[]
|
121
|
+
for (k = 0; k < g->n_arc; ++k)
|
122
|
+
if (cnt_link[k] > 0) cov_link[k] += 1.0;
|
123
|
+
for (i = 0; i < g->n_seg; ++i) {
|
124
|
+
int32_t st = 0, en = 0, cov = 0;
|
125
|
+
assert(scnt[i] == soff[i+1] - soff[i]);
|
126
|
+
radix_sort_mg_intv(&sintv[soff[i]], &sintv[soff[i+1]]);
|
127
|
+
for (j = soff[i]; j < soff[i+1]; ++j) {
|
128
|
+
if (sintv[j].st > en)
|
129
|
+
cov += en - st, st = sintv[j].st, en = sintv[j].en;
|
130
|
+
else en = sintv[j].en > en? sintv[j].en : en;
|
131
|
+
}
|
132
|
+
cov += en - st;
|
133
|
+
cov_seg[i] += (double)cov / g->seg[i].len;
|
134
|
+
}
|
135
|
+
|
136
|
+
// free
|
137
|
+
kfree(km, cnt_link);
|
138
|
+
kfree(km, sintv); kfree(km, soff); kfree(km, scnt);
|
139
|
+
}
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,241 @@
|
|
1
|
+
#include <stdarg.h>
|
2
|
+
#include <stdlib.h>
|
3
|
+
#include <string.h>
|
4
|
+
#include <assert.h>
|
5
|
+
#include <stdio.h>
|
6
|
+
#include <math.h>
|
7
|
+
#include "kalloc.h"
|
8
|
+
#include "mgpriv.h"
|
9
|
+
|
10
|
+
static inline void str_enlarge(kstring_t *s, int l)
|
11
|
+
{
|
12
|
+
if (s->l + l + 1 > s->m) {
|
13
|
+
s->m = s->l + l + 1;
|
14
|
+
kroundup32(s->m);
|
15
|
+
s->s = (char*)realloc(s->s, s->m);
|
16
|
+
}
|
17
|
+
}
|
18
|
+
|
19
|
+
static inline void str_copy(kstring_t *s, const char *st, const char *en)
|
20
|
+
{
|
21
|
+
str_enlarge(s, en - st);
|
22
|
+
memcpy(&s->s[s->l], st, en - st);
|
23
|
+
s->l += en - st;
|
24
|
+
}
|
25
|
+
|
26
|
+
void mg_sprintf_lite(kstring_t *s, const char *fmt, ...)
|
27
|
+
{
|
28
|
+
char buf[16]; // for integer to string conversion
|
29
|
+
const char *p, *q;
|
30
|
+
va_list ap;
|
31
|
+
va_start(ap, fmt);
|
32
|
+
for (q = p = fmt; *p; ++p) {
|
33
|
+
if (*p == '%') {
|
34
|
+
if (p > q) str_copy(s, q, p);
|
35
|
+
++p;
|
36
|
+
if (*p == 'd') {
|
37
|
+
int c, i, l = 0;
|
38
|
+
unsigned int x;
|
39
|
+
c = va_arg(ap, int);
|
40
|
+
x = c >= 0? c : -c;
|
41
|
+
do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0);
|
42
|
+
if (c < 0) buf[l++] = '-';
|
43
|
+
str_enlarge(s, l);
|
44
|
+
for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
|
45
|
+
} else if (*p == 'u') {
|
46
|
+
int i, l = 0;
|
47
|
+
uint32_t x;
|
48
|
+
x = va_arg(ap, uint32_t);
|
49
|
+
do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0);
|
50
|
+
str_enlarge(s, l);
|
51
|
+
for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
|
52
|
+
} else if (*p == 's') {
|
53
|
+
char *r = va_arg(ap, char*);
|
54
|
+
str_copy(s, r, r + strlen(r));
|
55
|
+
} else if (*p == 'c') {
|
56
|
+
str_enlarge(s, 1);
|
57
|
+
s->s[s->l++] = va_arg(ap, int);
|
58
|
+
} else abort();
|
59
|
+
q = p + 1;
|
60
|
+
}
|
61
|
+
}
|
62
|
+
if (p > q) str_copy(s, q, p);
|
63
|
+
va_end(ap);
|
64
|
+
s->s[s->l] = 0;
|
65
|
+
}
|
66
|
+
|
67
|
+
void mg_print_lchain(FILE *fp, const mg_idx_t *gi, int n_lc, const mg_lchain_t *lc, const mg128_t *a, const char *qname)
|
68
|
+
{
|
69
|
+
kstring_t str = {0,0,0};
|
70
|
+
int i, j;
|
71
|
+
for (i = 0; i < n_lc; ++i) {
|
72
|
+
const mg_lchain_t *p = &lc[i];
|
73
|
+
int mlen, blen, span = a[p->off].y>>32&0xff;
|
74
|
+
mlen = blen = span;
|
75
|
+
for (j = 1; j < p->cnt; ++j) {
|
76
|
+
int ql = (int32_t)a[p->off + j].y - (int32_t)a[p->off + j - 1].y;
|
77
|
+
int pl = (int32_t)a[p->off + j].x - (int32_t)a[p->off + j - 1].x;
|
78
|
+
blen += pl > ql? pl : ql;
|
79
|
+
mlen += pl > span && ql > span? span : pl < ql? pl : ql;
|
80
|
+
}
|
81
|
+
str.l = 0;
|
82
|
+
mg_sprintf_lite(&str, "LC\t%s\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t", qname, p->qs, p->qe, "+-"[p->v&1], gi->g->seg[p->v>>1].name, gi->g->seg[p->v>>1].len,
|
83
|
+
p->rs, p->re, p->score, mlen, blen, p->cnt);
|
84
|
+
for (j = 0; j < p->cnt; ++j)
|
85
|
+
mg_sprintf_lite(&str, "%d,", (int32_t)a[p->off + j].y);
|
86
|
+
mg_sprintf_lite(&str, "\t");
|
87
|
+
for (j = 0; j < p->cnt; ++j)
|
88
|
+
mg_sprintf_lite(&str, "%d,", (int32_t)a[p->off + j].x);
|
89
|
+
mg_sprintf_lite(&str, "\t");
|
90
|
+
for (j = 0; j < p->cnt; ++j)
|
91
|
+
mg_sprintf_lite(&str, "%d,", (int32_t)(a[p->off + j].y>>MG_SEED_OCC_SHIFT));
|
92
|
+
mg_sprintf_lite(&str, "\n");
|
93
|
+
fwrite(str.s, 1, str.l, fp);
|
94
|
+
}
|
95
|
+
free(str.s);
|
96
|
+
}
|
97
|
+
|
98
|
+
void mg_write_gaf(kstring_t *s, const gfa_t *g, const mg_gchains_t *gs, int32_t n_seg, const int32_t *qlens, const char *qname, uint64_t flag, void *km)
|
99
|
+
{
|
100
|
+
int32_t i, j, qlen, rev_sign = 0;
|
101
|
+
s->l = 0;
|
102
|
+
for (i = 0, qlen = 0; i < n_seg; ++i) qlen += qlens[i];
|
103
|
+
if ((gs == 0 || gs->n_gc == 0) && (flag&MG_M_SHOW_UNMAP)) {
|
104
|
+
mg_sprintf_lite(s, "%s", qname);
|
105
|
+
if ((flag&MG_M_FRAG_MERGE) && n_seg == 2 && s->l > 2 && s->s[s->l-1] == '1' && s->s[s->l-2] == '/') s->l -= 2;
|
106
|
+
mg_sprintf_lite(s, "\t%d\t0\t0\t*\t*\t0\t0\t0\t0\t0\t0\n", qlen);
|
107
|
+
return;
|
108
|
+
}
|
109
|
+
if (gs == 0) return;
|
110
|
+
for (i = 0; i < gs->n_gc; ++i) {
|
111
|
+
const mg_gchain_t *p = &gs->gc[i];
|
112
|
+
int32_t sign_pos, compact;
|
113
|
+
if (p->id != p->parent && !(flag&MG_M_PRINT_2ND)) continue;
|
114
|
+
if (p->cnt == 0) continue;
|
115
|
+
mg_sprintf_lite(s, "%s", qname);
|
116
|
+
if ((flag&MG_M_FRAG_MERGE) && n_seg == 2 && s->l > 2 && s->s[s->l-1] == '1' && s->s[s->l-2] == '/') s->l -= 2;
|
117
|
+
mg_sprintf_lite(s, "\t%d\t%d\t%d\t+\t", qlen, p->qs, p->qe);
|
118
|
+
assert(p->cnt > 0);
|
119
|
+
sign_pos = s->l - 2;
|
120
|
+
if (flag & MG_M_VERTEX_COOR) {
|
121
|
+
compact = 0;
|
122
|
+
for (j = 0; j < p->cnt; ++j) {
|
123
|
+
const mg_llchain_t *q = &gs->lc[p->off + j];
|
124
|
+
mg_sprintf_lite(s, "%c%s", "><"[q->v&1], g->seg[q->v>>1].name);
|
125
|
+
}
|
126
|
+
} else {
|
127
|
+
int32_t last_pnid = -1, st = -1, en = -1, rev = -1;
|
128
|
+
compact = flag&MG_M_NO_COMP_PATH? 0 : 1;
|
129
|
+
for (j = 0; j < p->cnt; ++j) {
|
130
|
+
const mg_llchain_t *q;
|
131
|
+
const gfa_seg_t *t;
|
132
|
+
assert(p->off + j < gs->n_lc);
|
133
|
+
q = &gs->lc[p->off + j];
|
134
|
+
t = &g->seg[q->v>>1];
|
135
|
+
if (t->snid < 0) { // no stable ID; write the vertex coordinate
|
136
|
+
compact = 0;
|
137
|
+
if (last_pnid >= 0) mg_sprintf_lite(s, "%c%s:%d-%d", "><"[rev], g->sseq[last_pnid].name, st, en);
|
138
|
+
last_pnid = -1, st = -1, en = -1, rev = -1;
|
139
|
+
mg_sprintf_lite(s, "%c%s", "><"[q->v&1], g->seg[q->v>>1].name);
|
140
|
+
} else {
|
141
|
+
int cont = 0;
|
142
|
+
if (last_pnid >= 0 && t->snid == last_pnid && (q->v&1) == rev) { // same stable sequence and same strand
|
143
|
+
if (!(q->v&1)) { // forward strand
|
144
|
+
if (t->soff == en)
|
145
|
+
en = t->soff + t->len, cont = 1;
|
146
|
+
} else { // reverse strand
|
147
|
+
if (t->soff + t->len == st)
|
148
|
+
st = t->soff, cont = 1;
|
149
|
+
}
|
150
|
+
}
|
151
|
+
if (cont == 0) {
|
152
|
+
if (last_pnid >= 0) compact = 0;
|
153
|
+
if (last_pnid >= 0) mg_sprintf_lite(s, "%c%s:%d-%d", "><"[rev], g->sseq[last_pnid].name, st, en);
|
154
|
+
last_pnid = t->snid, rev = q->v&1, st = t->soff, en = st + t->len;
|
155
|
+
}
|
156
|
+
}
|
157
|
+
}
|
158
|
+
if (last_pnid >= 0) {
|
159
|
+
if (g->sseq[last_pnid].rank != 0 || g->sseq[last_pnid].min != 0)
|
160
|
+
compact = 0;
|
161
|
+
if (!compact) mg_sprintf_lite(s, "%c%s:%d-%d", "><"[rev], g->sseq[last_pnid].name, st, en);
|
162
|
+
} else compact = 0;
|
163
|
+
}
|
164
|
+
if (compact) {
|
165
|
+
int32_t rev = gs->lc[p->off].v&1;
|
166
|
+
const gfa_seg_t *t = &g->seg[gs->lc[rev? p->off + p->cnt - 1 : p->off].v>>1];
|
167
|
+
const gfa_sseq_t *ps = &g->sseq[t->snid];
|
168
|
+
mg_sprintf_lite(s, "%s\t%d\t", ps->name, ps->max);
|
169
|
+
if (rev) {
|
170
|
+
rev_sign = 1;
|
171
|
+
s->s[sign_pos] = '-';
|
172
|
+
mg_sprintf_lite(s, "%d\t%d", t->soff + (p->plen - p->pe), t->soff + (p->plen - p->ps));
|
173
|
+
} else {
|
174
|
+
mg_sprintf_lite(s, "%d\t%d", t->soff + p->ps, t->soff + p->pe);
|
175
|
+
}
|
176
|
+
} else mg_sprintf_lite(s, "\t%d\t%d\t%d", p->plen, p->ps, p->pe);
|
177
|
+
if (p->p) mg_sprintf_lite(s, "\t%d\t%d\t%d", p->p->mlen, p->p->blen, p->mapq);
|
178
|
+
else mg_sprintf_lite(s, "\t%d\t%d\t%d", p->mlen, p->blen, p->mapq);
|
179
|
+
mg_sprintf_lite(s, "\ttp:A:%c", p->id == p->parent? 'P' : 'S');
|
180
|
+
if (p->p) mg_sprintf_lite(s, "\tNM:i:%d", p->p->blen - p->p->mlen);
|
181
|
+
mg_sprintf_lite(s, "\tcm:i:%d\ts1:i:%d\ts2:i:%d", p->n_anchor, p->score, p->subsc);
|
182
|
+
if (p->div >= 0.0f && p->div <= 1.0f) {
|
183
|
+
char buf[16];
|
184
|
+
if (p->div == 0.0f) buf[0] = '0', buf[1] = 0;
|
185
|
+
else snprintf(buf, 16, "%.4f", p->div);
|
186
|
+
mg_sprintf_lite(s, "\tdv:f:%s", buf);
|
187
|
+
}
|
188
|
+
if (n_seg > 1) {
|
189
|
+
mg_sprintf_lite(s, "\tql:B:i");
|
190
|
+
for (j = 0; j < n_seg; ++j) mg_sprintf_lite(s, ",%d", qlens[j]);
|
191
|
+
}
|
192
|
+
if (p->p) {
|
193
|
+
mg_sprintf_lite(s, "\tcg:Z:");
|
194
|
+
if (rev_sign)
|
195
|
+
for (j = p->p->n_cigar - 1; j >= 0; --j)
|
196
|
+
mg_sprintf_lite(s, "%d%c", (int32_t)(p->p->cigar[j]>>4), "MIDNSHP=XB"[p->p->cigar[j]&0xf]);
|
197
|
+
else
|
198
|
+
for (j = 0; j < p->p->n_cigar; ++j)
|
199
|
+
mg_sprintf_lite(s, "%d%c", (int32_t)(p->p->cigar[j]>>4), "MIDNSHP=XB"[p->p->cigar[j]&0xf]);
|
200
|
+
}
|
201
|
+
mg_sprintf_lite(s, "\n");
|
202
|
+
if ((mg_dbg_flag & MG_DBG_LCHAIN) || (flag & MG_M_WRITE_LCHAIN)) {
|
203
|
+
char buf[16];
|
204
|
+
for (j = 0; j < p->cnt; ++j) {
|
205
|
+
const mg_llchain_t *lc = &gs->lc[p->off + j];
|
206
|
+
mg_sprintf_lite(s, "*\t%c%s\t%d\t%d", "><"[lc->v&1], g->seg[lc->v>>1].name, g->seg[lc->v>>1].len, lc->cnt);
|
207
|
+
if (lc->cnt > 0) {
|
208
|
+
double div;
|
209
|
+
int32_t q_span = (int32_t)(gs->a[lc->off].y>>32&0xff);
|
210
|
+
int32_t n = (int32_t)(gs->a[lc->off + lc->cnt - 1].x>>32) - (int32_t)(gs->a[lc->off].x>>32) + 1;
|
211
|
+
div = n == lc->cnt? 0.0 : (n > lc->cnt? log((double)n / lc->cnt) : log((double)lc->cnt / n)) / q_span;
|
212
|
+
if (div == 0.0) buf[0] = '0', buf[1] = 0;
|
213
|
+
else snprintf(buf, 16, "%.4f", div);
|
214
|
+
mg_sprintf_lite(s, "\t%s", buf);
|
215
|
+
mg_sprintf_lite(s, "\t%d\t%d", (int32_t)gs->a[lc->off].x + 1 - q_span, (int32_t)gs->a[lc->off + lc->cnt - 1].x + 1);
|
216
|
+
mg_sprintf_lite(s, "\t%d\t%d", (int32_t)gs->a[lc->off].y + 1 - q_span, (int32_t)gs->a[lc->off + lc->cnt - 1].y + 1);
|
217
|
+
if (flag & MG_M_WRITE_MZ) {
|
218
|
+
int32_t i, last;
|
219
|
+
last = (int32_t)gs->a[lc->off].x + 1 - q_span;
|
220
|
+
mg_sprintf_lite(s, "\t%d\t", q_span);
|
221
|
+
for (i = 1; i < lc->cnt; ++i) {
|
222
|
+
int32_t x = (int32_t)gs->a[lc->off + i].x + 1 - q_span;
|
223
|
+
if (i > 1) mg_sprintf_lite(s, ",");
|
224
|
+
mg_sprintf_lite(s, "%d", x - last);
|
225
|
+
last = x;
|
226
|
+
}
|
227
|
+
last = (int32_t)gs->a[lc->off].y + 1 - q_span;
|
228
|
+
mg_sprintf_lite(s, "\t");
|
229
|
+
for (i = 1; i < lc->cnt; ++i) {
|
230
|
+
int32_t x = (int32_t)gs->a[lc->off + i].y + 1 - q_span;
|
231
|
+
if (i > 1) mg_sprintf_lite(s, ",");
|
232
|
+
mg_sprintf_lite(s, "%d", x - last);
|
233
|
+
last = x;
|
234
|
+
}
|
235
|
+
}
|
236
|
+
}
|
237
|
+
mg_sprintf_lite(s, "\n");
|
238
|
+
}
|
239
|
+
}
|
240
|
+
}
|
241
|
+
}
|