ruby-minigraph 0.0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +62 -0
- data/ext/Rakefile +56 -0
- data/ext/cmappy/cmappy.c +7 -0
- data/ext/cmappy/cmappy.h +8 -0
- data/ext/minigraph/LICENSE.txt +23 -0
- data/ext/minigraph/Makefile +66 -0
- data/ext/minigraph/NEWS.md +317 -0
- data/ext/minigraph/README.md +207 -0
- data/ext/minigraph/algo.c +194 -0
- data/ext/minigraph/algo.h +33 -0
- data/ext/minigraph/asm-call.c +147 -0
- data/ext/minigraph/bseq.c +133 -0
- data/ext/minigraph/bseq.h +76 -0
- data/ext/minigraph/cal_cov.c +139 -0
- data/ext/minigraph/doc/example1.png +0 -0
- data/ext/minigraph/doc/example2.png +0 -0
- data/ext/minigraph/doc/examples.graffle +0 -0
- data/ext/minigraph/format.c +241 -0
- data/ext/minigraph/galign.c +140 -0
- data/ext/minigraph/gchain1.c +532 -0
- data/ext/minigraph/gcmisc.c +223 -0
- data/ext/minigraph/gfa-aug.c +260 -0
- data/ext/minigraph/gfa-base.c +526 -0
- data/ext/minigraph/gfa-bbl.c +372 -0
- data/ext/minigraph/gfa-ed.c +617 -0
- data/ext/minigraph/gfa-io.c +395 -0
- data/ext/minigraph/gfa-priv.h +154 -0
- data/ext/minigraph/gfa.h +166 -0
- data/ext/minigraph/ggen.c +182 -0
- data/ext/minigraph/ggen.h +21 -0
- data/ext/minigraph/ggsimple.c +570 -0
- data/ext/minigraph/gmap.c +211 -0
- data/ext/minigraph/index.c +230 -0
- data/ext/minigraph/kalloc.c +224 -0
- data/ext/minigraph/kalloc.h +82 -0
- data/ext/minigraph/kavl.h +414 -0
- data/ext/minigraph/kdq.h +134 -0
- data/ext/minigraph/ketopt.h +116 -0
- data/ext/minigraph/khashl.h +348 -0
- data/ext/minigraph/krmq.h +474 -0
- data/ext/minigraph/kseq.h +256 -0
- data/ext/minigraph/ksort.h +164 -0
- data/ext/minigraph/kstring.h +165 -0
- data/ext/minigraph/kthread.c +159 -0
- data/ext/minigraph/kthread.h +15 -0
- data/ext/minigraph/kvec-km.h +105 -0
- data/ext/minigraph/kvec.h +110 -0
- data/ext/minigraph/lchain.c +441 -0
- data/ext/minigraph/main.c +301 -0
- data/ext/minigraph/map-algo.c +500 -0
- data/ext/minigraph/mgpriv.h +128 -0
- data/ext/minigraph/minigraph.1 +359 -0
- data/ext/minigraph/minigraph.h +176 -0
- data/ext/minigraph/miniwfa.c +834 -0
- data/ext/minigraph/miniwfa.h +95 -0
- data/ext/minigraph/misc/mgutils.js +1451 -0
- data/ext/minigraph/misc.c +12 -0
- data/ext/minigraph/options.c +134 -0
- data/ext/minigraph/shortk.c +251 -0
- data/ext/minigraph/sketch.c +109 -0
- data/ext/minigraph/sys.c +147 -0
- data/ext/minigraph/sys.h +20 -0
- data/ext/minigraph/test/MT-chimp.fa +277 -0
- data/ext/minigraph/test/MT-human.fa +239 -0
- data/ext/minigraph/test/MT-orangA.fa +276 -0
- data/ext/minigraph/test/MT.gfa +19 -0
- data/ext/minigraph/tex/Makefile +13 -0
- data/ext/minigraph/tex/minigraph.bib +676 -0
- data/ext/minigraph/tex/minigraph.tex +986 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
- data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
- data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
- data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
- data/ext/minigraph/tex/plots/bedutils.js +367 -0
- data/ext/minigraph/tex/plots/chr-plot.js +130 -0
- data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
- data/ext/minigraph.patch +21 -0
- data/lib/minigraph/ffi/constants.rb +230 -0
- data/lib/minigraph/ffi/functions.rb +70 -0
- data/lib/minigraph/ffi/mappy.rb +8 -0
- data/lib/minigraph/ffi.rb +27 -0
- data/lib/minigraph/version.rb +5 -0
- data/lib/minigraph.rb +72 -0
- metadata +159 -0
@@ -0,0 +1,133 @@
|
|
1
|
+
#include <zlib.h>
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include <assert.h>
|
5
|
+
#define __STDC_LIMIT_MACROS
|
6
|
+
#include "bseq.h"
|
7
|
+
#include "kvec-km.h"
|
8
|
+
#include "kseq.h"
|
9
|
+
KSEQ_INIT(gzFile, gzread)
|
10
|
+
|
11
|
+
#define CHECK_PAIR_THRES 1000000
|
12
|
+
|
13
|
+
struct mg_bseq_file_s {
|
14
|
+
gzFile fp;
|
15
|
+
kseq_t *ks;
|
16
|
+
mg_bseq1_t s;
|
17
|
+
};
|
18
|
+
|
19
|
+
mg_bseq_file_t *mg_bseq_open(const char *fn)
|
20
|
+
{
|
21
|
+
mg_bseq_file_t *fp;
|
22
|
+
gzFile f;
|
23
|
+
f = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(0, "r");
|
24
|
+
if (f == 0) return 0;
|
25
|
+
fp = (mg_bseq_file_t*)calloc(1, sizeof(mg_bseq_file_t));
|
26
|
+
fp->fp = f;
|
27
|
+
fp->ks = kseq_init(fp->fp);
|
28
|
+
return fp;
|
29
|
+
}
|
30
|
+
|
31
|
+
void mg_bseq_close(mg_bseq_file_t *fp)
|
32
|
+
{
|
33
|
+
kseq_destroy(fp->ks);
|
34
|
+
gzclose(fp->fp);
|
35
|
+
free(fp);
|
36
|
+
}
|
37
|
+
|
38
|
+
static inline char *kstrdup(const kstring_t *s)
|
39
|
+
{
|
40
|
+
char *t;
|
41
|
+
t = (char*)malloc(s->l + 1);
|
42
|
+
memcpy(t, s->s, s->l + 1);
|
43
|
+
return t;
|
44
|
+
}
|
45
|
+
|
46
|
+
static inline void kseq2bseq(kseq_t *ks, mg_bseq1_t *s, int with_qual, int with_comment)
|
47
|
+
{
|
48
|
+
int i;
|
49
|
+
if (ks->name.l == 0)
|
50
|
+
fprintf(stderr, "[WARNING]\033[1;31m empty sequence name in the input.\033[0m\n");
|
51
|
+
s->name = kstrdup(&ks->name);
|
52
|
+
s->seq = kstrdup(&ks->seq);
|
53
|
+
for (i = 0; i < (int)ks->seq.l; ++i) // convert U to T
|
54
|
+
if (s->seq[i] == 'u' || s->seq[i] == 'U')
|
55
|
+
--s->seq[i];
|
56
|
+
s->qual = with_qual && ks->qual.l? kstrdup(&ks->qual) : 0;
|
57
|
+
s->comment = with_comment && ks->comment.l? kstrdup(&ks->comment) : 0;
|
58
|
+
s->l_seq = ks->seq.l;
|
59
|
+
}
|
60
|
+
|
61
|
+
mg_bseq1_t *mg_bseq_read(mg_bseq_file_t *fp, int64_t chunk_size, int with_qual, int with_comment, int frag_mode, int *n_)
|
62
|
+
{
|
63
|
+
int64_t size = 0;
|
64
|
+
int ret;
|
65
|
+
kvec_t(mg_bseq1_t) a = {0,0,0};
|
66
|
+
kseq_t *ks = fp->ks;
|
67
|
+
*n_ = 0;
|
68
|
+
if (fp->s.seq) {
|
69
|
+
kv_resize(mg_bseq1_t, 0, a, 256);
|
70
|
+
kv_push(mg_bseq1_t, 0, a, fp->s);
|
71
|
+
size = fp->s.l_seq;
|
72
|
+
memset(&fp->s, 0, sizeof(mg_bseq1_t));
|
73
|
+
}
|
74
|
+
while ((ret = kseq_read(ks)) >= 0) {
|
75
|
+
mg_bseq1_t *s;
|
76
|
+
assert(ks->seq.l <= INT32_MAX);
|
77
|
+
if (a.m == 0) kv_resize(mg_bseq1_t, 0, a, 256);
|
78
|
+
kv_pushp(mg_bseq1_t, 0, a, &s);
|
79
|
+
kseq2bseq(ks, s, with_qual, with_comment);
|
80
|
+
size += s->l_seq;
|
81
|
+
if (size >= chunk_size) {
|
82
|
+
if (frag_mode && a.a[a.n-1].l_seq < CHECK_PAIR_THRES) {
|
83
|
+
while (kseq_read(ks) >= 0) {
|
84
|
+
kseq2bseq(ks, &fp->s, with_qual, with_comment);
|
85
|
+
if (mg_qname_same(fp->s.name, a.a[a.n-1].name)) {
|
86
|
+
kv_push(mg_bseq1_t, 0, a, fp->s);
|
87
|
+
memset(&fp->s, 0, sizeof(mg_bseq1_t));
|
88
|
+
} else break;
|
89
|
+
}
|
90
|
+
}
|
91
|
+
break;
|
92
|
+
}
|
93
|
+
}
|
94
|
+
if (ret < -1)
|
95
|
+
fprintf(stderr, "[WARNING]\033[1;31m wrong FASTA/FASTQ record. Continue anyway.\033[0m\n");
|
96
|
+
*n_ = a.n;
|
97
|
+
return a.a;
|
98
|
+
}
|
99
|
+
|
100
|
+
mg_bseq1_t *mg_bseq_read_frag(int n_fp, mg_bseq_file_t **fp, int64_t chunk_size, int with_qual, int with_comment, int *n_)
|
101
|
+
{
|
102
|
+
int i;
|
103
|
+
int64_t size = 0;
|
104
|
+
kvec_t(mg_bseq1_t) a = {0,0,0};
|
105
|
+
*n_ = 0;
|
106
|
+
if (n_fp < 1) return 0;
|
107
|
+
while (1) {
|
108
|
+
int n_read = 0;
|
109
|
+
for (i = 0; i < n_fp; ++i)
|
110
|
+
if (kseq_read(fp[i]->ks) >= 0)
|
111
|
+
++n_read;
|
112
|
+
if (n_read < n_fp) {
|
113
|
+
if (n_read > 0)
|
114
|
+
fprintf(stderr, "[W::%s]\033[1;31m query files have different number of records; extra records skipped.\033[0m\n", __func__);
|
115
|
+
break; // some file reaches the end
|
116
|
+
}
|
117
|
+
if (a.m == 0) kv_resize(mg_bseq1_t, 0, a, 256);
|
118
|
+
for (i = 0; i < n_fp; ++i) {
|
119
|
+
mg_bseq1_t *s;
|
120
|
+
kv_pushp(mg_bseq1_t, 0, a, &s);
|
121
|
+
kseq2bseq(fp[i]->ks, s, with_qual, with_comment);
|
122
|
+
size += s->l_seq;
|
123
|
+
}
|
124
|
+
if (size >= chunk_size) break;
|
125
|
+
}
|
126
|
+
*n_ = a.n;
|
127
|
+
return a.a;
|
128
|
+
}
|
129
|
+
|
130
|
+
int mg_bseq_eof(mg_bseq_file_t *fp)
|
131
|
+
{
|
132
|
+
return (ks_eof(fp->ks->f) && fp->s.seq == 0);
|
133
|
+
}
|
@@ -0,0 +1,76 @@
|
|
1
|
+
#ifndef MM_BSEQ_H
|
2
|
+
#define MM_BSEQ_H
|
3
|
+
|
4
|
+
#include <stdint.h>
|
5
|
+
#include <string.h>
|
6
|
+
|
7
|
+
#ifdef __cplusplus
|
8
|
+
extern "C" {
|
9
|
+
#endif
|
10
|
+
|
11
|
+
extern unsigned char gfa_comp_table[256];
|
12
|
+
|
13
|
+
struct mg_bseq_file_s;
|
14
|
+
typedef struct mg_bseq_file_s mg_bseq_file_t;
|
15
|
+
|
16
|
+
typedef struct {
|
17
|
+
int32_t l_seq, rid;
|
18
|
+
char *name, *seq, *qual, *comment;
|
19
|
+
} mg_bseq1_t;
|
20
|
+
|
21
|
+
mg_bseq_file_t *mg_bseq_open(const char *fn);
|
22
|
+
void mg_bseq_close(mg_bseq_file_t *fp);
|
23
|
+
mg_bseq1_t *mg_bseq_read(mg_bseq_file_t *fp, int64_t chunk_size, int with_qual, int with_comment, int frag_mode, int *n_);
|
24
|
+
mg_bseq1_t *mg_bseq_read_frag(int n_fp, mg_bseq_file_t **fp, int64_t chunk_size, int with_qual, int with_comment, int *n_);
|
25
|
+
int mg_bseq_eof(mg_bseq_file_t *fp);
|
26
|
+
|
27
|
+
extern unsigned char seq_nt4_table[256];
|
28
|
+
extern unsigned char gfa_comp_table[256];
|
29
|
+
|
30
|
+
static inline int32_t mg_qname_len(const char *s)
|
31
|
+
{
|
32
|
+
int32_t l;
|
33
|
+
l = strlen(s);
|
34
|
+
return l >= 3 && s[l-1] >= '0' && s[l-1] <= '9' && s[l-2] == '/'? l - 2 : l;
|
35
|
+
}
|
36
|
+
|
37
|
+
static inline int32_t mg_qname_same(const char *s1, const char *s2)
|
38
|
+
{
|
39
|
+
int32_t l1, l2;
|
40
|
+
l1 = mg_qname_len(s1);
|
41
|
+
l2 = mg_qname_len(s2);
|
42
|
+
return (l1 == l2 && strncmp(s1, s2, l1) == 0);
|
43
|
+
}
|
44
|
+
|
45
|
+
static inline void mg_toupper(int32_t len, char *seq)
|
46
|
+
{
|
47
|
+
int32_t j;
|
48
|
+
for (j = 0; j < len; ++j)
|
49
|
+
seq[j] = seq[j] < 'a' || seq[j] > 'z'? seq[j] : seq[j] - 32;
|
50
|
+
}
|
51
|
+
|
52
|
+
static inline void mg_revcomp_seq(int32_t len, char *seq)
|
53
|
+
{
|
54
|
+
int32_t i;
|
55
|
+
for (i = 0; i < len>>1; ++i) {
|
56
|
+
int32_t t = seq[len - i - 1];
|
57
|
+
seq[len - i - 1] = gfa_comp_table[(uint8_t)seq[i]];
|
58
|
+
seq[i] = gfa_comp_table[t];
|
59
|
+
}
|
60
|
+
if (len&1) seq[len>>1] = gfa_comp_table[(uint8_t)seq[len>>1]];
|
61
|
+
}
|
62
|
+
|
63
|
+
static inline void mg_revcomp_bseq(mg_bseq1_t *s)
|
64
|
+
{
|
65
|
+
int32_t i, t, l = s->l_seq;
|
66
|
+
mg_revcomp_seq(s->l_seq, s->seq);
|
67
|
+
if (s->qual)
|
68
|
+
for (i = 0; i < l>>1; ++i)
|
69
|
+
t = s->qual[l - i - 1], s->qual[l - i - 1] = s->qual[i], s->qual[i] = t;
|
70
|
+
}
|
71
|
+
|
72
|
+
#ifdef __cplusplus
|
73
|
+
}
|
74
|
+
#endif
|
75
|
+
|
76
|
+
#endif
|
@@ -0,0 +1,139 @@
|
|
1
|
+
#include <assert.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include "mgpriv.h"
|
4
|
+
#include "gfa-priv.h"
|
5
|
+
#include "algo.h"
|
6
|
+
#include "kalloc.h"
|
7
|
+
|
8
|
+
void mg_cov_map(const gfa_t *g, const mg_gchains_t *gt, int32_t min_mapq, int32_t min_blen, double *c_seg, double *c_link, const char *qname)
|
9
|
+
{
|
10
|
+
int32_t i, j;
|
11
|
+
if (c_seg == 0 && c_link == 0) return;
|
12
|
+
if (gt == 0 || gt->n_gc == 0) return;
|
13
|
+
for (i = 0; i < gt->n_gc; ++i) {
|
14
|
+
const mg_gchain_t *gc = >->gc[i];
|
15
|
+
const mg128_t *last_an;
|
16
|
+
assert(gc->cnt > 0 && gc->n_anchor > 0);
|
17
|
+
if (gc->mapq < min_mapq || gc->blen < min_blen) continue;
|
18
|
+
// count segment coverage
|
19
|
+
for (j = 0; j < gc->cnt; ++j) {
|
20
|
+
const mg_llchain_t *lc = >->lc[gc->off + j];
|
21
|
+
int32_t s, e;
|
22
|
+
s = 0, e = g->seg[lc->v>>1].len;
|
23
|
+
if (j == 0) s = (int32_t)gt->a[lc->off].x + 1 - (int32_t)(gt->a[lc->off].y>>32&0xff);
|
24
|
+
if (j == gc->cnt - 1) e = (int32_t)gt->a[lc->off + lc->cnt - 1].x + 1;
|
25
|
+
if (c_seg) c_seg[lc->v>>1] += (double)(e - s) / g->seg[lc->v>>1].len;
|
26
|
+
}
|
27
|
+
// count link
|
28
|
+
assert(gt->lc[gc->off].cnt > 0);
|
29
|
+
last_an = >->a[gt->lc[gc->off].off + gt->lc[gc->off].cnt - 1];
|
30
|
+
for (j = 1; j < gc->cnt; ++j) {
|
31
|
+
const mg_llchain_t *lc0 = >->lc[gc->off + j - 1];
|
32
|
+
const mg_llchain_t *lc1 = >->lc[gc->off + j];
|
33
|
+
int64_t a01, a10;
|
34
|
+
if (lc1->cnt > 0) {
|
35
|
+
const mg128_t *curr_an = >->a[lc1->off];
|
36
|
+
int32_t is_skip = (mg_seg_id(*curr_an) != mg_seg_id(*last_an));
|
37
|
+
last_an = >->a[lc1->off + lc1->cnt - 1];
|
38
|
+
if (is_skip) continue;
|
39
|
+
}
|
40
|
+
a01 = gfa_find_arc(g, lc0->v, lc1->v);
|
41
|
+
a10 = gfa_find_arc(g, lc1->v^1, lc0->v^1);
|
42
|
+
if (a01 < 0 || a10 < 0) {
|
43
|
+
if (mg_verbose >= 2)
|
44
|
+
fprintf(stderr, "[W] Multi/disconnected link: %c%s[%d] -> %c%s[%d] (%s, %ld, %ld). Continue anyway!\n",
|
45
|
+
"><"[lc0->v&1], g->seg[lc0->v>>1].name, lc0->v,
|
46
|
+
"><"[lc1->v&1], g->seg[lc1->v>>1].name, lc1->v, qname, (long)a01, (long)a10);
|
47
|
+
continue;
|
48
|
+
}
|
49
|
+
assert((g->arc[a01].comp ^ g->arc[a10].comp) == 1);
|
50
|
+
if (c_link) c_link[a01] += 1.0, c_link[a10] += 1.0;
|
51
|
+
}
|
52
|
+
}
|
53
|
+
}
|
54
|
+
|
55
|
+
void mg_cov_asm(const gfa_t *g, int32_t n_seq, mg_gchains_t *const *gcs, int32_t min_mapq, int32_t min_blen, double *cov_seg, double *cov_link)
|
56
|
+
{
|
57
|
+
int32_t i, j, t, *soff, *scnt, *cnt_link;
|
58
|
+
int64_t k;
|
59
|
+
mg_intv_t *sintv = 0;
|
60
|
+
void *km = 0;
|
61
|
+
|
62
|
+
// precalculate the size of sintv[] for each segment
|
63
|
+
KCALLOC(km, scnt, g->n_seg);
|
64
|
+
for (t = 0; t < n_seq; ++t) {
|
65
|
+
const mg_gchains_t *gt = gcs[t];
|
66
|
+
if (gt == 0 || gt->n_gc == 0) continue;
|
67
|
+
for (i = 0; i < gt->n_gc; ++i) {
|
68
|
+
const mg_gchain_t *gc = >->gc[i];
|
69
|
+
assert(gc->cnt > 0 && gc->n_anchor > 0);
|
70
|
+
if (gc->mapq < min_mapq || gc->blen < min_blen) continue;
|
71
|
+
for (j = 0; j < gc->cnt; ++j) {
|
72
|
+
const mg_llchain_t *lc = >->lc[gc->off + j];
|
73
|
+
++scnt[lc->v>>1];
|
74
|
+
}
|
75
|
+
}
|
76
|
+
}
|
77
|
+
KMALLOC(km, soff, g->n_seg + 1);
|
78
|
+
for (soff[0] = 0, i = 1; i <= g->n_seg; ++i)
|
79
|
+
soff[i] = soff[i - 1] + scnt[i - 1];
|
80
|
+
memset(scnt, 0, 4 * g->n_seg);
|
81
|
+
KMALLOC(km, sintv, soff[g->n_seg]);
|
82
|
+
|
83
|
+
// fill sintv[]
|
84
|
+
KCALLOC(km, cnt_link, g->n_arc);
|
85
|
+
for (t = 0; t < n_seq; ++t) {
|
86
|
+
const mg_gchains_t *gt = gcs[t];
|
87
|
+
if (gt == 0 || gt->n_gc == 0) continue;
|
88
|
+
for (i = 0; i < gt->n_gc;) {
|
89
|
+
const mg_gchain_t *gc = >->gc[i];
|
90
|
+
if (gc->mapq < min_mapq || gc->blen < min_blen) continue;
|
91
|
+
// count segment coverage
|
92
|
+
for (j = 0; j < gc->cnt; ++j) {
|
93
|
+
const mg_llchain_t *lc = >->lc[gc->off + j];
|
94
|
+
int32_t s, e, tmp;
|
95
|
+
mg_intv_t *p;
|
96
|
+
s = 0, e = g->seg[lc->v>>1].len;
|
97
|
+
if (j == 0) s = (int32_t)gt->a[lc->off].x + 1 - (int32_t)(gt->a[lc->off].y>>32&0xff);
|
98
|
+
if (j == gc->cnt - 1) e = (int32_t)gt->a[lc->off + lc->cnt - 1].x + 1;
|
99
|
+
if (lc->v&1) // convert to the forward strand of segment lc->v>>1
|
100
|
+
tmp = g->seg[lc->v>>1].len - s, s = g->seg[lc->v>>1].len - e, e = tmp;
|
101
|
+
p = &sintv[soff[lc->v>>1] + scnt[lc->v>>1]];
|
102
|
+
++scnt[lc->v>>1];
|
103
|
+
p->st = s, p->en = e, p->rev = lc->v&1, p->far = -1, p->i = -1;
|
104
|
+
}
|
105
|
+
// count link
|
106
|
+
for (j = 1; j < gc->cnt; ++j) {
|
107
|
+
const mg_llchain_t *lc0 = >->lc[gc->off + j - 1];
|
108
|
+
const mg_llchain_t *lc1 = >->lc[gc->off + j];
|
109
|
+
int64_t a01, a10;
|
110
|
+
a01 = gfa_find_arc(g, lc0->v, lc1->v);
|
111
|
+
a10 = gfa_find_arc(g, lc1->v^1, lc0->v^1);
|
112
|
+
assert(a01 >= 0 && a10 >= 0);
|
113
|
+
assert((g->arc[a01].comp ^ g->arc[a10].comp) == 1);
|
114
|
+
++cnt_link[a01];
|
115
|
+
++cnt_link[a10];
|
116
|
+
}
|
117
|
+
}
|
118
|
+
}
|
119
|
+
|
120
|
+
// update cov_link[] and cov_seg[]
|
121
|
+
for (k = 0; k < g->n_arc; ++k)
|
122
|
+
if (cnt_link[k] > 0) cov_link[k] += 1.0;
|
123
|
+
for (i = 0; i < g->n_seg; ++i) {
|
124
|
+
int32_t st = 0, en = 0, cov = 0;
|
125
|
+
assert(scnt[i] == soff[i+1] - soff[i]);
|
126
|
+
radix_sort_mg_intv(&sintv[soff[i]], &sintv[soff[i+1]]);
|
127
|
+
for (j = soff[i]; j < soff[i+1]; ++j) {
|
128
|
+
if (sintv[j].st > en)
|
129
|
+
cov += en - st, st = sintv[j].st, en = sintv[j].en;
|
130
|
+
else en = sintv[j].en > en? sintv[j].en : en;
|
131
|
+
}
|
132
|
+
cov += en - st;
|
133
|
+
cov_seg[i] += (double)cov / g->seg[i].len;
|
134
|
+
}
|
135
|
+
|
136
|
+
// free
|
137
|
+
kfree(km, cnt_link);
|
138
|
+
kfree(km, sintv); kfree(km, soff); kfree(km, scnt);
|
139
|
+
}
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,241 @@
|
|
1
|
+
#include <stdarg.h>
|
2
|
+
#include <stdlib.h>
|
3
|
+
#include <string.h>
|
4
|
+
#include <assert.h>
|
5
|
+
#include <stdio.h>
|
6
|
+
#include <math.h>
|
7
|
+
#include "kalloc.h"
|
8
|
+
#include "mgpriv.h"
|
9
|
+
|
10
|
+
static inline void str_enlarge(kstring_t *s, int l)
|
11
|
+
{
|
12
|
+
if (s->l + l + 1 > s->m) {
|
13
|
+
s->m = s->l + l + 1;
|
14
|
+
kroundup32(s->m);
|
15
|
+
s->s = (char*)realloc(s->s, s->m);
|
16
|
+
}
|
17
|
+
}
|
18
|
+
|
19
|
+
static inline void str_copy(kstring_t *s, const char *st, const char *en)
|
20
|
+
{
|
21
|
+
str_enlarge(s, en - st);
|
22
|
+
memcpy(&s->s[s->l], st, en - st);
|
23
|
+
s->l += en - st;
|
24
|
+
}
|
25
|
+
|
26
|
+
void mg_sprintf_lite(kstring_t *s, const char *fmt, ...)
|
27
|
+
{
|
28
|
+
char buf[16]; // for integer to string conversion
|
29
|
+
const char *p, *q;
|
30
|
+
va_list ap;
|
31
|
+
va_start(ap, fmt);
|
32
|
+
for (q = p = fmt; *p; ++p) {
|
33
|
+
if (*p == '%') {
|
34
|
+
if (p > q) str_copy(s, q, p);
|
35
|
+
++p;
|
36
|
+
if (*p == 'd') {
|
37
|
+
int c, i, l = 0;
|
38
|
+
unsigned int x;
|
39
|
+
c = va_arg(ap, int);
|
40
|
+
x = c >= 0? c : -c;
|
41
|
+
do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0);
|
42
|
+
if (c < 0) buf[l++] = '-';
|
43
|
+
str_enlarge(s, l);
|
44
|
+
for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
|
45
|
+
} else if (*p == 'u') {
|
46
|
+
int i, l = 0;
|
47
|
+
uint32_t x;
|
48
|
+
x = va_arg(ap, uint32_t);
|
49
|
+
do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0);
|
50
|
+
str_enlarge(s, l);
|
51
|
+
for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
|
52
|
+
} else if (*p == 's') {
|
53
|
+
char *r = va_arg(ap, char*);
|
54
|
+
str_copy(s, r, r + strlen(r));
|
55
|
+
} else if (*p == 'c') {
|
56
|
+
str_enlarge(s, 1);
|
57
|
+
s->s[s->l++] = va_arg(ap, int);
|
58
|
+
} else abort();
|
59
|
+
q = p + 1;
|
60
|
+
}
|
61
|
+
}
|
62
|
+
if (p > q) str_copy(s, q, p);
|
63
|
+
va_end(ap);
|
64
|
+
s->s[s->l] = 0;
|
65
|
+
}
|
66
|
+
|
67
|
+
void mg_print_lchain(FILE *fp, const mg_idx_t *gi, int n_lc, const mg_lchain_t *lc, const mg128_t *a, const char *qname)
|
68
|
+
{
|
69
|
+
kstring_t str = {0,0,0};
|
70
|
+
int i, j;
|
71
|
+
for (i = 0; i < n_lc; ++i) {
|
72
|
+
const mg_lchain_t *p = &lc[i];
|
73
|
+
int mlen, blen, span = a[p->off].y>>32&0xff;
|
74
|
+
mlen = blen = span;
|
75
|
+
for (j = 1; j < p->cnt; ++j) {
|
76
|
+
int ql = (int32_t)a[p->off + j].y - (int32_t)a[p->off + j - 1].y;
|
77
|
+
int pl = (int32_t)a[p->off + j].x - (int32_t)a[p->off + j - 1].x;
|
78
|
+
blen += pl > ql? pl : ql;
|
79
|
+
mlen += pl > span && ql > span? span : pl < ql? pl : ql;
|
80
|
+
}
|
81
|
+
str.l = 0;
|
82
|
+
mg_sprintf_lite(&str, "LC\t%s\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t", qname, p->qs, p->qe, "+-"[p->v&1], gi->g->seg[p->v>>1].name, gi->g->seg[p->v>>1].len,
|
83
|
+
p->rs, p->re, p->score, mlen, blen, p->cnt);
|
84
|
+
for (j = 0; j < p->cnt; ++j)
|
85
|
+
mg_sprintf_lite(&str, "%d,", (int32_t)a[p->off + j].y);
|
86
|
+
mg_sprintf_lite(&str, "\t");
|
87
|
+
for (j = 0; j < p->cnt; ++j)
|
88
|
+
mg_sprintf_lite(&str, "%d,", (int32_t)a[p->off + j].x);
|
89
|
+
mg_sprintf_lite(&str, "\t");
|
90
|
+
for (j = 0; j < p->cnt; ++j)
|
91
|
+
mg_sprintf_lite(&str, "%d,", (int32_t)(a[p->off + j].y>>MG_SEED_OCC_SHIFT));
|
92
|
+
mg_sprintf_lite(&str, "\n");
|
93
|
+
fwrite(str.s, 1, str.l, fp);
|
94
|
+
}
|
95
|
+
free(str.s);
|
96
|
+
}
|
97
|
+
|
98
|
+
void mg_write_gaf(kstring_t *s, const gfa_t *g, const mg_gchains_t *gs, int32_t n_seg, const int32_t *qlens, const char *qname, uint64_t flag, void *km)
|
99
|
+
{
|
100
|
+
int32_t i, j, qlen, rev_sign = 0;
|
101
|
+
s->l = 0;
|
102
|
+
for (i = 0, qlen = 0; i < n_seg; ++i) qlen += qlens[i];
|
103
|
+
if ((gs == 0 || gs->n_gc == 0) && (flag&MG_M_SHOW_UNMAP)) {
|
104
|
+
mg_sprintf_lite(s, "%s", qname);
|
105
|
+
if ((flag&MG_M_FRAG_MERGE) && n_seg == 2 && s->l > 2 && s->s[s->l-1] == '1' && s->s[s->l-2] == '/') s->l -= 2;
|
106
|
+
mg_sprintf_lite(s, "\t%d\t0\t0\t*\t*\t0\t0\t0\t0\t0\t0\n", qlen);
|
107
|
+
return;
|
108
|
+
}
|
109
|
+
if (gs == 0) return;
|
110
|
+
for (i = 0; i < gs->n_gc; ++i) {
|
111
|
+
const mg_gchain_t *p = &gs->gc[i];
|
112
|
+
int32_t sign_pos, compact;
|
113
|
+
if (p->id != p->parent && !(flag&MG_M_PRINT_2ND)) continue;
|
114
|
+
if (p->cnt == 0) continue;
|
115
|
+
mg_sprintf_lite(s, "%s", qname);
|
116
|
+
if ((flag&MG_M_FRAG_MERGE) && n_seg == 2 && s->l > 2 && s->s[s->l-1] == '1' && s->s[s->l-2] == '/') s->l -= 2;
|
117
|
+
mg_sprintf_lite(s, "\t%d\t%d\t%d\t+\t", qlen, p->qs, p->qe);
|
118
|
+
assert(p->cnt > 0);
|
119
|
+
sign_pos = s->l - 2;
|
120
|
+
if (flag & MG_M_VERTEX_COOR) {
|
121
|
+
compact = 0;
|
122
|
+
for (j = 0; j < p->cnt; ++j) {
|
123
|
+
const mg_llchain_t *q = &gs->lc[p->off + j];
|
124
|
+
mg_sprintf_lite(s, "%c%s", "><"[q->v&1], g->seg[q->v>>1].name);
|
125
|
+
}
|
126
|
+
} else {
|
127
|
+
int32_t last_pnid = -1, st = -1, en = -1, rev = -1;
|
128
|
+
compact = flag&MG_M_NO_COMP_PATH? 0 : 1;
|
129
|
+
for (j = 0; j < p->cnt; ++j) {
|
130
|
+
const mg_llchain_t *q;
|
131
|
+
const gfa_seg_t *t;
|
132
|
+
assert(p->off + j < gs->n_lc);
|
133
|
+
q = &gs->lc[p->off + j];
|
134
|
+
t = &g->seg[q->v>>1];
|
135
|
+
if (t->snid < 0) { // no stable ID; write the vertex coordinate
|
136
|
+
compact = 0;
|
137
|
+
if (last_pnid >= 0) mg_sprintf_lite(s, "%c%s:%d-%d", "><"[rev], g->sseq[last_pnid].name, st, en);
|
138
|
+
last_pnid = -1, st = -1, en = -1, rev = -1;
|
139
|
+
mg_sprintf_lite(s, "%c%s", "><"[q->v&1], g->seg[q->v>>1].name);
|
140
|
+
} else {
|
141
|
+
int cont = 0;
|
142
|
+
if (last_pnid >= 0 && t->snid == last_pnid && (q->v&1) == rev) { // same stable sequence and same strand
|
143
|
+
if (!(q->v&1)) { // forward strand
|
144
|
+
if (t->soff == en)
|
145
|
+
en = t->soff + t->len, cont = 1;
|
146
|
+
} else { // reverse strand
|
147
|
+
if (t->soff + t->len == st)
|
148
|
+
st = t->soff, cont = 1;
|
149
|
+
}
|
150
|
+
}
|
151
|
+
if (cont == 0) {
|
152
|
+
if (last_pnid >= 0) compact = 0;
|
153
|
+
if (last_pnid >= 0) mg_sprintf_lite(s, "%c%s:%d-%d", "><"[rev], g->sseq[last_pnid].name, st, en);
|
154
|
+
last_pnid = t->snid, rev = q->v&1, st = t->soff, en = st + t->len;
|
155
|
+
}
|
156
|
+
}
|
157
|
+
}
|
158
|
+
if (last_pnid >= 0) {
|
159
|
+
if (g->sseq[last_pnid].rank != 0 || g->sseq[last_pnid].min != 0)
|
160
|
+
compact = 0;
|
161
|
+
if (!compact) mg_sprintf_lite(s, "%c%s:%d-%d", "><"[rev], g->sseq[last_pnid].name, st, en);
|
162
|
+
} else compact = 0;
|
163
|
+
}
|
164
|
+
if (compact) {
|
165
|
+
int32_t rev = gs->lc[p->off].v&1;
|
166
|
+
const gfa_seg_t *t = &g->seg[gs->lc[rev? p->off + p->cnt - 1 : p->off].v>>1];
|
167
|
+
const gfa_sseq_t *ps = &g->sseq[t->snid];
|
168
|
+
mg_sprintf_lite(s, "%s\t%d\t", ps->name, ps->max);
|
169
|
+
if (rev) {
|
170
|
+
rev_sign = 1;
|
171
|
+
s->s[sign_pos] = '-';
|
172
|
+
mg_sprintf_lite(s, "%d\t%d", t->soff + (p->plen - p->pe), t->soff + (p->plen - p->ps));
|
173
|
+
} else {
|
174
|
+
mg_sprintf_lite(s, "%d\t%d", t->soff + p->ps, t->soff + p->pe);
|
175
|
+
}
|
176
|
+
} else mg_sprintf_lite(s, "\t%d\t%d\t%d", p->plen, p->ps, p->pe);
|
177
|
+
if (p->p) mg_sprintf_lite(s, "\t%d\t%d\t%d", p->p->mlen, p->p->blen, p->mapq);
|
178
|
+
else mg_sprintf_lite(s, "\t%d\t%d\t%d", p->mlen, p->blen, p->mapq);
|
179
|
+
mg_sprintf_lite(s, "\ttp:A:%c", p->id == p->parent? 'P' : 'S');
|
180
|
+
if (p->p) mg_sprintf_lite(s, "\tNM:i:%d", p->p->blen - p->p->mlen);
|
181
|
+
mg_sprintf_lite(s, "\tcm:i:%d\ts1:i:%d\ts2:i:%d", p->n_anchor, p->score, p->subsc);
|
182
|
+
if (p->div >= 0.0f && p->div <= 1.0f) {
|
183
|
+
char buf[16];
|
184
|
+
if (p->div == 0.0f) buf[0] = '0', buf[1] = 0;
|
185
|
+
else snprintf(buf, 16, "%.4f", p->div);
|
186
|
+
mg_sprintf_lite(s, "\tdv:f:%s", buf);
|
187
|
+
}
|
188
|
+
if (n_seg > 1) {
|
189
|
+
mg_sprintf_lite(s, "\tql:B:i");
|
190
|
+
for (j = 0; j < n_seg; ++j) mg_sprintf_lite(s, ",%d", qlens[j]);
|
191
|
+
}
|
192
|
+
if (p->p) {
|
193
|
+
mg_sprintf_lite(s, "\tcg:Z:");
|
194
|
+
if (rev_sign)
|
195
|
+
for (j = p->p->n_cigar - 1; j >= 0; --j)
|
196
|
+
mg_sprintf_lite(s, "%d%c", (int32_t)(p->p->cigar[j]>>4), "MIDNSHP=XB"[p->p->cigar[j]&0xf]);
|
197
|
+
else
|
198
|
+
for (j = 0; j < p->p->n_cigar; ++j)
|
199
|
+
mg_sprintf_lite(s, "%d%c", (int32_t)(p->p->cigar[j]>>4), "MIDNSHP=XB"[p->p->cigar[j]&0xf]);
|
200
|
+
}
|
201
|
+
mg_sprintf_lite(s, "\n");
|
202
|
+
if ((mg_dbg_flag & MG_DBG_LCHAIN) || (flag & MG_M_WRITE_LCHAIN)) {
|
203
|
+
char buf[16];
|
204
|
+
for (j = 0; j < p->cnt; ++j) {
|
205
|
+
const mg_llchain_t *lc = &gs->lc[p->off + j];
|
206
|
+
mg_sprintf_lite(s, "*\t%c%s\t%d\t%d", "><"[lc->v&1], g->seg[lc->v>>1].name, g->seg[lc->v>>1].len, lc->cnt);
|
207
|
+
if (lc->cnt > 0) {
|
208
|
+
double div;
|
209
|
+
int32_t q_span = (int32_t)(gs->a[lc->off].y>>32&0xff);
|
210
|
+
int32_t n = (int32_t)(gs->a[lc->off + lc->cnt - 1].x>>32) - (int32_t)(gs->a[lc->off].x>>32) + 1;
|
211
|
+
div = n == lc->cnt? 0.0 : (n > lc->cnt? log((double)n / lc->cnt) : log((double)lc->cnt / n)) / q_span;
|
212
|
+
if (div == 0.0) buf[0] = '0', buf[1] = 0;
|
213
|
+
else snprintf(buf, 16, "%.4f", div);
|
214
|
+
mg_sprintf_lite(s, "\t%s", buf);
|
215
|
+
mg_sprintf_lite(s, "\t%d\t%d", (int32_t)gs->a[lc->off].x + 1 - q_span, (int32_t)gs->a[lc->off + lc->cnt - 1].x + 1);
|
216
|
+
mg_sprintf_lite(s, "\t%d\t%d", (int32_t)gs->a[lc->off].y + 1 - q_span, (int32_t)gs->a[lc->off + lc->cnt - 1].y + 1);
|
217
|
+
if (flag & MG_M_WRITE_MZ) {
|
218
|
+
int32_t i, last;
|
219
|
+
last = (int32_t)gs->a[lc->off].x + 1 - q_span;
|
220
|
+
mg_sprintf_lite(s, "\t%d\t", q_span);
|
221
|
+
for (i = 1; i < lc->cnt; ++i) {
|
222
|
+
int32_t x = (int32_t)gs->a[lc->off + i].x + 1 - q_span;
|
223
|
+
if (i > 1) mg_sprintf_lite(s, ",");
|
224
|
+
mg_sprintf_lite(s, "%d", x - last);
|
225
|
+
last = x;
|
226
|
+
}
|
227
|
+
last = (int32_t)gs->a[lc->off].y + 1 - q_span;
|
228
|
+
mg_sprintf_lite(s, "\t");
|
229
|
+
for (i = 1; i < lc->cnt; ++i) {
|
230
|
+
int32_t x = (int32_t)gs->a[lc->off + i].y + 1 - q_span;
|
231
|
+
if (i > 1) mg_sprintf_lite(s, ",");
|
232
|
+
mg_sprintf_lite(s, "%d", x - last);
|
233
|
+
last = x;
|
234
|
+
}
|
235
|
+
}
|
236
|
+
}
|
237
|
+
mg_sprintf_lite(s, "\n");
|
238
|
+
}
|
239
|
+
}
|
240
|
+
}
|
241
|
+
}
|