ruby-minigraph 0.0.20.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +62 -0
- data/ext/Rakefile +56 -0
- data/ext/cmappy/cmappy.c +7 -0
- data/ext/cmappy/cmappy.h +8 -0
- data/ext/minigraph/LICENSE.txt +23 -0
- data/ext/minigraph/Makefile +66 -0
- data/ext/minigraph/NEWS.md +317 -0
- data/ext/minigraph/README.md +207 -0
- data/ext/minigraph/algo.c +194 -0
- data/ext/minigraph/algo.h +33 -0
- data/ext/minigraph/asm-call.c +147 -0
- data/ext/minigraph/bseq.c +133 -0
- data/ext/minigraph/bseq.h +76 -0
- data/ext/minigraph/cal_cov.c +139 -0
- data/ext/minigraph/doc/example1.png +0 -0
- data/ext/minigraph/doc/example2.png +0 -0
- data/ext/minigraph/doc/examples.graffle +0 -0
- data/ext/minigraph/format.c +241 -0
- data/ext/minigraph/galign.c +140 -0
- data/ext/minigraph/gchain1.c +532 -0
- data/ext/minigraph/gcmisc.c +223 -0
- data/ext/minigraph/gfa-aug.c +260 -0
- data/ext/minigraph/gfa-base.c +526 -0
- data/ext/minigraph/gfa-bbl.c +372 -0
- data/ext/minigraph/gfa-ed.c +617 -0
- data/ext/minigraph/gfa-io.c +395 -0
- data/ext/minigraph/gfa-priv.h +154 -0
- data/ext/minigraph/gfa.h +166 -0
- data/ext/minigraph/ggen.c +182 -0
- data/ext/minigraph/ggen.h +21 -0
- data/ext/minigraph/ggsimple.c +570 -0
- data/ext/minigraph/gmap.c +211 -0
- data/ext/minigraph/index.c +230 -0
- data/ext/minigraph/kalloc.c +224 -0
- data/ext/minigraph/kalloc.h +82 -0
- data/ext/minigraph/kavl.h +414 -0
- data/ext/minigraph/kdq.h +134 -0
- data/ext/minigraph/ketopt.h +116 -0
- data/ext/minigraph/khashl.h +348 -0
- data/ext/minigraph/krmq.h +474 -0
- data/ext/minigraph/kseq.h +256 -0
- data/ext/minigraph/ksort.h +164 -0
- data/ext/minigraph/kstring.h +165 -0
- data/ext/minigraph/kthread.c +159 -0
- data/ext/minigraph/kthread.h +15 -0
- data/ext/minigraph/kvec-km.h +105 -0
- data/ext/minigraph/kvec.h +110 -0
- data/ext/minigraph/lchain.c +441 -0
- data/ext/minigraph/main.c +301 -0
- data/ext/minigraph/map-algo.c +500 -0
- data/ext/minigraph/mgpriv.h +128 -0
- data/ext/minigraph/minigraph.1 +359 -0
- data/ext/minigraph/minigraph.h +176 -0
- data/ext/minigraph/miniwfa.c +834 -0
- data/ext/minigraph/miniwfa.h +95 -0
- data/ext/minigraph/misc/mgutils.js +1451 -0
- data/ext/minigraph/misc.c +12 -0
- data/ext/minigraph/options.c +134 -0
- data/ext/minigraph/shortk.c +251 -0
- data/ext/minigraph/sketch.c +109 -0
- data/ext/minigraph/sys.c +147 -0
- data/ext/minigraph/sys.h +20 -0
- data/ext/minigraph/test/MT-chimp.fa +277 -0
- data/ext/minigraph/test/MT-human.fa +239 -0
- data/ext/minigraph/test/MT-orangA.fa +276 -0
- data/ext/minigraph/test/MT.gfa +19 -0
- data/ext/minigraph/tex/Makefile +13 -0
- data/ext/minigraph/tex/minigraph.bib +676 -0
- data/ext/minigraph/tex/minigraph.tex +986 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
- data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
- data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
- data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
- data/ext/minigraph/tex/plots/bedutils.js +367 -0
- data/ext/minigraph/tex/plots/chr-plot.js +130 -0
- data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
- data/ext/minigraph.patch +21 -0
- data/lib/minigraph/ffi/constants.rb +230 -0
- data/lib/minigraph/ffi/functions.rb +70 -0
- data/lib/minigraph/ffi/mappy.rb +8 -0
- data/lib/minigraph/ffi.rb +27 -0
- data/lib/minigraph/version.rb +5 -0
- data/lib/minigraph.rb +72 -0
- metadata +159 -0
@@ -0,0 +1,207 @@
|
|
1
|
+
[![Build Status](https://travis-ci.org/lh3/minigraph.svg?branch=master)](https://travis-ci.org/lh3/minigraph)
|
2
|
+
## <a name="started"></a>Getting Started
|
3
|
+
|
4
|
+
```sh
|
5
|
+
git clone https://github.com/lh3/minigraph
|
6
|
+
cd minigraph && make
|
7
|
+
# Map sequence to sequence, similar to minimap2 without base alignment
|
8
|
+
./minigraph test/MT-human.fa test/MT-orangA.fa > out.paf
|
9
|
+
# Map sequence to graph
|
10
|
+
./minigraph test/MT.gfa test/MT-orangA.fa > out.gaf
|
11
|
+
# Incremental graph generation (-l10k necessary for this toy example)
|
12
|
+
./minigraph -cxggs -l10k test/MT.gfa test/MT-chimp.fa test/MT-orangA.fa > out.gfa
|
13
|
+
# Call per-sample path in each bubble/variation (-c not needed for this)
|
14
|
+
./minigraph -xasm -l10k --call test/MT.gfa test/MT-orangA.fa > orangA.call.bed
|
15
|
+
|
16
|
+
# The lossy FASTA representation (requring https://github.com/lh3/gfatools)
|
17
|
+
gfatools gfa2fa -s out.gfa > out.fa
|
18
|
+
# Extract localized structural variations
|
19
|
+
gfatools bubble out.gfa > SV.bed
|
20
|
+
```
|
21
|
+
|
22
|
+
## Table of Contents
|
23
|
+
|
24
|
+
<img align="right" width="278" src="doc/example1.png"/>
|
25
|
+
|
26
|
+
- [Getting Started](#started)
|
27
|
+
- [Introduction](#intro)
|
28
|
+
- [Users' Guide](#uguide)
|
29
|
+
- [Installation](#install)
|
30
|
+
- [Sequence-to-graph mapping](#map)
|
31
|
+
- [Graph generation](#ggen)
|
32
|
+
- [Calling structural variations](#callsv)
|
33
|
+
- [Prebuilt graphs](#prebuilt)
|
34
|
+
- [Algorithm overview](#algo)
|
35
|
+
- [Limitations](#limit)
|
36
|
+
|
37
|
+
## <a name="intro"></a>Introduction
|
38
|
+
|
39
|
+
Minigraph is a sequence-to-graph mapper and graph constructor. For graph
|
40
|
+
generation, it aligns a query sequence against a sequence graph and
|
41
|
+
incrementally augments an existing graph with long query subsequences diverged
|
42
|
+
from the graph. The figure on the right briefly explains the procedure.
|
43
|
+
|
44
|
+
Minigraph borrows ideas and code from [minimap2][minimap2]. It is fairly
|
45
|
+
efficient and can construct a graph from 90 human assemblies in a couple of
|
46
|
+
days using 24 CPU cores. Older versions of minigraph was unable to produce
|
47
|
+
base alignment. The latest version can. **Please add option `-c` for graph
|
48
|
+
generation** as it generally improves the quality of graphs.
|
49
|
+
|
50
|
+
## <a name="uguide"></a>Users' Guide
|
51
|
+
|
52
|
+
### <a name="install"></a>Installation
|
53
|
+
|
54
|
+
To install minigraph, type `make` in the source code directory. The only
|
55
|
+
non-standard dependency is [zlib][zlib]. For better performance, it is
|
56
|
+
recommended to compile with recent compliers.
|
57
|
+
|
58
|
+
### <a name="map"></a>Sequence-to-graph mapping
|
59
|
+
|
60
|
+
To map sequences against a graph, you should prepare the graph in the [GFA
|
61
|
+
format][gfa1], or preferrably the [rGFA format][rgfa]. If you don't have
|
62
|
+
a graph, you can generate a graph from multiple samples (see the [Graph
|
63
|
+
generation section](#ggen) below). The typical command line for mapping is
|
64
|
+
```sh
|
65
|
+
minigraph -cx lr graph.gfa query.fa > out.gaf
|
66
|
+
```
|
67
|
+
You may choose the right preset option `-x` according to input. Minigraph
|
68
|
+
output mappings in the [GAF format][gaf], which is a strict superset of the
|
69
|
+
[PAF format][paf]. The only visual difference between GAF and PAF is that the
|
70
|
+
6th column in GAF may encode a graph path like
|
71
|
+
`>MT_human:0-4001<MT_orang:3426-3927` instead of a contig/chromosome name.
|
72
|
+
|
73
|
+
The minigraph GFA parser seamlessly parses FASTA and converts it to GFA
|
74
|
+
internally, so you can also provide sequences in FASTA as the reference. In
|
75
|
+
this case, minigraph will behave like minimap2, though likely producing
|
76
|
+
different alignments due to differences between the two implementations.
|
77
|
+
|
78
|
+
### <a name="ggen"></a>Graph generation
|
79
|
+
|
80
|
+
The following command-line generates a graph in rGFA:
|
81
|
+
```sh
|
82
|
+
minigraph -cxggs -t16 ref.fa sample1.fa sample2.fa > out.gfa
|
83
|
+
```
|
84
|
+
which is equivalent to
|
85
|
+
```sh
|
86
|
+
minigraph -cxggs -t16 ref.fa sample1.fa > sample1.gfa
|
87
|
+
minigraph -cxggs -t16 sample1.gfa sample2.fa > out.gfa
|
88
|
+
```
|
89
|
+
File `ref.fa` is typically the reference genome (e.g. GRCh38 for human).
|
90
|
+
It can also be replaced by a graph in rGFA. Minigraph assumes `sample1.fa` to
|
91
|
+
be the whole-genome assembly of an individual. This is an important assumption:
|
92
|
+
minigraph only considers 1-to-1 orthogonal regions between the graph and the
|
93
|
+
individual FASTA. If you use raw reads or put multiple individual genomes in
|
94
|
+
one file, minigraph will filter out most alignments as they cover the input
|
95
|
+
graph multiple times.
|
96
|
+
|
97
|
+
The output rGFA can be converted to a FASTA file with [gfatools][gfatools]:
|
98
|
+
```sh
|
99
|
+
gfatools gfa2fa -s graph.gfa > out.stable.fa
|
100
|
+
```
|
101
|
+
The output `out.stable.fa` will always include the initial reference `ref.fa`
|
102
|
+
and may additionally add new segments diverged from the initial reference.
|
103
|
+
|
104
|
+
### <a name="callsv"></a>Calling structural variations
|
105
|
+
|
106
|
+
A minigraph graph is composed of chains of bubbles with the reference as the
|
107
|
+
backbone. Each *bubble* represents a structural variation. It can be
|
108
|
+
multi-allelic if there are multiple paths through the bubble. You can extract
|
109
|
+
these bubbles with
|
110
|
+
```sh
|
111
|
+
gfatools bubble graph.gfa > var.bed
|
112
|
+
```
|
113
|
+
The output is a BED-like file. The first three columns give the position of a
|
114
|
+
bubble/variation and the rest of columns are:
|
115
|
+
|
116
|
+
* (4) \# GFA segments in the bubble including the source and the sink of the bubble
|
117
|
+
* (5) \# all possible paths through the bubble (not all paths present in input samples)
|
118
|
+
* (6) 1 if the bubble involves an inversion; 0 otherwise
|
119
|
+
* (7) length of the shortest path (i.e. allele) through the bubble
|
120
|
+
* (8) length of the longest path/allele through the bubble
|
121
|
+
* (9-11) please ignore
|
122
|
+
* (12) list of segments in the bubble; first for the source and last for the sink
|
123
|
+
* (13) sequence of the shortest path (`*` if zero length)
|
124
|
+
* (14) sequence of the longest path (NB: it may not be present in the input samples)
|
125
|
+
|
126
|
+
Given an assembly, you can find the path/allele of this assembly in each bubble with
|
127
|
+
```sh
|
128
|
+
minigraph -cxasm --call graph.gfa sample-asm.fa > sample.bed
|
129
|
+
```
|
130
|
+
On each line in the BED-like output, the last colon separated field gives the
|
131
|
+
alignment path through the bubble, the path length in the graph, the mapping
|
132
|
+
strand of sample contig, the contig name, the approximate contig start and
|
133
|
+
contig end. The number of lines in the file is the same as the number of lines
|
134
|
+
in the output of `gfatools bubble`. You can use the `paste` Unix command to
|
135
|
+
piece multiple samples together.
|
136
|
+
|
137
|
+
### <a name="prebuilt"></a>Prebuilt graphs
|
138
|
+
|
139
|
+
Prebuilt human graphs in the rGFA format can be found [at Zenodo][human-zenodo].
|
140
|
+
|
141
|
+
### <a name="algo"></a>Algorithm overview
|
142
|
+
|
143
|
+
<img align="right" width="278" src="doc/example2.png"/>
|
144
|
+
|
145
|
+
In the following, minigraph command line options have a dash ahead and are
|
146
|
+
highlighted in bold. The description may help to tune minigraph parameters.
|
147
|
+
|
148
|
+
1. Read all reference bases, extract (**-k**,**-w**)-minimizers and index them
|
149
|
+
in a hash table.
|
150
|
+
|
151
|
+
2. Read **-K** [=*500M*] query bases in the mapping mode, or read all query
|
152
|
+
bases in the graph construction mode. For each query sequence, do step 3
|
153
|
+
through 5:
|
154
|
+
|
155
|
+
3. Find colinear minimizer chains using the [minimap2][minimap2] algorithm,
|
156
|
+
assuming segments in the graph are disconnected. These are called *linear
|
157
|
+
chains*.
|
158
|
+
|
159
|
+
4. Perform another round of chaining, taking each linear chain as an anchor.
|
160
|
+
For a pair of linear chains, minigraph tries to connect them by doing graph
|
161
|
+
wavefront alignment algorithm (GWFA). If minigraph fails to find an
|
162
|
+
alignment within an edit distance threshold, it will find up to 15 shortest
|
163
|
+
paths between the two linear chains and chooses the path of length closest
|
164
|
+
to the distance on the query sequence. Chains found at this step are called
|
165
|
+
*graph chains*.
|
166
|
+
|
167
|
+
5. Identify primary chains and estimate mapping quality with a method similar
|
168
|
+
to the one used in minimap2. Perform base alignment.
|
169
|
+
|
170
|
+
6. In the graph construction mode, collect all mappings longer than **-d**
|
171
|
+
[=*10k*] and keep their query and graph segment intervals in two lists,
|
172
|
+
respectively.
|
173
|
+
|
174
|
+
7. For each mapping longer than **-l** [=*100k*], finds poorly aligned regions.
|
175
|
+
A region is filtered if it overlaps two or more intervals collected at step
|
176
|
+
6.
|
177
|
+
|
178
|
+
8. Insert the remaining poorly aligned regions into the input graph. This
|
179
|
+
constructs a new graph.
|
180
|
+
|
181
|
+
## <a name="limit"></a>Limitations
|
182
|
+
|
183
|
+
* A complex minigraph subgraph is often suboptimal and may vary with the order
|
184
|
+
of input samples. It may not represent the evolution history
|
185
|
+
or the functional relevance at the locus. Please *do not overinterpret*
|
186
|
+
complex subgraphs. If you are interested in a particular subgraph, it is
|
187
|
+
recommended to extract the input contig subsequences involved in the subgraph
|
188
|
+
with the `--call` option and manually curated the results.
|
189
|
+
|
190
|
+
* Minigraph needs to find strong colinear chains first. For a graph consisting
|
191
|
+
of many short segments (e.g. one generated from rare SNPs in large
|
192
|
+
populations), minigraph will fail to map query sequences.
|
193
|
+
|
194
|
+
* The base alignment in the current version of minigraph is slow for species of
|
195
|
+
high diversity.
|
196
|
+
|
197
|
+
|
198
|
+
[zlib]: http://zlib.net/
|
199
|
+
[minimap2]: https://github.com/lh3/minimap2
|
200
|
+
[rgfa]: https://github.com/lh3/gfatools/blob/master/doc/rGFA.md
|
201
|
+
[gfa1]: https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md
|
202
|
+
[gaf]: https://github.com/lh3/gfatools/blob/master/doc/rGFA.md#the-graph-alignment-format-gaf
|
203
|
+
[paf]: https://github.com/lh3/miniasm/blob/master/PAF.md
|
204
|
+
[gfatools]: https://github.com/lh3/gfatools
|
205
|
+
[bandage]: https://rrwick.github.io/Bandage/
|
206
|
+
[gfaviz]: https://github.com/ggonnella/gfaviz
|
207
|
+
[human-zenodo]: https://zenodo.org/record/6499594
|
@@ -0,0 +1,194 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include "kalloc.h"
|
4
|
+
#define __STDC_LIMIT_MACROS
|
5
|
+
#include "algo.h"
|
6
|
+
#include "miniwfa.h"
|
7
|
+
|
8
|
+
/************************
|
9
|
+
* Max-scoring segments *
|
10
|
+
************************/
|
11
|
+
|
12
|
+
#include "kvec-km.h"
|
13
|
+
|
14
|
+
#define MSS_NEG_INF INT32_MIN
|
15
|
+
|
16
|
+
typedef struct {
|
17
|
+
int32_t st, en;
|
18
|
+
MG_MSS_TYPE L, R;
|
19
|
+
int32_t pre;
|
20
|
+
} msseg_aux_t;
|
21
|
+
|
22
|
+
typedef kvec_t(mg_msseg_t) msseg_v;
|
23
|
+
typedef kvec_t(msseg_aux_t) msseg_aux_v;
|
24
|
+
|
25
|
+
static void move_segs(void *km, msseg_v *ret, msseg_aux_v *seg, MG_MSS_TYPE min_sc)
|
26
|
+
{
|
27
|
+
int32_t i;
|
28
|
+
for (i = 0; i < seg->n; ++i) {
|
29
|
+
msseg_aux_t *p = &seg->a[i];
|
30
|
+
if (p->R - p->L >= min_sc) {
|
31
|
+
mg_msseg_t *q;
|
32
|
+
kv_pushp(mg_msseg_t, km, *ret, &q);
|
33
|
+
q->st = p->st, q->en = p->en, q->sc = p->R - p->L;
|
34
|
+
}
|
35
|
+
}
|
36
|
+
seg->n = 0;
|
37
|
+
}
|
38
|
+
|
39
|
+
// Reference: Ruzzo and Tompa (1999) A linear time algorithm for finding all maximal scoring subsequencs
|
40
|
+
mg_msseg_t *mg_mss_all(void *km, int32_t n, const MG_MSS_TYPE *S, MG_MSS_TYPE min_sc, MG_MSS_TYPE xdrop, int32_t *n_seg)
|
41
|
+
{
|
42
|
+
int32_t i, j;
|
43
|
+
MG_MSS_TYPE L, max;
|
44
|
+
msseg_v ret = {0,0,0};
|
45
|
+
msseg_aux_v seg = {0,0,0};
|
46
|
+
msseg_aux_t t;
|
47
|
+
|
48
|
+
kv_resize(mg_msseg_t, km, ret, 16);
|
49
|
+
kv_resize(msseg_aux_t, km, seg, 16);
|
50
|
+
for (i = 0, L = 0, max = MSS_NEG_INF; i < n;) {
|
51
|
+
if (S[i] > 0) {
|
52
|
+
int32_t k;
|
53
|
+
MG_MSS_TYPE R = L + S[i];
|
54
|
+
for (k = i + 1; k < n && S[k] > 0; ++k)
|
55
|
+
R += S[k];
|
56
|
+
if (R > max) max = R;
|
57
|
+
t.st = i, t.en = k, t.L = L, t.R = R;
|
58
|
+
while (1) {
|
59
|
+
msseg_aux_t *p;
|
60
|
+
for (j = seg.n - 1; j >= 0;) {
|
61
|
+
p = &seg.a[j];
|
62
|
+
if (p->L < t.L) break;
|
63
|
+
j = p->pre >= 0? p->pre : j - 1;
|
64
|
+
}
|
65
|
+
if (j >= 0 && seg.a[j].R < t.R) {
|
66
|
+
p = &seg.a[j];
|
67
|
+
t.st = p->st, t.L = p->L, t.pre = p->pre;
|
68
|
+
seg.n = j;
|
69
|
+
} else {
|
70
|
+
if (j < 0) {
|
71
|
+
move_segs(km, &ret, &seg, min_sc);
|
72
|
+
max = R;
|
73
|
+
}
|
74
|
+
t.pre = j;
|
75
|
+
kv_push(msseg_aux_t, km, seg, t);
|
76
|
+
break;
|
77
|
+
}
|
78
|
+
}
|
79
|
+
L = R, i = k;
|
80
|
+
} else {
|
81
|
+
if (xdrop > 0 && L + S[i] + xdrop < max) { // reset
|
82
|
+
move_segs(km, &ret, &seg, min_sc);
|
83
|
+
L = 0, max = MSS_NEG_INF;
|
84
|
+
}
|
85
|
+
L += S[i++];
|
86
|
+
}
|
87
|
+
}
|
88
|
+
move_segs(km, &ret, &seg, min_sc);
|
89
|
+
kfree(km, seg.a);
|
90
|
+
KREALLOC(km, ret.a, ret.n);
|
91
|
+
*n_seg = ret.n;
|
92
|
+
return ret.a;
|
93
|
+
}
|
94
|
+
|
95
|
+
/**************************
|
96
|
+
* Interval overlap query *
|
97
|
+
**************************/
|
98
|
+
|
99
|
+
#include <assert.h>
|
100
|
+
#include "ksort.h"
|
101
|
+
|
102
|
+
#define sort_key_intv(a) ((a).st)
|
103
|
+
KRADIX_SORT_INIT(mg_intv, mg_intv_t, sort_key_intv, 4)
|
104
|
+
|
105
|
+
int32_t mg_intv_index(int32_t n, mg_intv_t *a)
|
106
|
+
{
|
107
|
+
int32_t i, last_i, last, k;
|
108
|
+
if (n <= 0) return -1;
|
109
|
+
radix_sort_mg_intv(a, a + n);
|
110
|
+
for (i = 0; i < n; i += 2) last_i = i, last = a[i].far = a[i].en;
|
111
|
+
for (k = 1; 1LL<<k <= n; ++k) {
|
112
|
+
int64_t x = 1LL<<(k-1), i0 = (x<<1) - 1, step = x<<2;
|
113
|
+
for (i = i0; i < n; i += step) {
|
114
|
+
int32_t el = a[i - x].far;
|
115
|
+
int32_t er = i + x < n? a[i + x].far : last;
|
116
|
+
int32_t e = a[i].en;
|
117
|
+
e = e > el? e : el;
|
118
|
+
e = e > er? e : er;
|
119
|
+
a[i].far = e;
|
120
|
+
}
|
121
|
+
last_i = last_i>>k&1? last_i - x : last_i + x;
|
122
|
+
if (last_i < n && a[last_i].far > last)
|
123
|
+
last = a[last_i].far;
|
124
|
+
}
|
125
|
+
return k - 1;
|
126
|
+
}
|
127
|
+
|
128
|
+
typedef struct {
|
129
|
+
int64_t x;
|
130
|
+
int32_t k, w;
|
131
|
+
} istack_t;
|
132
|
+
|
133
|
+
int32_t mg_intv_overlap(void *km, int32_t n_a, const mg_intv_t *a, int32_t st, int32_t en, int32_t **b_, int32_t *m_b_)
|
134
|
+
{
|
135
|
+
int32_t t = 0, h, *b = *b_, m_b = *m_b_, n = 0;
|
136
|
+
istack_t stack[64], *p;
|
137
|
+
|
138
|
+
for (h = 0; 1<<h <= n_a; ++h);
|
139
|
+
--h;
|
140
|
+
p = &stack[t++];
|
141
|
+
p->k = h, p->x = (1LL<<p->k) - 1, p->w = 0; // push the root into the stack
|
142
|
+
while (t) { // stack is not empyt
|
143
|
+
istack_t z = stack[--t];
|
144
|
+
if (z.k <= 3) { // the subtree is no larger than (1<<(z.k+1))-1; do a linear scan
|
145
|
+
int32_t i, i0 = z.x >> z.k << z.k, i1 = i0 + (1LL<<(z.k+1)) - 1;
|
146
|
+
if (i1 >= n_a) i1 = n_a;
|
147
|
+
for (i = i0; i < i1 && a[i].st < en; ++i)
|
148
|
+
if (st < a[i].en) {
|
149
|
+
if (n == m_b) KEXPAND(km, b, m_b);
|
150
|
+
b[n++] = i;
|
151
|
+
}
|
152
|
+
} else if (z.w == 0) { // if left child not processed
|
153
|
+
int32_t y = z.x - (1LL<<(z.k-1));
|
154
|
+
p = &stack[t++];
|
155
|
+
p->k = z.k, p->x = z.x, p->w = 1;
|
156
|
+
if (y >= n_a || a[y].far > st) {
|
157
|
+
p = &stack[t++];
|
158
|
+
p->k = z.k - 1, p->x = y, p->w = 0; // push the left child to the stack
|
159
|
+
}
|
160
|
+
} else if (z.x < n_a && a[z.x].st < en) {
|
161
|
+
if (st < a[z.x].en) { // then z.x overlaps the query; write to the output array
|
162
|
+
if (n == m_b) KEXPAND(km, b, m_b);
|
163
|
+
b[n++] = z.x;
|
164
|
+
}
|
165
|
+
p = &stack[t++];
|
166
|
+
p->k = z.k - 1, p->x = z.x + (1LL<<(z.k-1)), p->w = 0; // push the right child
|
167
|
+
}
|
168
|
+
}
|
169
|
+
*b_ = b, *m_b_ = m_b;
|
170
|
+
return n;
|
171
|
+
}
|
172
|
+
|
173
|
+
/********************
|
174
|
+
* Global alignment *
|
175
|
+
********************/
|
176
|
+
|
177
|
+
int32_t mg_wfa_cmp(void *km, int32_t l1, const char *s1, int32_t l2, const char *s2, int32_t max_pen, int32_t *mlen, int32_t *blen)
|
178
|
+
{
|
179
|
+
mwf_opt_t opt;
|
180
|
+
mwf_rst_t r;
|
181
|
+
int32_t i;
|
182
|
+
mwf_opt_init(&opt);
|
183
|
+
opt.max_s = max_pen;
|
184
|
+
opt.flag |= MWF_F_CIGAR;
|
185
|
+
mwf_wfa_exact(km, &opt, l1, s1, l2, s2, &r);
|
186
|
+
*mlen = *blen = 0;
|
187
|
+
for (i = 0; i < r.n_cigar; ++i) {
|
188
|
+
int32_t op = r.cigar[i]&0xf, len = r.cigar[i]>>4;
|
189
|
+
*blen += len;
|
190
|
+
if (op == 7) *mlen += len;
|
191
|
+
}
|
192
|
+
kfree(km, r.cigar);
|
193
|
+
return r.s < 0? -(l1 + l2) : (l1 + l2) / 2 - r.s;
|
194
|
+
}
|
@@ -0,0 +1,33 @@
|
|
1
|
+
#ifndef MG_ALGO_H
|
2
|
+
#define MG_ALGO_H
|
3
|
+
|
4
|
+
#include <stdint.h>
|
5
|
+
|
6
|
+
#define MG_MSS_TYPE int32_t
|
7
|
+
#define MG_LIS_TYPE uint64_t
|
8
|
+
|
9
|
+
typedef struct {
|
10
|
+
int32_t st, en;
|
11
|
+
MG_MSS_TYPE sc;
|
12
|
+
} mg_msseg_t;
|
13
|
+
|
14
|
+
typedef struct {
|
15
|
+
uint32_t st, en:31, rev:1;
|
16
|
+
int32_t far, i;
|
17
|
+
} mg_intv_t;
|
18
|
+
|
19
|
+
#ifdef __cplusplus
|
20
|
+
extern "C" {
|
21
|
+
#endif
|
22
|
+
|
23
|
+
mg_msseg_t *mg_mss_all(void *km, int32_t n, const MG_MSS_TYPE *S, MG_MSS_TYPE min_sc, MG_MSS_TYPE xdrop, int32_t *n_seg);
|
24
|
+
int32_t mg_intv_index(int32_t n, mg_intv_t *a);
|
25
|
+
int32_t mg_intv_overlap(void *km, int32_t n_a, const mg_intv_t *a, int32_t st, int32_t en, int32_t **b_, int32_t *m_b_);
|
26
|
+
void radix_sort_mg_intv(mg_intv_t *st, mg_intv_t *en);
|
27
|
+
int32_t mg_wfa_cmp(void *km, int32_t l1, const char *s1, int32_t l2, const char *s2, int32_t max_pen, int32_t *mlen, int32_t *blen);
|
28
|
+
|
29
|
+
#ifdef __cplusplus
|
30
|
+
}
|
31
|
+
#endif
|
32
|
+
|
33
|
+
#endif
|
@@ -0,0 +1,147 @@
|
|
1
|
+
#include <assert.h>
|
2
|
+
#include "mgpriv.h"
|
3
|
+
#include "ggen.h"
|
4
|
+
#include "gfa-priv.h"
|
5
|
+
#include "algo.h"
|
6
|
+
|
7
|
+
int32_t mg_gc_index(void *km, int min_mapq, int min_map_len, int min_depth_len, const gfa_t *g, int32_t n_seq, mg_gchains_t *const* gcs,
|
8
|
+
double *a_dens, int32_t **soff_, int32_t **qoff_, mg_intv_t **sintv_, mg_intv_t **qintv_);
|
9
|
+
|
10
|
+
typedef struct {
|
11
|
+
int32_t bid;
|
12
|
+
uint8_t is_stem:4, is_src:4;
|
13
|
+
} callaux_t;
|
14
|
+
|
15
|
+
typedef struct {
|
16
|
+
int32_t t, i;
|
17
|
+
int32_t st, en, strand;
|
18
|
+
int32_t qs, qe, glen;
|
19
|
+
} bbaux_t;
|
20
|
+
|
21
|
+
void mg_call_asm(const gfa_t *g, int32_t n_seq, const mg_bseq1_t *seq, mg_gchains_t *const *gcs, int32_t min_mapq, int32_t min_blen)
|
22
|
+
{
|
23
|
+
int32_t i, j, t, max_acnt, *soff, *qoff, n_bb, m_ovlp = 0, *ovlp = 0;
|
24
|
+
mg_intv_t *sintv, *qintv;
|
25
|
+
double a_dens;
|
26
|
+
gfa_bubble_t *bb;
|
27
|
+
callaux_t *ca;
|
28
|
+
bbaux_t *ba;
|
29
|
+
kstring_t out = {0,0,0};
|
30
|
+
|
31
|
+
max_acnt = mg_gc_index(0, min_mapq, min_blen>>1, min_blen, g, n_seq, gcs, &a_dens, &soff, &qoff, &sintv, &qintv);
|
32
|
+
if (max_acnt == 0) return;
|
33
|
+
|
34
|
+
bb = gfa_bubble(g, &n_bb);
|
35
|
+
GFA_CALLOC(ba, n_bb);
|
36
|
+
GFA_CALLOC(ca, g->n_seg);
|
37
|
+
for (i = 0; i < n_bb; ++i) {
|
38
|
+
gfa_bubble_t *b = &bb[i];
|
39
|
+
assert(b->n_seg >= 2);
|
40
|
+
for (j = 0; j < b->n_seg; ++j)
|
41
|
+
ca[b->v[j]>>1].bid = i;
|
42
|
+
ca[b->v[0]>>1].is_stem = ca[b->v[b->n_seg-1]>>1].is_stem = 1;
|
43
|
+
ca[b->v[0]>>1].is_src = 1;
|
44
|
+
ba[i].t = -1;
|
45
|
+
}
|
46
|
+
|
47
|
+
for (t = 0; t < n_seq; ++t) {
|
48
|
+
const mg_gchains_t *gt = gcs[t];
|
49
|
+
for (i = 0; i < gt->n_gc; ++i) {
|
50
|
+
const mg_gchain_t *gc = >->gc[i];
|
51
|
+
int32_t st = -1;
|
52
|
+
for (j = 1; j < gc->cnt; ++j) {
|
53
|
+
const mg_llchain_t *lc = >->lc[gc->off + j];
|
54
|
+
if (!ca[lc->v>>1].is_stem && ca[(lc-1)->v>>1].is_stem) {
|
55
|
+
st = gc->off + j;
|
56
|
+
} else if ((ca[lc->v>>1].is_stem && !ca[(lc-1)->v>>1].is_stem && st > 0) || (ca[lc->v>>1].is_stem && ca[(lc-1)->v>>1].is_stem)) {
|
57
|
+
int32_t n_ovlp, k, en = gc->off + j, qs, qe, span, bid, strand, glen;
|
58
|
+
bbaux_t *p;
|
59
|
+
|
60
|
+
// determine the source and sink nodes
|
61
|
+
if (ca[lc->v>>1].is_stem && ca[(lc-1)->v>>1].is_stem) { // two adjacent stems: this is a deletion
|
62
|
+
st = gc->off + j;
|
63
|
+
} else {
|
64
|
+
assert(en > st);
|
65
|
+
}
|
66
|
+
|
67
|
+
// test overlap on the query
|
68
|
+
span = gt->a[gt->lc[st].off].y >> 32 & 0xff;
|
69
|
+
qs = (int32_t)gt->a[gt->lc[st - 1].off + gt->lc[st - 1].cnt - 1].y + 1; // NB: it is fine even if .cnt==0
|
70
|
+
qe = (int32_t)gt->a[gt->lc[en].off].y + 1 - span;
|
71
|
+
n_ovlp = mg_intv_overlap(0, qoff[t+1] - qoff[t], &qintv[qoff[t]], qs, qe, &ovlp, &m_ovlp);
|
72
|
+
if (n_ovlp > 1) continue; // overlap on the query - not orthologous
|
73
|
+
|
74
|
+
// test overlap on the graph
|
75
|
+
for (k = st, glen = 0; k < en; ++k) {
|
76
|
+
const mg_llchain_t *lk = >->lc[k];
|
77
|
+
int32_t seg = lk->v>>1;
|
78
|
+
n_ovlp = mg_intv_overlap(0, soff[seg+1] - soff[seg], &sintv[soff[seg]], 0, g->seg[seg].len, &ovlp, &m_ovlp);
|
79
|
+
glen += g->seg[seg].len;
|
80
|
+
if (n_ovlp > 1) break; // overlap on the graph - not orthoologous
|
81
|
+
}
|
82
|
+
if (k < en) continue;
|
83
|
+
|
84
|
+
// determine the bubble ID
|
85
|
+
assert(ca[gt->lc[st-1].v>>1].is_stem && ca[gt->lc[en].v>>1].is_stem);
|
86
|
+
if (ca[gt->lc[st-1].v>>1].bid < ca[gt->lc[en].v>>1].bid)
|
87
|
+
strand = 1;
|
88
|
+
else if (ca[gt->lc[st-1].v>>1].bid > ca[gt->lc[en].v>>1].bid)
|
89
|
+
strand = -1;
|
90
|
+
else {
|
91
|
+
if (ca[gt->lc[st-1].v>>1].is_src + ca[gt->lc[en].v>>1].is_src != 1) {
|
92
|
+
fprintf(stderr, "[W::%s] type-1 folded inversion alignment around %c%s <=> %s:%d-%d\n",
|
93
|
+
__func__, "><"[gt->lc[st].v&1], g->seg[gt->lc[st].v>>1].name, seq[t].name, qs, qe);
|
94
|
+
continue;
|
95
|
+
}
|
96
|
+
if (ca[gt->lc[st-1].v>>1].is_src) strand = 1;
|
97
|
+
else strand = -1;
|
98
|
+
}
|
99
|
+
bid = strand > 0? ca[gt->lc[st-1].v>>1].bid : ca[gt->lc[en].v>>1].bid;
|
100
|
+
|
101
|
+
// attach the bubble
|
102
|
+
for (k = st; k < en; ++k) // check consistency
|
103
|
+
if (ca[gt->lc[k].v>>1].bid != bid)
|
104
|
+
break;
|
105
|
+
if (k != en) { // this may happen around an inversion towards the end of an alignment chain
|
106
|
+
fprintf(stderr, "[W::%s] type-2 folded inversion alignment around %c%s <=> %s:%d-%d\n",
|
107
|
+
__func__, "><"[gt->lc[st].v&1], g->seg[gt->lc[st].v>>1].name, seq[t].name, qs, qe);
|
108
|
+
continue;
|
109
|
+
}
|
110
|
+
p = &ba[bid];
|
111
|
+
p->t = t, p->i = i, p->st = st, p->en = en, p->strand = strand, p->qs = qs, p->qe = qe, p->glen = glen;
|
112
|
+
}
|
113
|
+
}
|
114
|
+
}
|
115
|
+
}
|
116
|
+
|
117
|
+
for (i = 0; i < n_bb; ++i) {
|
118
|
+
gfa_bubble_t *b = &bb[i];
|
119
|
+
bbaux_t *a = &ba[i];
|
120
|
+
const mg_gchains_t *gt = gcs[a->t];
|
121
|
+
out.l = 0;
|
122
|
+
mg_sprintf_lite(&out, "%s\t%d\t%d\t%c%s\t%c%s\t", g->sseq[b->snid].name, b->ss, b->se, "><"[b->v[0]&1], g->seg[b->v[0]>>1].name,
|
123
|
+
"><"[b->v[b->n_seg-1]&1], g->seg[b->v[b->n_seg-1]>>1].name);
|
124
|
+
if (a->t >= 0) {
|
125
|
+
assert(a->strand != 0);
|
126
|
+
if (a->st == a->en) {
|
127
|
+
mg_sprintf_lite(&out, "*");
|
128
|
+
} else if (a->strand > 0) {
|
129
|
+
for (j = a->st; j < a->en; ++j)
|
130
|
+
mg_sprintf_lite(&out, "%c%s", "><"[gt->lc[j].v&1], g->seg[gt->lc[j].v>>1].name);
|
131
|
+
} else {
|
132
|
+
for (j = a->en - 1; j >= a->st; --j)
|
133
|
+
mg_sprintf_lite(&out, "%c%s", "<>"[gt->lc[j].v&1], g->seg[gt->lc[j].v>>1].name);
|
134
|
+
}
|
135
|
+
mg_sprintf_lite(&out, ":%d:%c:%s:%d:%d", a->glen, a->strand > 0? '+' : '-', seq[a->t].name, a->qs, a->qe);
|
136
|
+
} else {
|
137
|
+
mg_sprintf_lite(&out, ".");
|
138
|
+
}
|
139
|
+
puts(out.s);
|
140
|
+
}
|
141
|
+
|
142
|
+
free(ba); free(ca);
|
143
|
+
free(soff); free(qoff); free(sintv); free(qintv);
|
144
|
+
for (i = 0; i < n_bb; ++i) free(bb[i].v);
|
145
|
+
free(bb);
|
146
|
+
free(out.s);
|
147
|
+
}
|