ruby-minigraph 0.0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +62 -0
- data/ext/Rakefile +56 -0
- data/ext/cmappy/cmappy.c +7 -0
- data/ext/cmappy/cmappy.h +8 -0
- data/ext/minigraph/LICENSE.txt +23 -0
- data/ext/minigraph/Makefile +66 -0
- data/ext/minigraph/NEWS.md +317 -0
- data/ext/minigraph/README.md +207 -0
- data/ext/minigraph/algo.c +194 -0
- data/ext/minigraph/algo.h +33 -0
- data/ext/minigraph/asm-call.c +147 -0
- data/ext/minigraph/bseq.c +133 -0
- data/ext/minigraph/bseq.h +76 -0
- data/ext/minigraph/cal_cov.c +139 -0
- data/ext/minigraph/doc/example1.png +0 -0
- data/ext/minigraph/doc/example2.png +0 -0
- data/ext/minigraph/doc/examples.graffle +0 -0
- data/ext/minigraph/format.c +241 -0
- data/ext/minigraph/galign.c +140 -0
- data/ext/minigraph/gchain1.c +532 -0
- data/ext/minigraph/gcmisc.c +223 -0
- data/ext/minigraph/gfa-aug.c +260 -0
- data/ext/minigraph/gfa-base.c +526 -0
- data/ext/minigraph/gfa-bbl.c +372 -0
- data/ext/minigraph/gfa-ed.c +617 -0
- data/ext/minigraph/gfa-io.c +395 -0
- data/ext/minigraph/gfa-priv.h +154 -0
- data/ext/minigraph/gfa.h +166 -0
- data/ext/minigraph/ggen.c +182 -0
- data/ext/minigraph/ggen.h +21 -0
- data/ext/minigraph/ggsimple.c +570 -0
- data/ext/minigraph/gmap.c +211 -0
- data/ext/minigraph/index.c +230 -0
- data/ext/minigraph/kalloc.c +224 -0
- data/ext/minigraph/kalloc.h +82 -0
- data/ext/minigraph/kavl.h +414 -0
- data/ext/minigraph/kdq.h +134 -0
- data/ext/minigraph/ketopt.h +116 -0
- data/ext/minigraph/khashl.h +348 -0
- data/ext/minigraph/krmq.h +474 -0
- data/ext/minigraph/kseq.h +256 -0
- data/ext/minigraph/ksort.h +164 -0
- data/ext/minigraph/kstring.h +165 -0
- data/ext/minigraph/kthread.c +159 -0
- data/ext/minigraph/kthread.h +15 -0
- data/ext/minigraph/kvec-km.h +105 -0
- data/ext/minigraph/kvec.h +110 -0
- data/ext/minigraph/lchain.c +441 -0
- data/ext/minigraph/main.c +301 -0
- data/ext/minigraph/map-algo.c +500 -0
- data/ext/minigraph/mgpriv.h +128 -0
- data/ext/minigraph/minigraph.1 +359 -0
- data/ext/minigraph/minigraph.h +176 -0
- data/ext/minigraph/miniwfa.c +834 -0
- data/ext/minigraph/miniwfa.h +95 -0
- data/ext/minigraph/misc/mgutils.js +1451 -0
- data/ext/minigraph/misc.c +12 -0
- data/ext/minigraph/options.c +134 -0
- data/ext/minigraph/shortk.c +251 -0
- data/ext/minigraph/sketch.c +109 -0
- data/ext/minigraph/sys.c +147 -0
- data/ext/minigraph/sys.h +20 -0
- data/ext/minigraph/test/MT-chimp.fa +277 -0
- data/ext/minigraph/test/MT-human.fa +239 -0
- data/ext/minigraph/test/MT-orangA.fa +276 -0
- data/ext/minigraph/test/MT.gfa +19 -0
- data/ext/minigraph/tex/Makefile +13 -0
- data/ext/minigraph/tex/minigraph.bib +676 -0
- data/ext/minigraph/tex/minigraph.tex +986 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
- data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
- data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
- data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
- data/ext/minigraph/tex/plots/bedutils.js +367 -0
- data/ext/minigraph/tex/plots/chr-plot.js +130 -0
- data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
- data/ext/minigraph.patch +21 -0
- data/lib/minigraph/ffi/constants.rb +230 -0
- data/lib/minigraph/ffi/functions.rb +70 -0
- data/lib/minigraph/ffi/mappy.rb +8 -0
- data/lib/minigraph/ffi.rb +27 -0
- data/lib/minigraph/version.rb +5 -0
- data/lib/minigraph.rb +72 -0
- metadata +159 -0
@@ -0,0 +1,207 @@
|
|
1
|
+
[](https://travis-ci.org/lh3/minigraph)
|
2
|
+
## <a name="started"></a>Getting Started
|
3
|
+
|
4
|
+
```sh
|
5
|
+
git clone https://github.com/lh3/minigraph
|
6
|
+
cd minigraph && make
|
7
|
+
# Map sequence to sequence, similar to minimap2 without base alignment
|
8
|
+
./minigraph test/MT-human.fa test/MT-orangA.fa > out.paf
|
9
|
+
# Map sequence to graph
|
10
|
+
./minigraph test/MT.gfa test/MT-orangA.fa > out.gaf
|
11
|
+
# Incremental graph generation (-l10k necessary for this toy example)
|
12
|
+
./minigraph -cxggs -l10k test/MT.gfa test/MT-chimp.fa test/MT-orangA.fa > out.gfa
|
13
|
+
# Call per-sample path in each bubble/variation (-c not needed for this)
|
14
|
+
./minigraph -xasm -l10k --call test/MT.gfa test/MT-orangA.fa > orangA.call.bed
|
15
|
+
|
16
|
+
# The lossy FASTA representation (requring https://github.com/lh3/gfatools)
|
17
|
+
gfatools gfa2fa -s out.gfa > out.fa
|
18
|
+
# Extract localized structural variations
|
19
|
+
gfatools bubble out.gfa > SV.bed
|
20
|
+
```
|
21
|
+
|
22
|
+
## Table of Contents
|
23
|
+
|
24
|
+
<img align="right" width="278" src="doc/example1.png"/>
|
25
|
+
|
26
|
+
- [Getting Started](#started)
|
27
|
+
- [Introduction](#intro)
|
28
|
+
- [Users' Guide](#uguide)
|
29
|
+
- [Installation](#install)
|
30
|
+
- [Sequence-to-graph mapping](#map)
|
31
|
+
- [Graph generation](#ggen)
|
32
|
+
- [Calling structural variations](#callsv)
|
33
|
+
- [Prebuilt graphs](#prebuilt)
|
34
|
+
- [Algorithm overview](#algo)
|
35
|
+
- [Limitations](#limit)
|
36
|
+
|
37
|
+
## <a name="intro"></a>Introduction
|
38
|
+
|
39
|
+
Minigraph is a sequence-to-graph mapper and graph constructor. For graph
|
40
|
+
generation, it aligns a query sequence against a sequence graph and
|
41
|
+
incrementally augments an existing graph with long query subsequences diverged
|
42
|
+
from the graph. The figure on the right briefly explains the procedure.
|
43
|
+
|
44
|
+
Minigraph borrows ideas and code from [minimap2][minimap2]. It is fairly
|
45
|
+
efficient and can construct a graph from 90 human assemblies in a couple of
|
46
|
+
days using 24 CPU cores. Older versions of minigraph was unable to produce
|
47
|
+
base alignment. The latest version can. **Please add option `-c` for graph
|
48
|
+
generation** as it generally improves the quality of graphs.
|
49
|
+
|
50
|
+
## <a name="uguide"></a>Users' Guide
|
51
|
+
|
52
|
+
### <a name="install"></a>Installation
|
53
|
+
|
54
|
+
To install minigraph, type `make` in the source code directory. The only
|
55
|
+
non-standard dependency is [zlib][zlib]. For better performance, it is
|
56
|
+
recommended to compile with recent compliers.
|
57
|
+
|
58
|
+
### <a name="map"></a>Sequence-to-graph mapping
|
59
|
+
|
60
|
+
To map sequences against a graph, you should prepare the graph in the [GFA
|
61
|
+
format][gfa1], or preferrably the [rGFA format][rgfa]. If you don't have
|
62
|
+
a graph, you can generate a graph from multiple samples (see the [Graph
|
63
|
+
generation section](#ggen) below). The typical command line for mapping is
|
64
|
+
```sh
|
65
|
+
minigraph -cx lr graph.gfa query.fa > out.gaf
|
66
|
+
```
|
67
|
+
You may choose the right preset option `-x` according to input. Minigraph
|
68
|
+
output mappings in the [GAF format][gaf], which is a strict superset of the
|
69
|
+
[PAF format][paf]. The only visual difference between GAF and PAF is that the
|
70
|
+
6th column in GAF may encode a graph path like
|
71
|
+
`>MT_human:0-4001<MT_orang:3426-3927` instead of a contig/chromosome name.
|
72
|
+
|
73
|
+
The minigraph GFA parser seamlessly parses FASTA and converts it to GFA
|
74
|
+
internally, so you can also provide sequences in FASTA as the reference. In
|
75
|
+
this case, minigraph will behave like minimap2, though likely producing
|
76
|
+
different alignments due to differences between the two implementations.
|
77
|
+
|
78
|
+
### <a name="ggen"></a>Graph generation
|
79
|
+
|
80
|
+
The following command-line generates a graph in rGFA:
|
81
|
+
```sh
|
82
|
+
minigraph -cxggs -t16 ref.fa sample1.fa sample2.fa > out.gfa
|
83
|
+
```
|
84
|
+
which is equivalent to
|
85
|
+
```sh
|
86
|
+
minigraph -cxggs -t16 ref.fa sample1.fa > sample1.gfa
|
87
|
+
minigraph -cxggs -t16 sample1.gfa sample2.fa > out.gfa
|
88
|
+
```
|
89
|
+
File `ref.fa` is typically the reference genome (e.g. GRCh38 for human).
|
90
|
+
It can also be replaced by a graph in rGFA. Minigraph assumes `sample1.fa` to
|
91
|
+
be the whole-genome assembly of an individual. This is an important assumption:
|
92
|
+
minigraph only considers 1-to-1 orthogonal regions between the graph and the
|
93
|
+
individual FASTA. If you use raw reads or put multiple individual genomes in
|
94
|
+
one file, minigraph will filter out most alignments as they cover the input
|
95
|
+
graph multiple times.
|
96
|
+
|
97
|
+
The output rGFA can be converted to a FASTA file with [gfatools][gfatools]:
|
98
|
+
```sh
|
99
|
+
gfatools gfa2fa -s graph.gfa > out.stable.fa
|
100
|
+
```
|
101
|
+
The output `out.stable.fa` will always include the initial reference `ref.fa`
|
102
|
+
and may additionally add new segments diverged from the initial reference.
|
103
|
+
|
104
|
+
### <a name="callsv"></a>Calling structural variations
|
105
|
+
|
106
|
+
A minigraph graph is composed of chains of bubbles with the reference as the
|
107
|
+
backbone. Each *bubble* represents a structural variation. It can be
|
108
|
+
multi-allelic if there are multiple paths through the bubble. You can extract
|
109
|
+
these bubbles with
|
110
|
+
```sh
|
111
|
+
gfatools bubble graph.gfa > var.bed
|
112
|
+
```
|
113
|
+
The output is a BED-like file. The first three columns give the position of a
|
114
|
+
bubble/variation and the rest of columns are:
|
115
|
+
|
116
|
+
* (4) \# GFA segments in the bubble including the source and the sink of the bubble
|
117
|
+
* (5) \# all possible paths through the bubble (not all paths present in input samples)
|
118
|
+
* (6) 1 if the bubble involves an inversion; 0 otherwise
|
119
|
+
* (7) length of the shortest path (i.e. allele) through the bubble
|
120
|
+
* (8) length of the longest path/allele through the bubble
|
121
|
+
* (9-11) please ignore
|
122
|
+
* (12) list of segments in the bubble; first for the source and last for the sink
|
123
|
+
* (13) sequence of the shortest path (`*` if zero length)
|
124
|
+
* (14) sequence of the longest path (NB: it may not be present in the input samples)
|
125
|
+
|
126
|
+
Given an assembly, you can find the path/allele of this assembly in each bubble with
|
127
|
+
```sh
|
128
|
+
minigraph -cxasm --call graph.gfa sample-asm.fa > sample.bed
|
129
|
+
```
|
130
|
+
On each line in the BED-like output, the last colon separated field gives the
|
131
|
+
alignment path through the bubble, the path length in the graph, the mapping
|
132
|
+
strand of sample contig, the contig name, the approximate contig start and
|
133
|
+
contig end. The number of lines in the file is the same as the number of lines
|
134
|
+
in the output of `gfatools bubble`. You can use the `paste` Unix command to
|
135
|
+
piece multiple samples together.
|
136
|
+
|
137
|
+
### <a name="prebuilt"></a>Prebuilt graphs
|
138
|
+
|
139
|
+
Prebuilt human graphs in the rGFA format can be found [at Zenodo][human-zenodo].
|
140
|
+
|
141
|
+
### <a name="algo"></a>Algorithm overview
|
142
|
+
|
143
|
+
<img align="right" width="278" src="doc/example2.png"/>
|
144
|
+
|
145
|
+
In the following, minigraph command line options have a dash ahead and are
|
146
|
+
highlighted in bold. The description may help to tune minigraph parameters.
|
147
|
+
|
148
|
+
1. Read all reference bases, extract (**-k**,**-w**)-minimizers and index them
|
149
|
+
in a hash table.
|
150
|
+
|
151
|
+
2. Read **-K** [=*500M*] query bases in the mapping mode, or read all query
|
152
|
+
bases in the graph construction mode. For each query sequence, do step 3
|
153
|
+
through 5:
|
154
|
+
|
155
|
+
3. Find colinear minimizer chains using the [minimap2][minimap2] algorithm,
|
156
|
+
assuming segments in the graph are disconnected. These are called *linear
|
157
|
+
chains*.
|
158
|
+
|
159
|
+
4. Perform another round of chaining, taking each linear chain as an anchor.
|
160
|
+
For a pair of linear chains, minigraph tries to connect them by doing graph
|
161
|
+
wavefront alignment algorithm (GWFA). If minigraph fails to find an
|
162
|
+
alignment within an edit distance threshold, it will find up to 15 shortest
|
163
|
+
paths between the two linear chains and chooses the path of length closest
|
164
|
+
to the distance on the query sequence. Chains found at this step are called
|
165
|
+
*graph chains*.
|
166
|
+
|
167
|
+
5. Identify primary chains and estimate mapping quality with a method similar
|
168
|
+
to the one used in minimap2. Perform base alignment.
|
169
|
+
|
170
|
+
6. In the graph construction mode, collect all mappings longer than **-d**
|
171
|
+
[=*10k*] and keep their query and graph segment intervals in two lists,
|
172
|
+
respectively.
|
173
|
+
|
174
|
+
7. For each mapping longer than **-l** [=*100k*], finds poorly aligned regions.
|
175
|
+
A region is filtered if it overlaps two or more intervals collected at step
|
176
|
+
6.
|
177
|
+
|
178
|
+
8. Insert the remaining poorly aligned regions into the input graph. This
|
179
|
+
constructs a new graph.
|
180
|
+
|
181
|
+
## <a name="limit"></a>Limitations
|
182
|
+
|
183
|
+
* A complex minigraph subgraph is often suboptimal and may vary with the order
|
184
|
+
of input samples. It may not represent the evolution history
|
185
|
+
or the functional relevance at the locus. Please *do not overinterpret*
|
186
|
+
complex subgraphs. If you are interested in a particular subgraph, it is
|
187
|
+
recommended to extract the input contig subsequences involved in the subgraph
|
188
|
+
with the `--call` option and manually curated the results.
|
189
|
+
|
190
|
+
* Minigraph needs to find strong colinear chains first. For a graph consisting
|
191
|
+
of many short segments (e.g. one generated from rare SNPs in large
|
192
|
+
populations), minigraph will fail to map query sequences.
|
193
|
+
|
194
|
+
* The base alignment in the current version of minigraph is slow for species of
|
195
|
+
high diversity.
|
196
|
+
|
197
|
+
|
198
|
+
[zlib]: http://zlib.net/
|
199
|
+
[minimap2]: https://github.com/lh3/minimap2
|
200
|
+
[rgfa]: https://github.com/lh3/gfatools/blob/master/doc/rGFA.md
|
201
|
+
[gfa1]: https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md
|
202
|
+
[gaf]: https://github.com/lh3/gfatools/blob/master/doc/rGFA.md#the-graph-alignment-format-gaf
|
203
|
+
[paf]: https://github.com/lh3/miniasm/blob/master/PAF.md
|
204
|
+
[gfatools]: https://github.com/lh3/gfatools
|
205
|
+
[bandage]: https://rrwick.github.io/Bandage/
|
206
|
+
[gfaviz]: https://github.com/ggonnella/gfaviz
|
207
|
+
[human-zenodo]: https://zenodo.org/record/6499594
|
@@ -0,0 +1,194 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include "kalloc.h"
|
4
|
+
#define __STDC_LIMIT_MACROS
|
5
|
+
#include "algo.h"
|
6
|
+
#include "miniwfa.h"
|
7
|
+
|
8
|
+
/************************
|
9
|
+
* Max-scoring segments *
|
10
|
+
************************/
|
11
|
+
|
12
|
+
#include "kvec-km.h"
|
13
|
+
|
14
|
+
#define MSS_NEG_INF INT32_MIN
|
15
|
+
|
16
|
+
typedef struct {
|
17
|
+
int32_t st, en;
|
18
|
+
MG_MSS_TYPE L, R;
|
19
|
+
int32_t pre;
|
20
|
+
} msseg_aux_t;
|
21
|
+
|
22
|
+
typedef kvec_t(mg_msseg_t) msseg_v;
|
23
|
+
typedef kvec_t(msseg_aux_t) msseg_aux_v;
|
24
|
+
|
25
|
+
static void move_segs(void *km, msseg_v *ret, msseg_aux_v *seg, MG_MSS_TYPE min_sc)
|
26
|
+
{
|
27
|
+
int32_t i;
|
28
|
+
for (i = 0; i < seg->n; ++i) {
|
29
|
+
msseg_aux_t *p = &seg->a[i];
|
30
|
+
if (p->R - p->L >= min_sc) {
|
31
|
+
mg_msseg_t *q;
|
32
|
+
kv_pushp(mg_msseg_t, km, *ret, &q);
|
33
|
+
q->st = p->st, q->en = p->en, q->sc = p->R - p->L;
|
34
|
+
}
|
35
|
+
}
|
36
|
+
seg->n = 0;
|
37
|
+
}
|
38
|
+
|
39
|
+
// Reference: Ruzzo and Tompa (1999) A linear time algorithm for finding all maximal scoring subsequencs
|
40
|
+
mg_msseg_t *mg_mss_all(void *km, int32_t n, const MG_MSS_TYPE *S, MG_MSS_TYPE min_sc, MG_MSS_TYPE xdrop, int32_t *n_seg)
|
41
|
+
{
|
42
|
+
int32_t i, j;
|
43
|
+
MG_MSS_TYPE L, max;
|
44
|
+
msseg_v ret = {0,0,0};
|
45
|
+
msseg_aux_v seg = {0,0,0};
|
46
|
+
msseg_aux_t t;
|
47
|
+
|
48
|
+
kv_resize(mg_msseg_t, km, ret, 16);
|
49
|
+
kv_resize(msseg_aux_t, km, seg, 16);
|
50
|
+
for (i = 0, L = 0, max = MSS_NEG_INF; i < n;) {
|
51
|
+
if (S[i] > 0) {
|
52
|
+
int32_t k;
|
53
|
+
MG_MSS_TYPE R = L + S[i];
|
54
|
+
for (k = i + 1; k < n && S[k] > 0; ++k)
|
55
|
+
R += S[k];
|
56
|
+
if (R > max) max = R;
|
57
|
+
t.st = i, t.en = k, t.L = L, t.R = R;
|
58
|
+
while (1) {
|
59
|
+
msseg_aux_t *p;
|
60
|
+
for (j = seg.n - 1; j >= 0;) {
|
61
|
+
p = &seg.a[j];
|
62
|
+
if (p->L < t.L) break;
|
63
|
+
j = p->pre >= 0? p->pre : j - 1;
|
64
|
+
}
|
65
|
+
if (j >= 0 && seg.a[j].R < t.R) {
|
66
|
+
p = &seg.a[j];
|
67
|
+
t.st = p->st, t.L = p->L, t.pre = p->pre;
|
68
|
+
seg.n = j;
|
69
|
+
} else {
|
70
|
+
if (j < 0) {
|
71
|
+
move_segs(km, &ret, &seg, min_sc);
|
72
|
+
max = R;
|
73
|
+
}
|
74
|
+
t.pre = j;
|
75
|
+
kv_push(msseg_aux_t, km, seg, t);
|
76
|
+
break;
|
77
|
+
}
|
78
|
+
}
|
79
|
+
L = R, i = k;
|
80
|
+
} else {
|
81
|
+
if (xdrop > 0 && L + S[i] + xdrop < max) { // reset
|
82
|
+
move_segs(km, &ret, &seg, min_sc);
|
83
|
+
L = 0, max = MSS_NEG_INF;
|
84
|
+
}
|
85
|
+
L += S[i++];
|
86
|
+
}
|
87
|
+
}
|
88
|
+
move_segs(km, &ret, &seg, min_sc);
|
89
|
+
kfree(km, seg.a);
|
90
|
+
KREALLOC(km, ret.a, ret.n);
|
91
|
+
*n_seg = ret.n;
|
92
|
+
return ret.a;
|
93
|
+
}
|
94
|
+
|
95
|
+
/**************************
|
96
|
+
* Interval overlap query *
|
97
|
+
**************************/
|
98
|
+
|
99
|
+
#include <assert.h>
|
100
|
+
#include "ksort.h"
|
101
|
+
|
102
|
+
#define sort_key_intv(a) ((a).st)
|
103
|
+
KRADIX_SORT_INIT(mg_intv, mg_intv_t, sort_key_intv, 4)
|
104
|
+
|
105
|
+
int32_t mg_intv_index(int32_t n, mg_intv_t *a)
|
106
|
+
{
|
107
|
+
int32_t i, last_i, last, k;
|
108
|
+
if (n <= 0) return -1;
|
109
|
+
radix_sort_mg_intv(a, a + n);
|
110
|
+
for (i = 0; i < n; i += 2) last_i = i, last = a[i].far = a[i].en;
|
111
|
+
for (k = 1; 1LL<<k <= n; ++k) {
|
112
|
+
int64_t x = 1LL<<(k-1), i0 = (x<<1) - 1, step = x<<2;
|
113
|
+
for (i = i0; i < n; i += step) {
|
114
|
+
int32_t el = a[i - x].far;
|
115
|
+
int32_t er = i + x < n? a[i + x].far : last;
|
116
|
+
int32_t e = a[i].en;
|
117
|
+
e = e > el? e : el;
|
118
|
+
e = e > er? e : er;
|
119
|
+
a[i].far = e;
|
120
|
+
}
|
121
|
+
last_i = last_i>>k&1? last_i - x : last_i + x;
|
122
|
+
if (last_i < n && a[last_i].far > last)
|
123
|
+
last = a[last_i].far;
|
124
|
+
}
|
125
|
+
return k - 1;
|
126
|
+
}
|
127
|
+
|
128
|
+
typedef struct {
|
129
|
+
int64_t x;
|
130
|
+
int32_t k, w;
|
131
|
+
} istack_t;
|
132
|
+
|
133
|
+
int32_t mg_intv_overlap(void *km, int32_t n_a, const mg_intv_t *a, int32_t st, int32_t en, int32_t **b_, int32_t *m_b_)
|
134
|
+
{
|
135
|
+
int32_t t = 0, h, *b = *b_, m_b = *m_b_, n = 0;
|
136
|
+
istack_t stack[64], *p;
|
137
|
+
|
138
|
+
for (h = 0; 1<<h <= n_a; ++h);
|
139
|
+
--h;
|
140
|
+
p = &stack[t++];
|
141
|
+
p->k = h, p->x = (1LL<<p->k) - 1, p->w = 0; // push the root into the stack
|
142
|
+
while (t) { // stack is not empyt
|
143
|
+
istack_t z = stack[--t];
|
144
|
+
if (z.k <= 3) { // the subtree is no larger than (1<<(z.k+1))-1; do a linear scan
|
145
|
+
int32_t i, i0 = z.x >> z.k << z.k, i1 = i0 + (1LL<<(z.k+1)) - 1;
|
146
|
+
if (i1 >= n_a) i1 = n_a;
|
147
|
+
for (i = i0; i < i1 && a[i].st < en; ++i)
|
148
|
+
if (st < a[i].en) {
|
149
|
+
if (n == m_b) KEXPAND(km, b, m_b);
|
150
|
+
b[n++] = i;
|
151
|
+
}
|
152
|
+
} else if (z.w == 0) { // if left child not processed
|
153
|
+
int32_t y = z.x - (1LL<<(z.k-1));
|
154
|
+
p = &stack[t++];
|
155
|
+
p->k = z.k, p->x = z.x, p->w = 1;
|
156
|
+
if (y >= n_a || a[y].far > st) {
|
157
|
+
p = &stack[t++];
|
158
|
+
p->k = z.k - 1, p->x = y, p->w = 0; // push the left child to the stack
|
159
|
+
}
|
160
|
+
} else if (z.x < n_a && a[z.x].st < en) {
|
161
|
+
if (st < a[z.x].en) { // then z.x overlaps the query; write to the output array
|
162
|
+
if (n == m_b) KEXPAND(km, b, m_b);
|
163
|
+
b[n++] = z.x;
|
164
|
+
}
|
165
|
+
p = &stack[t++];
|
166
|
+
p->k = z.k - 1, p->x = z.x + (1LL<<(z.k-1)), p->w = 0; // push the right child
|
167
|
+
}
|
168
|
+
}
|
169
|
+
*b_ = b, *m_b_ = m_b;
|
170
|
+
return n;
|
171
|
+
}
|
172
|
+
|
173
|
+
/********************
|
174
|
+
* Global alignment *
|
175
|
+
********************/
|
176
|
+
|
177
|
+
int32_t mg_wfa_cmp(void *km, int32_t l1, const char *s1, int32_t l2, const char *s2, int32_t max_pen, int32_t *mlen, int32_t *blen)
|
178
|
+
{
|
179
|
+
mwf_opt_t opt;
|
180
|
+
mwf_rst_t r;
|
181
|
+
int32_t i;
|
182
|
+
mwf_opt_init(&opt);
|
183
|
+
opt.max_s = max_pen;
|
184
|
+
opt.flag |= MWF_F_CIGAR;
|
185
|
+
mwf_wfa_exact(km, &opt, l1, s1, l2, s2, &r);
|
186
|
+
*mlen = *blen = 0;
|
187
|
+
for (i = 0; i < r.n_cigar; ++i) {
|
188
|
+
int32_t op = r.cigar[i]&0xf, len = r.cigar[i]>>4;
|
189
|
+
*blen += len;
|
190
|
+
if (op == 7) *mlen += len;
|
191
|
+
}
|
192
|
+
kfree(km, r.cigar);
|
193
|
+
return r.s < 0? -(l1 + l2) : (l1 + l2) / 2 - r.s;
|
194
|
+
}
|
@@ -0,0 +1,33 @@
|
|
1
|
+
#ifndef MG_ALGO_H
|
2
|
+
#define MG_ALGO_H
|
3
|
+
|
4
|
+
#include <stdint.h>
|
5
|
+
|
6
|
+
#define MG_MSS_TYPE int32_t
|
7
|
+
#define MG_LIS_TYPE uint64_t
|
8
|
+
|
9
|
+
typedef struct {
|
10
|
+
int32_t st, en;
|
11
|
+
MG_MSS_TYPE sc;
|
12
|
+
} mg_msseg_t;
|
13
|
+
|
14
|
+
typedef struct {
|
15
|
+
uint32_t st, en:31, rev:1;
|
16
|
+
int32_t far, i;
|
17
|
+
} mg_intv_t;
|
18
|
+
|
19
|
+
#ifdef __cplusplus
|
20
|
+
extern "C" {
|
21
|
+
#endif
|
22
|
+
|
23
|
+
mg_msseg_t *mg_mss_all(void *km, int32_t n, const MG_MSS_TYPE *S, MG_MSS_TYPE min_sc, MG_MSS_TYPE xdrop, int32_t *n_seg);
|
24
|
+
int32_t mg_intv_index(int32_t n, mg_intv_t *a);
|
25
|
+
int32_t mg_intv_overlap(void *km, int32_t n_a, const mg_intv_t *a, int32_t st, int32_t en, int32_t **b_, int32_t *m_b_);
|
26
|
+
void radix_sort_mg_intv(mg_intv_t *st, mg_intv_t *en);
|
27
|
+
int32_t mg_wfa_cmp(void *km, int32_t l1, const char *s1, int32_t l2, const char *s2, int32_t max_pen, int32_t *mlen, int32_t *blen);
|
28
|
+
|
29
|
+
#ifdef __cplusplus
|
30
|
+
}
|
31
|
+
#endif
|
32
|
+
|
33
|
+
#endif
|
@@ -0,0 +1,147 @@
|
|
1
|
+
#include <assert.h>
|
2
|
+
#include "mgpriv.h"
|
3
|
+
#include "ggen.h"
|
4
|
+
#include "gfa-priv.h"
|
5
|
+
#include "algo.h"
|
6
|
+
|
7
|
+
int32_t mg_gc_index(void *km, int min_mapq, int min_map_len, int min_depth_len, const gfa_t *g, int32_t n_seq, mg_gchains_t *const* gcs,
|
8
|
+
double *a_dens, int32_t **soff_, int32_t **qoff_, mg_intv_t **sintv_, mg_intv_t **qintv_);
|
9
|
+
|
10
|
+
typedef struct {
|
11
|
+
int32_t bid;
|
12
|
+
uint8_t is_stem:4, is_src:4;
|
13
|
+
} callaux_t;
|
14
|
+
|
15
|
+
typedef struct {
|
16
|
+
int32_t t, i;
|
17
|
+
int32_t st, en, strand;
|
18
|
+
int32_t qs, qe, glen;
|
19
|
+
} bbaux_t;
|
20
|
+
|
21
|
+
void mg_call_asm(const gfa_t *g, int32_t n_seq, const mg_bseq1_t *seq, mg_gchains_t *const *gcs, int32_t min_mapq, int32_t min_blen)
|
22
|
+
{
|
23
|
+
int32_t i, j, t, max_acnt, *soff, *qoff, n_bb, m_ovlp = 0, *ovlp = 0;
|
24
|
+
mg_intv_t *sintv, *qintv;
|
25
|
+
double a_dens;
|
26
|
+
gfa_bubble_t *bb;
|
27
|
+
callaux_t *ca;
|
28
|
+
bbaux_t *ba;
|
29
|
+
kstring_t out = {0,0,0};
|
30
|
+
|
31
|
+
max_acnt = mg_gc_index(0, min_mapq, min_blen>>1, min_blen, g, n_seq, gcs, &a_dens, &soff, &qoff, &sintv, &qintv);
|
32
|
+
if (max_acnt == 0) return;
|
33
|
+
|
34
|
+
bb = gfa_bubble(g, &n_bb);
|
35
|
+
GFA_CALLOC(ba, n_bb);
|
36
|
+
GFA_CALLOC(ca, g->n_seg);
|
37
|
+
for (i = 0; i < n_bb; ++i) {
|
38
|
+
gfa_bubble_t *b = &bb[i];
|
39
|
+
assert(b->n_seg >= 2);
|
40
|
+
for (j = 0; j < b->n_seg; ++j)
|
41
|
+
ca[b->v[j]>>1].bid = i;
|
42
|
+
ca[b->v[0]>>1].is_stem = ca[b->v[b->n_seg-1]>>1].is_stem = 1;
|
43
|
+
ca[b->v[0]>>1].is_src = 1;
|
44
|
+
ba[i].t = -1;
|
45
|
+
}
|
46
|
+
|
47
|
+
for (t = 0; t < n_seq; ++t) {
|
48
|
+
const mg_gchains_t *gt = gcs[t];
|
49
|
+
for (i = 0; i < gt->n_gc; ++i) {
|
50
|
+
const mg_gchain_t *gc = >->gc[i];
|
51
|
+
int32_t st = -1;
|
52
|
+
for (j = 1; j < gc->cnt; ++j) {
|
53
|
+
const mg_llchain_t *lc = >->lc[gc->off + j];
|
54
|
+
if (!ca[lc->v>>1].is_stem && ca[(lc-1)->v>>1].is_stem) {
|
55
|
+
st = gc->off + j;
|
56
|
+
} else if ((ca[lc->v>>1].is_stem && !ca[(lc-1)->v>>1].is_stem && st > 0) || (ca[lc->v>>1].is_stem && ca[(lc-1)->v>>1].is_stem)) {
|
57
|
+
int32_t n_ovlp, k, en = gc->off + j, qs, qe, span, bid, strand, glen;
|
58
|
+
bbaux_t *p;
|
59
|
+
|
60
|
+
// determine the source and sink nodes
|
61
|
+
if (ca[lc->v>>1].is_stem && ca[(lc-1)->v>>1].is_stem) { // two adjacent stems: this is a deletion
|
62
|
+
st = gc->off + j;
|
63
|
+
} else {
|
64
|
+
assert(en > st);
|
65
|
+
}
|
66
|
+
|
67
|
+
// test overlap on the query
|
68
|
+
span = gt->a[gt->lc[st].off].y >> 32 & 0xff;
|
69
|
+
qs = (int32_t)gt->a[gt->lc[st - 1].off + gt->lc[st - 1].cnt - 1].y + 1; // NB: it is fine even if .cnt==0
|
70
|
+
qe = (int32_t)gt->a[gt->lc[en].off].y + 1 - span;
|
71
|
+
n_ovlp = mg_intv_overlap(0, qoff[t+1] - qoff[t], &qintv[qoff[t]], qs, qe, &ovlp, &m_ovlp);
|
72
|
+
if (n_ovlp > 1) continue; // overlap on the query - not orthologous
|
73
|
+
|
74
|
+
// test overlap on the graph
|
75
|
+
for (k = st, glen = 0; k < en; ++k) {
|
76
|
+
const mg_llchain_t *lk = >->lc[k];
|
77
|
+
int32_t seg = lk->v>>1;
|
78
|
+
n_ovlp = mg_intv_overlap(0, soff[seg+1] - soff[seg], &sintv[soff[seg]], 0, g->seg[seg].len, &ovlp, &m_ovlp);
|
79
|
+
glen += g->seg[seg].len;
|
80
|
+
if (n_ovlp > 1) break; // overlap on the graph - not orthoologous
|
81
|
+
}
|
82
|
+
if (k < en) continue;
|
83
|
+
|
84
|
+
// determine the bubble ID
|
85
|
+
assert(ca[gt->lc[st-1].v>>1].is_stem && ca[gt->lc[en].v>>1].is_stem);
|
86
|
+
if (ca[gt->lc[st-1].v>>1].bid < ca[gt->lc[en].v>>1].bid)
|
87
|
+
strand = 1;
|
88
|
+
else if (ca[gt->lc[st-1].v>>1].bid > ca[gt->lc[en].v>>1].bid)
|
89
|
+
strand = -1;
|
90
|
+
else {
|
91
|
+
if (ca[gt->lc[st-1].v>>1].is_src + ca[gt->lc[en].v>>1].is_src != 1) {
|
92
|
+
fprintf(stderr, "[W::%s] type-1 folded inversion alignment around %c%s <=> %s:%d-%d\n",
|
93
|
+
__func__, "><"[gt->lc[st].v&1], g->seg[gt->lc[st].v>>1].name, seq[t].name, qs, qe);
|
94
|
+
continue;
|
95
|
+
}
|
96
|
+
if (ca[gt->lc[st-1].v>>1].is_src) strand = 1;
|
97
|
+
else strand = -1;
|
98
|
+
}
|
99
|
+
bid = strand > 0? ca[gt->lc[st-1].v>>1].bid : ca[gt->lc[en].v>>1].bid;
|
100
|
+
|
101
|
+
// attach the bubble
|
102
|
+
for (k = st; k < en; ++k) // check consistency
|
103
|
+
if (ca[gt->lc[k].v>>1].bid != bid)
|
104
|
+
break;
|
105
|
+
if (k != en) { // this may happen around an inversion towards the end of an alignment chain
|
106
|
+
fprintf(stderr, "[W::%s] type-2 folded inversion alignment around %c%s <=> %s:%d-%d\n",
|
107
|
+
__func__, "><"[gt->lc[st].v&1], g->seg[gt->lc[st].v>>1].name, seq[t].name, qs, qe);
|
108
|
+
continue;
|
109
|
+
}
|
110
|
+
p = &ba[bid];
|
111
|
+
p->t = t, p->i = i, p->st = st, p->en = en, p->strand = strand, p->qs = qs, p->qe = qe, p->glen = glen;
|
112
|
+
}
|
113
|
+
}
|
114
|
+
}
|
115
|
+
}
|
116
|
+
|
117
|
+
for (i = 0; i < n_bb; ++i) {
|
118
|
+
gfa_bubble_t *b = &bb[i];
|
119
|
+
bbaux_t *a = &ba[i];
|
120
|
+
const mg_gchains_t *gt = gcs[a->t];
|
121
|
+
out.l = 0;
|
122
|
+
mg_sprintf_lite(&out, "%s\t%d\t%d\t%c%s\t%c%s\t", g->sseq[b->snid].name, b->ss, b->se, "><"[b->v[0]&1], g->seg[b->v[0]>>1].name,
|
123
|
+
"><"[b->v[b->n_seg-1]&1], g->seg[b->v[b->n_seg-1]>>1].name);
|
124
|
+
if (a->t >= 0) {
|
125
|
+
assert(a->strand != 0);
|
126
|
+
if (a->st == a->en) {
|
127
|
+
mg_sprintf_lite(&out, "*");
|
128
|
+
} else if (a->strand > 0) {
|
129
|
+
for (j = a->st; j < a->en; ++j)
|
130
|
+
mg_sprintf_lite(&out, "%c%s", "><"[gt->lc[j].v&1], g->seg[gt->lc[j].v>>1].name);
|
131
|
+
} else {
|
132
|
+
for (j = a->en - 1; j >= a->st; --j)
|
133
|
+
mg_sprintf_lite(&out, "%c%s", "<>"[gt->lc[j].v&1], g->seg[gt->lc[j].v>>1].name);
|
134
|
+
}
|
135
|
+
mg_sprintf_lite(&out, ":%d:%c:%s:%d:%d", a->glen, a->strand > 0? '+' : '-', seq[a->t].name, a->qs, a->qe);
|
136
|
+
} else {
|
137
|
+
mg_sprintf_lite(&out, ".");
|
138
|
+
}
|
139
|
+
puts(out.s);
|
140
|
+
}
|
141
|
+
|
142
|
+
free(ba); free(ca);
|
143
|
+
free(soff); free(qoff); free(sintv); free(qintv);
|
144
|
+
for (i = 0; i < n_bb; ++i) free(bb[i].v);
|
145
|
+
free(bb);
|
146
|
+
free(out.s);
|
147
|
+
}
|