ruby-minigraph 0.0.20.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +62 -0
  4. data/ext/Rakefile +56 -0
  5. data/ext/cmappy/cmappy.c +7 -0
  6. data/ext/cmappy/cmappy.h +8 -0
  7. data/ext/minigraph/LICENSE.txt +23 -0
  8. data/ext/minigraph/Makefile +66 -0
  9. data/ext/minigraph/NEWS.md +317 -0
  10. data/ext/minigraph/README.md +207 -0
  11. data/ext/minigraph/algo.c +194 -0
  12. data/ext/minigraph/algo.h +33 -0
  13. data/ext/minigraph/asm-call.c +147 -0
  14. data/ext/minigraph/bseq.c +133 -0
  15. data/ext/minigraph/bseq.h +76 -0
  16. data/ext/minigraph/cal_cov.c +139 -0
  17. data/ext/minigraph/doc/example1.png +0 -0
  18. data/ext/minigraph/doc/example2.png +0 -0
  19. data/ext/minigraph/doc/examples.graffle +0 -0
  20. data/ext/minigraph/format.c +241 -0
  21. data/ext/minigraph/galign.c +140 -0
  22. data/ext/minigraph/gchain1.c +532 -0
  23. data/ext/minigraph/gcmisc.c +223 -0
  24. data/ext/minigraph/gfa-aug.c +260 -0
  25. data/ext/minigraph/gfa-base.c +526 -0
  26. data/ext/minigraph/gfa-bbl.c +372 -0
  27. data/ext/minigraph/gfa-ed.c +617 -0
  28. data/ext/minigraph/gfa-io.c +395 -0
  29. data/ext/minigraph/gfa-priv.h +154 -0
  30. data/ext/minigraph/gfa.h +166 -0
  31. data/ext/minigraph/ggen.c +182 -0
  32. data/ext/minigraph/ggen.h +21 -0
  33. data/ext/minigraph/ggsimple.c +570 -0
  34. data/ext/minigraph/gmap.c +211 -0
  35. data/ext/minigraph/index.c +230 -0
  36. data/ext/minigraph/kalloc.c +224 -0
  37. data/ext/minigraph/kalloc.h +82 -0
  38. data/ext/minigraph/kavl.h +414 -0
  39. data/ext/minigraph/kdq.h +134 -0
  40. data/ext/minigraph/ketopt.h +116 -0
  41. data/ext/minigraph/khashl.h +348 -0
  42. data/ext/minigraph/krmq.h +474 -0
  43. data/ext/minigraph/kseq.h +256 -0
  44. data/ext/minigraph/ksort.h +164 -0
  45. data/ext/minigraph/kstring.h +165 -0
  46. data/ext/minigraph/kthread.c +159 -0
  47. data/ext/minigraph/kthread.h +15 -0
  48. data/ext/minigraph/kvec-km.h +105 -0
  49. data/ext/minigraph/kvec.h +110 -0
  50. data/ext/minigraph/lchain.c +441 -0
  51. data/ext/minigraph/main.c +301 -0
  52. data/ext/minigraph/map-algo.c +500 -0
  53. data/ext/minigraph/mgpriv.h +128 -0
  54. data/ext/minigraph/minigraph.1 +359 -0
  55. data/ext/minigraph/minigraph.h +176 -0
  56. data/ext/minigraph/miniwfa.c +834 -0
  57. data/ext/minigraph/miniwfa.h +95 -0
  58. data/ext/minigraph/misc/mgutils.js +1451 -0
  59. data/ext/minigraph/misc.c +12 -0
  60. data/ext/minigraph/options.c +134 -0
  61. data/ext/minigraph/shortk.c +251 -0
  62. data/ext/minigraph/sketch.c +109 -0
  63. data/ext/minigraph/sys.c +147 -0
  64. data/ext/minigraph/sys.h +20 -0
  65. data/ext/minigraph/test/MT-chimp.fa +277 -0
  66. data/ext/minigraph/test/MT-human.fa +239 -0
  67. data/ext/minigraph/test/MT-orangA.fa +276 -0
  68. data/ext/minigraph/test/MT.gfa +19 -0
  69. data/ext/minigraph/tex/Makefile +13 -0
  70. data/ext/minigraph/tex/minigraph.bib +676 -0
  71. data/ext/minigraph/tex/minigraph.tex +986 -0
  72. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
  73. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
  74. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
  75. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
  76. data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
  77. data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
  78. data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
  79. data/ext/minigraph/tex/plots/bedutils.js +367 -0
  80. data/ext/minigraph/tex/plots/chr-plot.js +130 -0
  81. data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
  82. data/ext/minigraph.patch +21 -0
  83. data/lib/minigraph/ffi/constants.rb +230 -0
  84. data/lib/minigraph/ffi/functions.rb +70 -0
  85. data/lib/minigraph/ffi/mappy.rb +8 -0
  86. data/lib/minigraph/ffi.rb +27 -0
  87. data/lib/minigraph/version.rb +5 -0
  88. data/lib/minigraph.rb +72 -0
  89. metadata +159 -0
@@ -0,0 +1,207 @@
1
+ [![Build Status](https://travis-ci.org/lh3/minigraph.svg?branch=master)](https://travis-ci.org/lh3/minigraph)
2
+ ## <a name="started"></a>Getting Started
3
+
4
+ ```sh
5
+ git clone https://github.com/lh3/minigraph
6
+ cd minigraph && make
7
+ # Map sequence to sequence, similar to minimap2 without base alignment
8
+ ./minigraph test/MT-human.fa test/MT-orangA.fa > out.paf
9
+ # Map sequence to graph
10
+ ./minigraph test/MT.gfa test/MT-orangA.fa > out.gaf
11
+ # Incremental graph generation (-l10k necessary for this toy example)
12
+ ./minigraph -cxggs -l10k test/MT.gfa test/MT-chimp.fa test/MT-orangA.fa > out.gfa
13
+ # Call per-sample path in each bubble/variation (-c not needed for this)
14
+ ./minigraph -xasm -l10k --call test/MT.gfa test/MT-orangA.fa > orangA.call.bed
15
+
16
+ # The lossy FASTA representation (requring https://github.com/lh3/gfatools)
17
+ gfatools gfa2fa -s out.gfa > out.fa
18
+ # Extract localized structural variations
19
+ gfatools bubble out.gfa > SV.bed
20
+ ```
21
+
22
+ ## Table of Contents
23
+
24
+ <img align="right" width="278" src="doc/example1.png"/>
25
+
26
+ - [Getting Started](#started)
27
+ - [Introduction](#intro)
28
+ - [Users' Guide](#uguide)
29
+ - [Installation](#install)
30
+ - [Sequence-to-graph mapping](#map)
31
+ - [Graph generation](#ggen)
32
+ - [Calling structural variations](#callsv)
33
+ - [Prebuilt graphs](#prebuilt)
34
+ - [Algorithm overview](#algo)
35
+ - [Limitations](#limit)
36
+
37
+ ## <a name="intro"></a>Introduction
38
+
39
+ Minigraph is a sequence-to-graph mapper and graph constructor. For graph
40
+ generation, it aligns a query sequence against a sequence graph and
41
+ incrementally augments an existing graph with long query subsequences diverged
42
+ from the graph. The figure on the right briefly explains the procedure.
43
+
44
+ Minigraph borrows ideas and code from [minimap2][minimap2]. It is fairly
45
+ efficient and can construct a graph from 90 human assemblies in a couple of
46
+ days using 24 CPU cores. Older versions of minigraph was unable to produce
47
+ base alignment. The latest version can. **Please add option `-c` for graph
48
+ generation** as it generally improves the quality of graphs.
49
+
50
+ ## <a name="uguide"></a>Users' Guide
51
+
52
+ ### <a name="install"></a>Installation
53
+
54
+ To install minigraph, type `make` in the source code directory. The only
55
+ non-standard dependency is [zlib][zlib]. For better performance, it is
56
+ recommended to compile with recent compliers.
57
+
58
+ ### <a name="map"></a>Sequence-to-graph mapping
59
+
60
+ To map sequences against a graph, you should prepare the graph in the [GFA
61
+ format][gfa1], or preferrably the [rGFA format][rgfa]. If you don't have
62
+ a graph, you can generate a graph from multiple samples (see the [Graph
63
+ generation section](#ggen) below). The typical command line for mapping is
64
+ ```sh
65
+ minigraph -cx lr graph.gfa query.fa > out.gaf
66
+ ```
67
+ You may choose the right preset option `-x` according to input. Minigraph
68
+ output mappings in the [GAF format][gaf], which is a strict superset of the
69
+ [PAF format][paf]. The only visual difference between GAF and PAF is that the
70
+ 6th column in GAF may encode a graph path like
71
+ `>MT_human:0-4001<MT_orang:3426-3927` instead of a contig/chromosome name.
72
+
73
+ The minigraph GFA parser seamlessly parses FASTA and converts it to GFA
74
+ internally, so you can also provide sequences in FASTA as the reference. In
75
+ this case, minigraph will behave like minimap2, though likely producing
76
+ different alignments due to differences between the two implementations.
77
+
78
+ ### <a name="ggen"></a>Graph generation
79
+
80
+ The following command-line generates a graph in rGFA:
81
+ ```sh
82
+ minigraph -cxggs -t16 ref.fa sample1.fa sample2.fa > out.gfa
83
+ ```
84
+ which is equivalent to
85
+ ```sh
86
+ minigraph -cxggs -t16 ref.fa sample1.fa > sample1.gfa
87
+ minigraph -cxggs -t16 sample1.gfa sample2.fa > out.gfa
88
+ ```
89
+ File `ref.fa` is typically the reference genome (e.g. GRCh38 for human).
90
+ It can also be replaced by a graph in rGFA. Minigraph assumes `sample1.fa` to
91
+ be the whole-genome assembly of an individual. This is an important assumption:
92
+ minigraph only considers 1-to-1 orthogonal regions between the graph and the
93
+ individual FASTA. If you use raw reads or put multiple individual genomes in
94
+ one file, minigraph will filter out most alignments as they cover the input
95
+ graph multiple times.
96
+
97
+ The output rGFA can be converted to a FASTA file with [gfatools][gfatools]:
98
+ ```sh
99
+ gfatools gfa2fa -s graph.gfa > out.stable.fa
100
+ ```
101
+ The output `out.stable.fa` will always include the initial reference `ref.fa`
102
+ and may additionally add new segments diverged from the initial reference.
103
+
104
+ ### <a name="callsv"></a>Calling structural variations
105
+
106
+ A minigraph graph is composed of chains of bubbles with the reference as the
107
+ backbone. Each *bubble* represents a structural variation. It can be
108
+ multi-allelic if there are multiple paths through the bubble. You can extract
109
+ these bubbles with
110
+ ```sh
111
+ gfatools bubble graph.gfa > var.bed
112
+ ```
113
+ The output is a BED-like file. The first three columns give the position of a
114
+ bubble/variation and the rest of columns are:
115
+
116
+ * (4) \# GFA segments in the bubble including the source and the sink of the bubble
117
+ * (5) \# all possible paths through the bubble (not all paths present in input samples)
118
+ * (6) 1 if the bubble involves an inversion; 0 otherwise
119
+ * (7) length of the shortest path (i.e. allele) through the bubble
120
+ * (8) length of the longest path/allele through the bubble
121
+ * (9-11) please ignore
122
+ * (12) list of segments in the bubble; first for the source and last for the sink
123
+ * (13) sequence of the shortest path (`*` if zero length)
124
+ * (14) sequence of the longest path (NB: it may not be present in the input samples)
125
+
126
+ Given an assembly, you can find the path/allele of this assembly in each bubble with
127
+ ```sh
128
+ minigraph -cxasm --call graph.gfa sample-asm.fa > sample.bed
129
+ ```
130
+ On each line in the BED-like output, the last colon separated field gives the
131
+ alignment path through the bubble, the path length in the graph, the mapping
132
+ strand of sample contig, the contig name, the approximate contig start and
133
+ contig end. The number of lines in the file is the same as the number of lines
134
+ in the output of `gfatools bubble`. You can use the `paste` Unix command to
135
+ piece multiple samples together.
136
+
137
+ ### <a name="prebuilt"></a>Prebuilt graphs
138
+
139
+ Prebuilt human graphs in the rGFA format can be found [at Zenodo][human-zenodo].
140
+
141
+ ### <a name="algo"></a>Algorithm overview
142
+
143
+ <img align="right" width="278" src="doc/example2.png"/>
144
+
145
+ In the following, minigraph command line options have a dash ahead and are
146
+ highlighted in bold. The description may help to tune minigraph parameters.
147
+
148
+ 1. Read all reference bases, extract (**-k**,**-w**)-minimizers and index them
149
+ in a hash table.
150
+
151
+ 2. Read **-K** [=*500M*] query bases in the mapping mode, or read all query
152
+ bases in the graph construction mode. For each query sequence, do step 3
153
+ through 5:
154
+
155
+ 3. Find colinear minimizer chains using the [minimap2][minimap2] algorithm,
156
+ assuming segments in the graph are disconnected. These are called *linear
157
+ chains*.
158
+
159
+ 4. Perform another round of chaining, taking each linear chain as an anchor.
160
+ For a pair of linear chains, minigraph tries to connect them by doing graph
161
+ wavefront alignment algorithm (GWFA). If minigraph fails to find an
162
+ alignment within an edit distance threshold, it will find up to 15 shortest
163
+ paths between the two linear chains and chooses the path of length closest
164
+ to the distance on the query sequence. Chains found at this step are called
165
+ *graph chains*.
166
+
167
+ 5. Identify primary chains and estimate mapping quality with a method similar
168
+ to the one used in minimap2. Perform base alignment.
169
+
170
+ 6. In the graph construction mode, collect all mappings longer than **-d**
171
+ [=*10k*] and keep their query and graph segment intervals in two lists,
172
+ respectively.
173
+
174
+ 7. For each mapping longer than **-l** [=*100k*], finds poorly aligned regions.
175
+ A region is filtered if it overlaps two or more intervals collected at step
176
+ 6.
177
+
178
+ 8. Insert the remaining poorly aligned regions into the input graph. This
179
+ constructs a new graph.
180
+
181
+ ## <a name="limit"></a>Limitations
182
+
183
+ * A complex minigraph subgraph is often suboptimal and may vary with the order
184
+ of input samples. It may not represent the evolution history
185
+ or the functional relevance at the locus. Please *do not overinterpret*
186
+ complex subgraphs. If you are interested in a particular subgraph, it is
187
+ recommended to extract the input contig subsequences involved in the subgraph
188
+ with the `--call` option and manually curated the results.
189
+
190
+ * Minigraph needs to find strong colinear chains first. For a graph consisting
191
+ of many short segments (e.g. one generated from rare SNPs in large
192
+ populations), minigraph will fail to map query sequences.
193
+
194
+ * The base alignment in the current version of minigraph is slow for species of
195
+ high diversity.
196
+
197
+
198
+ [zlib]: http://zlib.net/
199
+ [minimap2]: https://github.com/lh3/minimap2
200
+ [rgfa]: https://github.com/lh3/gfatools/blob/master/doc/rGFA.md
201
+ [gfa1]: https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md
202
+ [gaf]: https://github.com/lh3/gfatools/blob/master/doc/rGFA.md#the-graph-alignment-format-gaf
203
+ [paf]: https://github.com/lh3/miniasm/blob/master/PAF.md
204
+ [gfatools]: https://github.com/lh3/gfatools
205
+ [bandage]: https://rrwick.github.io/Bandage/
206
+ [gfaviz]: https://github.com/ggonnella/gfaviz
207
+ [human-zenodo]: https://zenodo.org/record/6499594
@@ -0,0 +1,194 @@
1
+ #include <stdlib.h>
2
+ #include <string.h>
3
+ #include "kalloc.h"
4
+ #define __STDC_LIMIT_MACROS
5
+ #include "algo.h"
6
+ #include "miniwfa.h"
7
+
8
+ /************************
9
+ * Max-scoring segments *
10
+ ************************/
11
+
12
+ #include "kvec-km.h"
13
+
14
+ #define MSS_NEG_INF INT32_MIN
15
+
16
+ typedef struct {
17
+ int32_t st, en;
18
+ MG_MSS_TYPE L, R;
19
+ int32_t pre;
20
+ } msseg_aux_t;
21
+
22
+ typedef kvec_t(mg_msseg_t) msseg_v;
23
+ typedef kvec_t(msseg_aux_t) msseg_aux_v;
24
+
25
+ static void move_segs(void *km, msseg_v *ret, msseg_aux_v *seg, MG_MSS_TYPE min_sc)
26
+ {
27
+ int32_t i;
28
+ for (i = 0; i < seg->n; ++i) {
29
+ msseg_aux_t *p = &seg->a[i];
30
+ if (p->R - p->L >= min_sc) {
31
+ mg_msseg_t *q;
32
+ kv_pushp(mg_msseg_t, km, *ret, &q);
33
+ q->st = p->st, q->en = p->en, q->sc = p->R - p->L;
34
+ }
35
+ }
36
+ seg->n = 0;
37
+ }
38
+
39
+ // Reference: Ruzzo and Tompa (1999) A linear time algorithm for finding all maximal scoring subsequencs
40
+ mg_msseg_t *mg_mss_all(void *km, int32_t n, const MG_MSS_TYPE *S, MG_MSS_TYPE min_sc, MG_MSS_TYPE xdrop, int32_t *n_seg)
41
+ {
42
+ int32_t i, j;
43
+ MG_MSS_TYPE L, max;
44
+ msseg_v ret = {0,0,0};
45
+ msseg_aux_v seg = {0,0,0};
46
+ msseg_aux_t t;
47
+
48
+ kv_resize(mg_msseg_t, km, ret, 16);
49
+ kv_resize(msseg_aux_t, km, seg, 16);
50
+ for (i = 0, L = 0, max = MSS_NEG_INF; i < n;) {
51
+ if (S[i] > 0) {
52
+ int32_t k;
53
+ MG_MSS_TYPE R = L + S[i];
54
+ for (k = i + 1; k < n && S[k] > 0; ++k)
55
+ R += S[k];
56
+ if (R > max) max = R;
57
+ t.st = i, t.en = k, t.L = L, t.R = R;
58
+ while (1) {
59
+ msseg_aux_t *p;
60
+ for (j = seg.n - 1; j >= 0;) {
61
+ p = &seg.a[j];
62
+ if (p->L < t.L) break;
63
+ j = p->pre >= 0? p->pre : j - 1;
64
+ }
65
+ if (j >= 0 && seg.a[j].R < t.R) {
66
+ p = &seg.a[j];
67
+ t.st = p->st, t.L = p->L, t.pre = p->pre;
68
+ seg.n = j;
69
+ } else {
70
+ if (j < 0) {
71
+ move_segs(km, &ret, &seg, min_sc);
72
+ max = R;
73
+ }
74
+ t.pre = j;
75
+ kv_push(msseg_aux_t, km, seg, t);
76
+ break;
77
+ }
78
+ }
79
+ L = R, i = k;
80
+ } else {
81
+ if (xdrop > 0 && L + S[i] + xdrop < max) { // reset
82
+ move_segs(km, &ret, &seg, min_sc);
83
+ L = 0, max = MSS_NEG_INF;
84
+ }
85
+ L += S[i++];
86
+ }
87
+ }
88
+ move_segs(km, &ret, &seg, min_sc);
89
+ kfree(km, seg.a);
90
+ KREALLOC(km, ret.a, ret.n);
91
+ *n_seg = ret.n;
92
+ return ret.a;
93
+ }
94
+
95
+ /**************************
96
+ * Interval overlap query *
97
+ **************************/
98
+
99
+ #include <assert.h>
100
+ #include "ksort.h"
101
+
102
+ #define sort_key_intv(a) ((a).st)
103
+ KRADIX_SORT_INIT(mg_intv, mg_intv_t, sort_key_intv, 4)
104
+
105
+ int32_t mg_intv_index(int32_t n, mg_intv_t *a)
106
+ {
107
+ int32_t i, last_i, last, k;
108
+ if (n <= 0) return -1;
109
+ radix_sort_mg_intv(a, a + n);
110
+ for (i = 0; i < n; i += 2) last_i = i, last = a[i].far = a[i].en;
111
+ for (k = 1; 1LL<<k <= n; ++k) {
112
+ int64_t x = 1LL<<(k-1), i0 = (x<<1) - 1, step = x<<2;
113
+ for (i = i0; i < n; i += step) {
114
+ int32_t el = a[i - x].far;
115
+ int32_t er = i + x < n? a[i + x].far : last;
116
+ int32_t e = a[i].en;
117
+ e = e > el? e : el;
118
+ e = e > er? e : er;
119
+ a[i].far = e;
120
+ }
121
+ last_i = last_i>>k&1? last_i - x : last_i + x;
122
+ if (last_i < n && a[last_i].far > last)
123
+ last = a[last_i].far;
124
+ }
125
+ return k - 1;
126
+ }
127
+
128
+ typedef struct {
129
+ int64_t x;
130
+ int32_t k, w;
131
+ } istack_t;
132
+
133
+ int32_t mg_intv_overlap(void *km, int32_t n_a, const mg_intv_t *a, int32_t st, int32_t en, int32_t **b_, int32_t *m_b_)
134
+ {
135
+ int32_t t = 0, h, *b = *b_, m_b = *m_b_, n = 0;
136
+ istack_t stack[64], *p;
137
+
138
+ for (h = 0; 1<<h <= n_a; ++h);
139
+ --h;
140
+ p = &stack[t++];
141
+ p->k = h, p->x = (1LL<<p->k) - 1, p->w = 0; // push the root into the stack
142
+ while (t) { // stack is not empyt
143
+ istack_t z = stack[--t];
144
+ if (z.k <= 3) { // the subtree is no larger than (1<<(z.k+1))-1; do a linear scan
145
+ int32_t i, i0 = z.x >> z.k << z.k, i1 = i0 + (1LL<<(z.k+1)) - 1;
146
+ if (i1 >= n_a) i1 = n_a;
147
+ for (i = i0; i < i1 && a[i].st < en; ++i)
148
+ if (st < a[i].en) {
149
+ if (n == m_b) KEXPAND(km, b, m_b);
150
+ b[n++] = i;
151
+ }
152
+ } else if (z.w == 0) { // if left child not processed
153
+ int32_t y = z.x - (1LL<<(z.k-1));
154
+ p = &stack[t++];
155
+ p->k = z.k, p->x = z.x, p->w = 1;
156
+ if (y >= n_a || a[y].far > st) {
157
+ p = &stack[t++];
158
+ p->k = z.k - 1, p->x = y, p->w = 0; // push the left child to the stack
159
+ }
160
+ } else if (z.x < n_a && a[z.x].st < en) {
161
+ if (st < a[z.x].en) { // then z.x overlaps the query; write to the output array
162
+ if (n == m_b) KEXPAND(km, b, m_b);
163
+ b[n++] = z.x;
164
+ }
165
+ p = &stack[t++];
166
+ p->k = z.k - 1, p->x = z.x + (1LL<<(z.k-1)), p->w = 0; // push the right child
167
+ }
168
+ }
169
+ *b_ = b, *m_b_ = m_b;
170
+ return n;
171
+ }
172
+
173
+ /********************
174
+ * Global alignment *
175
+ ********************/
176
+
177
+ int32_t mg_wfa_cmp(void *km, int32_t l1, const char *s1, int32_t l2, const char *s2, int32_t max_pen, int32_t *mlen, int32_t *blen)
178
+ {
179
+ mwf_opt_t opt;
180
+ mwf_rst_t r;
181
+ int32_t i;
182
+ mwf_opt_init(&opt);
183
+ opt.max_s = max_pen;
184
+ opt.flag |= MWF_F_CIGAR;
185
+ mwf_wfa_exact(km, &opt, l1, s1, l2, s2, &r);
186
+ *mlen = *blen = 0;
187
+ for (i = 0; i < r.n_cigar; ++i) {
188
+ int32_t op = r.cigar[i]&0xf, len = r.cigar[i]>>4;
189
+ *blen += len;
190
+ if (op == 7) *mlen += len;
191
+ }
192
+ kfree(km, r.cigar);
193
+ return r.s < 0? -(l1 + l2) : (l1 + l2) / 2 - r.s;
194
+ }
@@ -0,0 +1,33 @@
1
+ #ifndef MG_ALGO_H
2
+ #define MG_ALGO_H
3
+
4
+ #include <stdint.h>
5
+
6
+ #define MG_MSS_TYPE int32_t
7
+ #define MG_LIS_TYPE uint64_t
8
+
9
+ typedef struct {
10
+ int32_t st, en;
11
+ MG_MSS_TYPE sc;
12
+ } mg_msseg_t;
13
+
14
+ typedef struct {
15
+ uint32_t st, en:31, rev:1;
16
+ int32_t far, i;
17
+ } mg_intv_t;
18
+
19
+ #ifdef __cplusplus
20
+ extern "C" {
21
+ #endif
22
+
23
+ mg_msseg_t *mg_mss_all(void *km, int32_t n, const MG_MSS_TYPE *S, MG_MSS_TYPE min_sc, MG_MSS_TYPE xdrop, int32_t *n_seg);
24
+ int32_t mg_intv_index(int32_t n, mg_intv_t *a);
25
+ int32_t mg_intv_overlap(void *km, int32_t n_a, const mg_intv_t *a, int32_t st, int32_t en, int32_t **b_, int32_t *m_b_);
26
+ void radix_sort_mg_intv(mg_intv_t *st, mg_intv_t *en);
27
+ int32_t mg_wfa_cmp(void *km, int32_t l1, const char *s1, int32_t l2, const char *s2, int32_t max_pen, int32_t *mlen, int32_t *blen);
28
+
29
+ #ifdef __cplusplus
30
+ }
31
+ #endif
32
+
33
+ #endif
@@ -0,0 +1,147 @@
1
+ #include <assert.h>
2
+ #include "mgpriv.h"
3
+ #include "ggen.h"
4
+ #include "gfa-priv.h"
5
+ #include "algo.h"
6
+
7
+ int32_t mg_gc_index(void *km, int min_mapq, int min_map_len, int min_depth_len, const gfa_t *g, int32_t n_seq, mg_gchains_t *const* gcs,
8
+ double *a_dens, int32_t **soff_, int32_t **qoff_, mg_intv_t **sintv_, mg_intv_t **qintv_);
9
+
10
+ typedef struct {
11
+ int32_t bid;
12
+ uint8_t is_stem:4, is_src:4;
13
+ } callaux_t;
14
+
15
+ typedef struct {
16
+ int32_t t, i;
17
+ int32_t st, en, strand;
18
+ int32_t qs, qe, glen;
19
+ } bbaux_t;
20
+
21
+ void mg_call_asm(const gfa_t *g, int32_t n_seq, const mg_bseq1_t *seq, mg_gchains_t *const *gcs, int32_t min_mapq, int32_t min_blen)
22
+ {
23
+ int32_t i, j, t, max_acnt, *soff, *qoff, n_bb, m_ovlp = 0, *ovlp = 0;
24
+ mg_intv_t *sintv, *qintv;
25
+ double a_dens;
26
+ gfa_bubble_t *bb;
27
+ callaux_t *ca;
28
+ bbaux_t *ba;
29
+ kstring_t out = {0,0,0};
30
+
31
+ max_acnt = mg_gc_index(0, min_mapq, min_blen>>1, min_blen, g, n_seq, gcs, &a_dens, &soff, &qoff, &sintv, &qintv);
32
+ if (max_acnt == 0) return;
33
+
34
+ bb = gfa_bubble(g, &n_bb);
35
+ GFA_CALLOC(ba, n_bb);
36
+ GFA_CALLOC(ca, g->n_seg);
37
+ for (i = 0; i < n_bb; ++i) {
38
+ gfa_bubble_t *b = &bb[i];
39
+ assert(b->n_seg >= 2);
40
+ for (j = 0; j < b->n_seg; ++j)
41
+ ca[b->v[j]>>1].bid = i;
42
+ ca[b->v[0]>>1].is_stem = ca[b->v[b->n_seg-1]>>1].is_stem = 1;
43
+ ca[b->v[0]>>1].is_src = 1;
44
+ ba[i].t = -1;
45
+ }
46
+
47
+ for (t = 0; t < n_seq; ++t) {
48
+ const mg_gchains_t *gt = gcs[t];
49
+ for (i = 0; i < gt->n_gc; ++i) {
50
+ const mg_gchain_t *gc = &gt->gc[i];
51
+ int32_t st = -1;
52
+ for (j = 1; j < gc->cnt; ++j) {
53
+ const mg_llchain_t *lc = &gt->lc[gc->off + j];
54
+ if (!ca[lc->v>>1].is_stem && ca[(lc-1)->v>>1].is_stem) {
55
+ st = gc->off + j;
56
+ } else if ((ca[lc->v>>1].is_stem && !ca[(lc-1)->v>>1].is_stem && st > 0) || (ca[lc->v>>1].is_stem && ca[(lc-1)->v>>1].is_stem)) {
57
+ int32_t n_ovlp, k, en = gc->off + j, qs, qe, span, bid, strand, glen;
58
+ bbaux_t *p;
59
+
60
+ // determine the source and sink nodes
61
+ if (ca[lc->v>>1].is_stem && ca[(lc-1)->v>>1].is_stem) { // two adjacent stems: this is a deletion
62
+ st = gc->off + j;
63
+ } else {
64
+ assert(en > st);
65
+ }
66
+
67
+ // test overlap on the query
68
+ span = gt->a[gt->lc[st].off].y >> 32 & 0xff;
69
+ qs = (int32_t)gt->a[gt->lc[st - 1].off + gt->lc[st - 1].cnt - 1].y + 1; // NB: it is fine even if .cnt==0
70
+ qe = (int32_t)gt->a[gt->lc[en].off].y + 1 - span;
71
+ n_ovlp = mg_intv_overlap(0, qoff[t+1] - qoff[t], &qintv[qoff[t]], qs, qe, &ovlp, &m_ovlp);
72
+ if (n_ovlp > 1) continue; // overlap on the query - not orthologous
73
+
74
+ // test overlap on the graph
75
+ for (k = st, glen = 0; k < en; ++k) {
76
+ const mg_llchain_t *lk = &gt->lc[k];
77
+ int32_t seg = lk->v>>1;
78
+ n_ovlp = mg_intv_overlap(0, soff[seg+1] - soff[seg], &sintv[soff[seg]], 0, g->seg[seg].len, &ovlp, &m_ovlp);
79
+ glen += g->seg[seg].len;
80
+ if (n_ovlp > 1) break; // overlap on the graph - not orthoologous
81
+ }
82
+ if (k < en) continue;
83
+
84
+ // determine the bubble ID
85
+ assert(ca[gt->lc[st-1].v>>1].is_stem && ca[gt->lc[en].v>>1].is_stem);
86
+ if (ca[gt->lc[st-1].v>>1].bid < ca[gt->lc[en].v>>1].bid)
87
+ strand = 1;
88
+ else if (ca[gt->lc[st-1].v>>1].bid > ca[gt->lc[en].v>>1].bid)
89
+ strand = -1;
90
+ else {
91
+ if (ca[gt->lc[st-1].v>>1].is_src + ca[gt->lc[en].v>>1].is_src != 1) {
92
+ fprintf(stderr, "[W::%s] type-1 folded inversion alignment around %c%s <=> %s:%d-%d\n",
93
+ __func__, "><"[gt->lc[st].v&1], g->seg[gt->lc[st].v>>1].name, seq[t].name, qs, qe);
94
+ continue;
95
+ }
96
+ if (ca[gt->lc[st-1].v>>1].is_src) strand = 1;
97
+ else strand = -1;
98
+ }
99
+ bid = strand > 0? ca[gt->lc[st-1].v>>1].bid : ca[gt->lc[en].v>>1].bid;
100
+
101
+ // attach the bubble
102
+ for (k = st; k < en; ++k) // check consistency
103
+ if (ca[gt->lc[k].v>>1].bid != bid)
104
+ break;
105
+ if (k != en) { // this may happen around an inversion towards the end of an alignment chain
106
+ fprintf(stderr, "[W::%s] type-2 folded inversion alignment around %c%s <=> %s:%d-%d\n",
107
+ __func__, "><"[gt->lc[st].v&1], g->seg[gt->lc[st].v>>1].name, seq[t].name, qs, qe);
108
+ continue;
109
+ }
110
+ p = &ba[bid];
111
+ p->t = t, p->i = i, p->st = st, p->en = en, p->strand = strand, p->qs = qs, p->qe = qe, p->glen = glen;
112
+ }
113
+ }
114
+ }
115
+ }
116
+
117
+ for (i = 0; i < n_bb; ++i) {
118
+ gfa_bubble_t *b = &bb[i];
119
+ bbaux_t *a = &ba[i];
120
+ const mg_gchains_t *gt = gcs[a->t];
121
+ out.l = 0;
122
+ mg_sprintf_lite(&out, "%s\t%d\t%d\t%c%s\t%c%s\t", g->sseq[b->snid].name, b->ss, b->se, "><"[b->v[0]&1], g->seg[b->v[0]>>1].name,
123
+ "><"[b->v[b->n_seg-1]&1], g->seg[b->v[b->n_seg-1]>>1].name);
124
+ if (a->t >= 0) {
125
+ assert(a->strand != 0);
126
+ if (a->st == a->en) {
127
+ mg_sprintf_lite(&out, "*");
128
+ } else if (a->strand > 0) {
129
+ for (j = a->st; j < a->en; ++j)
130
+ mg_sprintf_lite(&out, "%c%s", "><"[gt->lc[j].v&1], g->seg[gt->lc[j].v>>1].name);
131
+ } else {
132
+ for (j = a->en - 1; j >= a->st; --j)
133
+ mg_sprintf_lite(&out, "%c%s", "<>"[gt->lc[j].v&1], g->seg[gt->lc[j].v>>1].name);
134
+ }
135
+ mg_sprintf_lite(&out, ":%d:%c:%s:%d:%d", a->glen, a->strand > 0? '+' : '-', seq[a->t].name, a->qs, a->qe);
136
+ } else {
137
+ mg_sprintf_lite(&out, ".");
138
+ }
139
+ puts(out.s);
140
+ }
141
+
142
+ free(ba); free(ca);
143
+ free(soff); free(qoff); free(sintv); free(qintv);
144
+ for (i = 0; i < n_bb; ++i) free(bb[i].v);
145
+ free(bb);
146
+ free(out.s);
147
+ }