ruby-minigraph 0.0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +62 -0
  4. data/ext/Rakefile +56 -0
  5. data/ext/cmappy/cmappy.c +7 -0
  6. data/ext/cmappy/cmappy.h +8 -0
  7. data/ext/minigraph/LICENSE.txt +23 -0
  8. data/ext/minigraph/Makefile +66 -0
  9. data/ext/minigraph/NEWS.md +317 -0
  10. data/ext/minigraph/README.md +207 -0
  11. data/ext/minigraph/algo.c +194 -0
  12. data/ext/minigraph/algo.h +33 -0
  13. data/ext/minigraph/asm-call.c +147 -0
  14. data/ext/minigraph/bseq.c +133 -0
  15. data/ext/minigraph/bseq.h +76 -0
  16. data/ext/minigraph/cal_cov.c +139 -0
  17. data/ext/minigraph/doc/example1.png +0 -0
  18. data/ext/minigraph/doc/example2.png +0 -0
  19. data/ext/minigraph/doc/examples.graffle +0 -0
  20. data/ext/minigraph/format.c +241 -0
  21. data/ext/minigraph/galign.c +140 -0
  22. data/ext/minigraph/gchain1.c +532 -0
  23. data/ext/minigraph/gcmisc.c +223 -0
  24. data/ext/minigraph/gfa-aug.c +260 -0
  25. data/ext/minigraph/gfa-base.c +526 -0
  26. data/ext/minigraph/gfa-bbl.c +372 -0
  27. data/ext/minigraph/gfa-ed.c +617 -0
  28. data/ext/minigraph/gfa-io.c +395 -0
  29. data/ext/minigraph/gfa-priv.h +154 -0
  30. data/ext/minigraph/gfa.h +166 -0
  31. data/ext/minigraph/ggen.c +182 -0
  32. data/ext/minigraph/ggen.h +21 -0
  33. data/ext/minigraph/ggsimple.c +570 -0
  34. data/ext/minigraph/gmap.c +211 -0
  35. data/ext/minigraph/index.c +230 -0
  36. data/ext/minigraph/kalloc.c +224 -0
  37. data/ext/minigraph/kalloc.h +82 -0
  38. data/ext/minigraph/kavl.h +414 -0
  39. data/ext/minigraph/kdq.h +134 -0
  40. data/ext/minigraph/ketopt.h +116 -0
  41. data/ext/minigraph/khashl.h +348 -0
  42. data/ext/minigraph/krmq.h +474 -0
  43. data/ext/minigraph/kseq.h +256 -0
  44. data/ext/minigraph/ksort.h +164 -0
  45. data/ext/minigraph/kstring.h +165 -0
  46. data/ext/minigraph/kthread.c +159 -0
  47. data/ext/minigraph/kthread.h +15 -0
  48. data/ext/minigraph/kvec-km.h +105 -0
  49. data/ext/minigraph/kvec.h +110 -0
  50. data/ext/minigraph/lchain.c +441 -0
  51. data/ext/minigraph/main.c +301 -0
  52. data/ext/minigraph/map-algo.c +500 -0
  53. data/ext/minigraph/mgpriv.h +128 -0
  54. data/ext/minigraph/minigraph.1 +359 -0
  55. data/ext/minigraph/minigraph.h +176 -0
  56. data/ext/minigraph/miniwfa.c +834 -0
  57. data/ext/minigraph/miniwfa.h +95 -0
  58. data/ext/minigraph/misc/mgutils.js +1451 -0
  59. data/ext/minigraph/misc.c +12 -0
  60. data/ext/minigraph/options.c +134 -0
  61. data/ext/minigraph/shortk.c +251 -0
  62. data/ext/minigraph/sketch.c +109 -0
  63. data/ext/minigraph/sys.c +147 -0
  64. data/ext/minigraph/sys.h +20 -0
  65. data/ext/minigraph/test/MT-chimp.fa +277 -0
  66. data/ext/minigraph/test/MT-human.fa +239 -0
  67. data/ext/minigraph/test/MT-orangA.fa +276 -0
  68. data/ext/minigraph/test/MT.gfa +19 -0
  69. data/ext/minigraph/tex/Makefile +13 -0
  70. data/ext/minigraph/tex/minigraph.bib +676 -0
  71. data/ext/minigraph/tex/minigraph.tex +986 -0
  72. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
  73. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
  74. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
  75. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
  76. data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
  77. data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
  78. data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
  79. data/ext/minigraph/tex/plots/bedutils.js +367 -0
  80. data/ext/minigraph/tex/plots/chr-plot.js +130 -0
  81. data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
  82. data/ext/minigraph.patch +21 -0
  83. data/lib/minigraph/ffi/constants.rb +230 -0
  84. data/lib/minigraph/ffi/functions.rb +70 -0
  85. data/lib/minigraph/ffi/mappy.rb +8 -0
  86. data/lib/minigraph/ffi.rb +27 -0
  87. data/lib/minigraph/version.rb +5 -0
  88. data/lib/minigraph.rb +72 -0
  89. metadata +159 -0
@@ -0,0 +1,207 @@
1
+ [![Build Status](https://travis-ci.org/lh3/minigraph.svg?branch=master)](https://travis-ci.org/lh3/minigraph)
2
+ ## <a name="started"></a>Getting Started
3
+
4
+ ```sh
5
+ git clone https://github.com/lh3/minigraph
6
+ cd minigraph && make
7
+ # Map sequence to sequence, similar to minimap2 without base alignment
8
+ ./minigraph test/MT-human.fa test/MT-orangA.fa > out.paf
9
+ # Map sequence to graph
10
+ ./minigraph test/MT.gfa test/MT-orangA.fa > out.gaf
11
+ # Incremental graph generation (-l10k necessary for this toy example)
12
+ ./minigraph -cxggs -l10k test/MT.gfa test/MT-chimp.fa test/MT-orangA.fa > out.gfa
13
+ # Call per-sample path in each bubble/variation (-c not needed for this)
14
+ ./minigraph -xasm -l10k --call test/MT.gfa test/MT-orangA.fa > orangA.call.bed
15
+
16
+ # The lossy FASTA representation (requring https://github.com/lh3/gfatools)
17
+ gfatools gfa2fa -s out.gfa > out.fa
18
+ # Extract localized structural variations
19
+ gfatools bubble out.gfa > SV.bed
20
+ ```
21
+
22
+ ## Table of Contents
23
+
24
+ <img align="right" width="278" src="doc/example1.png"/>
25
+
26
+ - [Getting Started](#started)
27
+ - [Introduction](#intro)
28
+ - [Users' Guide](#uguide)
29
+ - [Installation](#install)
30
+ - [Sequence-to-graph mapping](#map)
31
+ - [Graph generation](#ggen)
32
+ - [Calling structural variations](#callsv)
33
+ - [Prebuilt graphs](#prebuilt)
34
+ - [Algorithm overview](#algo)
35
+ - [Limitations](#limit)
36
+
37
+ ## <a name="intro"></a>Introduction
38
+
39
+ Minigraph is a sequence-to-graph mapper and graph constructor. For graph
40
+ generation, it aligns a query sequence against a sequence graph and
41
+ incrementally augments an existing graph with long query subsequences diverged
42
+ from the graph. The figure on the right briefly explains the procedure.
43
+
44
+ Minigraph borrows ideas and code from [minimap2][minimap2]. It is fairly
45
+ efficient and can construct a graph from 90 human assemblies in a couple of
46
+ days using 24 CPU cores. Older versions of minigraph was unable to produce
47
+ base alignment. The latest version can. **Please add option `-c` for graph
48
+ generation** as it generally improves the quality of graphs.
49
+
50
+ ## <a name="uguide"></a>Users' Guide
51
+
52
+ ### <a name="install"></a>Installation
53
+
54
+ To install minigraph, type `make` in the source code directory. The only
55
+ non-standard dependency is [zlib][zlib]. For better performance, it is
56
+ recommended to compile with recent compliers.
57
+
58
+ ### <a name="map"></a>Sequence-to-graph mapping
59
+
60
+ To map sequences against a graph, you should prepare the graph in the [GFA
61
+ format][gfa1], or preferrably the [rGFA format][rgfa]. If you don't have
62
+ a graph, you can generate a graph from multiple samples (see the [Graph
63
+ generation section](#ggen) below). The typical command line for mapping is
64
+ ```sh
65
+ minigraph -cx lr graph.gfa query.fa > out.gaf
66
+ ```
67
+ You may choose the right preset option `-x` according to input. Minigraph
68
+ output mappings in the [GAF format][gaf], which is a strict superset of the
69
+ [PAF format][paf]. The only visual difference between GAF and PAF is that the
70
+ 6th column in GAF may encode a graph path like
71
+ `>MT_human:0-4001<MT_orang:3426-3927` instead of a contig/chromosome name.
72
+
73
+ The minigraph GFA parser seamlessly parses FASTA and converts it to GFA
74
+ internally, so you can also provide sequences in FASTA as the reference. In
75
+ this case, minigraph will behave like minimap2, though likely producing
76
+ different alignments due to differences between the two implementations.
77
+
78
+ ### <a name="ggen"></a>Graph generation
79
+
80
+ The following command-line generates a graph in rGFA:
81
+ ```sh
82
+ minigraph -cxggs -t16 ref.fa sample1.fa sample2.fa > out.gfa
83
+ ```
84
+ which is equivalent to
85
+ ```sh
86
+ minigraph -cxggs -t16 ref.fa sample1.fa > sample1.gfa
87
+ minigraph -cxggs -t16 sample1.gfa sample2.fa > out.gfa
88
+ ```
89
+ File `ref.fa` is typically the reference genome (e.g. GRCh38 for human).
90
+ It can also be replaced by a graph in rGFA. Minigraph assumes `sample1.fa` to
91
+ be the whole-genome assembly of an individual. This is an important assumption:
92
+ minigraph only considers 1-to-1 orthogonal regions between the graph and the
93
+ individual FASTA. If you use raw reads or put multiple individual genomes in
94
+ one file, minigraph will filter out most alignments as they cover the input
95
+ graph multiple times.
96
+
97
+ The output rGFA can be converted to a FASTA file with [gfatools][gfatools]:
98
+ ```sh
99
+ gfatools gfa2fa -s graph.gfa > out.stable.fa
100
+ ```
101
+ The output `out.stable.fa` will always include the initial reference `ref.fa`
102
+ and may additionally add new segments diverged from the initial reference.
103
+
104
+ ### <a name="callsv"></a>Calling structural variations
105
+
106
+ A minigraph graph is composed of chains of bubbles with the reference as the
107
+ backbone. Each *bubble* represents a structural variation. It can be
108
+ multi-allelic if there are multiple paths through the bubble. You can extract
109
+ these bubbles with
110
+ ```sh
111
+ gfatools bubble graph.gfa > var.bed
112
+ ```
113
+ The output is a BED-like file. The first three columns give the position of a
114
+ bubble/variation and the rest of columns are:
115
+
116
+ * (4) \# GFA segments in the bubble including the source and the sink of the bubble
117
+ * (5) \# all possible paths through the bubble (not all paths present in input samples)
118
+ * (6) 1 if the bubble involves an inversion; 0 otherwise
119
+ * (7) length of the shortest path (i.e. allele) through the bubble
120
+ * (8) length of the longest path/allele through the bubble
121
+ * (9-11) please ignore
122
+ * (12) list of segments in the bubble; first for the source and last for the sink
123
+ * (13) sequence of the shortest path (`*` if zero length)
124
+ * (14) sequence of the longest path (NB: it may not be present in the input samples)
125
+
126
+ Given an assembly, you can find the path/allele of this assembly in each bubble with
127
+ ```sh
128
+ minigraph -cxasm --call graph.gfa sample-asm.fa > sample.bed
129
+ ```
130
+ On each line in the BED-like output, the last colon separated field gives the
131
+ alignment path through the bubble, the path length in the graph, the mapping
132
+ strand of sample contig, the contig name, the approximate contig start and
133
+ contig end. The number of lines in the file is the same as the number of lines
134
+ in the output of `gfatools bubble`. You can use the `paste` Unix command to
135
+ piece multiple samples together.
136
+
137
+ ### <a name="prebuilt"></a>Prebuilt graphs
138
+
139
+ Prebuilt human graphs in the rGFA format can be found [at Zenodo][human-zenodo].
140
+
141
+ ### <a name="algo"></a>Algorithm overview
142
+
143
+ <img align="right" width="278" src="doc/example2.png"/>
144
+
145
+ In the following, minigraph command line options have a dash ahead and are
146
+ highlighted in bold. The description may help to tune minigraph parameters.
147
+
148
+ 1. Read all reference bases, extract (**-k**,**-w**)-minimizers and index them
149
+ in a hash table.
150
+
151
+ 2. Read **-K** [=*500M*] query bases in the mapping mode, or read all query
152
+ bases in the graph construction mode. For each query sequence, do step 3
153
+ through 5:
154
+
155
+ 3. Find colinear minimizer chains using the [minimap2][minimap2] algorithm,
156
+ assuming segments in the graph are disconnected. These are called *linear
157
+ chains*.
158
+
159
+ 4. Perform another round of chaining, taking each linear chain as an anchor.
160
+ For a pair of linear chains, minigraph tries to connect them by doing graph
161
+ wavefront alignment algorithm (GWFA). If minigraph fails to find an
162
+ alignment within an edit distance threshold, it will find up to 15 shortest
163
+ paths between the two linear chains and chooses the path of length closest
164
+ to the distance on the query sequence. Chains found at this step are called
165
+ *graph chains*.
166
+
167
+ 5. Identify primary chains and estimate mapping quality with a method similar
168
+ to the one used in minimap2. Perform base alignment.
169
+
170
+ 6. In the graph construction mode, collect all mappings longer than **-d**
171
+ [=*10k*] and keep their query and graph segment intervals in two lists,
172
+ respectively.
173
+
174
+ 7. For each mapping longer than **-l** [=*100k*], finds poorly aligned regions.
175
+ A region is filtered if it overlaps two or more intervals collected at step
176
+ 6.
177
+
178
+ 8. Insert the remaining poorly aligned regions into the input graph. This
179
+ constructs a new graph.
180
+
181
+ ## <a name="limit"></a>Limitations
182
+
183
+ * A complex minigraph subgraph is often suboptimal and may vary with the order
184
+ of input samples. It may not represent the evolution history
185
+ or the functional relevance at the locus. Please *do not overinterpret*
186
+ complex subgraphs. If you are interested in a particular subgraph, it is
187
+ recommended to extract the input contig subsequences involved in the subgraph
188
+ with the `--call` option and manually curated the results.
189
+
190
+ * Minigraph needs to find strong colinear chains first. For a graph consisting
191
+ of many short segments (e.g. one generated from rare SNPs in large
192
+ populations), minigraph will fail to map query sequences.
193
+
194
+ * The base alignment in the current version of minigraph is slow for species of
195
+ high diversity.
196
+
197
+
198
+ [zlib]: http://zlib.net/
199
+ [minimap2]: https://github.com/lh3/minimap2
200
+ [rgfa]: https://github.com/lh3/gfatools/blob/master/doc/rGFA.md
201
+ [gfa1]: https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md
202
+ [gaf]: https://github.com/lh3/gfatools/blob/master/doc/rGFA.md#the-graph-alignment-format-gaf
203
+ [paf]: https://github.com/lh3/miniasm/blob/master/PAF.md
204
+ [gfatools]: https://github.com/lh3/gfatools
205
+ [bandage]: https://rrwick.github.io/Bandage/
206
+ [gfaviz]: https://github.com/ggonnella/gfaviz
207
+ [human-zenodo]: https://zenodo.org/record/6499594
@@ -0,0 +1,194 @@
1
+ #include <stdlib.h>
2
+ #include <string.h>
3
+ #include "kalloc.h"
4
+ #define __STDC_LIMIT_MACROS
5
+ #include "algo.h"
6
+ #include "miniwfa.h"
7
+
8
+ /************************
9
+ * Max-scoring segments *
10
+ ************************/
11
+
12
+ #include "kvec-km.h"
13
+
14
+ #define MSS_NEG_INF INT32_MIN
15
+
16
+ typedef struct {
17
+ int32_t st, en;
18
+ MG_MSS_TYPE L, R;
19
+ int32_t pre;
20
+ } msseg_aux_t;
21
+
22
+ typedef kvec_t(mg_msseg_t) msseg_v;
23
+ typedef kvec_t(msseg_aux_t) msseg_aux_v;
24
+
25
+ static void move_segs(void *km, msseg_v *ret, msseg_aux_v *seg, MG_MSS_TYPE min_sc)
26
+ {
27
+ int32_t i;
28
+ for (i = 0; i < seg->n; ++i) {
29
+ msseg_aux_t *p = &seg->a[i];
30
+ if (p->R - p->L >= min_sc) {
31
+ mg_msseg_t *q;
32
+ kv_pushp(mg_msseg_t, km, *ret, &q);
33
+ q->st = p->st, q->en = p->en, q->sc = p->R - p->L;
34
+ }
35
+ }
36
+ seg->n = 0;
37
+ }
38
+
39
+ // Reference: Ruzzo and Tompa (1999) A linear time algorithm for finding all maximal scoring subsequencs
40
+ mg_msseg_t *mg_mss_all(void *km, int32_t n, const MG_MSS_TYPE *S, MG_MSS_TYPE min_sc, MG_MSS_TYPE xdrop, int32_t *n_seg)
41
+ {
42
+ int32_t i, j;
43
+ MG_MSS_TYPE L, max;
44
+ msseg_v ret = {0,0,0};
45
+ msseg_aux_v seg = {0,0,0};
46
+ msseg_aux_t t;
47
+
48
+ kv_resize(mg_msseg_t, km, ret, 16);
49
+ kv_resize(msseg_aux_t, km, seg, 16);
50
+ for (i = 0, L = 0, max = MSS_NEG_INF; i < n;) {
51
+ if (S[i] > 0) {
52
+ int32_t k;
53
+ MG_MSS_TYPE R = L + S[i];
54
+ for (k = i + 1; k < n && S[k] > 0; ++k)
55
+ R += S[k];
56
+ if (R > max) max = R;
57
+ t.st = i, t.en = k, t.L = L, t.R = R;
58
+ while (1) {
59
+ msseg_aux_t *p;
60
+ for (j = seg.n - 1; j >= 0;) {
61
+ p = &seg.a[j];
62
+ if (p->L < t.L) break;
63
+ j = p->pre >= 0? p->pre : j - 1;
64
+ }
65
+ if (j >= 0 && seg.a[j].R < t.R) {
66
+ p = &seg.a[j];
67
+ t.st = p->st, t.L = p->L, t.pre = p->pre;
68
+ seg.n = j;
69
+ } else {
70
+ if (j < 0) {
71
+ move_segs(km, &ret, &seg, min_sc);
72
+ max = R;
73
+ }
74
+ t.pre = j;
75
+ kv_push(msseg_aux_t, km, seg, t);
76
+ break;
77
+ }
78
+ }
79
+ L = R, i = k;
80
+ } else {
81
+ if (xdrop > 0 && L + S[i] + xdrop < max) { // reset
82
+ move_segs(km, &ret, &seg, min_sc);
83
+ L = 0, max = MSS_NEG_INF;
84
+ }
85
+ L += S[i++];
86
+ }
87
+ }
88
+ move_segs(km, &ret, &seg, min_sc);
89
+ kfree(km, seg.a);
90
+ KREALLOC(km, ret.a, ret.n);
91
+ *n_seg = ret.n;
92
+ return ret.a;
93
+ }
94
+
95
+ /**************************
96
+ * Interval overlap query *
97
+ **************************/
98
+
99
+ #include <assert.h>
100
+ #include "ksort.h"
101
+
102
+ #define sort_key_intv(a) ((a).st)
103
+ KRADIX_SORT_INIT(mg_intv, mg_intv_t, sort_key_intv, 4)
104
+
105
+ int32_t mg_intv_index(int32_t n, mg_intv_t *a)
106
+ {
107
+ int32_t i, last_i, last, k;
108
+ if (n <= 0) return -1;
109
+ radix_sort_mg_intv(a, a + n);
110
+ for (i = 0; i < n; i += 2) last_i = i, last = a[i].far = a[i].en;
111
+ for (k = 1; 1LL<<k <= n; ++k) {
112
+ int64_t x = 1LL<<(k-1), i0 = (x<<1) - 1, step = x<<2;
113
+ for (i = i0; i < n; i += step) {
114
+ int32_t el = a[i - x].far;
115
+ int32_t er = i + x < n? a[i + x].far : last;
116
+ int32_t e = a[i].en;
117
+ e = e > el? e : el;
118
+ e = e > er? e : er;
119
+ a[i].far = e;
120
+ }
121
+ last_i = last_i>>k&1? last_i - x : last_i + x;
122
+ if (last_i < n && a[last_i].far > last)
123
+ last = a[last_i].far;
124
+ }
125
+ return k - 1;
126
+ }
127
+
128
+ typedef struct {
129
+ int64_t x;
130
+ int32_t k, w;
131
+ } istack_t;
132
+
133
+ int32_t mg_intv_overlap(void *km, int32_t n_a, const mg_intv_t *a, int32_t st, int32_t en, int32_t **b_, int32_t *m_b_)
134
+ {
135
+ int32_t t = 0, h, *b = *b_, m_b = *m_b_, n = 0;
136
+ istack_t stack[64], *p;
137
+
138
+ for (h = 0; 1<<h <= n_a; ++h);
139
+ --h;
140
+ p = &stack[t++];
141
+ p->k = h, p->x = (1LL<<p->k) - 1, p->w = 0; // push the root into the stack
142
+ while (t) { // stack is not empyt
143
+ istack_t z = stack[--t];
144
+ if (z.k <= 3) { // the subtree is no larger than (1<<(z.k+1))-1; do a linear scan
145
+ int32_t i, i0 = z.x >> z.k << z.k, i1 = i0 + (1LL<<(z.k+1)) - 1;
146
+ if (i1 >= n_a) i1 = n_a;
147
+ for (i = i0; i < i1 && a[i].st < en; ++i)
148
+ if (st < a[i].en) {
149
+ if (n == m_b) KEXPAND(km, b, m_b);
150
+ b[n++] = i;
151
+ }
152
+ } else if (z.w == 0) { // if left child not processed
153
+ int32_t y = z.x - (1LL<<(z.k-1));
154
+ p = &stack[t++];
155
+ p->k = z.k, p->x = z.x, p->w = 1;
156
+ if (y >= n_a || a[y].far > st) {
157
+ p = &stack[t++];
158
+ p->k = z.k - 1, p->x = y, p->w = 0; // push the left child to the stack
159
+ }
160
+ } else if (z.x < n_a && a[z.x].st < en) {
161
+ if (st < a[z.x].en) { // then z.x overlaps the query; write to the output array
162
+ if (n == m_b) KEXPAND(km, b, m_b);
163
+ b[n++] = z.x;
164
+ }
165
+ p = &stack[t++];
166
+ p->k = z.k - 1, p->x = z.x + (1LL<<(z.k-1)), p->w = 0; // push the right child
167
+ }
168
+ }
169
+ *b_ = b, *m_b_ = m_b;
170
+ return n;
171
+ }
172
+
173
+ /********************
174
+ * Global alignment *
175
+ ********************/
176
+
177
+ int32_t mg_wfa_cmp(void *km, int32_t l1, const char *s1, int32_t l2, const char *s2, int32_t max_pen, int32_t *mlen, int32_t *blen)
178
+ {
179
+ mwf_opt_t opt;
180
+ mwf_rst_t r;
181
+ int32_t i;
182
+ mwf_opt_init(&opt);
183
+ opt.max_s = max_pen;
184
+ opt.flag |= MWF_F_CIGAR;
185
+ mwf_wfa_exact(km, &opt, l1, s1, l2, s2, &r);
186
+ *mlen = *blen = 0;
187
+ for (i = 0; i < r.n_cigar; ++i) {
188
+ int32_t op = r.cigar[i]&0xf, len = r.cigar[i]>>4;
189
+ *blen += len;
190
+ if (op == 7) *mlen += len;
191
+ }
192
+ kfree(km, r.cigar);
193
+ return r.s < 0? -(l1 + l2) : (l1 + l2) / 2 - r.s;
194
+ }
@@ -0,0 +1,33 @@
1
+ #ifndef MG_ALGO_H
2
+ #define MG_ALGO_H
3
+
4
+ #include <stdint.h>
5
+
6
+ #define MG_MSS_TYPE int32_t
7
+ #define MG_LIS_TYPE uint64_t
8
+
9
+ typedef struct {
10
+ int32_t st, en;
11
+ MG_MSS_TYPE sc;
12
+ } mg_msseg_t;
13
+
14
+ typedef struct {
15
+ uint32_t st, en:31, rev:1;
16
+ int32_t far, i;
17
+ } mg_intv_t;
18
+
19
+ #ifdef __cplusplus
20
+ extern "C" {
21
+ #endif
22
+
23
+ mg_msseg_t *mg_mss_all(void *km, int32_t n, const MG_MSS_TYPE *S, MG_MSS_TYPE min_sc, MG_MSS_TYPE xdrop, int32_t *n_seg);
24
+ int32_t mg_intv_index(int32_t n, mg_intv_t *a);
25
+ int32_t mg_intv_overlap(void *km, int32_t n_a, const mg_intv_t *a, int32_t st, int32_t en, int32_t **b_, int32_t *m_b_);
26
+ void radix_sort_mg_intv(mg_intv_t *st, mg_intv_t *en);
27
+ int32_t mg_wfa_cmp(void *km, int32_t l1, const char *s1, int32_t l2, const char *s2, int32_t max_pen, int32_t *mlen, int32_t *blen);
28
+
29
+ #ifdef __cplusplus
30
+ }
31
+ #endif
32
+
33
+ #endif
@@ -0,0 +1,147 @@
1
+ #include <assert.h>
2
+ #include "mgpriv.h"
3
+ #include "ggen.h"
4
+ #include "gfa-priv.h"
5
+ #include "algo.h"
6
+
7
+ int32_t mg_gc_index(void *km, int min_mapq, int min_map_len, int min_depth_len, const gfa_t *g, int32_t n_seq, mg_gchains_t *const* gcs,
8
+ double *a_dens, int32_t **soff_, int32_t **qoff_, mg_intv_t **sintv_, mg_intv_t **qintv_);
9
+
10
+ typedef struct {
11
+ int32_t bid;
12
+ uint8_t is_stem:4, is_src:4;
13
+ } callaux_t;
14
+
15
+ typedef struct {
16
+ int32_t t, i;
17
+ int32_t st, en, strand;
18
+ int32_t qs, qe, glen;
19
+ } bbaux_t;
20
+
21
+ void mg_call_asm(const gfa_t *g, int32_t n_seq, const mg_bseq1_t *seq, mg_gchains_t *const *gcs, int32_t min_mapq, int32_t min_blen)
22
+ {
23
+ int32_t i, j, t, max_acnt, *soff, *qoff, n_bb, m_ovlp = 0, *ovlp = 0;
24
+ mg_intv_t *sintv, *qintv;
25
+ double a_dens;
26
+ gfa_bubble_t *bb;
27
+ callaux_t *ca;
28
+ bbaux_t *ba;
29
+ kstring_t out = {0,0,0};
30
+
31
+ max_acnt = mg_gc_index(0, min_mapq, min_blen>>1, min_blen, g, n_seq, gcs, &a_dens, &soff, &qoff, &sintv, &qintv);
32
+ if (max_acnt == 0) return;
33
+
34
+ bb = gfa_bubble(g, &n_bb);
35
+ GFA_CALLOC(ba, n_bb);
36
+ GFA_CALLOC(ca, g->n_seg);
37
+ for (i = 0; i < n_bb; ++i) {
38
+ gfa_bubble_t *b = &bb[i];
39
+ assert(b->n_seg >= 2);
40
+ for (j = 0; j < b->n_seg; ++j)
41
+ ca[b->v[j]>>1].bid = i;
42
+ ca[b->v[0]>>1].is_stem = ca[b->v[b->n_seg-1]>>1].is_stem = 1;
43
+ ca[b->v[0]>>1].is_src = 1;
44
+ ba[i].t = -1;
45
+ }
46
+
47
+ for (t = 0; t < n_seq; ++t) {
48
+ const mg_gchains_t *gt = gcs[t];
49
+ for (i = 0; i < gt->n_gc; ++i) {
50
+ const mg_gchain_t *gc = &gt->gc[i];
51
+ int32_t st = -1;
52
+ for (j = 1; j < gc->cnt; ++j) {
53
+ const mg_llchain_t *lc = &gt->lc[gc->off + j];
54
+ if (!ca[lc->v>>1].is_stem && ca[(lc-1)->v>>1].is_stem) {
55
+ st = gc->off + j;
56
+ } else if ((ca[lc->v>>1].is_stem && !ca[(lc-1)->v>>1].is_stem && st > 0) || (ca[lc->v>>1].is_stem && ca[(lc-1)->v>>1].is_stem)) {
57
+ int32_t n_ovlp, k, en = gc->off + j, qs, qe, span, bid, strand, glen;
58
+ bbaux_t *p;
59
+
60
+ // determine the source and sink nodes
61
+ if (ca[lc->v>>1].is_stem && ca[(lc-1)->v>>1].is_stem) { // two adjacent stems: this is a deletion
62
+ st = gc->off + j;
63
+ } else {
64
+ assert(en > st);
65
+ }
66
+
67
+ // test overlap on the query
68
+ span = gt->a[gt->lc[st].off].y >> 32 & 0xff;
69
+ qs = (int32_t)gt->a[gt->lc[st - 1].off + gt->lc[st - 1].cnt - 1].y + 1; // NB: it is fine even if .cnt==0
70
+ qe = (int32_t)gt->a[gt->lc[en].off].y + 1 - span;
71
+ n_ovlp = mg_intv_overlap(0, qoff[t+1] - qoff[t], &qintv[qoff[t]], qs, qe, &ovlp, &m_ovlp);
72
+ if (n_ovlp > 1) continue; // overlap on the query - not orthologous
73
+
74
+ // test overlap on the graph
75
+ for (k = st, glen = 0; k < en; ++k) {
76
+ const mg_llchain_t *lk = &gt->lc[k];
77
+ int32_t seg = lk->v>>1;
78
+ n_ovlp = mg_intv_overlap(0, soff[seg+1] - soff[seg], &sintv[soff[seg]], 0, g->seg[seg].len, &ovlp, &m_ovlp);
79
+ glen += g->seg[seg].len;
80
+ if (n_ovlp > 1) break; // overlap on the graph - not orthoologous
81
+ }
82
+ if (k < en) continue;
83
+
84
+ // determine the bubble ID
85
+ assert(ca[gt->lc[st-1].v>>1].is_stem && ca[gt->lc[en].v>>1].is_stem);
86
+ if (ca[gt->lc[st-1].v>>1].bid < ca[gt->lc[en].v>>1].bid)
87
+ strand = 1;
88
+ else if (ca[gt->lc[st-1].v>>1].bid > ca[gt->lc[en].v>>1].bid)
89
+ strand = -1;
90
+ else {
91
+ if (ca[gt->lc[st-1].v>>1].is_src + ca[gt->lc[en].v>>1].is_src != 1) {
92
+ fprintf(stderr, "[W::%s] type-1 folded inversion alignment around %c%s <=> %s:%d-%d\n",
93
+ __func__, "><"[gt->lc[st].v&1], g->seg[gt->lc[st].v>>1].name, seq[t].name, qs, qe);
94
+ continue;
95
+ }
96
+ if (ca[gt->lc[st-1].v>>1].is_src) strand = 1;
97
+ else strand = -1;
98
+ }
99
+ bid = strand > 0? ca[gt->lc[st-1].v>>1].bid : ca[gt->lc[en].v>>1].bid;
100
+
101
+ // attach the bubble
102
+ for (k = st; k < en; ++k) // check consistency
103
+ if (ca[gt->lc[k].v>>1].bid != bid)
104
+ break;
105
+ if (k != en) { // this may happen around an inversion towards the end of an alignment chain
106
+ fprintf(stderr, "[W::%s] type-2 folded inversion alignment around %c%s <=> %s:%d-%d\n",
107
+ __func__, "><"[gt->lc[st].v&1], g->seg[gt->lc[st].v>>1].name, seq[t].name, qs, qe);
108
+ continue;
109
+ }
110
+ p = &ba[bid];
111
+ p->t = t, p->i = i, p->st = st, p->en = en, p->strand = strand, p->qs = qs, p->qe = qe, p->glen = glen;
112
+ }
113
+ }
114
+ }
115
+ }
116
+
117
+ for (i = 0; i < n_bb; ++i) {
118
+ gfa_bubble_t *b = &bb[i];
119
+ bbaux_t *a = &ba[i];
120
+ const mg_gchains_t *gt = gcs[a->t];
121
+ out.l = 0;
122
+ mg_sprintf_lite(&out, "%s\t%d\t%d\t%c%s\t%c%s\t", g->sseq[b->snid].name, b->ss, b->se, "><"[b->v[0]&1], g->seg[b->v[0]>>1].name,
123
+ "><"[b->v[b->n_seg-1]&1], g->seg[b->v[b->n_seg-1]>>1].name);
124
+ if (a->t >= 0) {
125
+ assert(a->strand != 0);
126
+ if (a->st == a->en) {
127
+ mg_sprintf_lite(&out, "*");
128
+ } else if (a->strand > 0) {
129
+ for (j = a->st; j < a->en; ++j)
130
+ mg_sprintf_lite(&out, "%c%s", "><"[gt->lc[j].v&1], g->seg[gt->lc[j].v>>1].name);
131
+ } else {
132
+ for (j = a->en - 1; j >= a->st; --j)
133
+ mg_sprintf_lite(&out, "%c%s", "<>"[gt->lc[j].v&1], g->seg[gt->lc[j].v>>1].name);
134
+ }
135
+ mg_sprintf_lite(&out, ":%d:%c:%s:%d:%d", a->glen, a->strand > 0? '+' : '-', seq[a->t].name, a->qs, a->qe);
136
+ } else {
137
+ mg_sprintf_lite(&out, ".");
138
+ }
139
+ puts(out.s);
140
+ }
141
+
142
+ free(ba); free(ca);
143
+ free(soff); free(qoff); free(sintv); free(qintv);
144
+ for (i = 0; i < n_bb; ++i) free(bb[i].v);
145
+ free(bb);
146
+ free(out.s);
147
+ }