ruby-minigraph 0.0.20.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +62 -0
- data/ext/Rakefile +56 -0
- data/ext/cmappy/cmappy.c +7 -0
- data/ext/cmappy/cmappy.h +8 -0
- data/ext/minigraph/LICENSE.txt +23 -0
- data/ext/minigraph/Makefile +66 -0
- data/ext/minigraph/NEWS.md +317 -0
- data/ext/minigraph/README.md +207 -0
- data/ext/minigraph/algo.c +194 -0
- data/ext/minigraph/algo.h +33 -0
- data/ext/minigraph/asm-call.c +147 -0
- data/ext/minigraph/bseq.c +133 -0
- data/ext/minigraph/bseq.h +76 -0
- data/ext/minigraph/cal_cov.c +139 -0
- data/ext/minigraph/doc/example1.png +0 -0
- data/ext/minigraph/doc/example2.png +0 -0
- data/ext/minigraph/doc/examples.graffle +0 -0
- data/ext/minigraph/format.c +241 -0
- data/ext/minigraph/galign.c +140 -0
- data/ext/minigraph/gchain1.c +532 -0
- data/ext/minigraph/gcmisc.c +223 -0
- data/ext/minigraph/gfa-aug.c +260 -0
- data/ext/minigraph/gfa-base.c +526 -0
- data/ext/minigraph/gfa-bbl.c +372 -0
- data/ext/minigraph/gfa-ed.c +617 -0
- data/ext/minigraph/gfa-io.c +395 -0
- data/ext/minigraph/gfa-priv.h +154 -0
- data/ext/minigraph/gfa.h +166 -0
- data/ext/minigraph/ggen.c +182 -0
- data/ext/minigraph/ggen.h +21 -0
- data/ext/minigraph/ggsimple.c +570 -0
- data/ext/minigraph/gmap.c +211 -0
- data/ext/minigraph/index.c +230 -0
- data/ext/minigraph/kalloc.c +224 -0
- data/ext/minigraph/kalloc.h +82 -0
- data/ext/minigraph/kavl.h +414 -0
- data/ext/minigraph/kdq.h +134 -0
- data/ext/minigraph/ketopt.h +116 -0
- data/ext/minigraph/khashl.h +348 -0
- data/ext/minigraph/krmq.h +474 -0
- data/ext/minigraph/kseq.h +256 -0
- data/ext/minigraph/ksort.h +164 -0
- data/ext/minigraph/kstring.h +165 -0
- data/ext/minigraph/kthread.c +159 -0
- data/ext/minigraph/kthread.h +15 -0
- data/ext/minigraph/kvec-km.h +105 -0
- data/ext/minigraph/kvec.h +110 -0
- data/ext/minigraph/lchain.c +441 -0
- data/ext/minigraph/main.c +301 -0
- data/ext/minigraph/map-algo.c +500 -0
- data/ext/minigraph/mgpriv.h +128 -0
- data/ext/minigraph/minigraph.1 +359 -0
- data/ext/minigraph/minigraph.h +176 -0
- data/ext/minigraph/miniwfa.c +834 -0
- data/ext/minigraph/miniwfa.h +95 -0
- data/ext/minigraph/misc/mgutils.js +1451 -0
- data/ext/minigraph/misc.c +12 -0
- data/ext/minigraph/options.c +134 -0
- data/ext/minigraph/shortk.c +251 -0
- data/ext/minigraph/sketch.c +109 -0
- data/ext/minigraph/sys.c +147 -0
- data/ext/minigraph/sys.h +20 -0
- data/ext/minigraph/test/MT-chimp.fa +277 -0
- data/ext/minigraph/test/MT-human.fa +239 -0
- data/ext/minigraph/test/MT-orangA.fa +276 -0
- data/ext/minigraph/test/MT.gfa +19 -0
- data/ext/minigraph/tex/Makefile +13 -0
- data/ext/minigraph/tex/minigraph.bib +676 -0
- data/ext/minigraph/tex/minigraph.tex +986 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
- data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
- data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
- data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
- data/ext/minigraph/tex/plots/bedutils.js +367 -0
- data/ext/minigraph/tex/plots/chr-plot.js +130 -0
- data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
- data/ext/minigraph.patch +21 -0
- data/lib/minigraph/ffi/constants.rb +230 -0
- data/lib/minigraph/ffi/functions.rb +70 -0
- data/lib/minigraph/ffi/mappy.rb +8 -0
- data/lib/minigraph/ffi.rb +27 -0
- data/lib/minigraph/version.rb +5 -0
- data/lib/minigraph.rb +72 -0
- metadata +159 -0
@@ -0,0 +1,359 @@
|
|
1
|
+
.TH minigraph 1 "20 November 2022" "minigraph-0.20 (r559)" "Bioinformatics tools"
|
2
|
+
|
3
|
+
.SH NAME
|
4
|
+
.PP
|
5
|
+
minigraph - sequence-to-graph mapping and incremental sequence graph generation
|
6
|
+
|
7
|
+
.SH SYNOPSIS
|
8
|
+
* Sequence-to-graph mapping:
|
9
|
+
.RS 4
|
10
|
+
.B minigraph
|
11
|
+
.RB [ -x
|
12
|
+
.IR preset ]
|
13
|
+
.RB [ -c ]
|
14
|
+
.RB [ -t
|
15
|
+
.IR nThreads ]
|
16
|
+
.I graph.gfa
|
17
|
+
.I query1.fa
|
18
|
+
.RI [ ... ]
|
19
|
+
.B >
|
20
|
+
.I out.gaf
|
21
|
+
.RE
|
22
|
+
|
23
|
+
* Incremental graph generation:
|
24
|
+
.RS 4
|
25
|
+
.B minigraph
|
26
|
+
.B -x ggs
|
27
|
+
.RB [ -c ]
|
28
|
+
.RB [ -t
|
29
|
+
.IR nThreads ]
|
30
|
+
.I initGraph.gfa
|
31
|
+
.I sample1Asm.fa
|
32
|
+
.RI [ ... ]
|
33
|
+
.B >
|
34
|
+
.I finalGraph.gfa
|
35
|
+
|
36
|
+
.SH DESCRIPTION
|
37
|
+
|
38
|
+
Minigraph is a
|
39
|
+
.I proof-of-concept
|
40
|
+
sequence-to-graph mapper and graph constructor. It finds approximate locations
|
41
|
+
of a query sequence in a sequence graph and incrementally augments an existing
|
42
|
+
graph with long query subsequences.
|
43
|
+
|
44
|
+
.SH OPTIONS
|
45
|
+
.SS Indexing options
|
46
|
+
.TP 10
|
47
|
+
.BI -k \ INT
|
48
|
+
Minimizer k-mer length [17]
|
49
|
+
.TP
|
50
|
+
.BI -w \ INT
|
51
|
+
Minimizer window size [11]. A minimizer is the smallest k-mer in a window of w
|
52
|
+
consecutive k-mers.
|
53
|
+
.SS Mapping options
|
54
|
+
.TP 10
|
55
|
+
.BI -c
|
56
|
+
Perform base alignment; recommended for graph generation
|
57
|
+
.TP 10
|
58
|
+
.BI -U \ INT1 [, INT2 ]
|
59
|
+
Choose the minimizer occurrence threshold within this interval [50,250]
|
60
|
+
.TP
|
61
|
+
.BI -f \ FLOAT
|
62
|
+
Ignore top
|
63
|
+
.I FLOAT
|
64
|
+
fraction of repetitive minimizers [0.0002]. If this threshold falls within the
|
65
|
+
interval set by
|
66
|
+
.BR -U ,
|
67
|
+
it will be the final threshold; otherwise the lower or the upper bound of
|
68
|
+
.B -U
|
69
|
+
will be applied.
|
70
|
+
.TP
|
71
|
+
.BI -j \ FLOAT
|
72
|
+
Expected query-graph sequence divergence [0.1]
|
73
|
+
.TP
|
74
|
+
.BI -g \ NUM
|
75
|
+
Stop chain enlongation if there are no minimizers within
|
76
|
+
.IR INT -bp
|
77
|
+
[10k]. K/k/M/m suffixes are recognized.
|
78
|
+
.TP
|
79
|
+
.BI -r \ NUM1 [, NUM2 ]
|
80
|
+
Bandwidth for the two rounds of chaining [500,20k].
|
81
|
+
.I NUM2
|
82
|
+
also controls bandwidth for graph chaining.
|
83
|
+
.TP
|
84
|
+
.BI -n \ INT1 [, INT2 ]
|
85
|
+
Drop graph chains consisting of
|
86
|
+
.RI < INT1
|
87
|
+
minimizers and drop linear chains consisting of
|
88
|
+
.RI < INT2
|
89
|
+
minimizers [5,3]
|
90
|
+
.TP
|
91
|
+
.BI -m \ INT1 [, INT2 ]
|
92
|
+
Drop graph chains with graph chaining score
|
93
|
+
.RI < INT1
|
94
|
+
and drop linear chains with linear chaining score
|
95
|
+
.RI < INT2
|
96
|
+
[50,30]. Linear chaining score equals the approximate number of matching bases
|
97
|
+
minus a weak concave gap penalty. Graph chaining score uses a linear gap
|
98
|
+
penalty.
|
99
|
+
.TP
|
100
|
+
.BI -p \ FLOAT
|
101
|
+
Minimal secondary-to-primary score ratio to output secondary mappings [0.8].
|
102
|
+
Between two chains overlaping over half of the shorter chain (controlled by
|
103
|
+
.BR -M ),
|
104
|
+
the chain with a lower score is secondary to the chain with a higher score.
|
105
|
+
.TP
|
106
|
+
.BI -N \ INT
|
107
|
+
Output at most
|
108
|
+
.I INT
|
109
|
+
secondary mappings [5]. This option has no effect when
|
110
|
+
.B -P
|
111
|
+
is applied.
|
112
|
+
.TP
|
113
|
+
.B -P
|
114
|
+
Retain all chains and don't attempt to set primary chains. Options
|
115
|
+
.B -p
|
116
|
+
and
|
117
|
+
.B -N
|
118
|
+
have no effect when this option is in use.
|
119
|
+
.TP
|
120
|
+
.BI -M \ FLOAT
|
121
|
+
Mark as secondary a chain that overlaps with a better chain by
|
122
|
+
.I FLOAT
|
123
|
+
or more of the shorter chain [0.5]
|
124
|
+
.TP
|
125
|
+
.BI --max-gap-pre \ NUM
|
126
|
+
Similar to
|
127
|
+
.B -g
|
128
|
+
but used for prefiltering [1000]
|
129
|
+
.TP
|
130
|
+
.BI --max-lc-iter \ NUM
|
131
|
+
max number of iterations for linear chaining [10000]
|
132
|
+
.TP
|
133
|
+
.BI --max-rmq-size \ NUM
|
134
|
+
max size of the RMQ tree [100000]
|
135
|
+
.TP
|
136
|
+
.BI --max-lc-skip \ INT
|
137
|
+
A heuristics that stops linear chaining early [25]
|
138
|
+
.TP
|
139
|
+
.BI --max-gc-skip \ INT
|
140
|
+
Similar to
|
141
|
+
.B --max-lc-skip
|
142
|
+
but applied to graph chaining [25]
|
143
|
+
.TP
|
144
|
+
.BI --ref-bonus \ INT
|
145
|
+
Bonus for a reference subwalk [0]
|
146
|
+
.TP
|
147
|
+
.BI --min-cov-blen \ NUM
|
148
|
+
Minimum alignment block length to count [1k]
|
149
|
+
.TP
|
150
|
+
.BI --min-cov-mapq \ INT
|
151
|
+
Minimum mapping quality to count [20]
|
152
|
+
.SS Graph generation options
|
153
|
+
.TP 10
|
154
|
+
.BR --ggen =[ simple ]
|
155
|
+
Graph generation algorithm. So far only a
|
156
|
+
.B simple
|
157
|
+
algorithm is implemented [simple]. With this option, all query sequences are
|
158
|
+
loaded into memory.
|
159
|
+
.TP
|
160
|
+
.B --call
|
161
|
+
Call the graph path in each bubble and output in a BED-based format:
|
162
|
+
.RS
|
163
|
+
ctg start end sourceNode sinkNode walk:strand:queryName:qStart:qEnd
|
164
|
+
.RE
|
165
|
+
.TP
|
166
|
+
.BI -q \ INT
|
167
|
+
Minimum mapping quality [5]
|
168
|
+
.TP
|
169
|
+
.BI -l \ NUM
|
170
|
+
Minimum chain length to consider [100k]
|
171
|
+
.TP
|
172
|
+
.BI -d \ NUM
|
173
|
+
Minimum chain length for depth calculation [20k]
|
174
|
+
.TP
|
175
|
+
.BI -L \ INT
|
176
|
+
Minimum insertion length [50]
|
177
|
+
.TP
|
178
|
+
.BI --gg-match-pen \ INT
|
179
|
+
Penalty for a pair of matching anchors [5]. Larger value for more fragmented inserts.
|
180
|
+
Effectively without
|
181
|
+
.BR -c .
|
182
|
+
.TP
|
183
|
+
.BR --ins-qovlp = yes | no
|
184
|
+
Forcefully resolve query overlaps [no]. Effective without
|
185
|
+
.BR -c .
|
186
|
+
.TP
|
187
|
+
.BR --inv = yes | no
|
188
|
+
Generate graphs with inversions or not [yes]
|
189
|
+
.TP
|
190
|
+
.B --cov
|
191
|
+
Remap and generate segment and link use frequencies. This option triggers GFA
|
192
|
+
output. When used with
|
193
|
+
.BR --ggen ,
|
194
|
+
minigraph writes the frequency of link uses and the average breadth of coverage
|
195
|
+
of each segment to the
|
196
|
+
.B cf
|
197
|
+
tag. When used without
|
198
|
+
.BR --ggen ,
|
199
|
+
minigraph writes the count of link uses and the average depth of coverage of
|
200
|
+
each segment to the
|
201
|
+
.B dc
|
202
|
+
tag.
|
203
|
+
.B
|
204
|
+
WARNING:
|
205
|
+
THIS OPTION IS DEPRECATED AND MAY BE REMOVED IN FUTURE.
|
206
|
+
.SS Input/output options
|
207
|
+
.TP 10
|
208
|
+
.BI -o \ FILE
|
209
|
+
Output alignments to
|
210
|
+
.I FILE
|
211
|
+
[stdout].
|
212
|
+
.TP
|
213
|
+
.BI -t \ INT
|
214
|
+
Number of threads [4]. Minigraph uses at most three threads when indexing target
|
215
|
+
sequences, and uses up to
|
216
|
+
.IR INT +1
|
217
|
+
threads when mapping (the extra thread is for I/O, which is frequently idle and
|
218
|
+
takes little CPU time).
|
219
|
+
.TP
|
220
|
+
.BI -K \ NUM
|
221
|
+
Number of bases loaded into memory to process in a mini-batch [500M].
|
222
|
+
K/M/G/k/m/g suffix is accepted. A large
|
223
|
+
.I NUM
|
224
|
+
helps load balancing in the multi-threading mode, at the cost of increased
|
225
|
+
memory. This option has no effect if
|
226
|
+
.B --ggen
|
227
|
+
is applied.
|
228
|
+
.TP
|
229
|
+
.B --vc
|
230
|
+
In output GAF, show mapping paths in the unstable segment coordinate.
|
231
|
+
.TP
|
232
|
+
.B -S
|
233
|
+
Output linear chains in the format of: `*' segName segLen nMinimizer seqDiv segStart segEnd qStart qEnd
|
234
|
+
.TP
|
235
|
+
.B --write-mz
|
236
|
+
Output linear chains in the format of: `*' segName segLen nMinimizer seqDiv segStart segEnd qStart qEnd
|
237
|
+
k-mer segOffsets qOffsets. segOffsets and qOffsets are comma-separated lists
|
238
|
+
with each consisting of nMinimizer-1 integers which give the distance from the
|
239
|
+
previous minimizer on segments and query, respectively.
|
240
|
+
.TP
|
241
|
+
.BR --secondary = yes | no
|
242
|
+
Whether to output secondary alignments [no]
|
243
|
+
.TP
|
244
|
+
.BR --show-unmap = yes | no
|
245
|
+
Print unmapped query sequences in GAF [no]
|
246
|
+
.TP
|
247
|
+
.B --version
|
248
|
+
Print version number to stdout
|
249
|
+
.SS Preset options
|
250
|
+
.TP 10
|
251
|
+
.BI -x \ STR
|
252
|
+
Preset []. This option applies multiple options at the same time. Other options
|
253
|
+
on the command line will always override values set by
|
254
|
+
.BR -x .
|
255
|
+
Available
|
256
|
+
.I STR
|
257
|
+
are:
|
258
|
+
.RS
|
259
|
+
.TP 8
|
260
|
+
.B lr
|
261
|
+
Mapping noisy long reads. This is the same as the default setting.
|
262
|
+
.TP
|
263
|
+
.B sr
|
264
|
+
Mapping short single-end or paired-end reads
|
265
|
+
.RB ( -k21
|
266
|
+
.B -w10 -U1000,2500 -g100 -r100 -p.5 -n3,2 -m40,25 --heap-sort=yes -K50m --frag --ref-bonus=1
|
267
|
+
.BR --min-cov-blen=50 ).
|
268
|
+
Paired-end mapping is not supported.
|
269
|
+
.TP
|
270
|
+
.B asm
|
271
|
+
Mapping long contigs or high-quality CCS reads
|
272
|
+
.RB ( -k19
|
273
|
+
.B -w10 -U10,100 -j.01 -g10k -r1k,150k -n5,5 -m1000,40 -K4g --max-lc-skip=50 --max-gc-skip=50 --min-cov-mapq=5
|
274
|
+
.BR --min-cov-blen=100k ).
|
275
|
+
.TP
|
276
|
+
.B ggs
|
277
|
+
Incremental graph generation
|
278
|
+
.RB ( -xasm
|
279
|
+
.B -N0
|
280
|
+
.BR --ggen=simple ).
|
281
|
+
.RE
|
282
|
+
.SS Miscellaneous options
|
283
|
+
.TP 10
|
284
|
+
.B --no-kalloc
|
285
|
+
Use the libc default allocator instead of the kalloc thread-local allocator.
|
286
|
+
This debugging option is mostly used with Valgrind to detect invalid memory
|
287
|
+
accesses. Minigraph runs slower with this option, especially in the
|
288
|
+
multi-threading mode.
|
289
|
+
.SH OUTPUT FORMAT
|
290
|
+
.PP
|
291
|
+
Minigraph outputs mapping positions in the Graph mApping Format (GAF) by
|
292
|
+
default. GAF is a TAB-delimited text format with each line consisting of at
|
293
|
+
least 12 fields as are described in the following table:
|
294
|
+
.TS
|
295
|
+
center box;
|
296
|
+
cb | cb | cb
|
297
|
+
r | c | l .
|
298
|
+
Col Type Description
|
299
|
+
_
|
300
|
+
1 string Query sequence name
|
301
|
+
2 int Query sequence length
|
302
|
+
3 int Query start coordinate (0-based; closed)
|
303
|
+
4 int Query end coordinate (0-based; open)
|
304
|
+
5 char `+' if query/path on the same strand; `-' if opposite
|
305
|
+
6 string Path matching /([><][^\\s><]+(:\\d+-\\d+)?)+|([^\\s><]+)/
|
306
|
+
7 int Path sequence length
|
307
|
+
8 int Path start coordinate
|
308
|
+
9 int Path end coordinate
|
309
|
+
10 int Number of matching bases in the mapping
|
310
|
+
11 int Number bases, including gaps, in the mapping
|
311
|
+
12 int Mapping quality (0-255 with 255 for missing)
|
312
|
+
.TE
|
313
|
+
|
314
|
+
.PP
|
315
|
+
When alignment is available, column 11 gives the total number of sequence
|
316
|
+
matches, mismatches and gaps in the alignment; column 10 divided by column 11
|
317
|
+
gives the BLAST-like alignment identity. When alignment is unavailable,
|
318
|
+
these two columns are approximate. PAF may optionally have additional fields in
|
319
|
+
the SAM-like typed key-value format. Minigraph may output the following tags:
|
320
|
+
.TS
|
321
|
+
center box;
|
322
|
+
cb | cb | cb
|
323
|
+
r | c | l .
|
324
|
+
Tag Type Description
|
325
|
+
_
|
326
|
+
tp A Type of aln: P/primary and S/secondary
|
327
|
+
cm i Number of minimizers on the chain
|
328
|
+
s1 i Chaining score
|
329
|
+
s2 i Chaining score of the best secondary chain
|
330
|
+
dv f Approximate per-base sequence divergence
|
331
|
+
cf f Avg. segment breadth of coverage and link use freq
|
332
|
+
dc f Avg. segment depth of coverage and link use counts
|
333
|
+
cg Z CIGAR string
|
334
|
+
ql B,i Lengths of single-end reads
|
335
|
+
.TE
|
336
|
+
|
337
|
+
.SH LIMITATIONS
|
338
|
+
.TP 2
|
339
|
+
*
|
340
|
+
Minigraph needs to find strong colinear chains first. For a graph consisting of
|
341
|
+
many short segments (e.g. one generated from rare SNPs in large populations),
|
342
|
+
minigraph will fail to map query sequences.
|
343
|
+
.TP
|
344
|
+
*
|
345
|
+
When connecting colinear chains on graphs, minigraph doesn't always take full
|
346
|
+
advantage of base sequences and may miss the optimal alignments.
|
347
|
+
.TP
|
348
|
+
*
|
349
|
+
Minigraph only inserts segments contained in long graph chains. This
|
350
|
+
conservative strategy helps to build relatively accurate graph, but may miss
|
351
|
+
more complex events. Other strategies may be explored in future.
|
352
|
+
.TP
|
353
|
+
*
|
354
|
+
Base alignment has only been evaluated for human. For more diverse genomes,
|
355
|
+
the performance may need to be improved.
|
356
|
+
|
357
|
+
.SH SEE ALSO
|
358
|
+
.PP
|
359
|
+
minimap2(1), gfatools(1).
|
@@ -0,0 +1,176 @@
|
|
1
|
+
#ifndef MINIGRAPH_H
|
2
|
+
#define MINIGRAPH_H
|
3
|
+
|
4
|
+
#include <stdint.h>
|
5
|
+
#include "gfa.h"
|
6
|
+
|
7
|
+
#define MG_VERSION "0.20-r559"
|
8
|
+
|
9
|
+
#define MG_M_SPLICE 0x10
|
10
|
+
#define MG_M_SR 0x20
|
11
|
+
#define MG_M_FRAG_MODE 0x40
|
12
|
+
#define MG_M_FRAG_MERGE 0x80
|
13
|
+
#define MG_M_FOR_ONLY 0x100
|
14
|
+
#define MG_M_REV_ONLY 0x200
|
15
|
+
#define MG_M_HEAP_SORT 0x400
|
16
|
+
#define MG_M_VERTEX_COOR 0x800
|
17
|
+
#define MG_M_ALL_CHAINS 0x1000
|
18
|
+
#define MG_M_PRINT_2ND 0x2000
|
19
|
+
#define MG_M_CAL_COV 0x4000
|
20
|
+
#define MG_M_RMQ 0x8000
|
21
|
+
#define MG_M_COPY_COMMENT 0x10000
|
22
|
+
#define MG_M_INDEPEND_SEG 0x20000
|
23
|
+
#define MG_M_NO_QUAL 0x40000
|
24
|
+
#define MG_M_2_IO_THREADS 0x80000
|
25
|
+
#define MG_M_SHOW_UNMAP 0x100000
|
26
|
+
#define MG_M_NO_COMP_PATH 0x200000
|
27
|
+
#define MG_M_NO_DIAG 0x400000
|
28
|
+
#define MG_M_WRITE_LCHAIN 0x800000
|
29
|
+
#define MG_M_WRITE_MZ 0x1000000
|
30
|
+
#define MG_M_SKIP_GCHECK 0x2000000
|
31
|
+
#define MG_M_CIGAR 0x4000000
|
32
|
+
|
33
|
+
#define MG_G_NONE 0
|
34
|
+
#define MG_G_GGSIMPLE 1
|
35
|
+
|
36
|
+
#define MG_G_NO_QOVLP 0x1
|
37
|
+
#define MG_G_CAL_COV 0x2
|
38
|
+
#define MG_G_NO_INV 0x4
|
39
|
+
#define MG_G_CALL 0x8
|
40
|
+
|
41
|
+
typedef struct { uint64_t x, y; } mg128_t;
|
42
|
+
typedef struct { size_t n, m; mg128_t *a; } mg128_v;
|
43
|
+
typedef struct { int32_t n, m; uint32_t *a; } mg32_v;
|
44
|
+
typedef struct { int32_t n, m; uint64_t *a; } mg64_v;
|
45
|
+
|
46
|
+
typedef struct {
|
47
|
+
int w, k;
|
48
|
+
int bucket_bits;
|
49
|
+
} mg_idxopt_t;
|
50
|
+
|
51
|
+
typedef struct {
|
52
|
+
uint64_t flag;
|
53
|
+
int64_t mini_batch_size;
|
54
|
+
int seed;
|
55
|
+
int max_qlen;
|
56
|
+
int pe_ori;
|
57
|
+
int occ_max1, occ_max1_cap;
|
58
|
+
float occ_max1_frac;
|
59
|
+
int bw, bw_long;
|
60
|
+
int rmq_size_cap;
|
61
|
+
int rmq_rescue_size;
|
62
|
+
float rmq_rescue_ratio;
|
63
|
+
int max_gap_pre, max_gap, max_gap_ref, max_frag_len;
|
64
|
+
float div;
|
65
|
+
float chn_pen_gap, chn_pen_skip;
|
66
|
+
int max_lc_skip, max_lc_iter, max_gc_skip;
|
67
|
+
int min_lc_cnt, min_lc_score;
|
68
|
+
int min_gc_cnt, min_gc_score;
|
69
|
+
int gdp_max_ed, lc_max_trim, lc_max_occ;
|
70
|
+
float mask_level;
|
71
|
+
int sub_diff;
|
72
|
+
int best_n;
|
73
|
+
float pri_ratio;
|
74
|
+
int ref_bonus;
|
75
|
+
int64_t cap_kalloc;
|
76
|
+
int min_cov_mapq, min_cov_blen;
|
77
|
+
} mg_mapopt_t;
|
78
|
+
|
79
|
+
typedef struct {
|
80
|
+
uint64_t flag;
|
81
|
+
int algo;
|
82
|
+
int min_mapq;
|
83
|
+
int min_map_len, min_depth_len;
|
84
|
+
int min_var_len, match_pen;
|
85
|
+
// parameters specific to ggsimple/ggs
|
86
|
+
int ggs_shrink_pen;
|
87
|
+
int ggs_min_end_cnt;
|
88
|
+
float ggs_min_end_frac;
|
89
|
+
// scoring for SW check
|
90
|
+
float ggs_max_iden, ggs_min_inv_iden;
|
91
|
+
} mg_ggopt_t;
|
92
|
+
|
93
|
+
typedef struct {
|
94
|
+
const gfa_t *g;
|
95
|
+
gfa_edseq_t *es;
|
96
|
+
int32_t b, w, k, flag, n_seg;
|
97
|
+
struct mg_idx_bucket_s *B; // index (hidden)
|
98
|
+
} mg_idx_t;
|
99
|
+
|
100
|
+
typedef struct {
|
101
|
+
int32_t off, cnt:31, inner_pre:1;
|
102
|
+
uint32_t v;
|
103
|
+
int32_t rs, re, qs, qe;
|
104
|
+
int32_t score, dist_pre;
|
105
|
+
uint32_t hash_pre;
|
106
|
+
} mg_lchain_t;
|
107
|
+
|
108
|
+
typedef struct {
|
109
|
+
int32_t off, cnt;
|
110
|
+
uint32_t v;
|
111
|
+
int32_t score;
|
112
|
+
int32_t ed;
|
113
|
+
} mg_llchain_t;
|
114
|
+
|
115
|
+
typedef struct {
|
116
|
+
int32_t n_cigar, mlen, blen, aplen, ss, ee; // ss: start on the start vertex; ee: end on the end vertex
|
117
|
+
uint64_t cigar[];
|
118
|
+
} mg_cigar_t;
|
119
|
+
|
120
|
+
typedef struct {
|
121
|
+
int32_t id, parent;
|
122
|
+
int32_t off, cnt;
|
123
|
+
int32_t n_anchor, score;
|
124
|
+
int32_t qs, qe;
|
125
|
+
int32_t plen, ps, pe;
|
126
|
+
int32_t blen, mlen;
|
127
|
+
float div;
|
128
|
+
uint32_t hash;
|
129
|
+
int32_t subsc, n_sub;
|
130
|
+
uint32_t mapq:8, flt:1, dummy:23;
|
131
|
+
mg_cigar_t *p;
|
132
|
+
} mg_gchain_t;
|
133
|
+
|
134
|
+
typedef struct {
|
135
|
+
void *km;
|
136
|
+
int32_t n_gc, n_lc, n_a, rep_len;
|
137
|
+
mg_gchain_t *gc;
|
138
|
+
mg_llchain_t *lc;
|
139
|
+
mg128_t *a; // minimizer positions; see comments above mg_update_anchors() for details
|
140
|
+
} mg_gchains_t;
|
141
|
+
|
142
|
+
typedef struct mg_tbuf_s mg_tbuf_t;
|
143
|
+
|
144
|
+
extern int mg_verbose, mg_dbg_flag;
|
145
|
+
extern double mg_realtime0;
|
146
|
+
|
147
|
+
#ifdef __cplusplus
|
148
|
+
extern "C" {
|
149
|
+
#endif
|
150
|
+
|
151
|
+
// options
|
152
|
+
int mg_opt_set(const char *preset, mg_idxopt_t *io, mg_mapopt_t *mo, mg_ggopt_t *go);
|
153
|
+
int mg_opt_check(const mg_idxopt_t *io, const mg_mapopt_t *mo, const mg_ggopt_t *go);
|
154
|
+
void mg_opt_update(const mg_idx_t *gi, mg_mapopt_t *mo, mg_ggopt_t *go);
|
155
|
+
|
156
|
+
// index operations
|
157
|
+
mg_idx_t *mg_index(gfa_t *g, const mg_idxopt_t *io, int n_threads, mg_mapopt_t *mo); // combine mg_index_core() and mg_opt_update()
|
158
|
+
void mg_idx_destroy(mg_idx_t *gi);
|
159
|
+
|
160
|
+
// mapping
|
161
|
+
mg_tbuf_t *mg_tbuf_init(void);
|
162
|
+
void mg_tbuf_destroy(mg_tbuf_t *b);
|
163
|
+
mg_gchains_t *mg_map(const mg_idx_t *gi, int qlen, const char *seq, mg_tbuf_t *b, const mg_mapopt_t *opt, const char *qname);
|
164
|
+
void mg_map_frag(const mg_idx_t *gi, int n_segs, const int *qlens, const char **seqs, mg_gchains_t **gcs, mg_tbuf_t *b, const mg_mapopt_t *opt, const char *qname);
|
165
|
+
|
166
|
+
// high-level mapping APIs
|
167
|
+
int mg_map_files(gfa_t *g, int n_fn, const char **fn, const mg_idxopt_t *ipt, const mg_mapopt_t *opt0, int n_threads);
|
168
|
+
|
169
|
+
// graph generation
|
170
|
+
int mg_ggen(gfa_t *g, int32_t n_fn, const char **fn, const mg_idxopt_t *ipt, const mg_mapopt_t *opt0, const mg_ggopt_t *go, int n_threads);
|
171
|
+
|
172
|
+
#ifdef __cplusplus
|
173
|
+
}
|
174
|
+
#endif
|
175
|
+
|
176
|
+
#endif
|