ruby-minigraph 0.0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +62 -0
- data/ext/Rakefile +56 -0
- data/ext/cmappy/cmappy.c +7 -0
- data/ext/cmappy/cmappy.h +8 -0
- data/ext/minigraph/LICENSE.txt +23 -0
- data/ext/minigraph/Makefile +66 -0
- data/ext/minigraph/NEWS.md +317 -0
- data/ext/minigraph/README.md +207 -0
- data/ext/minigraph/algo.c +194 -0
- data/ext/minigraph/algo.h +33 -0
- data/ext/minigraph/asm-call.c +147 -0
- data/ext/minigraph/bseq.c +133 -0
- data/ext/minigraph/bseq.h +76 -0
- data/ext/minigraph/cal_cov.c +139 -0
- data/ext/minigraph/doc/example1.png +0 -0
- data/ext/minigraph/doc/example2.png +0 -0
- data/ext/minigraph/doc/examples.graffle +0 -0
- data/ext/minigraph/format.c +241 -0
- data/ext/minigraph/galign.c +140 -0
- data/ext/minigraph/gchain1.c +532 -0
- data/ext/minigraph/gcmisc.c +223 -0
- data/ext/minigraph/gfa-aug.c +260 -0
- data/ext/minigraph/gfa-base.c +526 -0
- data/ext/minigraph/gfa-bbl.c +372 -0
- data/ext/minigraph/gfa-ed.c +617 -0
- data/ext/minigraph/gfa-io.c +395 -0
- data/ext/minigraph/gfa-priv.h +154 -0
- data/ext/minigraph/gfa.h +166 -0
- data/ext/minigraph/ggen.c +182 -0
- data/ext/minigraph/ggen.h +21 -0
- data/ext/minigraph/ggsimple.c +570 -0
- data/ext/minigraph/gmap.c +211 -0
- data/ext/minigraph/index.c +230 -0
- data/ext/minigraph/kalloc.c +224 -0
- data/ext/minigraph/kalloc.h +82 -0
- data/ext/minigraph/kavl.h +414 -0
- data/ext/minigraph/kdq.h +134 -0
- data/ext/minigraph/ketopt.h +116 -0
- data/ext/minigraph/khashl.h +348 -0
- data/ext/minigraph/krmq.h +474 -0
- data/ext/minigraph/kseq.h +256 -0
- data/ext/minigraph/ksort.h +164 -0
- data/ext/minigraph/kstring.h +165 -0
- data/ext/minigraph/kthread.c +159 -0
- data/ext/minigraph/kthread.h +15 -0
- data/ext/minigraph/kvec-km.h +105 -0
- data/ext/minigraph/kvec.h +110 -0
- data/ext/minigraph/lchain.c +441 -0
- data/ext/minigraph/main.c +301 -0
- data/ext/minigraph/map-algo.c +500 -0
- data/ext/minigraph/mgpriv.h +128 -0
- data/ext/minigraph/minigraph.1 +359 -0
- data/ext/minigraph/minigraph.h +176 -0
- data/ext/minigraph/miniwfa.c +834 -0
- data/ext/minigraph/miniwfa.h +95 -0
- data/ext/minigraph/misc/mgutils.js +1451 -0
- data/ext/minigraph/misc.c +12 -0
- data/ext/minigraph/options.c +134 -0
- data/ext/minigraph/shortk.c +251 -0
- data/ext/minigraph/sketch.c +109 -0
- data/ext/minigraph/sys.c +147 -0
- data/ext/minigraph/sys.h +20 -0
- data/ext/minigraph/test/MT-chimp.fa +277 -0
- data/ext/minigraph/test/MT-human.fa +239 -0
- data/ext/minigraph/test/MT-orangA.fa +276 -0
- data/ext/minigraph/test/MT.gfa +19 -0
- data/ext/minigraph/tex/Makefile +13 -0
- data/ext/minigraph/tex/minigraph.bib +676 -0
- data/ext/minigraph/tex/minigraph.tex +986 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
- data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
- data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
- data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
- data/ext/minigraph/tex/plots/bedutils.js +367 -0
- data/ext/minigraph/tex/plots/chr-plot.js +130 -0
- data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
- data/ext/minigraph.patch +21 -0
- data/lib/minigraph/ffi/constants.rb +230 -0
- data/lib/minigraph/ffi/functions.rb +70 -0
- data/lib/minigraph/ffi/mappy.rb +8 -0
- data/lib/minigraph/ffi.rb +27 -0
- data/lib/minigraph/version.rb +5 -0
- data/lib/minigraph.rb +72 -0
- metadata +159 -0
@@ -0,0 +1,359 @@
|
|
1
|
+
.TH minigraph 1 "20 November 2022" "minigraph-0.20 (r559)" "Bioinformatics tools"
|
2
|
+
|
3
|
+
.SH NAME
|
4
|
+
.PP
|
5
|
+
minigraph - sequence-to-graph mapping and incremental sequence graph generation
|
6
|
+
|
7
|
+
.SH SYNOPSIS
|
8
|
+
* Sequence-to-graph mapping:
|
9
|
+
.RS 4
|
10
|
+
.B minigraph
|
11
|
+
.RB [ -x
|
12
|
+
.IR preset ]
|
13
|
+
.RB [ -c ]
|
14
|
+
.RB [ -t
|
15
|
+
.IR nThreads ]
|
16
|
+
.I graph.gfa
|
17
|
+
.I query1.fa
|
18
|
+
.RI [ ... ]
|
19
|
+
.B >
|
20
|
+
.I out.gaf
|
21
|
+
.RE
|
22
|
+
|
23
|
+
* Incremental graph generation:
|
24
|
+
.RS 4
|
25
|
+
.B minigraph
|
26
|
+
.B -x ggs
|
27
|
+
.RB [ -c ]
|
28
|
+
.RB [ -t
|
29
|
+
.IR nThreads ]
|
30
|
+
.I initGraph.gfa
|
31
|
+
.I sample1Asm.fa
|
32
|
+
.RI [ ... ]
|
33
|
+
.B >
|
34
|
+
.I finalGraph.gfa
|
35
|
+
|
36
|
+
.SH DESCRIPTION
|
37
|
+
|
38
|
+
Minigraph is a
|
39
|
+
.I proof-of-concept
|
40
|
+
sequence-to-graph mapper and graph constructor. It finds approximate locations
|
41
|
+
of a query sequence in a sequence graph and incrementally augments an existing
|
42
|
+
graph with long query subsequences.
|
43
|
+
|
44
|
+
.SH OPTIONS
|
45
|
+
.SS Indexing options
|
46
|
+
.TP 10
|
47
|
+
.BI -k \ INT
|
48
|
+
Minimizer k-mer length [17]
|
49
|
+
.TP
|
50
|
+
.BI -w \ INT
|
51
|
+
Minimizer window size [11]. A minimizer is the smallest k-mer in a window of w
|
52
|
+
consecutive k-mers.
|
53
|
+
.SS Mapping options
|
54
|
+
.TP 10
|
55
|
+
.BI -c
|
56
|
+
Perform base alignment; recommended for graph generation
|
57
|
+
.TP 10
|
58
|
+
.BI -U \ INT1 [, INT2 ]
|
59
|
+
Choose the minimizer occurrence threshold within this interval [50,250]
|
60
|
+
.TP
|
61
|
+
.BI -f \ FLOAT
|
62
|
+
Ignore top
|
63
|
+
.I FLOAT
|
64
|
+
fraction of repetitive minimizers [0.0002]. If this threshold falls within the
|
65
|
+
interval set by
|
66
|
+
.BR -U ,
|
67
|
+
it will be the final threshold; otherwise the lower or the upper bound of
|
68
|
+
.B -U
|
69
|
+
will be applied.
|
70
|
+
.TP
|
71
|
+
.BI -j \ FLOAT
|
72
|
+
Expected query-graph sequence divergence [0.1]
|
73
|
+
.TP
|
74
|
+
.BI -g \ NUM
|
75
|
+
Stop chain enlongation if there are no minimizers within
|
76
|
+
.IR INT -bp
|
77
|
+
[10k]. K/k/M/m suffixes are recognized.
|
78
|
+
.TP
|
79
|
+
.BI -r \ NUM1 [, NUM2 ]
|
80
|
+
Bandwidth for the two rounds of chaining [500,20k].
|
81
|
+
.I NUM2
|
82
|
+
also controls bandwidth for graph chaining.
|
83
|
+
.TP
|
84
|
+
.BI -n \ INT1 [, INT2 ]
|
85
|
+
Drop graph chains consisting of
|
86
|
+
.RI < INT1
|
87
|
+
minimizers and drop linear chains consisting of
|
88
|
+
.RI < INT2
|
89
|
+
minimizers [5,3]
|
90
|
+
.TP
|
91
|
+
.BI -m \ INT1 [, INT2 ]
|
92
|
+
Drop graph chains with graph chaining score
|
93
|
+
.RI < INT1
|
94
|
+
and drop linear chains with linear chaining score
|
95
|
+
.RI < INT2
|
96
|
+
[50,30]. Linear chaining score equals the approximate number of matching bases
|
97
|
+
minus a weak concave gap penalty. Graph chaining score uses a linear gap
|
98
|
+
penalty.
|
99
|
+
.TP
|
100
|
+
.BI -p \ FLOAT
|
101
|
+
Minimal secondary-to-primary score ratio to output secondary mappings [0.8].
|
102
|
+
Between two chains overlaping over half of the shorter chain (controlled by
|
103
|
+
.BR -M ),
|
104
|
+
the chain with a lower score is secondary to the chain with a higher score.
|
105
|
+
.TP
|
106
|
+
.BI -N \ INT
|
107
|
+
Output at most
|
108
|
+
.I INT
|
109
|
+
secondary mappings [5]. This option has no effect when
|
110
|
+
.B -P
|
111
|
+
is applied.
|
112
|
+
.TP
|
113
|
+
.B -P
|
114
|
+
Retain all chains and don't attempt to set primary chains. Options
|
115
|
+
.B -p
|
116
|
+
and
|
117
|
+
.B -N
|
118
|
+
have no effect when this option is in use.
|
119
|
+
.TP
|
120
|
+
.BI -M \ FLOAT
|
121
|
+
Mark as secondary a chain that overlaps with a better chain by
|
122
|
+
.I FLOAT
|
123
|
+
or more of the shorter chain [0.5]
|
124
|
+
.TP
|
125
|
+
.BI --max-gap-pre \ NUM
|
126
|
+
Similar to
|
127
|
+
.B -g
|
128
|
+
but used for prefiltering [1000]
|
129
|
+
.TP
|
130
|
+
.BI --max-lc-iter \ NUM
|
131
|
+
max number of iterations for linear chaining [10000]
|
132
|
+
.TP
|
133
|
+
.BI --max-rmq-size \ NUM
|
134
|
+
max size of the RMQ tree [100000]
|
135
|
+
.TP
|
136
|
+
.BI --max-lc-skip \ INT
|
137
|
+
A heuristics that stops linear chaining early [25]
|
138
|
+
.TP
|
139
|
+
.BI --max-gc-skip \ INT
|
140
|
+
Similar to
|
141
|
+
.B --max-lc-skip
|
142
|
+
but applied to graph chaining [25]
|
143
|
+
.TP
|
144
|
+
.BI --ref-bonus \ INT
|
145
|
+
Bonus for a reference subwalk [0]
|
146
|
+
.TP
|
147
|
+
.BI --min-cov-blen \ NUM
|
148
|
+
Minimum alignment block length to count [1k]
|
149
|
+
.TP
|
150
|
+
.BI --min-cov-mapq \ INT
|
151
|
+
Minimum mapping quality to count [20]
|
152
|
+
.SS Graph generation options
|
153
|
+
.TP 10
|
154
|
+
.BR --ggen =[ simple ]
|
155
|
+
Graph generation algorithm. So far only a
|
156
|
+
.B simple
|
157
|
+
algorithm is implemented [simple]. With this option, all query sequences are
|
158
|
+
loaded into memory.
|
159
|
+
.TP
|
160
|
+
.B --call
|
161
|
+
Call the graph path in each bubble and output in a BED-based format:
|
162
|
+
.RS
|
163
|
+
ctg start end sourceNode sinkNode walk:strand:queryName:qStart:qEnd
|
164
|
+
.RE
|
165
|
+
.TP
|
166
|
+
.BI -q \ INT
|
167
|
+
Minimum mapping quality [5]
|
168
|
+
.TP
|
169
|
+
.BI -l \ NUM
|
170
|
+
Minimum chain length to consider [100k]
|
171
|
+
.TP
|
172
|
+
.BI -d \ NUM
|
173
|
+
Minimum chain length for depth calculation [20k]
|
174
|
+
.TP
|
175
|
+
.BI -L \ INT
|
176
|
+
Minimum insertion length [50]
|
177
|
+
.TP
|
178
|
+
.BI --gg-match-pen \ INT
|
179
|
+
Penalty for a pair of matching anchors [5]. Larger value for more fragmented inserts.
|
180
|
+
Effectively without
|
181
|
+
.BR -c .
|
182
|
+
.TP
|
183
|
+
.BR --ins-qovlp = yes | no
|
184
|
+
Forcefully resolve query overlaps [no]. Effective without
|
185
|
+
.BR -c .
|
186
|
+
.TP
|
187
|
+
.BR --inv = yes | no
|
188
|
+
Generate graphs with inversions or not [yes]
|
189
|
+
.TP
|
190
|
+
.B --cov
|
191
|
+
Remap and generate segment and link use frequencies. This option triggers GFA
|
192
|
+
output. When used with
|
193
|
+
.BR --ggen ,
|
194
|
+
minigraph writes the frequency of link uses and the average breadth of coverage
|
195
|
+
of each segment to the
|
196
|
+
.B cf
|
197
|
+
tag. When used without
|
198
|
+
.BR --ggen ,
|
199
|
+
minigraph writes the count of link uses and the average depth of coverage of
|
200
|
+
each segment to the
|
201
|
+
.B dc
|
202
|
+
tag.
|
203
|
+
.B
|
204
|
+
WARNING:
|
205
|
+
THIS OPTION IS DEPRECATED AND MAY BE REMOVED IN FUTURE.
|
206
|
+
.SS Input/output options
|
207
|
+
.TP 10
|
208
|
+
.BI -o \ FILE
|
209
|
+
Output alignments to
|
210
|
+
.I FILE
|
211
|
+
[stdout].
|
212
|
+
.TP
|
213
|
+
.BI -t \ INT
|
214
|
+
Number of threads [4]. Minigraph uses at most three threads when indexing target
|
215
|
+
sequences, and uses up to
|
216
|
+
.IR INT +1
|
217
|
+
threads when mapping (the extra thread is for I/O, which is frequently idle and
|
218
|
+
takes little CPU time).
|
219
|
+
.TP
|
220
|
+
.BI -K \ NUM
|
221
|
+
Number of bases loaded into memory to process in a mini-batch [500M].
|
222
|
+
K/M/G/k/m/g suffix is accepted. A large
|
223
|
+
.I NUM
|
224
|
+
helps load balancing in the multi-threading mode, at the cost of increased
|
225
|
+
memory. This option has no effect if
|
226
|
+
.B --ggen
|
227
|
+
is applied.
|
228
|
+
.TP
|
229
|
+
.B --vc
|
230
|
+
In output GAF, show mapping paths in the unstable segment coordinate.
|
231
|
+
.TP
|
232
|
+
.B -S
|
233
|
+
Output linear chains in the format of: `*' segName segLen nMinimizer seqDiv segStart segEnd qStart qEnd
|
234
|
+
.TP
|
235
|
+
.B --write-mz
|
236
|
+
Output linear chains in the format of: `*' segName segLen nMinimizer seqDiv segStart segEnd qStart qEnd
|
237
|
+
k-mer segOffsets qOffsets. segOffsets and qOffsets are comma-separated lists
|
238
|
+
with each consisting of nMinimizer-1 integers which give the distance from the
|
239
|
+
previous minimizer on segments and query, respectively.
|
240
|
+
.TP
|
241
|
+
.BR --secondary = yes | no
|
242
|
+
Whether to output secondary alignments [no]
|
243
|
+
.TP
|
244
|
+
.BR --show-unmap = yes | no
|
245
|
+
Print unmapped query sequences in GAF [no]
|
246
|
+
.TP
|
247
|
+
.B --version
|
248
|
+
Print version number to stdout
|
249
|
+
.SS Preset options
|
250
|
+
.TP 10
|
251
|
+
.BI -x \ STR
|
252
|
+
Preset []. This option applies multiple options at the same time. Other options
|
253
|
+
on the command line will always override values set by
|
254
|
+
.BR -x .
|
255
|
+
Available
|
256
|
+
.I STR
|
257
|
+
are:
|
258
|
+
.RS
|
259
|
+
.TP 8
|
260
|
+
.B lr
|
261
|
+
Mapping noisy long reads. This is the same as the default setting.
|
262
|
+
.TP
|
263
|
+
.B sr
|
264
|
+
Mapping short single-end or paired-end reads
|
265
|
+
.RB ( -k21
|
266
|
+
.B -w10 -U1000,2500 -g100 -r100 -p.5 -n3,2 -m40,25 --heap-sort=yes -K50m --frag --ref-bonus=1
|
267
|
+
.BR --min-cov-blen=50 ).
|
268
|
+
Paired-end mapping is not supported.
|
269
|
+
.TP
|
270
|
+
.B asm
|
271
|
+
Mapping long contigs or high-quality CCS reads
|
272
|
+
.RB ( -k19
|
273
|
+
.B -w10 -U10,100 -j.01 -g10k -r1k,150k -n5,5 -m1000,40 -K4g --max-lc-skip=50 --max-gc-skip=50 --min-cov-mapq=5
|
274
|
+
.BR --min-cov-blen=100k ).
|
275
|
+
.TP
|
276
|
+
.B ggs
|
277
|
+
Incremental graph generation
|
278
|
+
.RB ( -xasm
|
279
|
+
.B -N0
|
280
|
+
.BR --ggen=simple ).
|
281
|
+
.RE
|
282
|
+
.SS Miscellaneous options
|
283
|
+
.TP 10
|
284
|
+
.B --no-kalloc
|
285
|
+
Use the libc default allocator instead of the kalloc thread-local allocator.
|
286
|
+
This debugging option is mostly used with Valgrind to detect invalid memory
|
287
|
+
accesses. Minigraph runs slower with this option, especially in the
|
288
|
+
multi-threading mode.
|
289
|
+
.SH OUTPUT FORMAT
|
290
|
+
.PP
|
291
|
+
Minigraph outputs mapping positions in the Graph mApping Format (GAF) by
|
292
|
+
default. GAF is a TAB-delimited text format with each line consisting of at
|
293
|
+
least 12 fields as are described in the following table:
|
294
|
+
.TS
|
295
|
+
center box;
|
296
|
+
cb | cb | cb
|
297
|
+
r | c | l .
|
298
|
+
Col Type Description
|
299
|
+
_
|
300
|
+
1 string Query sequence name
|
301
|
+
2 int Query sequence length
|
302
|
+
3 int Query start coordinate (0-based; closed)
|
303
|
+
4 int Query end coordinate (0-based; open)
|
304
|
+
5 char `+' if query/path on the same strand; `-' if opposite
|
305
|
+
6 string Path matching /([><][^\\s><]+(:\\d+-\\d+)?)+|([^\\s><]+)/
|
306
|
+
7 int Path sequence length
|
307
|
+
8 int Path start coordinate
|
308
|
+
9 int Path end coordinate
|
309
|
+
10 int Number of matching bases in the mapping
|
310
|
+
11 int Number bases, including gaps, in the mapping
|
311
|
+
12 int Mapping quality (0-255 with 255 for missing)
|
312
|
+
.TE
|
313
|
+
|
314
|
+
.PP
|
315
|
+
When alignment is available, column 11 gives the total number of sequence
|
316
|
+
matches, mismatches and gaps in the alignment; column 10 divided by column 11
|
317
|
+
gives the BLAST-like alignment identity. When alignment is unavailable,
|
318
|
+
these two columns are approximate. PAF may optionally have additional fields in
|
319
|
+
the SAM-like typed key-value format. Minigraph may output the following tags:
|
320
|
+
.TS
|
321
|
+
center box;
|
322
|
+
cb | cb | cb
|
323
|
+
r | c | l .
|
324
|
+
Tag Type Description
|
325
|
+
_
|
326
|
+
tp A Type of aln: P/primary and S/secondary
|
327
|
+
cm i Number of minimizers on the chain
|
328
|
+
s1 i Chaining score
|
329
|
+
s2 i Chaining score of the best secondary chain
|
330
|
+
dv f Approximate per-base sequence divergence
|
331
|
+
cf f Avg. segment breadth of coverage and link use freq
|
332
|
+
dc f Avg. segment depth of coverage and link use counts
|
333
|
+
cg Z CIGAR string
|
334
|
+
ql B,i Lengths of single-end reads
|
335
|
+
.TE
|
336
|
+
|
337
|
+
.SH LIMITATIONS
|
338
|
+
.TP 2
|
339
|
+
*
|
340
|
+
Minigraph needs to find strong colinear chains first. For a graph consisting of
|
341
|
+
many short segments (e.g. one generated from rare SNPs in large populations),
|
342
|
+
minigraph will fail to map query sequences.
|
343
|
+
.TP
|
344
|
+
*
|
345
|
+
When connecting colinear chains on graphs, minigraph doesn't always take full
|
346
|
+
advantage of base sequences and may miss the optimal alignments.
|
347
|
+
.TP
|
348
|
+
*
|
349
|
+
Minigraph only inserts segments contained in long graph chains. This
|
350
|
+
conservative strategy helps to build relatively accurate graph, but may miss
|
351
|
+
more complex events. Other strategies may be explored in future.
|
352
|
+
.TP
|
353
|
+
*
|
354
|
+
Base alignment has only been evaluated for human. For more diverse genomes,
|
355
|
+
the performance may need to be improved.
|
356
|
+
|
357
|
+
.SH SEE ALSO
|
358
|
+
.PP
|
359
|
+
minimap2(1), gfatools(1).
|
@@ -0,0 +1,176 @@
|
|
1
|
+
#ifndef MINIGRAPH_H
|
2
|
+
#define MINIGRAPH_H
|
3
|
+
|
4
|
+
#include <stdint.h>
|
5
|
+
#include "gfa.h"
|
6
|
+
|
7
|
+
#define MG_VERSION "0.20-r559"
|
8
|
+
|
9
|
+
#define MG_M_SPLICE 0x10
|
10
|
+
#define MG_M_SR 0x20
|
11
|
+
#define MG_M_FRAG_MODE 0x40
|
12
|
+
#define MG_M_FRAG_MERGE 0x80
|
13
|
+
#define MG_M_FOR_ONLY 0x100
|
14
|
+
#define MG_M_REV_ONLY 0x200
|
15
|
+
#define MG_M_HEAP_SORT 0x400
|
16
|
+
#define MG_M_VERTEX_COOR 0x800
|
17
|
+
#define MG_M_ALL_CHAINS 0x1000
|
18
|
+
#define MG_M_PRINT_2ND 0x2000
|
19
|
+
#define MG_M_CAL_COV 0x4000
|
20
|
+
#define MG_M_RMQ 0x8000
|
21
|
+
#define MG_M_COPY_COMMENT 0x10000
|
22
|
+
#define MG_M_INDEPEND_SEG 0x20000
|
23
|
+
#define MG_M_NO_QUAL 0x40000
|
24
|
+
#define MG_M_2_IO_THREADS 0x80000
|
25
|
+
#define MG_M_SHOW_UNMAP 0x100000
|
26
|
+
#define MG_M_NO_COMP_PATH 0x200000
|
27
|
+
#define MG_M_NO_DIAG 0x400000
|
28
|
+
#define MG_M_WRITE_LCHAIN 0x800000
|
29
|
+
#define MG_M_WRITE_MZ 0x1000000
|
30
|
+
#define MG_M_SKIP_GCHECK 0x2000000
|
31
|
+
#define MG_M_CIGAR 0x4000000
|
32
|
+
|
33
|
+
#define MG_G_NONE 0
|
34
|
+
#define MG_G_GGSIMPLE 1
|
35
|
+
|
36
|
+
#define MG_G_NO_QOVLP 0x1
|
37
|
+
#define MG_G_CAL_COV 0x2
|
38
|
+
#define MG_G_NO_INV 0x4
|
39
|
+
#define MG_G_CALL 0x8
|
40
|
+
|
41
|
+
typedef struct { uint64_t x, y; } mg128_t;
|
42
|
+
typedef struct { size_t n, m; mg128_t *a; } mg128_v;
|
43
|
+
typedef struct { int32_t n, m; uint32_t *a; } mg32_v;
|
44
|
+
typedef struct { int32_t n, m; uint64_t *a; } mg64_v;
|
45
|
+
|
46
|
+
typedef struct {
|
47
|
+
int w, k;
|
48
|
+
int bucket_bits;
|
49
|
+
} mg_idxopt_t;
|
50
|
+
|
51
|
+
typedef struct {
|
52
|
+
uint64_t flag;
|
53
|
+
int64_t mini_batch_size;
|
54
|
+
int seed;
|
55
|
+
int max_qlen;
|
56
|
+
int pe_ori;
|
57
|
+
int occ_max1, occ_max1_cap;
|
58
|
+
float occ_max1_frac;
|
59
|
+
int bw, bw_long;
|
60
|
+
int rmq_size_cap;
|
61
|
+
int rmq_rescue_size;
|
62
|
+
float rmq_rescue_ratio;
|
63
|
+
int max_gap_pre, max_gap, max_gap_ref, max_frag_len;
|
64
|
+
float div;
|
65
|
+
float chn_pen_gap, chn_pen_skip;
|
66
|
+
int max_lc_skip, max_lc_iter, max_gc_skip;
|
67
|
+
int min_lc_cnt, min_lc_score;
|
68
|
+
int min_gc_cnt, min_gc_score;
|
69
|
+
int gdp_max_ed, lc_max_trim, lc_max_occ;
|
70
|
+
float mask_level;
|
71
|
+
int sub_diff;
|
72
|
+
int best_n;
|
73
|
+
float pri_ratio;
|
74
|
+
int ref_bonus;
|
75
|
+
int64_t cap_kalloc;
|
76
|
+
int min_cov_mapq, min_cov_blen;
|
77
|
+
} mg_mapopt_t;
|
78
|
+
|
79
|
+
typedef struct {
|
80
|
+
uint64_t flag;
|
81
|
+
int algo;
|
82
|
+
int min_mapq;
|
83
|
+
int min_map_len, min_depth_len;
|
84
|
+
int min_var_len, match_pen;
|
85
|
+
// parameters specific to ggsimple/ggs
|
86
|
+
int ggs_shrink_pen;
|
87
|
+
int ggs_min_end_cnt;
|
88
|
+
float ggs_min_end_frac;
|
89
|
+
// scoring for SW check
|
90
|
+
float ggs_max_iden, ggs_min_inv_iden;
|
91
|
+
} mg_ggopt_t;
|
92
|
+
|
93
|
+
typedef struct {
|
94
|
+
const gfa_t *g;
|
95
|
+
gfa_edseq_t *es;
|
96
|
+
int32_t b, w, k, flag, n_seg;
|
97
|
+
struct mg_idx_bucket_s *B; // index (hidden)
|
98
|
+
} mg_idx_t;
|
99
|
+
|
100
|
+
typedef struct {
|
101
|
+
int32_t off, cnt:31, inner_pre:1;
|
102
|
+
uint32_t v;
|
103
|
+
int32_t rs, re, qs, qe;
|
104
|
+
int32_t score, dist_pre;
|
105
|
+
uint32_t hash_pre;
|
106
|
+
} mg_lchain_t;
|
107
|
+
|
108
|
+
typedef struct {
|
109
|
+
int32_t off, cnt;
|
110
|
+
uint32_t v;
|
111
|
+
int32_t score;
|
112
|
+
int32_t ed;
|
113
|
+
} mg_llchain_t;
|
114
|
+
|
115
|
+
typedef struct {
|
116
|
+
int32_t n_cigar, mlen, blen, aplen, ss, ee; // ss: start on the start vertex; ee: end on the end vertex
|
117
|
+
uint64_t cigar[];
|
118
|
+
} mg_cigar_t;
|
119
|
+
|
120
|
+
typedef struct {
|
121
|
+
int32_t id, parent;
|
122
|
+
int32_t off, cnt;
|
123
|
+
int32_t n_anchor, score;
|
124
|
+
int32_t qs, qe;
|
125
|
+
int32_t plen, ps, pe;
|
126
|
+
int32_t blen, mlen;
|
127
|
+
float div;
|
128
|
+
uint32_t hash;
|
129
|
+
int32_t subsc, n_sub;
|
130
|
+
uint32_t mapq:8, flt:1, dummy:23;
|
131
|
+
mg_cigar_t *p;
|
132
|
+
} mg_gchain_t;
|
133
|
+
|
134
|
+
typedef struct {
|
135
|
+
void *km;
|
136
|
+
int32_t n_gc, n_lc, n_a, rep_len;
|
137
|
+
mg_gchain_t *gc;
|
138
|
+
mg_llchain_t *lc;
|
139
|
+
mg128_t *a; // minimizer positions; see comments above mg_update_anchors() for details
|
140
|
+
} mg_gchains_t;
|
141
|
+
|
142
|
+
typedef struct mg_tbuf_s mg_tbuf_t;
|
143
|
+
|
144
|
+
extern int mg_verbose, mg_dbg_flag;
|
145
|
+
extern double mg_realtime0;
|
146
|
+
|
147
|
+
#ifdef __cplusplus
|
148
|
+
extern "C" {
|
149
|
+
#endif
|
150
|
+
|
151
|
+
// options
|
152
|
+
int mg_opt_set(const char *preset, mg_idxopt_t *io, mg_mapopt_t *mo, mg_ggopt_t *go);
|
153
|
+
int mg_opt_check(const mg_idxopt_t *io, const mg_mapopt_t *mo, const mg_ggopt_t *go);
|
154
|
+
void mg_opt_update(const mg_idx_t *gi, mg_mapopt_t *mo, mg_ggopt_t *go);
|
155
|
+
|
156
|
+
// index operations
|
157
|
+
mg_idx_t *mg_index(gfa_t *g, const mg_idxopt_t *io, int n_threads, mg_mapopt_t *mo); // combine mg_index_core() and mg_opt_update()
|
158
|
+
void mg_idx_destroy(mg_idx_t *gi);
|
159
|
+
|
160
|
+
// mapping
|
161
|
+
mg_tbuf_t *mg_tbuf_init(void);
|
162
|
+
void mg_tbuf_destroy(mg_tbuf_t *b);
|
163
|
+
mg_gchains_t *mg_map(const mg_idx_t *gi, int qlen, const char *seq, mg_tbuf_t *b, const mg_mapopt_t *opt, const char *qname);
|
164
|
+
void mg_map_frag(const mg_idx_t *gi, int n_segs, const int *qlens, const char **seqs, mg_gchains_t **gcs, mg_tbuf_t *b, const mg_mapopt_t *opt, const char *qname);
|
165
|
+
|
166
|
+
// high-level mapping APIs
|
167
|
+
int mg_map_files(gfa_t *g, int n_fn, const char **fn, const mg_idxopt_t *ipt, const mg_mapopt_t *opt0, int n_threads);
|
168
|
+
|
169
|
+
// graph generation
|
170
|
+
int mg_ggen(gfa_t *g, int32_t n_fn, const char **fn, const mg_idxopt_t *ipt, const mg_mapopt_t *opt0, const mg_ggopt_t *go, int n_threads);
|
171
|
+
|
172
|
+
#ifdef __cplusplus
|
173
|
+
}
|
174
|
+
#endif
|
175
|
+
|
176
|
+
#endif
|