minimap2 0.2.26.1 → 0.2.28.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +16 -16
- data/ext/Rakefile +1 -3
- data/ext/minimap2/NEWS.md +68 -1
- data/ext/minimap2/README.md +8 -5
- data/ext/minimap2/align.c +19 -5
- data/ext/minimap2/cookbook.md +2 -2
- data/ext/minimap2/format.c +83 -17
- data/ext/minimap2/index.c +1 -0
- data/ext/minimap2/lchain.c +5 -5
- data/ext/minimap2/main.c +14 -6
- data/ext/minimap2/minimap.h +4 -1
- data/ext/minimap2/minimap2.1 +60 -11
- data/ext/minimap2/misc/paftools.js +88 -36
- data/ext/minimap2/mmpriv.h +1 -2
- data/ext/minimap2/options.c +25 -7
- data/ext/minimap2/python/README.rst +3 -1
- data/ext/minimap2/python/cmappy.pxd +1 -0
- data/ext/minimap2/python/mappy.pyx +4 -2
- data/ext/minimap2/python/minimap2.py +5 -3
- data/ext/minimap2/seed.c +2 -1
- data/ext/minimap2/setup.py +1 -1
- data/lib/minimap2/aligner.rb +6 -3
- data/lib/minimap2/alignment.rb +2 -1
- data/lib/minimap2/ffi/constants.rb +5 -1
- data/lib/minimap2/ffi/functions.rb +16 -3
- data/lib/minimap2/ffi.rb +1 -0
- data/lib/minimap2/version.rb +1 -1
- data/lib/minimap2.rb +2 -2
- metadata +6 -6
data/ext/minimap2/minimap2.1
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
.TH minimap2 1 "
|
1
|
+
.TH minimap2 1 "12 March 2024" "minimap2-2.28 (r1209)" "Bioinformatics tools"
|
2
2
|
.SH NAME
|
3
3
|
.PP
|
4
4
|
minimap2 - mapping and alignment between collections of DNA sequences
|
@@ -268,6 +268,11 @@ or more of the shorter chain [0.5]
|
|
268
268
|
Use the minigraph chaining algorithm [no]. The minigraph algorithm is better
|
269
269
|
for aligning contigs through long INDELs.
|
270
270
|
.TP
|
271
|
+
.BI --rmq-inner \ NUM
|
272
|
+
Apply full dynamic programming for anchors within distance
|
273
|
+
.I NUM
|
274
|
+
[1000].
|
275
|
+
.TP
|
271
276
|
.B --hard-mask-level
|
272
277
|
Honor option
|
273
278
|
.B -M
|
@@ -343,6 +348,10 @@ Matching score [2]
|
|
343
348
|
.BI -B \ INT
|
344
349
|
Mismatching penalty [4]
|
345
350
|
.TP
|
351
|
+
.BI -b \ INT
|
352
|
+
Mismatching penalty for transitions [same as
|
353
|
+
.BR -B ].
|
354
|
+
.TP
|
346
355
|
.BI -O \ INT1[,INT2]
|
347
356
|
Gap open penalty [4,24]. If
|
348
357
|
.I INT2
|
@@ -356,10 +365,19 @@ costs
|
|
356
365
|
.RI min{ O1 + k * E1 , O2 + k * E2 }.
|
357
366
|
In the splice mode, the second gap penalties are not used.
|
358
367
|
.TP
|
368
|
+
.BI -J \ INT
|
369
|
+
Splice model [1]. 0 for the original minimap2 splice model that always penalizes non-GT-AG splicing;
|
370
|
+
1 for the miniprot model that considers non-GT-AG. Option
|
371
|
+
.B -C
|
372
|
+
has no effect with the default
|
373
|
+
.BR -J1 .
|
374
|
+
.BR -J0 .
|
375
|
+
.TP
|
359
376
|
.BI -C \ INT
|
360
377
|
Cost for a non-canonical GT-AG splicing (effective with
|
361
|
-
.
|
362
|
-
|
378
|
+
.B --splice
|
379
|
+
.BR -J0 )
|
380
|
+
[0].
|
363
381
|
.TP
|
364
382
|
.BI -z \ INT1[,INT2]
|
365
383
|
Truncate an alignment if the running alignment score drops too quickly along
|
@@ -450,7 +468,7 @@ Set 0 to disable [100m].
|
|
450
468
|
.BI --cap-kalloc \ NUM
|
451
469
|
Free thread-local kalloc memory reservoir if after the alignment the size of the reservoir above
|
452
470
|
.IR NUM .
|
453
|
-
Set 0 to disable [
|
471
|
+
Set 0 to disable [500m].
|
454
472
|
.SS Input/output options
|
455
473
|
.TP 10
|
456
474
|
.B -a
|
@@ -506,6 +524,9 @@ Output =/X CIGAR operators for sequence match/mismatch.
|
|
506
524
|
.B -Y
|
507
525
|
In SAM output, use soft clipping for supplementary alignments.
|
508
526
|
.TP
|
527
|
+
.B --secondary-seq
|
528
|
+
In SAM output, show query sequences for secondary alignments.
|
529
|
+
.TP
|
509
530
|
.BI --seed \ INT
|
510
531
|
Integer seed for randomizing equally best hits. Minimap2 hashes
|
511
532
|
.I INT
|
@@ -566,15 +587,43 @@ are:
|
|
566
587
|
Align noisy long reads of ~10% error rate to a reference genome. This is the
|
567
588
|
default mode.
|
568
589
|
.TP
|
590
|
+
.B lr:hq
|
591
|
+
Align accurate long reads (error rate <1%) to a reference genome
|
592
|
+
.RB ( -k19
|
593
|
+
.B -w19 -U50,500
|
594
|
+
.BR -g10k ).
|
595
|
+
This was recommended by ONT developers for recent Nanopore reads
|
596
|
+
produced with chemistry v14 that can reach ~99% in accuracy.
|
597
|
+
It was shown to work better for accurate Nanopore reads
|
598
|
+
than
|
599
|
+
.BR map-hifi .
|
600
|
+
.TP
|
569
601
|
.B map-hifi
|
570
602
|
Align PacBio high-fidelity (HiFi) reads to a reference genome
|
571
|
-
.RB ( -
|
572
|
-
.B -
|
603
|
+
.RB ( -xlr:hq
|
604
|
+
.B -A1 -B4 -O6,26 -E2,1
|
573
605
|
.BR -s200 ).
|
606
|
+
It differs from
|
607
|
+
.B lr:hq
|
608
|
+
only in scoring. It has not been tested whether
|
609
|
+
.B lr:hq
|
610
|
+
would work better for PacBio HiFi reads.
|
574
611
|
.TP
|
575
612
|
.B map-pb
|
576
613
|
Align older PacBio continuous long (CLR) reads to a reference genome
|
577
614
|
.RB ( -Hk19 ).
|
615
|
+
Note that this data type is effectively deprecated by HiFi.
|
616
|
+
Unless you work on very old data, you probably want to use
|
617
|
+
.B map-hifi
|
618
|
+
or
|
619
|
+
.BR lr:hq .
|
620
|
+
.TP
|
621
|
+
.B map-iclr
|
622
|
+
Align Illumina Complete Long Reads (ICLR) to a reference genome
|
623
|
+
.RB ( -k19
|
624
|
+
.B -B6 -b4
|
625
|
+
.BR -O10,50 ).
|
626
|
+
This was recommended by Illumina developers.
|
578
627
|
.TP
|
579
628
|
.B asm5
|
580
629
|
Long assembly to reference mapping
|
@@ -582,21 +631,21 @@ Long assembly to reference mapping
|
|
582
631
|
.B -w19 -U50,500 --rmq -r1k,100k -g10k -A1 -B19 -O39,81 -E3,1 -s200 -z200
|
583
632
|
.BR -N50 ).
|
584
633
|
Typically, the alignment will not extend to regions with 5% or higher sequence
|
585
|
-
divergence.
|
634
|
+
divergence. Use this preset if the average divergence is not much higher than 0.1%.
|
586
635
|
.TP
|
587
636
|
.B asm10
|
588
637
|
Long assembly to reference mapping
|
589
638
|
.RB ( -k19
|
590
639
|
.B -w19 -U50,500 --rmq -r1k,100k -g10k -A1 -B9 -O16,41 -E2,1 -s200 -z200
|
591
640
|
.BR -N50 ).
|
592
|
-
|
641
|
+
Use this if the average divergence is around 1%.
|
593
642
|
.TP
|
594
643
|
.B asm20
|
595
644
|
Long assembly to reference mapping
|
596
645
|
.RB ( -k19
|
597
646
|
.B -w10 -U50,500 --rmq -r1k,100k -g10k -A1 -B4 -O6,26 -E2,1 -s200 -z200
|
598
647
|
.BR -N50 ).
|
599
|
-
|
648
|
+
Use this if the average divergence is around several percent.
|
600
649
|
.TP
|
601
650
|
.B splice
|
602
651
|
Long-read spliced alignment
|
@@ -612,13 +661,13 @@ costs are different during chaining; 4) the computation of the
|
|
612
661
|
tag ignores introns to demote hits to pseudogenes.
|
613
662
|
.TP
|
614
663
|
.B splice:hq
|
615
|
-
|
664
|
+
Spliced alignment for accurate long RNA-seq reads such as PacBio iso-seq
|
616
665
|
.RB ( -xsplice
|
617
666
|
.B -C5 -O6,24
|
618
667
|
.BR -B4 ).
|
619
668
|
.TP
|
620
669
|
.B sr
|
621
|
-
Short
|
670
|
+
Short-read alignment without splicing
|
622
671
|
.RB ( -k21
|
623
672
|
.B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -b0 -r100 -p.5 -N20 -f1000,5000 -n2 -m25
|
624
673
|
.B -s40 -g100 -2K50m --heap-sort=yes
|
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/usr/bin/env k8
|
2
2
|
|
3
|
-
var paftools_version = '2.
|
3
|
+
var paftools_version = '2.28-r1209';
|
4
4
|
|
5
5
|
/*****************************
|
6
6
|
***** Library functions *****
|
@@ -133,26 +133,50 @@ Interval.find_ovlp = function(a, st, en)
|
|
133
133
|
|
134
134
|
function fasta_read(fn)
|
135
135
|
{
|
136
|
-
var h = {},
|
136
|
+
var h = {}, seqlen = [];
|
137
|
+
var buf = new Bytes();
|
137
138
|
var file = fn == '-'? new File() : new File(fn);
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
if (
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
139
|
+
if (typeof k8_version == "undefined") { // for k8-0.x
|
140
|
+
var seq = null, name = null, gt = '>'.charCodeAt(0);
|
141
|
+
while (file.readline(buf) >= 0) {
|
142
|
+
if (buf[0] == gt) {
|
143
|
+
if (seq != null && name != null) {
|
144
|
+
seqlen.push([name, seq.length]);
|
145
|
+
h[name] = seq;
|
146
|
+
name = seq = null;
|
147
|
+
}
|
148
|
+
var m, line = buf.toString();
|
149
|
+
if ((m = /^>(\S+)/.exec(line)) != null) {
|
150
|
+
name = m[1];
|
151
|
+
seq = new Bytes();
|
152
|
+
}
|
153
|
+
} else seq.set(buf);
|
154
|
+
}
|
155
|
+
if (seq != null && name != null) {
|
156
|
+
seqlen.push([name, seq.length]);
|
157
|
+
h[name] = seq;
|
158
|
+
}
|
159
|
+
} else { // for k8-1.x
|
160
|
+
var seq = null, name = null;
|
161
|
+
while (file.readline(buf) >= 0) {
|
162
|
+
var line = buf.toString();
|
163
|
+
if (line[0] == ">") {
|
164
|
+
if (seq != null && name != null) {
|
165
|
+
seqlen.push([name, seq.length]);
|
166
|
+
h[name] = new Uint8Array(seq.buffer);
|
167
|
+
name = seq = null;
|
168
|
+
}
|
169
|
+
var m;
|
170
|
+
if ((m = /^>(\S+)/.exec(line)) != null) {
|
171
|
+
name = m[1];
|
172
|
+
seq = new Bytes();
|
173
|
+
}
|
174
|
+
} else seq.set(line);
|
175
|
+
}
|
176
|
+
if (seq != null && name != null) {
|
177
|
+
seqlen.push([name, seq.length]);
|
178
|
+
h[name] = new Uint8Array(seq.buffer);
|
179
|
+
}
|
156
180
|
}
|
157
181
|
buf.destroy();
|
158
182
|
file.close();
|
@@ -161,16 +185,27 @@ function fasta_read(fn)
|
|
161
185
|
|
162
186
|
function fasta_free(fa)
|
163
187
|
{
|
164
|
-
|
165
|
-
fa
|
188
|
+
if (typeof k8_version == "undefined")
|
189
|
+
for (var name in fa)
|
190
|
+
fa[name].destroy();
|
191
|
+
// FIXME: for k8-1.0, sequences are not freed. This is ok for now but not general.
|
166
192
|
}
|
167
193
|
|
168
194
|
Bytes.prototype.reverse = function()
|
169
195
|
{
|
170
|
-
|
171
|
-
var
|
172
|
-
|
173
|
-
|
196
|
+
if (typeof k8_version === "undefined") { // k8-0.x
|
197
|
+
for (var i = 0; i < this.length>>1; ++i) {
|
198
|
+
var tmp = this[i];
|
199
|
+
this[i] = this[this.length - i - 1];
|
200
|
+
this[this.length - i - 1] = tmp;
|
201
|
+
}
|
202
|
+
} else { // k8-1.x
|
203
|
+
var buf = new Uint8Array(this.buffer);
|
204
|
+
for (var i = 0; i < buf.length>>1; ++i) {
|
205
|
+
var tmp = buf[i];
|
206
|
+
buf[i] = buf[buf.length - i - 1];
|
207
|
+
buf[buf.length - i - 1] = tmp;
|
208
|
+
}
|
174
209
|
}
|
175
210
|
}
|
176
211
|
|
@@ -185,13 +220,24 @@ Bytes.prototype.revcomp = function()
|
|
185
220
|
for (var i = 0; i < s1.length; ++i)
|
186
221
|
Bytes.rctab[s1.charCodeAt(i)] = s2.charCodeAt(i);
|
187
222
|
}
|
188
|
-
|
189
|
-
var
|
190
|
-
|
191
|
-
|
223
|
+
if (typeof k8_version === "undefined") { // k8-0.x
|
224
|
+
for (var i = 0; i < this.length>>1; ++i) {
|
225
|
+
var tmp = this[this.length - i - 1];
|
226
|
+
this[this.length - i - 1] = Bytes.rctab[this[i]];
|
227
|
+
this[i] = Bytes.rctab[tmp];
|
228
|
+
}
|
229
|
+
if (this.length&1)
|
230
|
+
this[this.length>>1] = Bytes.rctab[this[this.length>>1]];
|
231
|
+
} else { // k8-1.x
|
232
|
+
var buf = new Uint8Array(this.buffer);
|
233
|
+
for (var i = 0; i < buf.length>>1; ++i) {
|
234
|
+
var tmp = buf[buf.length - i - 1];
|
235
|
+
buf[buf.length - i - 1] = Bytes.rctab[buf[i]];
|
236
|
+
buf[i] = Bytes.rctab[tmp];
|
237
|
+
}
|
238
|
+
if (buf.length&1)
|
239
|
+
buf[buf.length>>1] = Bytes.rctab[buf[buf.length>>1]];
|
192
240
|
}
|
193
|
-
if (this.length&1)
|
194
|
-
this[this.length>>1] = Bytes.rctab[this[this.length>>1]];
|
195
241
|
}
|
196
242
|
|
197
243
|
/********************
|
@@ -1694,15 +1740,17 @@ function paf_gff2bed(args)
|
|
1694
1740
|
|
1695
1741
|
function paf_sam2paf(args)
|
1696
1742
|
{
|
1697
|
-
var c, pri_only = false, long_cs = false;
|
1698
|
-
while ((c = getopt(args, "
|
1743
|
+
var c, pri_only = false, long_cs = false, pri_pri_only = false;
|
1744
|
+
while ((c = getopt(args, "pPL")) != null) {
|
1699
1745
|
if (c == 'p') pri_only = true;
|
1746
|
+
else if (c == 'P') pri_pri_only = pri_only = true;
|
1700
1747
|
else if (c == 'L') long_cs = true;
|
1701
1748
|
}
|
1702
1749
|
if (args.length == getopt.ind) {
|
1703
1750
|
print("Usage: paftools.js sam2paf [options] <in.sam>");
|
1704
1751
|
print("Options:");
|
1705
1752
|
print(" -p convert primary or supplementary alignments only");
|
1753
|
+
print(" -P convert primary alignments only");
|
1706
1754
|
print(" -L output the cs tag in the long form");
|
1707
1755
|
exit(1);
|
1708
1756
|
}
|
@@ -1729,6 +1777,7 @@ function paf_sam2paf(args)
|
|
1729
1777
|
throw Error("at line " + lineno + ": inconsistent SEQ and QUAL lengths - " + t[9].length + " != " + t[10].length);
|
1730
1778
|
if (t[2] == '*' || (flag&4) || t[5] == '*') continue;
|
1731
1779
|
if (pri_only && (flag&0x100)) continue;
|
1780
|
+
if (pri_pri_only && (flag&0x900)) continue;
|
1732
1781
|
var tlen = ctg_len[t[2]];
|
1733
1782
|
if (tlen == null) throw Error("at line " + lineno + ": can't find the length of contig " + t[2]);
|
1734
1783
|
// find tags
|
@@ -1841,7 +1890,10 @@ function paf_sam2paf(args)
|
|
1841
1890
|
// optional tags
|
1842
1891
|
var type = flag&0x100? 'S' : 'P';
|
1843
1892
|
var tags = ["tp:A:" + type];
|
1844
|
-
if (NM != null)
|
1893
|
+
if (NM != null) {
|
1894
|
+
tags.push("NM:i:"+NM);
|
1895
|
+
tags.push("mm:i:"+mm);
|
1896
|
+
}
|
1845
1897
|
tags.push("gn:i:"+(I[1]+D[1]), "go:i:"+(I[0]+D[0]), "cg:Z:" + t[5].replace(/\d+[SH]/g, ''));
|
1846
1898
|
if (cs_str != null) tags.push("cs:Z:" + cs_str);
|
1847
1899
|
else if (cs.length > 0) tags.push("cs:Z:" + cs.join(""));
|
@@ -2051,7 +2103,7 @@ function paf_mapeval(args)
|
|
2051
2103
|
warn("Usage: paftools.js mapeval [options] <in.paf>|<in.sam>");
|
2052
2104
|
warn("Options:");
|
2053
2105
|
warn(" -r FLOAT mapping correct if overlap_length/union_length>FLOAT [" + ovlp_ratio + "]");
|
2054
|
-
warn(" -Q INT print wrong mappings with mapQ
|
2106
|
+
warn(" -Q INT print wrong mappings with mapQ>=INT [don't print]");
|
2055
2107
|
warn(" -m INT 0: eval the longest aln only; 1: first aln only; 2: all primary aln [0]");
|
2056
2108
|
exit(1);
|
2057
2109
|
}
|
data/ext/minimap2/mmpriv.h
CHANGED
@@ -14,6 +14,7 @@
|
|
14
14
|
#define MM_DBG_PRINT_SEED 0x4
|
15
15
|
#define MM_DBG_PRINT_ALN_SEQ 0x8
|
16
16
|
#define MM_DBG_PRINT_CHAIN 0x10
|
17
|
+
#define MM_DBG_SEED_FREQ 0x20
|
17
18
|
|
18
19
|
#define MM_SEED_LONG_JOIN (1ULL<<40)
|
19
20
|
#define MM_SEED_IGNORE (1ULL<<41)
|
@@ -79,8 +80,6 @@ int mm_idx_getseq2(const mm_idx_t *mi, int is_rev, uint32_t rid, uint32_t st, ui
|
|
79
80
|
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
|
80
81
|
mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a, int is_qstrand);
|
81
82
|
|
82
|
-
mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale,
|
83
|
-
int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
|
84
83
|
mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
|
85
84
|
int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
|
86
85
|
mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_skip, int cap_rmq_size, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
|
data/ext/minimap2/options.c
CHANGED
@@ -45,6 +45,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
|
|
45
45
|
opt->alt_drop = 0.15f;
|
46
46
|
|
47
47
|
opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1;
|
48
|
+
opt->transition = 0;
|
48
49
|
opt->sc_ambi = 1;
|
49
50
|
opt->zdrop = 400, opt->zdrop_inv = 200;
|
50
51
|
opt->end_bonus = -1;
|
@@ -54,7 +55,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
|
|
54
55
|
opt->max_clip_ratio = 1.0f;
|
55
56
|
opt->mini_batch_size = 500000000;
|
56
57
|
opt->max_sw_mat = 100000000;
|
57
|
-
opt->cap_kalloc =
|
58
|
+
opt->cap_kalloc = 500000000;
|
58
59
|
|
59
60
|
opt->rank_min_len = 500;
|
60
61
|
opt->rank_frac = 0.9f;
|
@@ -90,7 +91,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
|
90
91
|
if (preset == 0) {
|
91
92
|
mm_idxopt_init(io);
|
92
93
|
mm_mapopt_init(mo);
|
93
|
-
} else if (strcmp(preset, "map-ont") == 0) { // this is the same as the default
|
94
|
+
} else if (strcmp(preset, "lr") == 0 || strcmp(preset, "map-ont") == 0) { // this is the same as the default
|
94
95
|
} else if (strcmp(preset, "ava-ont") == 0) {
|
95
96
|
io->flag = 0, io->k = 15, io->w = 5;
|
96
97
|
mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN;
|
@@ -105,13 +106,30 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
|
105
106
|
mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25;
|
106
107
|
mo->bw_long = mo->bw;
|
107
108
|
mo->occ_dist = 0;
|
108
|
-
} else if (strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) {
|
109
|
+
} else if (strcmp(preset, "lr:hq") == 0 || strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) {
|
109
110
|
io->flag = 0, io->k = 19, io->w = 19;
|
110
111
|
mo->max_gap = 10000;
|
111
|
-
mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1;
|
112
|
-
mo->occ_dist = 500;
|
113
112
|
mo->min_mid_occ = 50, mo->max_mid_occ = 500;
|
114
|
-
|
113
|
+
if (strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) {
|
114
|
+
mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1;
|
115
|
+
mo->min_dp_max = 200;
|
116
|
+
}
|
117
|
+
} else if (strcmp(preset, "lr:hqae") == 0) { // high-quality assembly evaluation
|
118
|
+
io->flag = 0, io->k = 25, io->w = 51;
|
119
|
+
mo->flag |= MM_F_RMQ;
|
120
|
+
mo->min_mid_occ = 50, mo->max_mid_occ = 500;
|
121
|
+
mo->rmq_inner_dist = 5000;
|
122
|
+
mo->occ_dist = 200;
|
123
|
+
mo->best_n = 100;
|
124
|
+
mo->chain_gap_scale = 5.0f;
|
125
|
+
} else if (strcmp(preset, "map-iclr-prerender") == 0) {
|
126
|
+
io->flag = 0, io->k = 15;
|
127
|
+
mo->b = 6, mo->transition = 1;
|
128
|
+
mo->q = 10, mo->q2 = 50;
|
129
|
+
} else if (strcmp(preset, "map-iclr") == 0) {
|
130
|
+
io->flag = 0, io->k = 19;
|
131
|
+
mo->b = 6, mo->transition = 4;
|
132
|
+
mo->q = 10, mo->q2 = 50;
|
115
133
|
} else if (strncmp(preset, "asm", 3) == 0) {
|
116
134
|
io->flag = 0, io->k = 19, io->w = 19;
|
117
135
|
mo->bw = 1000, mo->bw_long = 100000;
|
@@ -156,7 +174,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
|
156
174
|
mo->junc_bonus = 9;
|
157
175
|
mo->zdrop = 200, mo->zdrop_inv = 100; // because mo->a is halved
|
158
176
|
if (strcmp(preset, "splice:hq") == 0)
|
159
|
-
mo->
|
177
|
+
mo->noncan = 5, mo->b = 4, mo->q = 6, mo->q2 = 24;
|
160
178
|
} else return -1;
|
161
179
|
return 0;
|
162
180
|
}
|
@@ -77,7 +77,9 @@ This constructor accepts the following arguments:
|
|
77
77
|
|
78
78
|
* **min_chain_score**: minimum chaing score
|
79
79
|
|
80
|
-
* **bw**: chaining and alignment band width
|
80
|
+
* **bw**: chaining and alignment band width (initial chaining and extension)
|
81
|
+
|
82
|
+
* **bw_long**: chaining and alignment band width (RMQ-based rechaining and closing gaps)
|
81
83
|
|
82
84
|
* **best_n**: max number of alignments to return
|
83
85
|
|
@@ -3,7 +3,7 @@ from libc.stdlib cimport free
|
|
3
3
|
cimport cmappy
|
4
4
|
import sys
|
5
5
|
|
6
|
-
__version__ = '2.
|
6
|
+
__version__ = '2.28'
|
7
7
|
|
8
8
|
cmappy.mm_reset_timer()
|
9
9
|
|
@@ -96,6 +96,7 @@ cdef class Alignment:
|
|
96
96
|
a = [str(self._q_st), str(self._q_en), strand, self._ctg, str(self._ctg_len), str(self._r_st), str(self._r_en),
|
97
97
|
str(self._mlen), str(self._blen), str(self._mapq), tp, ts, "cg:Z:" + self.cigar_str]
|
98
98
|
if self._cs != "": a.append("cs:Z:" + self._cs)
|
99
|
+
if self._MD != "": a.append("MD:Z:" + self._MD)
|
99
100
|
return "\t".join(a)
|
100
101
|
|
101
102
|
cdef class ThreadBuffer:
|
@@ -112,7 +113,7 @@ cdef class Aligner:
|
|
112
113
|
cdef cmappy.mm_idxopt_t idx_opt
|
113
114
|
cdef cmappy.mm_mapopt_t map_opt
|
114
115
|
|
115
|
-
def __cinit__(self, fn_idx_in=None, preset=None, k=None, w=None, min_cnt=None, min_chain_score=None, min_dp_score=None, bw=None, best_n=None, n_threads=3, fn_idx_out=None, max_frag_len=None, extra_flags=None, seq=None, scoring=None):
|
116
|
+
def __cinit__(self, fn_idx_in=None, preset=None, k=None, w=None, min_cnt=None, min_chain_score=None, min_dp_score=None, bw=None, bw_long=None, best_n=None, n_threads=3, fn_idx_out=None, max_frag_len=None, extra_flags=None, seq=None, scoring=None):
|
116
117
|
self._idx = NULL
|
117
118
|
cmappy.mm_set_opt(NULL, &self.idx_opt, &self.map_opt) # set the default options
|
118
119
|
if preset is not None:
|
@@ -125,6 +126,7 @@ cdef class Aligner:
|
|
125
126
|
if min_chain_score is not None: self.map_opt.min_chain_score = min_chain_score
|
126
127
|
if min_dp_score is not None: self.map_opt.min_dp_max = min_dp_score
|
127
128
|
if bw is not None: self.map_opt.bw = bw
|
129
|
+
if bw_long is not None: self.map_opt.bw_long = bw_long
|
128
130
|
if best_n is not None: self.map_opt.best_n = best_n
|
129
131
|
if max_frag_len is not None: self.map_opt.max_frag_len = max_frag_len
|
130
132
|
if extra_flags is not None: self.map_opt.flag |= extra_flags
|
@@ -5,7 +5,7 @@ import getopt
|
|
5
5
|
import mappy as mp
|
6
6
|
|
7
7
|
def main(argv):
|
8
|
-
opts, args = getopt.getopt(argv[1:], "x:n:m:k:w:r:
|
8
|
+
opts, args = getopt.getopt(argv[1:], "x:n:m:k:w:r:cM")
|
9
9
|
if len(args) < 2:
|
10
10
|
print("Usage: minimap2.py [options] <ref.fa>|<ref.mmi> <query.fq>")
|
11
11
|
print("Options:")
|
@@ -16,10 +16,11 @@ def main(argv):
|
|
16
16
|
print(" -w INT minimizer window length")
|
17
17
|
print(" -r INT band width")
|
18
18
|
print(" -c output the cs tag")
|
19
|
+
print(" -M output the MD tag")
|
19
20
|
sys.exit(1)
|
20
21
|
|
21
22
|
preset = min_cnt = min_sc = k = w = bw = None
|
22
|
-
out_cs = False
|
23
|
+
out_cs = out_MD = False
|
23
24
|
for opt, arg in opts:
|
24
25
|
if opt == '-x': preset = arg
|
25
26
|
elif opt == '-n': min_cnt = int(arg)
|
@@ -28,11 +29,12 @@ def main(argv):
|
|
28
29
|
elif opt == '-k': k = int(arg)
|
29
30
|
elif opt == '-w': w = int(arg)
|
30
31
|
elif opt == '-c': out_cs = True
|
32
|
+
elif opt == '-M': out_MD = True
|
31
33
|
|
32
34
|
a = mp.Aligner(args[0], preset=preset, min_cnt=min_cnt, min_chain_score=min_sc, k=k, w=w, bw=bw)
|
33
35
|
if not a: raise Exception("ERROR: failed to load/build index file '{}'".format(args[0]))
|
34
36
|
for name, seq, qual in mp.fastx_read(args[1]): # read one sequence
|
35
|
-
for h in a.map(seq, cs=out_cs): # traverse hits
|
37
|
+
for h in a.map(seq, cs=out_cs, MD=out_MD): # traverse hits
|
36
38
|
print('{}\t{}\t{}'.format(name, len(seq), h))
|
37
39
|
|
38
40
|
if __name__ == "__main__":
|
data/ext/minimap2/seed.c
CHANGED
@@ -112,7 +112,8 @@ mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int ma
|
|
112
112
|
}
|
113
113
|
for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < n_m0; ++i) {
|
114
114
|
mm_seed_t *q = &m[i];
|
115
|
-
|
115
|
+
if (mm_dbg_flag & MM_DBG_SEED_FREQ)
|
116
|
+
fprintf(stderr, "SF\t%d\t%d\t%d\n", q->q_pos>>1, q->n, q->flt);
|
116
117
|
if (q->flt) {
|
117
118
|
int en = (q->q_pos >> 1) + 1, st = en - q->q_span;
|
118
119
|
if (st > rep_en) {
|
data/ext/minimap2/setup.py
CHANGED
data/lib/minimap2/aligner.rb
CHANGED
@@ -21,10 +21,11 @@ module Minimap2
|
|
21
21
|
# * ava-ont : Nanopore read overlap
|
22
22
|
# @param k [Integer] k-mer length, no larger than 28.
|
23
23
|
# @param w [Integer] minimizer window size, no larger than 255.
|
24
|
-
# @param min_cnt [Integer]
|
25
|
-
# @param min_chain_score [Integer] minimum
|
24
|
+
# @param min_cnt [Integer] minimum number of minimizers on a chain.
|
25
|
+
# @param min_chain_score [Integer] minimum chain score.
|
26
26
|
# @param min_dp_score
|
27
|
-
# @param bw [Integer] chaining and alignment band width.
|
27
|
+
# @param bw [Integer] chaining and alignment band width. (initial chaining and extension)
|
28
|
+
# @param bw_long [Integer] chaining and alignment band width (RMQ-based rechaining and closing gaps)
|
28
29
|
# @param best_n [Integer] max number of alignments to return.
|
29
30
|
# @param n_threads [Integer] number of indexing threads.
|
30
31
|
# @param fn_idx_out [String] name of file to which the index is written.
|
@@ -47,6 +48,7 @@ module Minimap2
|
|
47
48
|
min_chain_score: nil,
|
48
49
|
min_dp_score: nil,
|
49
50
|
bw: nil,
|
51
|
+
bw_long: nil,
|
50
52
|
best_n: nil,
|
51
53
|
n_threads: 3,
|
52
54
|
fn_idx_out: nil,
|
@@ -72,6 +74,7 @@ module Minimap2
|
|
72
74
|
map_opt[:min_chain_score] = min_chain_score if min_chain_score
|
73
75
|
map_opt[:min_dp_max] = min_dp_score if min_dp_score
|
74
76
|
map_opt[:bw] = bw if bw
|
77
|
+
map_opt[:bw_long] = bw_long if bw_long
|
75
78
|
map_opt[:best_n] = best_n if best_n
|
76
79
|
map_opt[:max_frag_len] = max_frag_len if max_frag_len
|
77
80
|
map_opt[:flag] |= extra_flags if extra_flags
|
data/lib/minimap2/alignment.rb
CHANGED
@@ -23,7 +23,7 @@ module Minimap2
|
|
23
23
|
# @return [Integer] length of the matching bases in the alignment,
|
24
24
|
# excluding ambiguous base matches.
|
25
25
|
# @!attribute nm
|
26
|
-
# @return [Integer] number of mismatches, gaps and ambiguous
|
26
|
+
# @return [Integer] number of mismatches, gaps and ambiguous positions in the alignment.
|
27
27
|
# @!attribute primary
|
28
28
|
# @return [Integer] if the alignment is primary (typically the best and the first to generate)
|
29
29
|
# @!attribute q_st
|
@@ -107,6 +107,7 @@ module Minimap2
|
|
107
107
|
a = [@q_st, @q_en, strand, @ctg, @ctg_len, @r_st, @r_en,
|
108
108
|
@mlen, @blen, @mapq, tp, ts, "cg:Z:#{@cigar_str}"]
|
109
109
|
a << "cs:Z:#{@cs}" if @cs
|
110
|
+
a << "MD:Z:#{@md}" if @md
|
110
111
|
a.join("\t")
|
111
112
|
end
|
112
113
|
end
|
@@ -40,6 +40,7 @@ module Minimap2
|
|
40
40
|
NO_HASH_NAME = 0x400000000
|
41
41
|
SPLICE_OLD = 0x800000000
|
42
42
|
SECONDARY_SEQ = 0x1000000000 # output SEQ field for seqondary alignments using hard clipping
|
43
|
+
OUT_DS = 0x2000000000
|
43
44
|
|
44
45
|
HPC = 0x1
|
45
46
|
NO_SEQ = 0x2
|
@@ -109,8 +110,10 @@ module Minimap2
|
|
109
110
|
:dp_score, :int32, # DP score
|
110
111
|
:dp_max, :int32, # score of the max-scoring segment
|
111
112
|
:dp_max2, :int32, # score of the best alternate mappings
|
113
|
+
:dp_max0, :int32, # DP score before mm_update_dp_max() adjustment
|
112
114
|
:n_ambi_trans_strand, :uint32,
|
113
115
|
:n_cigar, :uint32
|
116
|
+
# :cigar, :pointer # variable length array (see cigar method below)
|
114
117
|
|
115
118
|
bit_field :n_ambi_trans_strand,
|
116
119
|
:n_ambi, 30, # number of ambiguous bases
|
@@ -204,6 +207,7 @@ module Minimap2
|
|
204
207
|
:e, :int, # gap-ext
|
205
208
|
:q2, :int, # gap-open
|
206
209
|
:e2, :int, # gap-ext
|
210
|
+
:transition, :int, # transition mismatch score (A:G, C:T)
|
207
211
|
:sc_ambi, :int, # score when one or both bases are "N"
|
208
212
|
:noncan, :int, # cost of non-canonical splicing sites
|
209
213
|
:junc_bonus, :int,
|
@@ -223,7 +227,7 @@ module Minimap2
|
|
223
227
|
:q_occ_frac, :float,
|
224
228
|
:min_mid_occ, :int32,
|
225
229
|
:max_mid_occ, :int32,
|
226
|
-
:mid_occ, :int32,
|
230
|
+
:mid_occ, :int32, # ignore seeds with occurrences above this threshold
|
227
231
|
:max_occ, :int32,
|
228
232
|
:max_max_occ, :int32,
|
229
233
|
:occ_dist, :int32,
|
@@ -15,10 +15,11 @@ module Minimap2
|
|
15
15
|
private_class_method :mm_set_opt_raw
|
16
16
|
|
17
17
|
def self.mm_set_opt(preset, io, mo)
|
18
|
-
ptr =
|
19
|
-
|
20
|
-
else
|
18
|
+
ptr = case preset
|
19
|
+
when 0, nil
|
21
20
|
::FFI::Pointer.new(:int, 0)
|
21
|
+
else
|
22
|
+
::FFI::MemoryPointer.from_string(preset.to_s)
|
22
23
|
end
|
23
24
|
mm_set_opt_raw(ptr, io, mo)
|
24
25
|
end
|
@@ -77,5 +78,17 @@ module Minimap2
|
|
77
78
|
:mm_gen_md, :mm_gen_MD, # Avoid uppercase letters in method names.
|
78
79
|
[:pointer, :pointer, :pointer, Idx.by_ref, Reg1.by_ref, :string],
|
79
80
|
:int
|
81
|
+
|
82
|
+
attach_function \
|
83
|
+
:mm_mapopt_init,
|
84
|
+
[MapOpt.by_ref],
|
85
|
+
:void
|
86
|
+
|
87
|
+
# mmpriv.h
|
88
|
+
|
89
|
+
attach_function \
|
90
|
+
:mm_idxopt_init,
|
91
|
+
[IdxOpt.by_ref],
|
92
|
+
:void
|
80
93
|
end
|
81
94
|
end
|