minimap2 0.2.26.1 → 0.2.28.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +16 -16
- data/ext/Rakefile +1 -3
- data/ext/minimap2/NEWS.md +68 -1
- data/ext/minimap2/README.md +8 -5
- data/ext/minimap2/align.c +19 -5
- data/ext/minimap2/cookbook.md +2 -2
- data/ext/minimap2/format.c +83 -17
- data/ext/minimap2/index.c +1 -0
- data/ext/minimap2/lchain.c +5 -5
- data/ext/minimap2/main.c +14 -6
- data/ext/minimap2/minimap.h +4 -1
- data/ext/minimap2/minimap2.1 +60 -11
- data/ext/minimap2/misc/paftools.js +88 -36
- data/ext/minimap2/mmpriv.h +1 -2
- data/ext/minimap2/options.c +25 -7
- data/ext/minimap2/python/README.rst +3 -1
- data/ext/minimap2/python/cmappy.pxd +1 -0
- data/ext/minimap2/python/mappy.pyx +4 -2
- data/ext/minimap2/python/minimap2.py +5 -3
- data/ext/minimap2/seed.c +2 -1
- data/ext/minimap2/setup.py +1 -1
- data/lib/minimap2/aligner.rb +6 -3
- data/lib/minimap2/alignment.rb +2 -1
- data/lib/minimap2/ffi/constants.rb +5 -1
- data/lib/minimap2/ffi/functions.rb +16 -3
- data/lib/minimap2/ffi.rb +1 -0
- data/lib/minimap2/version.rb +1 -1
- data/lib/minimap2.rb +2 -2
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f23caa7c5707d41b96b6df1748d257e11bed0554cbc2123d5dca31ef22b9bb05
|
4
|
+
data.tar.gz: 6ab72bef6ad874385871460d55503696969c8dc381896873d64c821538bbfd57
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4327be493c432ba562780e79aceff0f53d409a1ff2c5630cde48ad28b4c338fdc9bab2a333087f1e2aa8cd83e2374ce3a7feec9e5133aaeb4c01c3011b0414db
|
7
|
+
data.tar.gz: adc86c65a0dbeb775b89385790894cf3ad2fa1c24cdb0b0a9d94302134910ee651afc832dc0b8864c3a24b6b86577ccd694e46effb1a95ecf6bea5e968d189ad
|
data/README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# ruby-minimap2
|
2
2
|
|
3
3
|
[![Gem Version](https://img.shields.io/gem/v/minimap2?color=brightgreen)](https://rubygems.org/gems/minimap2)
|
4
|
-
[![
|
4
|
+
[![test](https://github.com/kojix2/ruby-minimap2/actions/workflows/ci.yml/badge.svg)](https://github.com/kojix2/ruby-minimap2/actions/workflows/ci.yml)
|
5
5
|
[![Docs Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://rubydoc.info/gems/minimap2)
|
6
6
|
[![Docs Latest](https://img.shields.io/badge/docs-latest-blue.svg)](https://kojix2.github.io/ruby-minimap2/)
|
7
7
|
[![The MIT License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE.txt)
|
@@ -23,7 +23,7 @@ gem install minimap2
|
|
23
23
|
bundle install
|
24
24
|
bundle exec rake minimap2:build
|
25
25
|
bundle exec rake install
|
26
|
-
|
26
|
+
|
27
27
|
</details>
|
28
28
|
|
29
29
|
## Quick Start
|
@@ -36,6 +36,7 @@ seq = aligner.seq("MT_human", 100, 200)
|
|
36
36
|
hits = aligner.align(seq)
|
37
37
|
pp hits
|
38
38
|
```
|
39
|
+
|
39
40
|
```
|
40
41
|
[#<Minimap2::Alignment:0x000055bbfde2d128
|
41
42
|
@blen=100,
|
@@ -57,8 +58,6 @@ pp hits
|
|
57
58
|
@strand=1,
|
58
59
|
@trans_strand=0>]
|
59
60
|
```
|
60
|
-
|
61
|
-
</details>
|
62
61
|
|
63
62
|
## APIs Overview
|
64
63
|
|
@@ -87,7 +86,7 @@ pp hits
|
|
87
86
|
- trans_strand Returns transcript strand. +1 if on the forward strand; -1 if on the reverse strand; 0 if unknown.
|
88
87
|
- blen Returns length of the alignment, including both alignment matches and gaps but excluding ambiguous bases.
|
89
88
|
- mlen Returns length of the matching bases in the alignment, excluding ambiguous base matches.
|
90
|
-
- nm Returns number of mismatches, gaps and ambiguous
|
89
|
+
- nm Returns number of mismatches, gaps and ambiguous positions in the alignment.
|
91
90
|
- primary Returns if the alignment is primary (typically the best and the first to generate).
|
92
91
|
- q_st Returns start positions on the query.
|
93
92
|
- q_en Returns end positions on the query.
|
@@ -106,19 +105,20 @@ pp hits
|
|
106
105
|
* MapOpt class Mapping options.
|
107
106
|
```
|
108
107
|
|
109
|
-
|
110
|
-
|
111
|
-
|
108
|
+
- API is based on [Mappy](https://github.com/lh3/minimap2/tree/master/python), the official Python binding for Minimap2.
|
109
|
+
- `Aligner#map` has been changed to `align`, because `map` means iterator in Ruby.
|
110
|
+
- See [documentation](https://kojix2.github.io/ruby-minimap2/) for details.
|
112
111
|
|
113
112
|
<details>
|
114
113
|
<summary><b>C Structures and Functions</b></summary>
|
115
114
|
|
116
115
|
### FFI
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
116
|
+
|
117
|
+
- Ruby-Minimap2 is built on top of [Ruby-FFI](https://github.com/ffi/ffi).
|
118
|
+
- Native C functions can be called from the `Minimap2::FFI` module.
|
119
|
+
- Native C structure members can be accessed.
|
120
|
+
- Bitfields are supported by [ffi-bitfield](https://github.com/kojix2/ffi-bitfield) gems.
|
121
|
+
|
122
122
|
```ruby
|
123
123
|
aligner.idx_opt.members
|
124
124
|
# => [:k, :w, :flag, :bucket_bits, :mini_batch_size, :batch_size]
|
@@ -130,7 +130,7 @@ aligner.idx_opt[:k] = 14
|
|
130
130
|
aligner.idx_opt[:k]
|
131
131
|
# => 14
|
132
132
|
```
|
133
|
-
|
133
|
+
|
134
134
|
</details>
|
135
135
|
|
136
136
|
## Contributing
|
@@ -138,7 +138,7 @@ aligner.idx_opt[:k]
|
|
138
138
|
<details>
|
139
139
|
<summary><b>Development</b></summary>
|
140
140
|
|
141
|
-
|
141
|
+
Fork your repository.
|
142
142
|
then clone.
|
143
143
|
|
144
144
|
```sh
|
@@ -184,7 +184,7 @@ ruby-minimap2 is a library under development and there are many points to be imp
|
|
184
184
|
|
185
185
|
Please feel free to report [bugs](https://github.com/kojix2/ruby-minimap2/issues) and [pull requests](https://github.com/kojix2/ruby-minimap2/pulls)!
|
186
186
|
|
187
|
-
Many OSS projects become abandoned because only the founder has commit rights to the original repository.
|
187
|
+
Many OSS projects become abandoned because only the founder has commit rights to the original repository.
|
188
188
|
If you need commit rights to ruby-minimap2 repository or want to get admin rights and take over the project, please feel free to contact me @kojix2.
|
189
189
|
|
190
190
|
## License
|
data/ext/Rakefile
CHANGED
@@ -51,10 +51,8 @@ namespace :minimap2 do
|
|
51
51
|
end
|
52
52
|
end
|
53
53
|
|
54
|
-
task cleanall: [:clean]
|
55
|
-
|
56
54
|
desc "`make clean` and remove shared lib"
|
57
|
-
task :
|
55
|
+
task cleanall: [:clean] do
|
58
56
|
Dir.chdir(minimap2_dir) do
|
59
57
|
sh "rm #{target_path}" if File.exist?(target_path)
|
60
58
|
end
|
data/ext/minimap2/NEWS.md
CHANGED
@@ -1,9 +1,76 @@
|
|
1
|
+
Release 2.28-r1209 (27 March 2024)
|
2
|
+
----------------------------------
|
3
|
+
|
4
|
+
Notable changes to minimap2:
|
5
|
+
|
6
|
+
* Bugfix: `--MD` was not working properly due to the addition of `--ds` in the
|
7
|
+
last release (#1181 and #1182).
|
8
|
+
|
9
|
+
* New feature: added an experimental preset `lq:hqae` for aligning accurate
|
10
|
+
long reads back to their assembly. It has been observed that `map-hifi` and
|
11
|
+
`lr:hq` may produce many wrong alignments around centromeres when accurate
|
12
|
+
long reads (PacBio HiFi or Nanopore duplex/Q20+) are mapped to a diploid
|
13
|
+
assembly constructed from them. This new preset produces much more accurate
|
14
|
+
alignment. It is still experimental and may be subjective to changes in
|
15
|
+
future.
|
16
|
+
|
17
|
+
* Change: reduced the default `--cap-kalloc` to 500m to lower the peak
|
18
|
+
memory consumption (#855).
|
19
|
+
|
20
|
+
Notable changes to mappy:
|
21
|
+
|
22
|
+
* Bugfix: mappy option struct was out of sync with minimap2 (#1177).
|
23
|
+
|
24
|
+
Minimap2 should output identical alignments to v2.27.
|
25
|
+
|
26
|
+
(2.28: 27 March 2024, r1209)
|
27
|
+
|
28
|
+
|
29
|
+
|
30
|
+
Release 2.27-r1193 (12 March 2024)
|
31
|
+
----------------------------------
|
32
|
+
|
33
|
+
Notable changes to minimap2:
|
34
|
+
|
35
|
+
* New feature: added the `lr:hq` preset for accurate long reads at ~1% error
|
36
|
+
rate. This was suggested by Oxford Nanopore developers (#1127). It is not
|
37
|
+
clear if this preset also works well for PacBio HiFi reads.
|
38
|
+
|
39
|
+
* New feature: added the `map-iclr` preset for Illumina Complete Long Reads
|
40
|
+
(#1069), provided by Illumina developers.
|
41
|
+
|
42
|
+
* New feature: added option `-b` to specify mismatch penalty for base
|
43
|
+
transitions (i.e. A-to-G or C-to-T changes).
|
44
|
+
|
45
|
+
* New feature: added option `--ds` to generate a new `ds:Z` tag that
|
46
|
+
indicates uncertainty in INDEL positions. It is an extension to `cs`. The
|
47
|
+
`mgutils-es6.js` script in minigraph parses `ds`.
|
48
|
+
|
49
|
+
* Bugfix: avoided a NULL pointer dereference (#1154). This would not have an
|
50
|
+
effect on most systems but would still be good to fix.
|
51
|
+
|
52
|
+
* Bugfix: reverted the value of `ms:i` to pre-2.22 versions (#1146). This was
|
53
|
+
an oversight. See fcd4df2 for details.
|
54
|
+
|
55
|
+
Notable changes to paftools.js and mappy:
|
56
|
+
|
57
|
+
* New feature: expose `bw_long` to mappy's Aligner class (#1124).
|
58
|
+
|
59
|
+
* Bugfix: fixed several compatibility issues with k8 v1.0 (#1161 and #1166).
|
60
|
+
Subcommands "call", "pbsim2fq" and "mason2fq" were not working with v1.0.
|
61
|
+
|
62
|
+
Minimap2 should output identical alignments to v2.26, except the ms tag.
|
63
|
+
|
64
|
+
(2.27: 12 March 2024, r1193)
|
65
|
+
|
66
|
+
|
67
|
+
|
1
68
|
Release 2.26-r1175 (29 April 2023)
|
2
69
|
----------------------------------
|
3
70
|
|
4
71
|
Fixed the broken Python package. This is the only change.
|
5
72
|
|
6
|
-
(2.
|
73
|
+
(2.26: 25 April 2023, r1173)
|
7
74
|
|
8
75
|
|
9
76
|
|
data/ext/minimap2/README.md
CHANGED
@@ -15,7 +15,7 @@ cd minimap2 && make
|
|
15
15
|
./minimap2 -ax map-pb ref.fa pacbio.fq.gz > aln.sam # PacBio CLR genomic reads
|
16
16
|
./minimap2 -ax map-ont ref.fa ont.fq.gz > aln.sam # Oxford Nanopore genomic reads
|
17
17
|
./minimap2 -ax map-hifi ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.19 or later)
|
18
|
-
./minimap2 -ax
|
18
|
+
./minimap2 -ax lr:hq ref.fa ont-Q20.fq.gz > aln.sam # Nanopore Q20 genomic reads (v2.27 or later)
|
19
19
|
./minimap2 -ax sr ref.fa read1.fa read2.fa > aln.sam # short genomic paired-end reads
|
20
20
|
./minimap2 -ax splice ref.fa rna-reads.fa > aln.sam # spliced long reads (strand unknown)
|
21
21
|
./minimap2 -ax splice -uf -k14 ref.fa reads.fa > aln.sam # noisy Nanopore Direct RNA-seq
|
@@ -74,8 +74,8 @@ Detailed evaluations are available from the [minimap2 paper][doi] or the
|
|
74
74
|
Minimap2 is optimized for x86-64 CPUs. You can acquire precompiled binaries from
|
75
75
|
the [release page][release] with:
|
76
76
|
```sh
|
77
|
-
curl -L https://github.com/lh3/minimap2/releases/download/v2.
|
78
|
-
./minimap2-2.
|
77
|
+
curl -L https://github.com/lh3/minimap2/releases/download/v2.28/minimap2-2.28_x64-linux.tar.bz2 | tar -jxvf -
|
78
|
+
./minimap2-2.28_x64-linux/minimap2
|
79
79
|
```
|
80
80
|
If you want to compile from the source, you need to have a C compiler, GNU make
|
81
81
|
and zlib development files installed. Then type `make` in the source code
|
@@ -139,12 +139,15 @@ parameters at the same time. The default setting is the same as `map-ont`.
|
|
139
139
|
```sh
|
140
140
|
minimap2 -ax map-pb ref.fa pacbio-reads.fq > aln.sam # for PacBio CLR reads
|
141
141
|
minimap2 -ax map-ont ref.fa ont-reads.fq > aln.sam # for Oxford Nanopore reads
|
142
|
+
minimap2 -ax map-iclr ref.fa iclr-reads.fq > aln.sam # for Illumina Complete Long Reads
|
142
143
|
```
|
143
144
|
The difference between `map-pb` and `map-ont` is that `map-pb` uses
|
144
145
|
homopolymer-compressed (HPC) minimizers as seeds, while `map-ont` uses ordinary
|
145
|
-
minimizers as seeds.
|
146
|
+
minimizers as seeds. Empirical evaluation suggests HPC minimizers improve
|
146
147
|
performance and sensitivity when aligning PacBio CLR reads, but hurt when aligning
|
147
|
-
Nanopore reads.
|
148
|
+
Nanopore reads. `map-iclr` uses an adjusted alignment scoring matrix that
|
149
|
+
accounts for the low overall error rate in the reads, with transversion errors
|
150
|
+
being less frequent than transitions.
|
148
151
|
|
149
152
|
#### <a name="map-long-splice"></a>Map long mRNA/cDNA reads
|
150
153
|
|
data/ext/minimap2/align.c
CHANGED
@@ -21,6 +21,18 @@ static void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t sc
|
|
21
21
|
mat[(m - 1) * m + j] = sc_ambi;
|
22
22
|
}
|
23
23
|
|
24
|
+
static void ksw_gen_ts_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t transition, int8_t sc_ambi)
|
25
|
+
{
|
26
|
+
assert(m == 5);
|
27
|
+
ksw_gen_simple_mat(m, mat, a, b, sc_ambi);
|
28
|
+
if (transition == 0 || transition == b) return;
|
29
|
+
transition = transition > 0? -transition : transition;
|
30
|
+
mat[0 * m + 2] = transition; // A->G
|
31
|
+
mat[1 * m + 3] = transition; // C->T
|
32
|
+
mat[2 * m + 0] = transition; // G->A
|
33
|
+
mat[3 * m + 1] = transition; // T->C
|
34
|
+
}
|
35
|
+
|
24
36
|
static inline void mm_seq_rev(uint32_t len, uint8_t *seq)
|
25
37
|
{
|
26
38
|
uint32_t i;
|
@@ -283,7 +295,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
|
|
283
295
|
toff += len;
|
284
296
|
}
|
285
297
|
}
|
286
|
-
p->dp_max = (int32_t)(max + .499);
|
298
|
+
p->dp_max = p->dp_max0 = (int32_t)(max + .499);
|
287
299
|
assert(qoff == r->qe - r->qs && toff == r->re - r->rs);
|
288
300
|
if (is_eqx) mm_update_cigar_eqx(r, qseq, tseq); // NB: it has to be called here as changes to qseq and tseq are not returned
|
289
301
|
}
|
@@ -323,6 +335,8 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint
|
|
323
335
|
for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr);
|
324
336
|
fputc('\n', stderr);
|
325
337
|
}
|
338
|
+
if (opt->transition != 0 && opt->b != opt->transition)
|
339
|
+
flag |= KSW_EZ_GENERIC_SC;
|
326
340
|
if (opt->max_sw_mat > 0 && (int64_t)tlen * qlen > opt->max_sw_mat) {
|
327
341
|
ksw_reset_extz(ez);
|
328
342
|
ez->zdropped = 1;
|
@@ -586,7 +600,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
|
|
586
600
|
|
587
601
|
r2->cnt = 0;
|
588
602
|
if (r->cnt == 0) return;
|
589
|
-
|
603
|
+
ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi);
|
590
604
|
bw = (int)(opt->bw * 1.5 + 1.);
|
591
605
|
bw_long = (int)(opt->bw_long * 1.5 + 1.);
|
592
606
|
if (bw_long < bw) bw_long = bw;
|
@@ -844,7 +858,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i
|
|
844
858
|
if (ql < opt->min_chain_score || ql > opt->max_gap) return 0;
|
845
859
|
if (tl < opt->min_chain_score || tl > opt->max_gap) return 0;
|
846
860
|
|
847
|
-
|
861
|
+
ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi);
|
848
862
|
tseq = (uint8_t*)kmalloc(km, tl);
|
849
863
|
mm_idx_getseq(mi, r1->rid, r1->re, r2->rs, tseq);
|
850
864
|
qseq = r1->rev? &qseq0[0][r2->qe] : &qseq0[1][qlen - r2->qs];
|
@@ -919,14 +933,14 @@ double mm_event_identity(const mm_reg1_t *r)
|
|
919
933
|
static int32_t mm_recal_max_dp(const mm_reg1_t *r, double b2, int32_t match_sc)
|
920
934
|
{
|
921
935
|
uint32_t i;
|
922
|
-
int32_t n_gap = 0,
|
936
|
+
int32_t n_gap = 0, n_mis;
|
923
937
|
double gap_cost = 0.0;
|
924
938
|
if (r->p == 0) return -1;
|
925
939
|
for (i = 0; i < r->p->n_cigar; ++i) {
|
926
940
|
int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4;
|
927
941
|
if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) {
|
928
942
|
gap_cost += b2 + (double)mg_log2(1.0 + len);
|
929
|
-
|
943
|
+
n_gap += len;
|
930
944
|
}
|
931
945
|
}
|
932
946
|
n_mis = r->blen + r->p->n_ambi - r->mlen - n_gap;
|
data/ext/minimap2/cookbook.md
CHANGED
@@ -31,8 +31,8 @@ To acquire the data used in this cookbook and to install minimap2 and paftools,
|
|
31
31
|
please follow the command lines below:
|
32
32
|
```sh
|
33
33
|
# install minimap2 executables
|
34
|
-
curl -L https://github.com/lh3/minimap2/releases/download/v2.
|
35
|
-
cp minimap2-2.
|
34
|
+
curl -L https://github.com/lh3/minimap2/releases/download/v2.28/minimap2-2.28_x64-linux.tar.bz2 | tar jxf -
|
35
|
+
cp minimap2-2.28_x64-linux/{minimap2,k8,paftools.js} . # copy executables
|
36
36
|
export PATH="$PATH:"`pwd` # put the current directory on PATH
|
37
37
|
# download example datasets
|
38
38
|
curl -L https://github.com/lh3/minimap2/releases/download/v2.10/cookbook-data.tgz | tar zxf -
|
data/ext/minimap2/format.c
CHANGED
@@ -139,10 +139,48 @@ int mm_write_sam_hdr(const mm_idx_t *idx, const char *rg, const char *ver, int a
|
|
139
139
|
return ret;
|
140
140
|
}
|
141
141
|
|
142
|
-
static void
|
142
|
+
static void write_indel_ds(kstring_t *str, int64_t len, const uint8_t *seq, int64_t ll, int64_t lr) // write an indel to ds; adapted from minigraph
|
143
143
|
{
|
144
|
-
|
145
|
-
if (
|
144
|
+
int64_t i;
|
145
|
+
if (ll + lr >= len) {
|
146
|
+
mm_sprintf_lite(str, "[");
|
147
|
+
for (i = 0; i < len; ++i)
|
148
|
+
mm_sprintf_lite(str, "%c", "acgtn"[seq[i]]);
|
149
|
+
mm_sprintf_lite(str, "]");
|
150
|
+
} else {
|
151
|
+
int64_t k = 0;
|
152
|
+
if (ll > 0) {
|
153
|
+
mm_sprintf_lite(str, "[");
|
154
|
+
for (i = 0; i < ll; ++i)
|
155
|
+
mm_sprintf_lite(str, "%c", "acgtn"[seq[k+i]]);
|
156
|
+
mm_sprintf_lite(str, "]");
|
157
|
+
k += ll;
|
158
|
+
}
|
159
|
+
for (i = 0; i < len - lr - ll; ++i)
|
160
|
+
mm_sprintf_lite(str, "%c", "acgtn"[seq[k+i]]);
|
161
|
+
k += len - lr - ll;
|
162
|
+
if (lr > 0) {
|
163
|
+
mm_sprintf_lite(str, "[");
|
164
|
+
for (i = 0; i < lr; ++i)
|
165
|
+
mm_sprintf_lite(str, "%c", "acgtn"[seq[k+i]]);
|
166
|
+
mm_sprintf_lite(str, "]");
|
167
|
+
}
|
168
|
+
}
|
169
|
+
}
|
170
|
+
|
171
|
+
static void write_cs_ds_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int no_iden, int is_ds, int write_tag)
|
172
|
+
{
|
173
|
+
int i, q_off, t_off, q_len = 0, t_len = 0;
|
174
|
+
if (write_tag) mm_sprintf_lite(s, "\t%cs:Z:", is_ds? 'd' : 'c');
|
175
|
+
for (i = 0; i < (int)r->p->n_cigar; ++i) {
|
176
|
+
int op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
|
177
|
+
if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH)
|
178
|
+
q_len += len, t_len += len;
|
179
|
+
else if (op == MM_CIGAR_INS)
|
180
|
+
q_len += len;
|
181
|
+
else if (op == MM_CIGAR_DEL || op == MM_CIGAR_N_SKIP)
|
182
|
+
t_len += len;
|
183
|
+
}
|
146
184
|
for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) {
|
147
185
|
int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
|
148
186
|
assert((op >= MM_CIGAR_MATCH && op <= MM_CIGAR_N_SKIP) || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH);
|
@@ -168,14 +206,42 @@ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
|
|
168
206
|
}
|
169
207
|
q_off += len, t_off += len;
|
170
208
|
} else if (op == MM_CIGAR_INS) {
|
171
|
-
|
172
|
-
|
173
|
-
|
209
|
+
if (is_ds) {
|
210
|
+
int z, ll, lr, y = q_off;
|
211
|
+
for (z = 1; z <= len; ++z)
|
212
|
+
if (y - z < 0 || qseq[y + len - z] != qseq[y - z])
|
213
|
+
break;
|
214
|
+
lr = z - 1;
|
215
|
+
for (z = 0; z < len; ++z)
|
216
|
+
if (y + len + z >= q_len || qseq[y + len + z] != qseq[y + z])
|
217
|
+
break;
|
218
|
+
ll = z;
|
219
|
+
mm_sprintf_lite(s, "+");
|
220
|
+
write_indel_ds(s, len, &qseq[y], ll, lr);
|
221
|
+
} else {
|
222
|
+
for (j = 0, tmp[len] = 0; j < len; ++j)
|
223
|
+
tmp[j] = "acgtn"[qseq[q_off + j]];
|
224
|
+
mm_sprintf_lite(s, "+%s", tmp);
|
225
|
+
}
|
174
226
|
q_off += len;
|
175
227
|
} else if (op == MM_CIGAR_DEL) {
|
176
|
-
|
177
|
-
|
178
|
-
|
228
|
+
if (is_ds) {
|
229
|
+
int z, ll, lr, x = t_off;
|
230
|
+
for (z = 1; z <= len; ++z)
|
231
|
+
if (x - z < 0 || tseq[x + len - z] != tseq[x - z])
|
232
|
+
break;
|
233
|
+
lr = z - 1;
|
234
|
+
for (z = 0; z < len; ++z)
|
235
|
+
if (x + len + z >= t_len || tseq[x + z] != tseq[x + len + z])
|
236
|
+
break;
|
237
|
+
ll = z;
|
238
|
+
mm_sprintf_lite(s, "-");
|
239
|
+
write_indel_ds(s, len, &tseq[x], ll, lr);
|
240
|
+
} else {
|
241
|
+
for (j = 0, tmp[len] = 0; j < len; ++j)
|
242
|
+
tmp[j] = "acgtn"[tseq[t_off + j]];
|
243
|
+
mm_sprintf_lite(s, "-%s", tmp);
|
244
|
+
}
|
179
245
|
t_off += len;
|
180
246
|
} else { // intron
|
181
247
|
assert(len >= 2);
|
@@ -218,7 +284,7 @@ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
|
|
218
284
|
assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
|
219
285
|
}
|
220
286
|
|
221
|
-
static void
|
287
|
+
static void write_cs_ds_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int no_iden, int is_MD, int is_ds, int write_tag, int is_qstrand)
|
222
288
|
{
|
223
289
|
extern unsigned char seq_nt4_table[256];
|
224
290
|
int i;
|
@@ -245,7 +311,7 @@ static void write_cs_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_
|
|
245
311
|
}
|
246
312
|
}
|
247
313
|
if (is_MD) write_MD_core(s, tseq, qseq, r, tmp, write_tag);
|
248
|
-
else
|
314
|
+
else write_cs_ds_core(s, tseq, qseq, r, tmp, no_iden, is_ds, write_tag);
|
249
315
|
kfree(km, qseq); kfree(km, tseq); kfree(km, tmp);
|
250
316
|
}
|
251
317
|
|
@@ -256,7 +322,7 @@ int mm_gen_cs_or_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, cons
|
|
256
322
|
str.s = *buf, str.l = 0, str.m = *max_len;
|
257
323
|
t.l_seq = strlen(seq);
|
258
324
|
t.seq = (char*)seq;
|
259
|
-
|
325
|
+
write_cs_ds_or_MD(km, &str, mi, &t, r, no_iden, is_MD, 0, 0, is_qstrand);
|
260
326
|
*max_len = str.m;
|
261
327
|
*buf = str.s;
|
262
328
|
return str.l;
|
@@ -278,7 +344,7 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
|
|
278
344
|
if (r->id == r->parent) type = r->inv? 'I' : 'P';
|
279
345
|
else type = r->inv? 'i' : 'S';
|
280
346
|
if (r->p) {
|
281
|
-
mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->blen - r->mlen + r->p->n_ambi, r->p->
|
347
|
+
mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->blen - r->mlen + r->p->n_ambi, r->p->dp_max0, r->p->dp_score, r->p->n_ambi);
|
282
348
|
if (r->p->trans_strand == 1 || r->p->trans_strand == 2)
|
283
349
|
mm_sprintf_lite(s, "\tts:A:%c", "?+-?"[r->p->trans_strand]);
|
284
350
|
}
|
@@ -326,8 +392,8 @@ void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const
|
|
326
392
|
for (k = 0; k < r->p->n_cigar; ++k)
|
327
393
|
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, MM_CIGAR_STR[r->p->cigar[k]&0xf]);
|
328
394
|
}
|
329
|
-
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
|
330
|
-
|
395
|
+
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_DS|MM_F_OUT_MD)))
|
396
|
+
write_cs_ds_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), !!(opt_flag&MM_F_OUT_MD), !!(opt_flag&MM_F_OUT_DS), 1, !!(opt_flag&MM_F_QSTRAND));
|
331
397
|
if ((opt_flag & MM_F_COPY_COMMENT) && t->comment)
|
332
398
|
mm_sprintf_lite(s, "\t%s", t->comment);
|
333
399
|
}
|
@@ -535,8 +601,8 @@ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
|
|
535
601
|
}
|
536
602
|
}
|
537
603
|
}
|
538
|
-
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
|
539
|
-
|
604
|
+
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_DS|MM_F_OUT_MD)))
|
605
|
+
write_cs_ds_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, !!(opt_flag&MM_F_OUT_DS), 1, 0);
|
540
606
|
if (cigar_in_tag)
|
541
607
|
write_sam_cigar(s, flag, 1, t->l_seq, r, opt_flag);
|
542
608
|
}
|
data/ext/minimap2/index.c
CHANGED
@@ -192,6 +192,7 @@ int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f)
|
|
192
192
|
if (f <= 0.) return INT32_MAX;
|
193
193
|
for (i = 0; i < 1<<mi->b; ++i)
|
194
194
|
if (mi->B[i].h) n += kh_size((idxhash_t*)mi->B[i].h);
|
195
|
+
if (n == 0) return INT32_MAX;
|
195
196
|
a = (uint32_t*)malloc(n * 4);
|
196
197
|
for (i = n = 0; i < 1<<mi->b; ++i) {
|
197
198
|
idxhash_t *h = (idxhash_t*)mi->B[i].h;
|
data/ext/minimap2/lchain.c
CHANGED
@@ -149,7 +149,7 @@ mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int
|
|
149
149
|
int is_cdna, int n_seg, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km)
|
150
150
|
{ // TODO: make sure this works when n has more than 32 bits
|
151
151
|
int32_t *f, *t, *v, n_u, n_v, mmax_f = 0, max_drop = bw;
|
152
|
-
int64_t *p, i, j, max_ii, st = 0
|
152
|
+
int64_t *p, i, j, max_ii, st = 0;
|
153
153
|
uint64_t *u;
|
154
154
|
|
155
155
|
if (_u) *_u = 0, *n_u_ = 0;
|
@@ -174,7 +174,6 @@ mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int
|
|
174
174
|
for (j = i - 1; j >= st; --j) {
|
175
175
|
int32_t sc;
|
176
176
|
sc = comput_sc(&a[i], &a[j], max_dist_x, max_dist_y, bw, chn_pen_gap, chn_pen_skip, is_cdna, n_seg);
|
177
|
-
++n_iter;
|
178
177
|
if (sc == INT32_MIN) continue;
|
179
178
|
sc += f[j];
|
180
179
|
if (sc > max_f) {
|
@@ -204,6 +203,7 @@ mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int
|
|
204
203
|
if (max_ii < 0 || (a[i].x - a[max_ii].x <= (int64_t)max_dist_x && f[max_ii] < f[i]))
|
205
204
|
max_ii = i;
|
206
205
|
if (mmax_f < max_f) mmax_f = max_f;
|
206
|
+
//fprintf(stderr, "X1\t%ld\t%ld:%d\t%ld\t%ld:%d\t%ld\t%ld\n", (long)i, (long)(a[i].x>>32), (int32_t)a[i].x, (long)max_j, max_j<0?-1L:(long)(a[max_j].x>>32), max_j<0?-1:(int32_t)a[max_j].x, (long)max_f, (long)v[i]);
|
207
207
|
}
|
208
208
|
|
209
209
|
u = mg_chain_backtrack(km, n, f, p, v, t, min_cnt, min_sc, max_drop, &n_u, &n_v);
|
@@ -263,7 +263,8 @@ mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_ski
|
|
263
263
|
return 0;
|
264
264
|
}
|
265
265
|
if (max_dist < bw) max_dist = bw;
|
266
|
-
if (max_dist_inner
|
266
|
+
if (max_dist_inner < 0) max_dist_inner = 0;
|
267
|
+
if (max_dist_inner > max_dist) max_dist_inner = max_dist;
|
267
268
|
p = Kmalloc(km, int64_t, n);
|
268
269
|
f = Kmalloc(km, int32_t, n);
|
269
270
|
t = Kcalloc(km, int32_t, n);
|
@@ -325,12 +326,11 @@ mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_ski
|
|
325
326
|
krmq_interval(lc_elem, root_inner, &s, &lo, &hi);
|
326
327
|
if (lo) {
|
327
328
|
const lc_elem_t *q;
|
328
|
-
int32_t width
|
329
|
+
int32_t width;
|
329
330
|
krmq_itr_t(lc_elem) itr;
|
330
331
|
krmq_itr_find(lc_elem, root_inner, lo, &itr);
|
331
332
|
while ((q = krmq_at(&itr)) != 0) {
|
332
333
|
if (q->y < (int32_t)a[i].y - max_dist_inner) break;
|
333
|
-
++n_rmq_iter;
|
334
334
|
j = q->i;
|
335
335
|
sc = f[j] + comput_sc_simple(&a[i], &a[j], chn_pen_gap, chn_pen_skip, 0, &width);
|
336
336
|
if (width <= bw) {
|
data/ext/minimap2/main.c
CHANGED
@@ -77,6 +77,9 @@ static ko_longopt_t long_options[] = {
|
|
77
77
|
{ "print-chains", ko_no_argument, 352 },
|
78
78
|
{ "no-hash-name", ko_no_argument, 353 },
|
79
79
|
{ "secondary-seq", ko_no_argument, 354 },
|
80
|
+
{ "ds", ko_no_argument, 355 },
|
81
|
+
{ "rmq-inner", ko_required_argument, 356 },
|
82
|
+
{ "dbg-seed-occ", ko_no_argument, 501 },
|
80
83
|
{ "help", ko_no_argument, 'h' },
|
81
84
|
{ "max-intron-len", ko_required_argument, 'G' },
|
82
85
|
{ "version", ko_no_argument, 'V' },
|
@@ -120,7 +123,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int64_t flag, int long_idx, const
|
|
120
123
|
|
121
124
|
int main(int argc, char *argv[])
|
122
125
|
{
|
123
|
-
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:";
|
126
|
+
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:b:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:";
|
124
127
|
ketopt_t o = KETOPT_INIT;
|
125
128
|
mm_mapopt_t opt;
|
126
129
|
mm_idxopt_t ipt;
|
@@ -178,6 +181,7 @@ int main(int argc, char *argv[])
|
|
178
181
|
else if (c == 'm') opt.min_chain_score = atoi(o.arg);
|
179
182
|
else if (c == 'A') opt.a = atoi(o.arg);
|
180
183
|
else if (c == 'B') opt.b = atoi(o.arg);
|
184
|
+
else if (c == 'b') opt.transition = atoi(o.arg);
|
181
185
|
else if (c == 's') opt.min_dp_max = atoi(o.arg);
|
182
186
|
else if (c == 'C') opt.noncan = atoi(o.arg);
|
183
187
|
else if (c == 'I') ipt.batch_size = mm_parse_num(o.arg);
|
@@ -242,6 +246,9 @@ int main(int argc, char *argv[])
|
|
242
246
|
else if (c == 352) mm_dbg_flag |= MM_DBG_PRINT_CHAIN; // --print-chains
|
243
247
|
else if (c == 353) opt.flag |= MM_F_NO_HASH_NAME; // --no-hash-name
|
244
248
|
else if (c == 354) opt.flag |= MM_F_SECONDARY_SEQ; // --secondary-seq
|
249
|
+
else if (c == 355) opt.flag |= MM_F_OUT_DS; // --ds
|
250
|
+
else if (c == 356) opt.rmq_inner_dist = mm_parse_num(o.arg); // --rmq-inner
|
251
|
+
else if (c == 501) mm_dbg_flag |= MM_DBG_SEED_FREQ; // --dbg-seed-occ
|
245
252
|
else if (c == 330) {
|
246
253
|
fprintf(stderr, "[WARNING] \033[1;31m --lj-min-ratio has been deprecated.\033[0m\n");
|
247
254
|
} else if (c == 314) { // --frag
|
@@ -358,6 +365,7 @@ int main(int argc, char *argv[])
|
|
358
365
|
fprintf(fp_help, " -R STR SAM read group line in a format like '@RG\\tID:foo\\tSM:bar' []\n");
|
359
366
|
fprintf(fp_help, " -c output CIGAR in PAF\n");
|
360
367
|
fprintf(fp_help, " --cs[=STR] output the cs tag; STR is 'short' (if absent) or 'long' [none]\n");
|
368
|
+
fprintf(fp_help, " --ds output the ds tag, which is an extension to cs\n");
|
361
369
|
fprintf(fp_help, " --MD output the MD tag\n");
|
362
370
|
fprintf(fp_help, " --eqx write =/X CIGAR operators\n");
|
363
371
|
fprintf(fp_help, " -Y use soft clipping for supplementary alignments\n");
|
@@ -367,12 +375,12 @@ int main(int argc, char *argv[])
|
|
367
375
|
fprintf(fp_help, " --version show version number\n");
|
368
376
|
fprintf(fp_help, " Preset:\n");
|
369
377
|
fprintf(fp_help, " -x STR preset (always applied before other options; see minimap2.1 for details) []\n");
|
370
|
-
fprintf(fp_help, " -
|
371
|
-
fprintf(fp_help, " -
|
372
|
-
fprintf(fp_help, " - ava-pb/ava-ont - PacBio/Nanopore read overlap\n");
|
378
|
+
fprintf(fp_help, " - lr:hq - accurate long reads (error rate <1%%) against a reference genome\n");
|
379
|
+
fprintf(fp_help, " - splice/splice:hq - spliced alignment for long reads/accurate long reads\n");
|
373
380
|
fprintf(fp_help, " - asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5%% sequence divergence\n");
|
374
|
-
fprintf(fp_help, " -
|
375
|
-
fprintf(fp_help, " -
|
381
|
+
fprintf(fp_help, " - sr - short reads against a reference\n");
|
382
|
+
fprintf(fp_help, " - map-pb/map-hifi/map-ont/map-iclr - CLR/HiFi/Nanopore/ICLR vs reference mapping\n");
|
383
|
+
fprintf(fp_help, " - ava-pb/ava-ont - PacBio CLR/Nanopore read overlap\n");
|
376
384
|
fprintf(fp_help, "\nSee `man ./minimap2.1' for detailed description of these and other advanced command-line options.\n");
|
377
385
|
return fp_help == stdout? 0 : 1;
|
378
386
|
}
|
data/ext/minimap2/minimap.h
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
#include <stdio.h>
|
6
6
|
#include <sys/types.h>
|
7
7
|
|
8
|
-
#define MM_VERSION "2.
|
8
|
+
#define MM_VERSION "2.28-r1209"
|
9
9
|
|
10
10
|
#define MM_F_NO_DIAG (0x001LL) // no exact diagonal hit
|
11
11
|
#define MM_F_NO_DUAL (0x002LL) // skip pairs where query name is lexicographically larger than target name
|
@@ -44,6 +44,7 @@
|
|
44
44
|
#define MM_F_NO_HASH_NAME (0x400000000LL)
|
45
45
|
#define MM_F_SPLICE_OLD (0x800000000LL)
|
46
46
|
#define MM_F_SECONDARY_SEQ (0x1000000000LL) //output SEQ field for seqondary alignments using hard clipping
|
47
|
+
#define MM_F_OUT_DS (0x2000000000LL)
|
47
48
|
|
48
49
|
#define MM_I_HPC 0x1
|
49
50
|
#define MM_I_NO_SEQ 0x2
|
@@ -97,6 +98,7 @@ typedef struct {
|
|
97
98
|
typedef struct {
|
98
99
|
uint32_t capacity; // the capacity of cigar[]
|
99
100
|
int32_t dp_score, dp_max, dp_max2; // DP score; score of the max-scoring segment; score of the best alternate mappings
|
101
|
+
int32_t dp_max0; // DP score before mm_update_dp_max() adjustment
|
100
102
|
uint32_t n_ambi:30, trans_strand:2; // number of ambiguous bases; transcript strand: 0 for unknown, 1 for +, 2 for -
|
101
103
|
uint32_t n_cigar; // number of cigar operations in cigar[]
|
102
104
|
uint32_t cigar[];
|
@@ -153,6 +155,7 @@ typedef struct {
|
|
153
155
|
float alt_drop;
|
154
156
|
|
155
157
|
int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
|
158
|
+
int transition; // transition mismatch score (A:G, C:T)
|
156
159
|
int sc_ambi; // score when one or both bases are "N"
|
157
160
|
int noncan; // cost of non-canonical splicing sites
|
158
161
|
int junc_bonus;
|