minimap2 0.2.26.1 → 0.2.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +16 -16
- data/ext/Rakefile +1 -3
- data/ext/minimap2/NEWS.md +68 -1
- data/ext/minimap2/README.md +8 -5
- data/ext/minimap2/align.c +19 -5
- data/ext/minimap2/cookbook.md +2 -2
- data/ext/minimap2/format.c +83 -17
- data/ext/minimap2/index.c +1 -0
- data/ext/minimap2/lchain.c +5 -5
- data/ext/minimap2/main.c +14 -6
- data/ext/minimap2/minimap.h +4 -1
- data/ext/minimap2/minimap2.1 +60 -11
- data/ext/minimap2/misc/paftools.js +88 -36
- data/ext/minimap2/mmpriv.h +1 -2
- data/ext/minimap2/options.c +25 -7
- data/ext/minimap2/python/README.rst +3 -1
- data/ext/minimap2/python/cmappy.pxd +1 -0
- data/ext/minimap2/python/mappy.pyx +4 -2
- data/ext/minimap2/python/minimap2.py +5 -3
- data/ext/minimap2/seed.c +2 -1
- data/ext/minimap2/setup.py +1 -1
- data/lib/minimap2/aligner.rb +6 -3
- data/lib/minimap2/alignment.rb +2 -1
- data/lib/minimap2/ffi/constants.rb +5 -1
- data/lib/minimap2/ffi/functions.rb +16 -3
- data/lib/minimap2/ffi.rb +1 -0
- data/lib/minimap2/version.rb +1 -1
- data/lib/minimap2.rb +2 -2
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f23caa7c5707d41b96b6df1748d257e11bed0554cbc2123d5dca31ef22b9bb05
|
4
|
+
data.tar.gz: 6ab72bef6ad874385871460d55503696969c8dc381896873d64c821538bbfd57
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4327be493c432ba562780e79aceff0f53d409a1ff2c5630cde48ad28b4c338fdc9bab2a333087f1e2aa8cd83e2374ce3a7feec9e5133aaeb4c01c3011b0414db
|
7
|
+
data.tar.gz: adc86c65a0dbeb775b89385790894cf3ad2fa1c24cdb0b0a9d94302134910ee651afc832dc0b8864c3a24b6b86577ccd694e46effb1a95ecf6bea5e968d189ad
|
data/README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# ruby-minimap2
|
2
2
|
|
3
3
|
[](https://rubygems.org/gems/minimap2)
|
4
|
-
[](https://github.com/kojix2/ruby-minimap2/actions/workflows/ci.yml)
|
5
5
|
[](https://rubydoc.info/gems/minimap2)
|
6
6
|
[](https://kojix2.github.io/ruby-minimap2/)
|
7
7
|
[](LICENSE.txt)
|
@@ -23,7 +23,7 @@ gem install minimap2
|
|
23
23
|
bundle install
|
24
24
|
bundle exec rake minimap2:build
|
25
25
|
bundle exec rake install
|
26
|
-
|
26
|
+
|
27
27
|
</details>
|
28
28
|
|
29
29
|
## Quick Start
|
@@ -36,6 +36,7 @@ seq = aligner.seq("MT_human", 100, 200)
|
|
36
36
|
hits = aligner.align(seq)
|
37
37
|
pp hits
|
38
38
|
```
|
39
|
+
|
39
40
|
```
|
40
41
|
[#<Minimap2::Alignment:0x000055bbfde2d128
|
41
42
|
@blen=100,
|
@@ -57,8 +58,6 @@ pp hits
|
|
57
58
|
@strand=1,
|
58
59
|
@trans_strand=0>]
|
59
60
|
```
|
60
|
-
|
61
|
-
</details>
|
62
61
|
|
63
62
|
## APIs Overview
|
64
63
|
|
@@ -87,7 +86,7 @@ pp hits
|
|
87
86
|
- trans_strand Returns transcript strand. +1 if on the forward strand; -1 if on the reverse strand; 0 if unknown.
|
88
87
|
- blen Returns length of the alignment, including both alignment matches and gaps but excluding ambiguous bases.
|
89
88
|
- mlen Returns length of the matching bases in the alignment, excluding ambiguous base matches.
|
90
|
-
- nm Returns number of mismatches, gaps and ambiguous
|
89
|
+
- nm Returns number of mismatches, gaps and ambiguous positions in the alignment.
|
91
90
|
- primary Returns if the alignment is primary (typically the best and the first to generate).
|
92
91
|
- q_st Returns start positions on the query.
|
93
92
|
- q_en Returns end positions on the query.
|
@@ -106,19 +105,20 @@ pp hits
|
|
106
105
|
* MapOpt class Mapping options.
|
107
106
|
```
|
108
107
|
|
109
|
-
|
110
|
-
|
111
|
-
|
108
|
+
- API is based on [Mappy](https://github.com/lh3/minimap2/tree/master/python), the official Python binding for Minimap2.
|
109
|
+
- `Aligner#map` has been changed to `align`, because `map` means iterator in Ruby.
|
110
|
+
- See [documentation](https://kojix2.github.io/ruby-minimap2/) for details.
|
112
111
|
|
113
112
|
<details>
|
114
113
|
<summary><b>C Structures and Functions</b></summary>
|
115
114
|
|
116
115
|
### FFI
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
116
|
+
|
117
|
+
- Ruby-Minimap2 is built on top of [Ruby-FFI](https://github.com/ffi/ffi).
|
118
|
+
- Native C functions can be called from the `Minimap2::FFI` module.
|
119
|
+
- Native C structure members can be accessed.
|
120
|
+
- Bitfields are supported by [ffi-bitfield](https://github.com/kojix2/ffi-bitfield) gems.
|
121
|
+
|
122
122
|
```ruby
|
123
123
|
aligner.idx_opt.members
|
124
124
|
# => [:k, :w, :flag, :bucket_bits, :mini_batch_size, :batch_size]
|
@@ -130,7 +130,7 @@ aligner.idx_opt[:k] = 14
|
|
130
130
|
aligner.idx_opt[:k]
|
131
131
|
# => 14
|
132
132
|
```
|
133
|
-
|
133
|
+
|
134
134
|
</details>
|
135
135
|
|
136
136
|
## Contributing
|
@@ -138,7 +138,7 @@ aligner.idx_opt[:k]
|
|
138
138
|
<details>
|
139
139
|
<summary><b>Development</b></summary>
|
140
140
|
|
141
|
-
|
141
|
+
Fork your repository.
|
142
142
|
then clone.
|
143
143
|
|
144
144
|
```sh
|
@@ -184,7 +184,7 @@ ruby-minimap2 is a library under development and there are many points to be imp
|
|
184
184
|
|
185
185
|
Please feel free to report [bugs](https://github.com/kojix2/ruby-minimap2/issues) and [pull requests](https://github.com/kojix2/ruby-minimap2/pulls)!
|
186
186
|
|
187
|
-
Many OSS projects become abandoned because only the founder has commit rights to the original repository.
|
187
|
+
Many OSS projects become abandoned because only the founder has commit rights to the original repository.
|
188
188
|
If you need commit rights to ruby-minimap2 repository or want to get admin rights and take over the project, please feel free to contact me @kojix2.
|
189
189
|
|
190
190
|
## License
|
data/ext/Rakefile
CHANGED
@@ -51,10 +51,8 @@ namespace :minimap2 do
|
|
51
51
|
end
|
52
52
|
end
|
53
53
|
|
54
|
-
task cleanall: [:clean]
|
55
|
-
|
56
54
|
desc "`make clean` and remove shared lib"
|
57
|
-
task :
|
55
|
+
task cleanall: [:clean] do
|
58
56
|
Dir.chdir(minimap2_dir) do
|
59
57
|
sh "rm #{target_path}" if File.exist?(target_path)
|
60
58
|
end
|
data/ext/minimap2/NEWS.md
CHANGED
@@ -1,9 +1,76 @@
|
|
1
|
+
Release 2.28-r1209 (27 March 2024)
|
2
|
+
----------------------------------
|
3
|
+
|
4
|
+
Notable changes to minimap2:
|
5
|
+
|
6
|
+
* Bugfix: `--MD` was not working properly due to the addition of `--ds` in the
|
7
|
+
last release (#1181 and #1182).
|
8
|
+
|
9
|
+
* New feature: added an experimental preset `lq:hqae` for aligning accurate
|
10
|
+
long reads back to their assembly. It has been observed that `map-hifi` and
|
11
|
+
`lr:hq` may produce many wrong alignments around centromeres when accurate
|
12
|
+
long reads (PacBio HiFi or Nanopore duplex/Q20+) are mapped to a diploid
|
13
|
+
assembly constructed from them. This new preset produces much more accurate
|
14
|
+
alignment. It is still experimental and may be subjective to changes in
|
15
|
+
future.
|
16
|
+
|
17
|
+
* Change: reduced the default `--cap-kalloc` to 500m to lower the peak
|
18
|
+
memory consumption (#855).
|
19
|
+
|
20
|
+
Notable changes to mappy:
|
21
|
+
|
22
|
+
* Bugfix: mappy option struct was out of sync with minimap2 (#1177).
|
23
|
+
|
24
|
+
Minimap2 should output identical alignments to v2.27.
|
25
|
+
|
26
|
+
(2.28: 27 March 2024, r1209)
|
27
|
+
|
28
|
+
|
29
|
+
|
30
|
+
Release 2.27-r1193 (12 March 2024)
|
31
|
+
----------------------------------
|
32
|
+
|
33
|
+
Notable changes to minimap2:
|
34
|
+
|
35
|
+
* New feature: added the `lr:hq` preset for accurate long reads at ~1% error
|
36
|
+
rate. This was suggested by Oxford Nanopore developers (#1127). It is not
|
37
|
+
clear if this preset also works well for PacBio HiFi reads.
|
38
|
+
|
39
|
+
* New feature: added the `map-iclr` preset for Illumina Complete Long Reads
|
40
|
+
(#1069), provided by Illumina developers.
|
41
|
+
|
42
|
+
* New feature: added option `-b` to specify mismatch penalty for base
|
43
|
+
transitions (i.e. A-to-G or C-to-T changes).
|
44
|
+
|
45
|
+
* New feature: added option `--ds` to generate a new `ds:Z` tag that
|
46
|
+
indicates uncertainty in INDEL positions. It is an extension to `cs`. The
|
47
|
+
`mgutils-es6.js` script in minigraph parses `ds`.
|
48
|
+
|
49
|
+
* Bugfix: avoided a NULL pointer dereference (#1154). This would not have an
|
50
|
+
effect on most systems but would still be good to fix.
|
51
|
+
|
52
|
+
* Bugfix: reverted the value of `ms:i` to pre-2.22 versions (#1146). This was
|
53
|
+
an oversight. See fcd4df2 for details.
|
54
|
+
|
55
|
+
Notable changes to paftools.js and mappy:
|
56
|
+
|
57
|
+
* New feature: expose `bw_long` to mappy's Aligner class (#1124).
|
58
|
+
|
59
|
+
* Bugfix: fixed several compatibility issues with k8 v1.0 (#1161 and #1166).
|
60
|
+
Subcommands "call", "pbsim2fq" and "mason2fq" were not working with v1.0.
|
61
|
+
|
62
|
+
Minimap2 should output identical alignments to v2.26, except the ms tag.
|
63
|
+
|
64
|
+
(2.27: 12 March 2024, r1193)
|
65
|
+
|
66
|
+
|
67
|
+
|
1
68
|
Release 2.26-r1175 (29 April 2023)
|
2
69
|
----------------------------------
|
3
70
|
|
4
71
|
Fixed the broken Python package. This is the only change.
|
5
72
|
|
6
|
-
(2.
|
73
|
+
(2.26: 25 April 2023, r1173)
|
7
74
|
|
8
75
|
|
9
76
|
|
data/ext/minimap2/README.md
CHANGED
@@ -15,7 +15,7 @@ cd minimap2 && make
|
|
15
15
|
./minimap2 -ax map-pb ref.fa pacbio.fq.gz > aln.sam # PacBio CLR genomic reads
|
16
16
|
./minimap2 -ax map-ont ref.fa ont.fq.gz > aln.sam # Oxford Nanopore genomic reads
|
17
17
|
./minimap2 -ax map-hifi ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.19 or later)
|
18
|
-
./minimap2 -ax
|
18
|
+
./minimap2 -ax lr:hq ref.fa ont-Q20.fq.gz > aln.sam # Nanopore Q20 genomic reads (v2.27 or later)
|
19
19
|
./minimap2 -ax sr ref.fa read1.fa read2.fa > aln.sam # short genomic paired-end reads
|
20
20
|
./minimap2 -ax splice ref.fa rna-reads.fa > aln.sam # spliced long reads (strand unknown)
|
21
21
|
./minimap2 -ax splice -uf -k14 ref.fa reads.fa > aln.sam # noisy Nanopore Direct RNA-seq
|
@@ -74,8 +74,8 @@ Detailed evaluations are available from the [minimap2 paper][doi] or the
|
|
74
74
|
Minimap2 is optimized for x86-64 CPUs. You can acquire precompiled binaries from
|
75
75
|
the [release page][release] with:
|
76
76
|
```sh
|
77
|
-
curl -L https://github.com/lh3/minimap2/releases/download/v2.
|
78
|
-
./minimap2-2.
|
77
|
+
curl -L https://github.com/lh3/minimap2/releases/download/v2.28/minimap2-2.28_x64-linux.tar.bz2 | tar -jxvf -
|
78
|
+
./minimap2-2.28_x64-linux/minimap2
|
79
79
|
```
|
80
80
|
If you want to compile from the source, you need to have a C compiler, GNU make
|
81
81
|
and zlib development files installed. Then type `make` in the source code
|
@@ -139,12 +139,15 @@ parameters at the same time. The default setting is the same as `map-ont`.
|
|
139
139
|
```sh
|
140
140
|
minimap2 -ax map-pb ref.fa pacbio-reads.fq > aln.sam # for PacBio CLR reads
|
141
141
|
minimap2 -ax map-ont ref.fa ont-reads.fq > aln.sam # for Oxford Nanopore reads
|
142
|
+
minimap2 -ax map-iclr ref.fa iclr-reads.fq > aln.sam # for Illumina Complete Long Reads
|
142
143
|
```
|
143
144
|
The difference between `map-pb` and `map-ont` is that `map-pb` uses
|
144
145
|
homopolymer-compressed (HPC) minimizers as seeds, while `map-ont` uses ordinary
|
145
|
-
minimizers as seeds.
|
146
|
+
minimizers as seeds. Empirical evaluation suggests HPC minimizers improve
|
146
147
|
performance and sensitivity when aligning PacBio CLR reads, but hurt when aligning
|
147
|
-
Nanopore reads.
|
148
|
+
Nanopore reads. `map-iclr` uses an adjusted alignment scoring matrix that
|
149
|
+
accounts for the low overall error rate in the reads, with transversion errors
|
150
|
+
being less frequent than transitions.
|
148
151
|
|
149
152
|
#### <a name="map-long-splice"></a>Map long mRNA/cDNA reads
|
150
153
|
|
data/ext/minimap2/align.c
CHANGED
@@ -21,6 +21,18 @@ static void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t sc
|
|
21
21
|
mat[(m - 1) * m + j] = sc_ambi;
|
22
22
|
}
|
23
23
|
|
24
|
+
static void ksw_gen_ts_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t transition, int8_t sc_ambi)
|
25
|
+
{
|
26
|
+
assert(m == 5);
|
27
|
+
ksw_gen_simple_mat(m, mat, a, b, sc_ambi);
|
28
|
+
if (transition == 0 || transition == b) return;
|
29
|
+
transition = transition > 0? -transition : transition;
|
30
|
+
mat[0 * m + 2] = transition; // A->G
|
31
|
+
mat[1 * m + 3] = transition; // C->T
|
32
|
+
mat[2 * m + 0] = transition; // G->A
|
33
|
+
mat[3 * m + 1] = transition; // T->C
|
34
|
+
}
|
35
|
+
|
24
36
|
static inline void mm_seq_rev(uint32_t len, uint8_t *seq)
|
25
37
|
{
|
26
38
|
uint32_t i;
|
@@ -283,7 +295,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
|
|
283
295
|
toff += len;
|
284
296
|
}
|
285
297
|
}
|
286
|
-
p->dp_max = (int32_t)(max + .499);
|
298
|
+
p->dp_max = p->dp_max0 = (int32_t)(max + .499);
|
287
299
|
assert(qoff == r->qe - r->qs && toff == r->re - r->rs);
|
288
300
|
if (is_eqx) mm_update_cigar_eqx(r, qseq, tseq); // NB: it has to be called here as changes to qseq and tseq are not returned
|
289
301
|
}
|
@@ -323,6 +335,8 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint
|
|
323
335
|
for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr);
|
324
336
|
fputc('\n', stderr);
|
325
337
|
}
|
338
|
+
if (opt->transition != 0 && opt->b != opt->transition)
|
339
|
+
flag |= KSW_EZ_GENERIC_SC;
|
326
340
|
if (opt->max_sw_mat > 0 && (int64_t)tlen * qlen > opt->max_sw_mat) {
|
327
341
|
ksw_reset_extz(ez);
|
328
342
|
ez->zdropped = 1;
|
@@ -586,7 +600,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
|
|
586
600
|
|
587
601
|
r2->cnt = 0;
|
588
602
|
if (r->cnt == 0) return;
|
589
|
-
|
603
|
+
ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi);
|
590
604
|
bw = (int)(opt->bw * 1.5 + 1.);
|
591
605
|
bw_long = (int)(opt->bw_long * 1.5 + 1.);
|
592
606
|
if (bw_long < bw) bw_long = bw;
|
@@ -844,7 +858,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i
|
|
844
858
|
if (ql < opt->min_chain_score || ql > opt->max_gap) return 0;
|
845
859
|
if (tl < opt->min_chain_score || tl > opt->max_gap) return 0;
|
846
860
|
|
847
|
-
|
861
|
+
ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi);
|
848
862
|
tseq = (uint8_t*)kmalloc(km, tl);
|
849
863
|
mm_idx_getseq(mi, r1->rid, r1->re, r2->rs, tseq);
|
850
864
|
qseq = r1->rev? &qseq0[0][r2->qe] : &qseq0[1][qlen - r2->qs];
|
@@ -919,14 +933,14 @@ double mm_event_identity(const mm_reg1_t *r)
|
|
919
933
|
static int32_t mm_recal_max_dp(const mm_reg1_t *r, double b2, int32_t match_sc)
|
920
934
|
{
|
921
935
|
uint32_t i;
|
922
|
-
int32_t n_gap = 0,
|
936
|
+
int32_t n_gap = 0, n_mis;
|
923
937
|
double gap_cost = 0.0;
|
924
938
|
if (r->p == 0) return -1;
|
925
939
|
for (i = 0; i < r->p->n_cigar; ++i) {
|
926
940
|
int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4;
|
927
941
|
if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) {
|
928
942
|
gap_cost += b2 + (double)mg_log2(1.0 + len);
|
929
|
-
|
943
|
+
n_gap += len;
|
930
944
|
}
|
931
945
|
}
|
932
946
|
n_mis = r->blen + r->p->n_ambi - r->mlen - n_gap;
|
data/ext/minimap2/cookbook.md
CHANGED
@@ -31,8 +31,8 @@ To acquire the data used in this cookbook and to install minimap2 and paftools,
|
|
31
31
|
please follow the command lines below:
|
32
32
|
```sh
|
33
33
|
# install minimap2 executables
|
34
|
-
curl -L https://github.com/lh3/minimap2/releases/download/v2.
|
35
|
-
cp minimap2-2.
|
34
|
+
curl -L https://github.com/lh3/minimap2/releases/download/v2.28/minimap2-2.28_x64-linux.tar.bz2 | tar jxf -
|
35
|
+
cp minimap2-2.28_x64-linux/{minimap2,k8,paftools.js} . # copy executables
|
36
36
|
export PATH="$PATH:"`pwd` # put the current directory on PATH
|
37
37
|
# download example datasets
|
38
38
|
curl -L https://github.com/lh3/minimap2/releases/download/v2.10/cookbook-data.tgz | tar zxf -
|
data/ext/minimap2/format.c
CHANGED
@@ -139,10 +139,48 @@ int mm_write_sam_hdr(const mm_idx_t *idx, const char *rg, const char *ver, int a
|
|
139
139
|
return ret;
|
140
140
|
}
|
141
141
|
|
142
|
-
static void
|
142
|
+
static void write_indel_ds(kstring_t *str, int64_t len, const uint8_t *seq, int64_t ll, int64_t lr) // write an indel to ds; adapted from minigraph
|
143
143
|
{
|
144
|
-
|
145
|
-
if (
|
144
|
+
int64_t i;
|
145
|
+
if (ll + lr >= len) {
|
146
|
+
mm_sprintf_lite(str, "[");
|
147
|
+
for (i = 0; i < len; ++i)
|
148
|
+
mm_sprintf_lite(str, "%c", "acgtn"[seq[i]]);
|
149
|
+
mm_sprintf_lite(str, "]");
|
150
|
+
} else {
|
151
|
+
int64_t k = 0;
|
152
|
+
if (ll > 0) {
|
153
|
+
mm_sprintf_lite(str, "[");
|
154
|
+
for (i = 0; i < ll; ++i)
|
155
|
+
mm_sprintf_lite(str, "%c", "acgtn"[seq[k+i]]);
|
156
|
+
mm_sprintf_lite(str, "]");
|
157
|
+
k += ll;
|
158
|
+
}
|
159
|
+
for (i = 0; i < len - lr - ll; ++i)
|
160
|
+
mm_sprintf_lite(str, "%c", "acgtn"[seq[k+i]]);
|
161
|
+
k += len - lr - ll;
|
162
|
+
if (lr > 0) {
|
163
|
+
mm_sprintf_lite(str, "[");
|
164
|
+
for (i = 0; i < lr; ++i)
|
165
|
+
mm_sprintf_lite(str, "%c", "acgtn"[seq[k+i]]);
|
166
|
+
mm_sprintf_lite(str, "]");
|
167
|
+
}
|
168
|
+
}
|
169
|
+
}
|
170
|
+
|
171
|
+
static void write_cs_ds_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int no_iden, int is_ds, int write_tag)
|
172
|
+
{
|
173
|
+
int i, q_off, t_off, q_len = 0, t_len = 0;
|
174
|
+
if (write_tag) mm_sprintf_lite(s, "\t%cs:Z:", is_ds? 'd' : 'c');
|
175
|
+
for (i = 0; i < (int)r->p->n_cigar; ++i) {
|
176
|
+
int op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
|
177
|
+
if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH)
|
178
|
+
q_len += len, t_len += len;
|
179
|
+
else if (op == MM_CIGAR_INS)
|
180
|
+
q_len += len;
|
181
|
+
else if (op == MM_CIGAR_DEL || op == MM_CIGAR_N_SKIP)
|
182
|
+
t_len += len;
|
183
|
+
}
|
146
184
|
for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) {
|
147
185
|
int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
|
148
186
|
assert((op >= MM_CIGAR_MATCH && op <= MM_CIGAR_N_SKIP) || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH);
|
@@ -168,14 +206,42 @@ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
|
|
168
206
|
}
|
169
207
|
q_off += len, t_off += len;
|
170
208
|
} else if (op == MM_CIGAR_INS) {
|
171
|
-
|
172
|
-
|
173
|
-
|
209
|
+
if (is_ds) {
|
210
|
+
int z, ll, lr, y = q_off;
|
211
|
+
for (z = 1; z <= len; ++z)
|
212
|
+
if (y - z < 0 || qseq[y + len - z] != qseq[y - z])
|
213
|
+
break;
|
214
|
+
lr = z - 1;
|
215
|
+
for (z = 0; z < len; ++z)
|
216
|
+
if (y + len + z >= q_len || qseq[y + len + z] != qseq[y + z])
|
217
|
+
break;
|
218
|
+
ll = z;
|
219
|
+
mm_sprintf_lite(s, "+");
|
220
|
+
write_indel_ds(s, len, &qseq[y], ll, lr);
|
221
|
+
} else {
|
222
|
+
for (j = 0, tmp[len] = 0; j < len; ++j)
|
223
|
+
tmp[j] = "acgtn"[qseq[q_off + j]];
|
224
|
+
mm_sprintf_lite(s, "+%s", tmp);
|
225
|
+
}
|
174
226
|
q_off += len;
|
175
227
|
} else if (op == MM_CIGAR_DEL) {
|
176
|
-
|
177
|
-
|
178
|
-
|
228
|
+
if (is_ds) {
|
229
|
+
int z, ll, lr, x = t_off;
|
230
|
+
for (z = 1; z <= len; ++z)
|
231
|
+
if (x - z < 0 || tseq[x + len - z] != tseq[x - z])
|
232
|
+
break;
|
233
|
+
lr = z - 1;
|
234
|
+
for (z = 0; z < len; ++z)
|
235
|
+
if (x + len + z >= t_len || tseq[x + z] != tseq[x + len + z])
|
236
|
+
break;
|
237
|
+
ll = z;
|
238
|
+
mm_sprintf_lite(s, "-");
|
239
|
+
write_indel_ds(s, len, &tseq[x], ll, lr);
|
240
|
+
} else {
|
241
|
+
for (j = 0, tmp[len] = 0; j < len; ++j)
|
242
|
+
tmp[j] = "acgtn"[tseq[t_off + j]];
|
243
|
+
mm_sprintf_lite(s, "-%s", tmp);
|
244
|
+
}
|
179
245
|
t_off += len;
|
180
246
|
} else { // intron
|
181
247
|
assert(len >= 2);
|
@@ -218,7 +284,7 @@ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
|
|
218
284
|
assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
|
219
285
|
}
|
220
286
|
|
221
|
-
static void
|
287
|
+
static void write_cs_ds_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int no_iden, int is_MD, int is_ds, int write_tag, int is_qstrand)
|
222
288
|
{
|
223
289
|
extern unsigned char seq_nt4_table[256];
|
224
290
|
int i;
|
@@ -245,7 +311,7 @@ static void write_cs_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_
|
|
245
311
|
}
|
246
312
|
}
|
247
313
|
if (is_MD) write_MD_core(s, tseq, qseq, r, tmp, write_tag);
|
248
|
-
else
|
314
|
+
else write_cs_ds_core(s, tseq, qseq, r, tmp, no_iden, is_ds, write_tag);
|
249
315
|
kfree(km, qseq); kfree(km, tseq); kfree(km, tmp);
|
250
316
|
}
|
251
317
|
|
@@ -256,7 +322,7 @@ int mm_gen_cs_or_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, cons
|
|
256
322
|
str.s = *buf, str.l = 0, str.m = *max_len;
|
257
323
|
t.l_seq = strlen(seq);
|
258
324
|
t.seq = (char*)seq;
|
259
|
-
|
325
|
+
write_cs_ds_or_MD(km, &str, mi, &t, r, no_iden, is_MD, 0, 0, is_qstrand);
|
260
326
|
*max_len = str.m;
|
261
327
|
*buf = str.s;
|
262
328
|
return str.l;
|
@@ -278,7 +344,7 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
|
|
278
344
|
if (r->id == r->parent) type = r->inv? 'I' : 'P';
|
279
345
|
else type = r->inv? 'i' : 'S';
|
280
346
|
if (r->p) {
|
281
|
-
mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->blen - r->mlen + r->p->n_ambi, r->p->
|
347
|
+
mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->blen - r->mlen + r->p->n_ambi, r->p->dp_max0, r->p->dp_score, r->p->n_ambi);
|
282
348
|
if (r->p->trans_strand == 1 || r->p->trans_strand == 2)
|
283
349
|
mm_sprintf_lite(s, "\tts:A:%c", "?+-?"[r->p->trans_strand]);
|
284
350
|
}
|
@@ -326,8 +392,8 @@ void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const
|
|
326
392
|
for (k = 0; k < r->p->n_cigar; ++k)
|
327
393
|
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, MM_CIGAR_STR[r->p->cigar[k]&0xf]);
|
328
394
|
}
|
329
|
-
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
|
330
|
-
|
395
|
+
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_DS|MM_F_OUT_MD)))
|
396
|
+
write_cs_ds_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), !!(opt_flag&MM_F_OUT_MD), !!(opt_flag&MM_F_OUT_DS), 1, !!(opt_flag&MM_F_QSTRAND));
|
331
397
|
if ((opt_flag & MM_F_COPY_COMMENT) && t->comment)
|
332
398
|
mm_sprintf_lite(s, "\t%s", t->comment);
|
333
399
|
}
|
@@ -535,8 +601,8 @@ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
|
|
535
601
|
}
|
536
602
|
}
|
537
603
|
}
|
538
|
-
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
|
539
|
-
|
604
|
+
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_DS|MM_F_OUT_MD)))
|
605
|
+
write_cs_ds_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, !!(opt_flag&MM_F_OUT_DS), 1, 0);
|
540
606
|
if (cigar_in_tag)
|
541
607
|
write_sam_cigar(s, flag, 1, t->l_seq, r, opt_flag);
|
542
608
|
}
|
data/ext/minimap2/index.c
CHANGED
@@ -192,6 +192,7 @@ int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f)
|
|
192
192
|
if (f <= 0.) return INT32_MAX;
|
193
193
|
for (i = 0; i < 1<<mi->b; ++i)
|
194
194
|
if (mi->B[i].h) n += kh_size((idxhash_t*)mi->B[i].h);
|
195
|
+
if (n == 0) return INT32_MAX;
|
195
196
|
a = (uint32_t*)malloc(n * 4);
|
196
197
|
for (i = n = 0; i < 1<<mi->b; ++i) {
|
197
198
|
idxhash_t *h = (idxhash_t*)mi->B[i].h;
|
data/ext/minimap2/lchain.c
CHANGED
@@ -149,7 +149,7 @@ mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int
|
|
149
149
|
int is_cdna, int n_seg, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km)
|
150
150
|
{ // TODO: make sure this works when n has more than 32 bits
|
151
151
|
int32_t *f, *t, *v, n_u, n_v, mmax_f = 0, max_drop = bw;
|
152
|
-
int64_t *p, i, j, max_ii, st = 0
|
152
|
+
int64_t *p, i, j, max_ii, st = 0;
|
153
153
|
uint64_t *u;
|
154
154
|
|
155
155
|
if (_u) *_u = 0, *n_u_ = 0;
|
@@ -174,7 +174,6 @@ mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int
|
|
174
174
|
for (j = i - 1; j >= st; --j) {
|
175
175
|
int32_t sc;
|
176
176
|
sc = comput_sc(&a[i], &a[j], max_dist_x, max_dist_y, bw, chn_pen_gap, chn_pen_skip, is_cdna, n_seg);
|
177
|
-
++n_iter;
|
178
177
|
if (sc == INT32_MIN) continue;
|
179
178
|
sc += f[j];
|
180
179
|
if (sc > max_f) {
|
@@ -204,6 +203,7 @@ mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int
|
|
204
203
|
if (max_ii < 0 || (a[i].x - a[max_ii].x <= (int64_t)max_dist_x && f[max_ii] < f[i]))
|
205
204
|
max_ii = i;
|
206
205
|
if (mmax_f < max_f) mmax_f = max_f;
|
206
|
+
//fprintf(stderr, "X1\t%ld\t%ld:%d\t%ld\t%ld:%d\t%ld\t%ld\n", (long)i, (long)(a[i].x>>32), (int32_t)a[i].x, (long)max_j, max_j<0?-1L:(long)(a[max_j].x>>32), max_j<0?-1:(int32_t)a[max_j].x, (long)max_f, (long)v[i]);
|
207
207
|
}
|
208
208
|
|
209
209
|
u = mg_chain_backtrack(km, n, f, p, v, t, min_cnt, min_sc, max_drop, &n_u, &n_v);
|
@@ -263,7 +263,8 @@ mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_ski
|
|
263
263
|
return 0;
|
264
264
|
}
|
265
265
|
if (max_dist < bw) max_dist = bw;
|
266
|
-
if (max_dist_inner
|
266
|
+
if (max_dist_inner < 0) max_dist_inner = 0;
|
267
|
+
if (max_dist_inner > max_dist) max_dist_inner = max_dist;
|
267
268
|
p = Kmalloc(km, int64_t, n);
|
268
269
|
f = Kmalloc(km, int32_t, n);
|
269
270
|
t = Kcalloc(km, int32_t, n);
|
@@ -325,12 +326,11 @@ mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_ski
|
|
325
326
|
krmq_interval(lc_elem, root_inner, &s, &lo, &hi);
|
326
327
|
if (lo) {
|
327
328
|
const lc_elem_t *q;
|
328
|
-
int32_t width
|
329
|
+
int32_t width;
|
329
330
|
krmq_itr_t(lc_elem) itr;
|
330
331
|
krmq_itr_find(lc_elem, root_inner, lo, &itr);
|
331
332
|
while ((q = krmq_at(&itr)) != 0) {
|
332
333
|
if (q->y < (int32_t)a[i].y - max_dist_inner) break;
|
333
|
-
++n_rmq_iter;
|
334
334
|
j = q->i;
|
335
335
|
sc = f[j] + comput_sc_simple(&a[i], &a[j], chn_pen_gap, chn_pen_skip, 0, &width);
|
336
336
|
if (width <= bw) {
|
data/ext/minimap2/main.c
CHANGED
@@ -77,6 +77,9 @@ static ko_longopt_t long_options[] = {
|
|
77
77
|
{ "print-chains", ko_no_argument, 352 },
|
78
78
|
{ "no-hash-name", ko_no_argument, 353 },
|
79
79
|
{ "secondary-seq", ko_no_argument, 354 },
|
80
|
+
{ "ds", ko_no_argument, 355 },
|
81
|
+
{ "rmq-inner", ko_required_argument, 356 },
|
82
|
+
{ "dbg-seed-occ", ko_no_argument, 501 },
|
80
83
|
{ "help", ko_no_argument, 'h' },
|
81
84
|
{ "max-intron-len", ko_required_argument, 'G' },
|
82
85
|
{ "version", ko_no_argument, 'V' },
|
@@ -120,7 +123,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int64_t flag, int long_idx, const
|
|
120
123
|
|
121
124
|
int main(int argc, char *argv[])
|
122
125
|
{
|
123
|
-
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:";
|
126
|
+
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:b:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:";
|
124
127
|
ketopt_t o = KETOPT_INIT;
|
125
128
|
mm_mapopt_t opt;
|
126
129
|
mm_idxopt_t ipt;
|
@@ -178,6 +181,7 @@ int main(int argc, char *argv[])
|
|
178
181
|
else if (c == 'm') opt.min_chain_score = atoi(o.arg);
|
179
182
|
else if (c == 'A') opt.a = atoi(o.arg);
|
180
183
|
else if (c == 'B') opt.b = atoi(o.arg);
|
184
|
+
else if (c == 'b') opt.transition = atoi(o.arg);
|
181
185
|
else if (c == 's') opt.min_dp_max = atoi(o.arg);
|
182
186
|
else if (c == 'C') opt.noncan = atoi(o.arg);
|
183
187
|
else if (c == 'I') ipt.batch_size = mm_parse_num(o.arg);
|
@@ -242,6 +246,9 @@ int main(int argc, char *argv[])
|
|
242
246
|
else if (c == 352) mm_dbg_flag |= MM_DBG_PRINT_CHAIN; // --print-chains
|
243
247
|
else if (c == 353) opt.flag |= MM_F_NO_HASH_NAME; // --no-hash-name
|
244
248
|
else if (c == 354) opt.flag |= MM_F_SECONDARY_SEQ; // --secondary-seq
|
249
|
+
else if (c == 355) opt.flag |= MM_F_OUT_DS; // --ds
|
250
|
+
else if (c == 356) opt.rmq_inner_dist = mm_parse_num(o.arg); // --rmq-inner
|
251
|
+
else if (c == 501) mm_dbg_flag |= MM_DBG_SEED_FREQ; // --dbg-seed-occ
|
245
252
|
else if (c == 330) {
|
246
253
|
fprintf(stderr, "[WARNING] \033[1;31m --lj-min-ratio has been deprecated.\033[0m\n");
|
247
254
|
} else if (c == 314) { // --frag
|
@@ -358,6 +365,7 @@ int main(int argc, char *argv[])
|
|
358
365
|
fprintf(fp_help, " -R STR SAM read group line in a format like '@RG\\tID:foo\\tSM:bar' []\n");
|
359
366
|
fprintf(fp_help, " -c output CIGAR in PAF\n");
|
360
367
|
fprintf(fp_help, " --cs[=STR] output the cs tag; STR is 'short' (if absent) or 'long' [none]\n");
|
368
|
+
fprintf(fp_help, " --ds output the ds tag, which is an extension to cs\n");
|
361
369
|
fprintf(fp_help, " --MD output the MD tag\n");
|
362
370
|
fprintf(fp_help, " --eqx write =/X CIGAR operators\n");
|
363
371
|
fprintf(fp_help, " -Y use soft clipping for supplementary alignments\n");
|
@@ -367,12 +375,12 @@ int main(int argc, char *argv[])
|
|
367
375
|
fprintf(fp_help, " --version show version number\n");
|
368
376
|
fprintf(fp_help, " Preset:\n");
|
369
377
|
fprintf(fp_help, " -x STR preset (always applied before other options; see minimap2.1 for details) []\n");
|
370
|
-
fprintf(fp_help, " -
|
371
|
-
fprintf(fp_help, " -
|
372
|
-
fprintf(fp_help, " - ava-pb/ava-ont - PacBio/Nanopore read overlap\n");
|
378
|
+
fprintf(fp_help, " - lr:hq - accurate long reads (error rate <1%%) against a reference genome\n");
|
379
|
+
fprintf(fp_help, " - splice/splice:hq - spliced alignment for long reads/accurate long reads\n");
|
373
380
|
fprintf(fp_help, " - asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5%% sequence divergence\n");
|
374
|
-
fprintf(fp_help, " -
|
375
|
-
fprintf(fp_help, " -
|
381
|
+
fprintf(fp_help, " - sr - short reads against a reference\n");
|
382
|
+
fprintf(fp_help, " - map-pb/map-hifi/map-ont/map-iclr - CLR/HiFi/Nanopore/ICLR vs reference mapping\n");
|
383
|
+
fprintf(fp_help, " - ava-pb/ava-ont - PacBio CLR/Nanopore read overlap\n");
|
376
384
|
fprintf(fp_help, "\nSee `man ./minimap2.1' for detailed description of these and other advanced command-line options.\n");
|
377
385
|
return fp_help == stdout? 0 : 1;
|
378
386
|
}
|
data/ext/minimap2/minimap.h
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
#include <stdio.h>
|
6
6
|
#include <sys/types.h>
|
7
7
|
|
8
|
-
#define MM_VERSION "2.
|
8
|
+
#define MM_VERSION "2.28-r1209"
|
9
9
|
|
10
10
|
#define MM_F_NO_DIAG (0x001LL) // no exact diagonal hit
|
11
11
|
#define MM_F_NO_DUAL (0x002LL) // skip pairs where query name is lexicographically larger than target name
|
@@ -44,6 +44,7 @@
|
|
44
44
|
#define MM_F_NO_HASH_NAME (0x400000000LL)
|
45
45
|
#define MM_F_SPLICE_OLD (0x800000000LL)
|
46
46
|
#define MM_F_SECONDARY_SEQ (0x1000000000LL) //output SEQ field for seqondary alignments using hard clipping
|
47
|
+
#define MM_F_OUT_DS (0x2000000000LL)
|
47
48
|
|
48
49
|
#define MM_I_HPC 0x1
|
49
50
|
#define MM_I_NO_SEQ 0x2
|
@@ -97,6 +98,7 @@ typedef struct {
|
|
97
98
|
typedef struct {
|
98
99
|
uint32_t capacity; // the capacity of cigar[]
|
99
100
|
int32_t dp_score, dp_max, dp_max2; // DP score; score of the max-scoring segment; score of the best alternate mappings
|
101
|
+
int32_t dp_max0; // DP score before mm_update_dp_max() adjustment
|
100
102
|
uint32_t n_ambi:30, trans_strand:2; // number of ambiguous bases; transcript strand: 0 for unknown, 1 for +, 2 for -
|
101
103
|
uint32_t n_cigar; // number of cigar operations in cigar[]
|
102
104
|
uint32_t cigar[];
|
@@ -153,6 +155,7 @@ typedef struct {
|
|
153
155
|
float alt_drop;
|
154
156
|
|
155
157
|
int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
|
158
|
+
int transition; // transition mismatch score (A:G, C:T)
|
156
159
|
int sc_ambi; // score when one or both bases are "N"
|
157
160
|
int noncan; // cost of non-canonical splicing sites
|
158
161
|
int junc_bonus;
|