minimap2 0.2.26.0 → 0.2.27.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +16 -16
- data/ext/Rakefile +9 -4
- data/ext/minimap2/NEWS.md +39 -1
- data/ext/minimap2/README.md +8 -5
- data/ext/minimap2/align.c +17 -3
- data/ext/minimap2/cookbook.md +2 -2
- data/ext/minimap2/format.c +84 -18
- data/ext/minimap2/index.c +1 -0
- data/ext/minimap2/main.c +10 -6
- data/ext/minimap2/minimap.h +4 -1
- data/ext/minimap2/minimap2.1 +54 -10
- data/ext/minimap2/misc/paftools.js +79 -33
- data/ext/minimap2/options.c +16 -6
- data/ext/minimap2/python/README.rst +3 -1
- data/ext/minimap2/python/mappy.pyx +3 -2
- data/ext/minimap2/setup.py +1 -1
- data/lib/minimap2/aligner.rb +6 -3
- data/lib/minimap2/alignment.rb +1 -1
- data/lib/minimap2/ffi/constants.rb +5 -1
- data/lib/minimap2/ffi/functions.rb +16 -3
- data/lib/minimap2/ffi.rb +1 -0
- data/lib/minimap2/version.rb +1 -2
- data/lib/minimap2.rb +2 -2
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f618028eabf476209264541d9037f68961548eb02dad4a22338bacdfe383fce7
|
4
|
+
data.tar.gz: f97eb69e9b1e78357cd738ba2a63ce36034e0fbd7c253c5a89a14b23ade19b01
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1ab742822a921c06f31671b0a555a0220cc9b2d0dc2e9d6ef7b72ff90fbc33de4d2f9285819b5064a28e3789065290e21f7e4dedb162c9927aefd8c860ceea35
|
7
|
+
data.tar.gz: 8d0b005004a1ac625a61d8a68073b31c08b4d156308ebfa01e18508e2ed520d948fa5d2a2e4804978a0846abc90bdc142183ae2d7d2c466cb2b00f87afff4d71
|
data/README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# ruby-minimap2
|
2
2
|
|
3
3
|
[![Gem Version](https://img.shields.io/gem/v/minimap2?color=brightgreen)](https://rubygems.org/gems/minimap2)
|
4
|
-
[![
|
4
|
+
[![test](https://github.com/kojix2/ruby-minimap2/actions/workflows/ci.yml/badge.svg)](https://github.com/kojix2/ruby-minimap2/actions/workflows/ci.yml)
|
5
5
|
[![Docs Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://rubydoc.info/gems/minimap2)
|
6
6
|
[![Docs Latest](https://img.shields.io/badge/docs-latest-blue.svg)](https://kojix2.github.io/ruby-minimap2/)
|
7
7
|
[![The MIT License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE.txt)
|
@@ -23,7 +23,7 @@ gem install minimap2
|
|
23
23
|
bundle install
|
24
24
|
bundle exec rake minimap2:build
|
25
25
|
bundle exec rake install
|
26
|
-
|
26
|
+
|
27
27
|
</details>
|
28
28
|
|
29
29
|
## Quick Start
|
@@ -36,6 +36,7 @@ seq = aligner.seq("MT_human", 100, 200)
|
|
36
36
|
hits = aligner.align(seq)
|
37
37
|
pp hits
|
38
38
|
```
|
39
|
+
|
39
40
|
```
|
40
41
|
[#<Minimap2::Alignment:0x000055bbfde2d128
|
41
42
|
@blen=100,
|
@@ -57,8 +58,6 @@ pp hits
|
|
57
58
|
@strand=1,
|
58
59
|
@trans_strand=0>]
|
59
60
|
```
|
60
|
-
|
61
|
-
</details>
|
62
61
|
|
63
62
|
## APIs Overview
|
64
63
|
|
@@ -87,7 +86,7 @@ pp hits
|
|
87
86
|
- trans_strand Returns transcript strand. +1 if on the forward strand; -1 if on the reverse strand; 0 if unknown.
|
88
87
|
- blen Returns length of the alignment, including both alignment matches and gaps but excluding ambiguous bases.
|
89
88
|
- mlen Returns length of the matching bases in the alignment, excluding ambiguous base matches.
|
90
|
-
- nm Returns number of mismatches, gaps and ambiguous
|
89
|
+
- nm Returns number of mismatches, gaps and ambiguous positions in the alignment.
|
91
90
|
- primary Returns if the alignment is primary (typically the best and the first to generate).
|
92
91
|
- q_st Returns start positions on the query.
|
93
92
|
- q_en Returns end positions on the query.
|
@@ -106,19 +105,20 @@ pp hits
|
|
106
105
|
* MapOpt class Mapping options.
|
107
106
|
```
|
108
107
|
|
109
|
-
|
110
|
-
|
111
|
-
|
108
|
+
- API is based on [Mappy](https://github.com/lh3/minimap2/tree/master/python), the official Python binding for Minimap2.
|
109
|
+
- `Aligner#map` has been changed to `align`, because `map` means iterator in Ruby.
|
110
|
+
- See [documentation](https://kojix2.github.io/ruby-minimap2/) for details.
|
112
111
|
|
113
112
|
<details>
|
114
113
|
<summary><b>C Structures and Functions</b></summary>
|
115
114
|
|
116
115
|
### FFI
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
116
|
+
|
117
|
+
- Ruby-Minimap2 is built on top of [Ruby-FFI](https://github.com/ffi/ffi).
|
118
|
+
- Native C functions can be called from the `Minimap2::FFI` module.
|
119
|
+
- Native C structure members can be accessed.
|
120
|
+
- Bitfields are supported by [ffi-bitfield](https://github.com/kojix2/ffi-bitfield) gems.
|
121
|
+
|
122
122
|
```ruby
|
123
123
|
aligner.idx_opt.members
|
124
124
|
# => [:k, :w, :flag, :bucket_bits, :mini_batch_size, :batch_size]
|
@@ -130,7 +130,7 @@ aligner.idx_opt[:k] = 14
|
|
130
130
|
aligner.idx_opt[:k]
|
131
131
|
# => 14
|
132
132
|
```
|
133
|
-
|
133
|
+
|
134
134
|
</details>
|
135
135
|
|
136
136
|
## Contributing
|
@@ -138,7 +138,7 @@ aligner.idx_opt[:k]
|
|
138
138
|
<details>
|
139
139
|
<summary><b>Development</b></summary>
|
140
140
|
|
141
|
-
|
141
|
+
Fork your repository.
|
142
142
|
then clone.
|
143
143
|
|
144
144
|
```sh
|
@@ -184,7 +184,7 @@ ruby-minimap2 is a library under development and there are many points to be imp
|
|
184
184
|
|
185
185
|
Please feel free to report [bugs](https://github.com/kojix2/ruby-minimap2/issues) and [pull requests](https://github.com/kojix2/ruby-minimap2/pulls)!
|
186
186
|
|
187
|
-
Many OSS projects become abandoned because only the founder has commit rights to the original repository.
|
187
|
+
Many OSS projects become abandoned because only the founder has commit rights to the original repository.
|
188
188
|
If you need commit rights to ruby-minimap2 repository or want to get admin rights and take over the project, please feel free to contact me @kojix2.
|
189
189
|
|
190
190
|
## License
|
data/ext/Rakefile
CHANGED
@@ -18,7 +18,14 @@ namespace :minimap2 do
|
|
18
18
|
# Add -fPIC option to Makefile
|
19
19
|
sh "git apply ../minimap2.patch"
|
20
20
|
sh "cp ../cmappy/cmappy.h ../cmappy/cmappy.c ."
|
21
|
-
|
21
|
+
case RbConfig::CONFIG["host_cpu"]
|
22
|
+
when /arm64/
|
23
|
+
sh "make arm_neon=1 aarch64=1"
|
24
|
+
when /arm/
|
25
|
+
sh "make arm_neon=1"
|
26
|
+
else
|
27
|
+
sh "make"
|
28
|
+
end
|
22
29
|
case RbConfig::CONFIG["host_os"]
|
23
30
|
when /mswin|msys|mingw|cygwin|bccwin|wince|emc/
|
24
31
|
sh "cc *.o -shared -o #{target_fname} -lm -lz -lpthread"
|
@@ -44,10 +51,8 @@ namespace :minimap2 do
|
|
44
51
|
end
|
45
52
|
end
|
46
53
|
|
47
|
-
task cleanall: [:clean]
|
48
|
-
|
49
54
|
desc "`make clean` and remove shared lib"
|
50
|
-
task :
|
55
|
+
task cleanall: [:clean] do
|
51
56
|
Dir.chdir(minimap2_dir) do
|
52
57
|
sh "rm #{target_path}" if File.exist?(target_path)
|
53
58
|
end
|
data/ext/minimap2/NEWS.md
CHANGED
@@ -1,9 +1,47 @@
|
|
1
|
+
Release 2.27-r1193 (12 March 2024)
|
2
|
+
----------------------------------
|
3
|
+
|
4
|
+
Notable changes to minimap2:
|
5
|
+
|
6
|
+
* New feature: added the `lr:hq` preset for accurate long reads at ~1% error
|
7
|
+
rate. This was suggested by Oxford Nanopore developers (#1127). It is not
|
8
|
+
clear if this preset also works well for PacBio HiFi reads.
|
9
|
+
|
10
|
+
* New feature: added the `map-iclr` preset for Illumina Complete Long Reads
|
11
|
+
(#1069), provided by Illumina developers.
|
12
|
+
|
13
|
+
* New feature: added option `-b` to specify mismatch penalty for base
|
14
|
+
transitions (i.e. A-to-G or C-to-T changes).
|
15
|
+
|
16
|
+
* New feature: added option `--ds` to generate a new `ds:Z` tag that
|
17
|
+
indicates uncertainty in INDEL positions. It is an extension to `cs`. The
|
18
|
+
`mgutils-es6.js` script in minigraph parses `ds`.
|
19
|
+
|
20
|
+
* Bugfix: avoided a NULL pointer dereference (#1154). This would not have an
|
21
|
+
effect on most systems but would still be good to fix.
|
22
|
+
|
23
|
+
* Bugfix: reverted the value of `ms:i` to pre-2.22 versions (#1146). This was
|
24
|
+
an oversight. See fcd4df2 for details.
|
25
|
+
|
26
|
+
Notable changes to paftools.js and mappy:
|
27
|
+
|
28
|
+
* New feature: expose `bw_long` to mappy's Aligner class (#1124).
|
29
|
+
|
30
|
+
* Bugfix: fixed several compatibility issues with k8 v1.0 (#1161 and #1166).
|
31
|
+
Subcommands "call", "pbsim2fq" and "mason2fq" were not working with v1.0.
|
32
|
+
|
33
|
+
Minimap2 should output identical alignments to v2.26, except the ms tag.
|
34
|
+
|
35
|
+
(2.27: 12 March 2024, r1193)
|
36
|
+
|
37
|
+
|
38
|
+
|
1
39
|
Release 2.26-r1175 (29 April 2023)
|
2
40
|
----------------------------------
|
3
41
|
|
4
42
|
Fixed the broken Python package. This is the only change.
|
5
43
|
|
6
|
-
(2.
|
44
|
+
(2.26: 25 April 2023, r1173)
|
7
45
|
|
8
46
|
|
9
47
|
|
data/ext/minimap2/README.md
CHANGED
@@ -15,7 +15,7 @@ cd minimap2 && make
|
|
15
15
|
./minimap2 -ax map-pb ref.fa pacbio.fq.gz > aln.sam # PacBio CLR genomic reads
|
16
16
|
./minimap2 -ax map-ont ref.fa ont.fq.gz > aln.sam # Oxford Nanopore genomic reads
|
17
17
|
./minimap2 -ax map-hifi ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.19 or later)
|
18
|
-
./minimap2 -ax
|
18
|
+
./minimap2 -ax lr:hq ref.fa ont-Q20.fq.gz > aln.sam # Nanopore Q20 genomic reads (v2.27 or later)
|
19
19
|
./minimap2 -ax sr ref.fa read1.fa read2.fa > aln.sam # short genomic paired-end reads
|
20
20
|
./minimap2 -ax splice ref.fa rna-reads.fa > aln.sam # spliced long reads (strand unknown)
|
21
21
|
./minimap2 -ax splice -uf -k14 ref.fa reads.fa > aln.sam # noisy Nanopore Direct RNA-seq
|
@@ -74,8 +74,8 @@ Detailed evaluations are available from the [minimap2 paper][doi] or the
|
|
74
74
|
Minimap2 is optimized for x86-64 CPUs. You can acquire precompiled binaries from
|
75
75
|
the [release page][release] with:
|
76
76
|
```sh
|
77
|
-
curl -L https://github.com/lh3/minimap2/releases/download/v2.
|
78
|
-
./minimap2-2.
|
77
|
+
curl -L https://github.com/lh3/minimap2/releases/download/v2.27/minimap2-2.27_x64-linux.tar.bz2 | tar -jxvf -
|
78
|
+
./minimap2-2.27_x64-linux/minimap2
|
79
79
|
```
|
80
80
|
If you want to compile from the source, you need to have a C compiler, GNU make
|
81
81
|
and zlib development files installed. Then type `make` in the source code
|
@@ -139,12 +139,15 @@ parameters at the same time. The default setting is the same as `map-ont`.
|
|
139
139
|
```sh
|
140
140
|
minimap2 -ax map-pb ref.fa pacbio-reads.fq > aln.sam # for PacBio CLR reads
|
141
141
|
minimap2 -ax map-ont ref.fa ont-reads.fq > aln.sam # for Oxford Nanopore reads
|
142
|
+
minimap2 -ax map-iclr ref.fa iclr-reads.fq > aln.sam # for Illumina Complete Long Reads
|
142
143
|
```
|
143
144
|
The difference between `map-pb` and `map-ont` is that `map-pb` uses
|
144
145
|
homopolymer-compressed (HPC) minimizers as seeds, while `map-ont` uses ordinary
|
145
|
-
minimizers as seeds.
|
146
|
+
minimizers as seeds. Empirical evaluation suggests HPC minimizers improve
|
146
147
|
performance and sensitivity when aligning PacBio CLR reads, but hurt when aligning
|
147
|
-
Nanopore reads.
|
148
|
+
Nanopore reads. `map-iclr` uses an adjusted alignment scoring matrix that
|
149
|
+
accounts for the low overall error rate in the reads, with transversion errors
|
150
|
+
being less frequent than transitions.
|
148
151
|
|
149
152
|
#### <a name="map-long-splice"></a>Map long mRNA/cDNA reads
|
150
153
|
|
data/ext/minimap2/align.c
CHANGED
@@ -21,6 +21,18 @@ static void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t sc
|
|
21
21
|
mat[(m - 1) * m + j] = sc_ambi;
|
22
22
|
}
|
23
23
|
|
24
|
+
static void ksw_gen_ts_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t transition, int8_t sc_ambi)
|
25
|
+
{
|
26
|
+
assert(m == 5);
|
27
|
+
ksw_gen_simple_mat(m, mat, a, b, sc_ambi);
|
28
|
+
if (transition == 0 || transition == b) return;
|
29
|
+
transition = transition > 0? -transition : transition;
|
30
|
+
mat[0 * m + 2] = transition; // A->G
|
31
|
+
mat[1 * m + 3] = transition; // C->T
|
32
|
+
mat[2 * m + 0] = transition; // G->A
|
33
|
+
mat[3 * m + 1] = transition; // T->C
|
34
|
+
}
|
35
|
+
|
24
36
|
static inline void mm_seq_rev(uint32_t len, uint8_t *seq)
|
25
37
|
{
|
26
38
|
uint32_t i;
|
@@ -283,7 +295,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
|
|
283
295
|
toff += len;
|
284
296
|
}
|
285
297
|
}
|
286
|
-
p->dp_max = (int32_t)(max + .499);
|
298
|
+
p->dp_max = p->dp_max0 = (int32_t)(max + .499);
|
287
299
|
assert(qoff == r->qe - r->qs && toff == r->re - r->rs);
|
288
300
|
if (is_eqx) mm_update_cigar_eqx(r, qseq, tseq); // NB: it has to be called here as changes to qseq and tseq are not returned
|
289
301
|
}
|
@@ -323,6 +335,8 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint
|
|
323
335
|
for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr);
|
324
336
|
fputc('\n', stderr);
|
325
337
|
}
|
338
|
+
if (opt->transition != 0 && opt->b != opt->transition)
|
339
|
+
flag |= KSW_EZ_GENERIC_SC;
|
326
340
|
if (opt->max_sw_mat > 0 && (int64_t)tlen * qlen > opt->max_sw_mat) {
|
327
341
|
ksw_reset_extz(ez);
|
328
342
|
ez->zdropped = 1;
|
@@ -586,7 +600,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
|
|
586
600
|
|
587
601
|
r2->cnt = 0;
|
588
602
|
if (r->cnt == 0) return;
|
589
|
-
|
603
|
+
ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi);
|
590
604
|
bw = (int)(opt->bw * 1.5 + 1.);
|
591
605
|
bw_long = (int)(opt->bw_long * 1.5 + 1.);
|
592
606
|
if (bw_long < bw) bw_long = bw;
|
@@ -844,7 +858,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i
|
|
844
858
|
if (ql < opt->min_chain_score || ql > opt->max_gap) return 0;
|
845
859
|
if (tl < opt->min_chain_score || tl > opt->max_gap) return 0;
|
846
860
|
|
847
|
-
|
861
|
+
ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi);
|
848
862
|
tseq = (uint8_t*)kmalloc(km, tl);
|
849
863
|
mm_idx_getseq(mi, r1->rid, r1->re, r2->rs, tseq);
|
850
864
|
qseq = r1->rev? &qseq0[0][r2->qe] : &qseq0[1][qlen - r2->qs];
|
data/ext/minimap2/cookbook.md
CHANGED
@@ -31,8 +31,8 @@ To acquire the data used in this cookbook and to install minimap2 and paftools,
|
|
31
31
|
please follow the command lines below:
|
32
32
|
```sh
|
33
33
|
# install minimap2 executables
|
34
|
-
curl -L https://github.com/lh3/minimap2/releases/download/v2.
|
35
|
-
cp minimap2-2.
|
34
|
+
curl -L https://github.com/lh3/minimap2/releases/download/v2.27/minimap2-2.27_x64-linux.tar.bz2 | tar jxf -
|
35
|
+
cp minimap2-2.27_x64-linux/{minimap2,k8,paftools.js} . # copy executables
|
36
36
|
export PATH="$PATH:"`pwd` # put the current directory on PATH
|
37
37
|
# download example datasets
|
38
38
|
curl -L https://github.com/lh3/minimap2/releases/download/v2.10/cookbook-data.tgz | tar zxf -
|
data/ext/minimap2/format.c
CHANGED
@@ -139,10 +139,48 @@ int mm_write_sam_hdr(const mm_idx_t *idx, const char *rg, const char *ver, int a
|
|
139
139
|
return ret;
|
140
140
|
}
|
141
141
|
|
142
|
-
static void
|
142
|
+
static void write_indel_ds(kstring_t *str, int64_t len, const uint8_t *seq, int64_t ll, int64_t lr) // write an indel to ds; adapted from minigraph
|
143
143
|
{
|
144
|
-
|
145
|
-
if (
|
144
|
+
int64_t i;
|
145
|
+
if (ll + lr >= len) {
|
146
|
+
mm_sprintf_lite(str, "[");
|
147
|
+
for (i = 0; i < len; ++i)
|
148
|
+
mm_sprintf_lite(str, "%c", "acgtn"[seq[i]]);
|
149
|
+
mm_sprintf_lite(str, "]");
|
150
|
+
} else {
|
151
|
+
int64_t k = 0;
|
152
|
+
if (ll > 0) {
|
153
|
+
mm_sprintf_lite(str, "[");
|
154
|
+
for (i = 0; i < ll; ++i)
|
155
|
+
mm_sprintf_lite(str, "%c", "acgtn"[seq[k+i]]);
|
156
|
+
mm_sprintf_lite(str, "]");
|
157
|
+
k += ll;
|
158
|
+
}
|
159
|
+
for (i = 0; i < len - lr - ll; ++i)
|
160
|
+
mm_sprintf_lite(str, "%c", "acgtn"[seq[k+i]]);
|
161
|
+
k += len - lr - ll;
|
162
|
+
if (lr > 0) {
|
163
|
+
mm_sprintf_lite(str, "[");
|
164
|
+
for (i = 0; i < lr; ++i)
|
165
|
+
mm_sprintf_lite(str, "%c", "acgtn"[seq[k+i]]);
|
166
|
+
mm_sprintf_lite(str, "]");
|
167
|
+
}
|
168
|
+
}
|
169
|
+
}
|
170
|
+
|
171
|
+
static void write_cs_ds_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int no_iden, int is_ds, int write_tag)
|
172
|
+
{
|
173
|
+
int i, q_off, t_off, q_len = 0, t_len = 0;
|
174
|
+
if (write_tag) mm_sprintf_lite(s, "\t%cs:Z:", is_ds? 'd' : 'c');
|
175
|
+
for (i = 0; i < (int)r->p->n_cigar; ++i) {
|
176
|
+
int op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
|
177
|
+
if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH)
|
178
|
+
q_len += len, t_len += len;
|
179
|
+
else if (op == MM_CIGAR_INS)
|
180
|
+
q_len += len;
|
181
|
+
else if (op == MM_CIGAR_DEL || op == MM_CIGAR_N_SKIP)
|
182
|
+
t_len += len;
|
183
|
+
}
|
146
184
|
for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) {
|
147
185
|
int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
|
148
186
|
assert((op >= MM_CIGAR_MATCH && op <= MM_CIGAR_N_SKIP) || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH);
|
@@ -168,14 +206,42 @@ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
|
|
168
206
|
}
|
169
207
|
q_off += len, t_off += len;
|
170
208
|
} else if (op == MM_CIGAR_INS) {
|
171
|
-
|
172
|
-
|
173
|
-
|
209
|
+
if (is_ds) {
|
210
|
+
int z, ll, lr, y = q_off;
|
211
|
+
for (z = 1; z <= len; ++z)
|
212
|
+
if (y - z < 0 || qseq[y + len - z] != qseq[y - z])
|
213
|
+
break;
|
214
|
+
lr = z - 1;
|
215
|
+
for (z = 0; z < len; ++z)
|
216
|
+
if (y + len + z >= q_len || qseq[y + len + z] != qseq[y + z])
|
217
|
+
break;
|
218
|
+
ll = z;
|
219
|
+
mm_sprintf_lite(s, "+");
|
220
|
+
write_indel_ds(s, len, &qseq[y], ll, lr);
|
221
|
+
} else {
|
222
|
+
for (j = 0, tmp[len] = 0; j < len; ++j)
|
223
|
+
tmp[j] = "acgtn"[qseq[q_off + j]];
|
224
|
+
mm_sprintf_lite(s, "+%s", tmp);
|
225
|
+
}
|
174
226
|
q_off += len;
|
175
227
|
} else if (op == MM_CIGAR_DEL) {
|
176
|
-
|
177
|
-
|
178
|
-
|
228
|
+
if (is_ds) {
|
229
|
+
int z, ll, lr, x = t_off;
|
230
|
+
for (z = 1; z <= len; ++z)
|
231
|
+
if (x - z < 0 || tseq[x + len - z] != tseq[x - z])
|
232
|
+
break;
|
233
|
+
lr = z - 1;
|
234
|
+
for (z = 0; z < len; ++z)
|
235
|
+
if (x + len + z >= t_len || tseq[x + z] != tseq[x + len + z])
|
236
|
+
break;
|
237
|
+
ll = z;
|
238
|
+
mm_sprintf_lite(s, "-");
|
239
|
+
write_indel_ds(s, len, &tseq[x], ll, lr);
|
240
|
+
} else {
|
241
|
+
for (j = 0, tmp[len] = 0; j < len; ++j)
|
242
|
+
tmp[j] = "acgtn"[tseq[t_off + j]];
|
243
|
+
mm_sprintf_lite(s, "-%s", tmp);
|
244
|
+
}
|
179
245
|
t_off += len;
|
180
246
|
} else { // intron
|
181
247
|
assert(len >= 2);
|
@@ -218,7 +284,7 @@ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
|
|
218
284
|
assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
|
219
285
|
}
|
220
286
|
|
221
|
-
static void
|
287
|
+
static void write_cs_ds_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int no_iden, int is_MD, int is_ds, int write_tag, int is_qstrand)
|
222
288
|
{
|
223
289
|
extern unsigned char seq_nt4_table[256];
|
224
290
|
int i;
|
@@ -244,8 +310,8 @@ static void write_cs_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_
|
|
244
310
|
}
|
245
311
|
}
|
246
312
|
}
|
247
|
-
if (is_MD) write_MD_core(s, tseq, qseq, r, tmp, write_tag);
|
248
|
-
else
|
313
|
+
if (is_MD == 1) write_MD_core(s, tseq, qseq, r, tmp, write_tag);
|
314
|
+
else write_cs_ds_core(s, tseq, qseq, r, tmp, no_iden, is_ds, write_tag);
|
249
315
|
kfree(km, qseq); kfree(km, tseq); kfree(km, tmp);
|
250
316
|
}
|
251
317
|
|
@@ -256,7 +322,7 @@ int mm_gen_cs_or_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, cons
|
|
256
322
|
str.s = *buf, str.l = 0, str.m = *max_len;
|
257
323
|
t.l_seq = strlen(seq);
|
258
324
|
t.seq = (char*)seq;
|
259
|
-
|
325
|
+
write_cs_ds_or_MD(km, &str, mi, &t, r, no_iden, is_MD, 0, 0, is_qstrand);
|
260
326
|
*max_len = str.m;
|
261
327
|
*buf = str.s;
|
262
328
|
return str.l;
|
@@ -278,7 +344,7 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
|
|
278
344
|
if (r->id == r->parent) type = r->inv? 'I' : 'P';
|
279
345
|
else type = r->inv? 'i' : 'S';
|
280
346
|
if (r->p) {
|
281
|
-
mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->blen - r->mlen + r->p->n_ambi, r->p->
|
347
|
+
mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->blen - r->mlen + r->p->n_ambi, r->p->dp_max0, r->p->dp_score, r->p->n_ambi);
|
282
348
|
if (r->p->trans_strand == 1 || r->p->trans_strand == 2)
|
283
349
|
mm_sprintf_lite(s, "\tts:A:%c", "?+-?"[r->p->trans_strand]);
|
284
350
|
}
|
@@ -326,8 +392,8 @@ void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const
|
|
326
392
|
for (k = 0; k < r->p->n_cigar; ++k)
|
327
393
|
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, MM_CIGAR_STR[r->p->cigar[k]&0xf]);
|
328
394
|
}
|
329
|
-
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
|
330
|
-
|
395
|
+
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_DS|MM_F_OUT_MD)))
|
396
|
+
write_cs_ds_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, !!(opt_flag&MM_F_OUT_DS), 1, !!(opt_flag&MM_F_QSTRAND));
|
331
397
|
if ((opt_flag & MM_F_COPY_COMMENT) && t->comment)
|
332
398
|
mm_sprintf_lite(s, "\t%s", t->comment);
|
333
399
|
}
|
@@ -535,8 +601,8 @@ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
|
|
535
601
|
}
|
536
602
|
}
|
537
603
|
}
|
538
|
-
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
|
539
|
-
|
604
|
+
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_DS|MM_F_OUT_MD)))
|
605
|
+
write_cs_ds_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, !!(opt_flag&MM_F_OUT_DS), 1, 0);
|
540
606
|
if (cigar_in_tag)
|
541
607
|
write_sam_cigar(s, flag, 1, t->l_seq, r, opt_flag);
|
542
608
|
}
|
data/ext/minimap2/index.c
CHANGED
@@ -192,6 +192,7 @@ int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f)
|
|
192
192
|
if (f <= 0.) return INT32_MAX;
|
193
193
|
for (i = 0; i < 1<<mi->b; ++i)
|
194
194
|
if (mi->B[i].h) n += kh_size((idxhash_t*)mi->B[i].h);
|
195
|
+
if (n == 0) return INT32_MAX;
|
195
196
|
a = (uint32_t*)malloc(n * 4);
|
196
197
|
for (i = n = 0; i < 1<<mi->b; ++i) {
|
197
198
|
idxhash_t *h = (idxhash_t*)mi->B[i].h;
|
data/ext/minimap2/main.c
CHANGED
@@ -77,6 +77,7 @@ static ko_longopt_t long_options[] = {
|
|
77
77
|
{ "print-chains", ko_no_argument, 352 },
|
78
78
|
{ "no-hash-name", ko_no_argument, 353 },
|
79
79
|
{ "secondary-seq", ko_no_argument, 354 },
|
80
|
+
{ "ds", ko_no_argument, 355 },
|
80
81
|
{ "help", ko_no_argument, 'h' },
|
81
82
|
{ "max-intron-len", ko_required_argument, 'G' },
|
82
83
|
{ "version", ko_no_argument, 'V' },
|
@@ -120,7 +121,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int64_t flag, int long_idx, const
|
|
120
121
|
|
121
122
|
int main(int argc, char *argv[])
|
122
123
|
{
|
123
|
-
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:";
|
124
|
+
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:b:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:";
|
124
125
|
ketopt_t o = KETOPT_INIT;
|
125
126
|
mm_mapopt_t opt;
|
126
127
|
mm_idxopt_t ipt;
|
@@ -178,6 +179,7 @@ int main(int argc, char *argv[])
|
|
178
179
|
else if (c == 'm') opt.min_chain_score = atoi(o.arg);
|
179
180
|
else if (c == 'A') opt.a = atoi(o.arg);
|
180
181
|
else if (c == 'B') opt.b = atoi(o.arg);
|
182
|
+
else if (c == 'b') opt.transition = atoi(o.arg);
|
181
183
|
else if (c == 's') opt.min_dp_max = atoi(o.arg);
|
182
184
|
else if (c == 'C') opt.noncan = atoi(o.arg);
|
183
185
|
else if (c == 'I') ipt.batch_size = mm_parse_num(o.arg);
|
@@ -242,6 +244,7 @@ int main(int argc, char *argv[])
|
|
242
244
|
else if (c == 352) mm_dbg_flag |= MM_DBG_PRINT_CHAIN; // --print-chains
|
243
245
|
else if (c == 353) opt.flag |= MM_F_NO_HASH_NAME; // --no-hash-name
|
244
246
|
else if (c == 354) opt.flag |= MM_F_SECONDARY_SEQ; // --secondary-seq
|
247
|
+
else if (c == 355) opt.flag |= MM_F_OUT_DS; // --ds
|
245
248
|
else if (c == 330) {
|
246
249
|
fprintf(stderr, "[WARNING] \033[1;31m --lj-min-ratio has been deprecated.\033[0m\n");
|
247
250
|
} else if (c == 314) { // --frag
|
@@ -358,6 +361,7 @@ int main(int argc, char *argv[])
|
|
358
361
|
fprintf(fp_help, " -R STR SAM read group line in a format like '@RG\\tID:foo\\tSM:bar' []\n");
|
359
362
|
fprintf(fp_help, " -c output CIGAR in PAF\n");
|
360
363
|
fprintf(fp_help, " --cs[=STR] output the cs tag; STR is 'short' (if absent) or 'long' [none]\n");
|
364
|
+
fprintf(fp_help, " --ds output the ds tag, which is an extension to cs\n");
|
361
365
|
fprintf(fp_help, " --MD output the MD tag\n");
|
362
366
|
fprintf(fp_help, " --eqx write =/X CIGAR operators\n");
|
363
367
|
fprintf(fp_help, " -Y use soft clipping for supplementary alignments\n");
|
@@ -367,12 +371,12 @@ int main(int argc, char *argv[])
|
|
367
371
|
fprintf(fp_help, " --version show version number\n");
|
368
372
|
fprintf(fp_help, " Preset:\n");
|
369
373
|
fprintf(fp_help, " -x STR preset (always applied before other options; see minimap2.1 for details) []\n");
|
370
|
-
fprintf(fp_help, " -
|
371
|
-
fprintf(fp_help, " -
|
372
|
-
fprintf(fp_help, " - ava-pb/ava-ont - PacBio/Nanopore read overlap\n");
|
374
|
+
fprintf(fp_help, " - lr:hq - accurate long reads (error rate <1%%) against a reference genome\n");
|
375
|
+
fprintf(fp_help, " - splice/splice:hq - spliced alignment for long reads/accurate long reads\n");
|
373
376
|
fprintf(fp_help, " - asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5%% sequence divergence\n");
|
374
|
-
fprintf(fp_help, " -
|
375
|
-
fprintf(fp_help, " -
|
377
|
+
fprintf(fp_help, " - sr - short reads against a reference\n");
|
378
|
+
fprintf(fp_help, " - map-pb/map-hifi/map-ont/map-iclr - CLR/HiFi/Nanopore/ICLR vs reference mapping\n");
|
379
|
+
fprintf(fp_help, " - ava-pb/ava-ont - PacBio CLR/Nanopore read overlap\n");
|
376
380
|
fprintf(fp_help, "\nSee `man ./minimap2.1' for detailed description of these and other advanced command-line options.\n");
|
377
381
|
return fp_help == stdout? 0 : 1;
|
378
382
|
}
|
data/ext/minimap2/minimap.h
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
#include <stdio.h>
|
6
6
|
#include <sys/types.h>
|
7
7
|
|
8
|
-
#define MM_VERSION "2.
|
8
|
+
#define MM_VERSION "2.27-r1193"
|
9
9
|
|
10
10
|
#define MM_F_NO_DIAG (0x001LL) // no exact diagonal hit
|
11
11
|
#define MM_F_NO_DUAL (0x002LL) // skip pairs where query name is lexicographically larger than target name
|
@@ -44,6 +44,7 @@
|
|
44
44
|
#define MM_F_NO_HASH_NAME (0x400000000LL)
|
45
45
|
#define MM_F_SPLICE_OLD (0x800000000LL)
|
46
46
|
#define MM_F_SECONDARY_SEQ (0x1000000000LL) //output SEQ field for seqondary alignments using hard clipping
|
47
|
+
#define MM_F_OUT_DS (0x2000000000LL)
|
47
48
|
|
48
49
|
#define MM_I_HPC 0x1
|
49
50
|
#define MM_I_NO_SEQ 0x2
|
@@ -97,6 +98,7 @@ typedef struct {
|
|
97
98
|
typedef struct {
|
98
99
|
uint32_t capacity; // the capacity of cigar[]
|
99
100
|
int32_t dp_score, dp_max, dp_max2; // DP score; score of the max-scoring segment; score of the best alternate mappings
|
101
|
+
int32_t dp_max0; // DP score before mm_update_dp_max() adjustment
|
100
102
|
uint32_t n_ambi:30, trans_strand:2; // number of ambiguous bases; transcript strand: 0 for unknown, 1 for +, 2 for -
|
101
103
|
uint32_t n_cigar; // number of cigar operations in cigar[]
|
102
104
|
uint32_t cigar[];
|
@@ -153,6 +155,7 @@ typedef struct {
|
|
153
155
|
float alt_drop;
|
154
156
|
|
155
157
|
int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
|
158
|
+
int transition; // transition mismatch score (A:G, C:T)
|
156
159
|
int sc_ambi; // score when one or both bases are "N"
|
157
160
|
int noncan; // cost of non-canonical splicing sites
|
158
161
|
int junc_bonus;
|
data/ext/minimap2/minimap2.1
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
.TH minimap2 1 "
|
1
|
+
.TH minimap2 1 "12 March 2024" "minimap2-2.27 (r1193)" "Bioinformatics tools"
|
2
2
|
.SH NAME
|
3
3
|
.PP
|
4
4
|
minimap2 - mapping and alignment between collections of DNA sequences
|
@@ -343,6 +343,10 @@ Matching score [2]
|
|
343
343
|
.BI -B \ INT
|
344
344
|
Mismatching penalty [4]
|
345
345
|
.TP
|
346
|
+
.BI -b \ INT
|
347
|
+
Mismatching penalty for transitions [same as
|
348
|
+
.BR -B ].
|
349
|
+
.TP
|
346
350
|
.BI -O \ INT1[,INT2]
|
347
351
|
Gap open penalty [4,24]. If
|
348
352
|
.I INT2
|
@@ -356,10 +360,19 @@ costs
|
|
356
360
|
.RI min{ O1 + k * E1 , O2 + k * E2 }.
|
357
361
|
In the splice mode, the second gap penalties are not used.
|
358
362
|
.TP
|
363
|
+
.BI -J \ INT
|
364
|
+
Splice model [1]. 0 for the original minimap2 splice model that always penalizes non-GT-AG splicing;
|
365
|
+
1 for the miniprot model that considers non-GT-AG. Option
|
366
|
+
.B -C
|
367
|
+
has no effect with the default
|
368
|
+
.BR -J1 .
|
369
|
+
.BR -J0 .
|
370
|
+
.TP
|
359
371
|
.BI -C \ INT
|
360
372
|
Cost for a non-canonical GT-AG splicing (effective with
|
361
|
-
.
|
362
|
-
|
373
|
+
.B --splice
|
374
|
+
.BR -J0 )
|
375
|
+
[0].
|
363
376
|
.TP
|
364
377
|
.BI -z \ INT1[,INT2]
|
365
378
|
Truncate an alignment if the running alignment score drops too quickly along
|
@@ -506,6 +519,9 @@ Output =/X CIGAR operators for sequence match/mismatch.
|
|
506
519
|
.B -Y
|
507
520
|
In SAM output, use soft clipping for supplementary alignments.
|
508
521
|
.TP
|
522
|
+
.B --secondary-seq
|
523
|
+
In SAM output, show query sequences for secondary alignments.
|
524
|
+
.TP
|
509
525
|
.BI --seed \ INT
|
510
526
|
Integer seed for randomizing equally best hits. Minimap2 hashes
|
511
527
|
.I INT
|
@@ -566,15 +582,43 @@ are:
|
|
566
582
|
Align noisy long reads of ~10% error rate to a reference genome. This is the
|
567
583
|
default mode.
|
568
584
|
.TP
|
585
|
+
.B lr:hq
|
586
|
+
Align accurate long reads (error rate <1%) to a reference genome
|
587
|
+
.RB ( -k19
|
588
|
+
.B -w19 -U50,500
|
589
|
+
.BR -g10k ).
|
590
|
+
This was recommended by ONT developers for recent Nanopore reads
|
591
|
+
produced with chemistry v14 that can reach ~99% in accuracy.
|
592
|
+
It was shown to work better for accurate Nanopore reads
|
593
|
+
than
|
594
|
+
.BR map-hifi .
|
595
|
+
.TP
|
569
596
|
.B map-hifi
|
570
597
|
Align PacBio high-fidelity (HiFi) reads to a reference genome
|
571
|
-
.RB ( -
|
572
|
-
.B -
|
598
|
+
.RB ( -xlr:hq
|
599
|
+
.B -A1 -B4 -O6,26 -E2,1
|
573
600
|
.BR -s200 ).
|
601
|
+
It differs from
|
602
|
+
.B lr:hq
|
603
|
+
only in scoring. It has not been tested whether
|
604
|
+
.B lr:hq
|
605
|
+
would work better for PacBio HiFi reads.
|
574
606
|
.TP
|
575
607
|
.B map-pb
|
576
608
|
Align older PacBio continuous long (CLR) reads to a reference genome
|
577
609
|
.RB ( -Hk19 ).
|
610
|
+
Note that this data type is effectively deprecated by HiFi.
|
611
|
+
Unless you work on very old data, you probably want to use
|
612
|
+
.B map-hifi
|
613
|
+
or
|
614
|
+
.BR lr:hq .
|
615
|
+
.TP
|
616
|
+
.B map-iclr
|
617
|
+
Align Illumina Complete Long Reads (ICLR) to a reference genome
|
618
|
+
.RB ( -k19
|
619
|
+
.B -B6 -b4
|
620
|
+
.BR -O10,50 ).
|
621
|
+
This was recommended by Illumina developers.
|
578
622
|
.TP
|
579
623
|
.B asm5
|
580
624
|
Long assembly to reference mapping
|
@@ -582,21 +626,21 @@ Long assembly to reference mapping
|
|
582
626
|
.B -w19 -U50,500 --rmq -r1k,100k -g10k -A1 -B19 -O39,81 -E3,1 -s200 -z200
|
583
627
|
.BR -N50 ).
|
584
628
|
Typically, the alignment will not extend to regions with 5% or higher sequence
|
585
|
-
divergence.
|
629
|
+
divergence. Use this preset if the average divergence is not much higher than 0.1%.
|
586
630
|
.TP
|
587
631
|
.B asm10
|
588
632
|
Long assembly to reference mapping
|
589
633
|
.RB ( -k19
|
590
634
|
.B -w19 -U50,500 --rmq -r1k,100k -g10k -A1 -B9 -O16,41 -E2,1 -s200 -z200
|
591
635
|
.BR -N50 ).
|
592
|
-
|
636
|
+
Use this if the average divergence is around 1%.
|
593
637
|
.TP
|
594
638
|
.B asm20
|
595
639
|
Long assembly to reference mapping
|
596
640
|
.RB ( -k19
|
597
641
|
.B -w10 -U50,500 --rmq -r1k,100k -g10k -A1 -B4 -O6,26 -E2,1 -s200 -z200
|
598
642
|
.BR -N50 ).
|
599
|
-
|
643
|
+
Use this if the average divergence is around several percent.
|
600
644
|
.TP
|
601
645
|
.B splice
|
602
646
|
Long-read spliced alignment
|
@@ -612,13 +656,13 @@ costs are different during chaining; 4) the computation of the
|
|
612
656
|
tag ignores introns to demote hits to pseudogenes.
|
613
657
|
.TP
|
614
658
|
.B splice:hq
|
615
|
-
|
659
|
+
Spliced alignment for accurate long RNA-seq reads such as PacBio iso-seq
|
616
660
|
.RB ( -xsplice
|
617
661
|
.B -C5 -O6,24
|
618
662
|
.BR -B4 ).
|
619
663
|
.TP
|
620
664
|
.B sr
|
621
|
-
Short
|
665
|
+
Short-read alignment without splicing
|
622
666
|
.RB ( -k21
|
623
667
|
.B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -b0 -r100 -p.5 -N20 -f1000,5000 -n2 -m25
|
624
668
|
.B -s40 -g100 -2K50m --heap-sort=yes
|
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/usr/bin/env k8
|
2
2
|
|
3
|
-
var paftools_version = '2.
|
3
|
+
var paftools_version = '2.27-r1193';
|
4
4
|
|
5
5
|
/*****************************
|
6
6
|
***** Library functions *****
|
@@ -133,26 +133,50 @@ Interval.find_ovlp = function(a, st, en)
|
|
133
133
|
|
134
134
|
function fasta_read(fn)
|
135
135
|
{
|
136
|
-
var h = {},
|
136
|
+
var h = {}, seqlen = [];
|
137
|
+
var buf = new Bytes();
|
137
138
|
var file = fn == '-'? new File() : new File(fn);
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
if (
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
139
|
+
if (typeof k8_version == "undefined") { // for k8-0.x
|
140
|
+
var seq = null, name = null, gt = '>'.charCodeAt(0);
|
141
|
+
while (file.readline(buf) >= 0) {
|
142
|
+
if (buf[0] == gt) {
|
143
|
+
if (seq != null && name != null) {
|
144
|
+
seqlen.push([name, seq.length]);
|
145
|
+
h[name] = seq;
|
146
|
+
name = seq = null;
|
147
|
+
}
|
148
|
+
var m, line = buf.toString();
|
149
|
+
if ((m = /^>(\S+)/.exec(line)) != null) {
|
150
|
+
name = m[1];
|
151
|
+
seq = new Bytes();
|
152
|
+
}
|
153
|
+
} else seq.set(buf);
|
154
|
+
}
|
155
|
+
if (seq != null && name != null) {
|
156
|
+
seqlen.push([name, seq.length]);
|
157
|
+
h[name] = seq;
|
158
|
+
}
|
159
|
+
} else { // for k8-1.x
|
160
|
+
var seq = null, name = null;
|
161
|
+
while (file.readline(buf) >= 0) {
|
162
|
+
var line = buf.toString();
|
163
|
+
if (line[0] == ">") {
|
164
|
+
if (seq != null && name != null) {
|
165
|
+
seqlen.push([name, seq.length]);
|
166
|
+
h[name] = new Uint8Array(seq.buffer);
|
167
|
+
name = seq = null;
|
168
|
+
}
|
169
|
+
var m;
|
170
|
+
if ((m = /^>(\S+)/.exec(line)) != null) {
|
171
|
+
name = m[1];
|
172
|
+
seq = new Bytes();
|
173
|
+
}
|
174
|
+
} else seq.set(line);
|
175
|
+
}
|
176
|
+
if (seq != null && name != null) {
|
177
|
+
seqlen.push([name, seq.length]);
|
178
|
+
h[name] = new Uint8Array(seq.buffer);
|
179
|
+
}
|
156
180
|
}
|
157
181
|
buf.destroy();
|
158
182
|
file.close();
|
@@ -161,16 +185,27 @@ function fasta_read(fn)
|
|
161
185
|
|
162
186
|
function fasta_free(fa)
|
163
187
|
{
|
164
|
-
|
165
|
-
fa
|
188
|
+
if (typeof k8_version == "undefined")
|
189
|
+
for (var name in fa)
|
190
|
+
fa[name].destroy();
|
191
|
+
// FIXME: for k8-1.0, sequences are not freed. This is ok for now but not general.
|
166
192
|
}
|
167
193
|
|
168
194
|
Bytes.prototype.reverse = function()
|
169
195
|
{
|
170
|
-
|
171
|
-
var
|
172
|
-
|
173
|
-
|
196
|
+
if (typeof k8_version === "undefined") { // k8-0.x
|
197
|
+
for (var i = 0; i < this.length>>1; ++i) {
|
198
|
+
var tmp = this[i];
|
199
|
+
this[i] = this[this.length - i - 1];
|
200
|
+
this[this.length - i - 1] = tmp;
|
201
|
+
}
|
202
|
+
} else { // k8-1.x
|
203
|
+
var buf = new Uint8Array(this.buffer);
|
204
|
+
for (var i = 0; i < buf.length>>1; ++i) {
|
205
|
+
var tmp = buf[i];
|
206
|
+
buf[i] = buf[buf.length - i - 1];
|
207
|
+
buf[buf.length - i - 1] = tmp;
|
208
|
+
}
|
174
209
|
}
|
175
210
|
}
|
176
211
|
|
@@ -185,13 +220,24 @@ Bytes.prototype.revcomp = function()
|
|
185
220
|
for (var i = 0; i < s1.length; ++i)
|
186
221
|
Bytes.rctab[s1.charCodeAt(i)] = s2.charCodeAt(i);
|
187
222
|
}
|
188
|
-
|
189
|
-
var
|
190
|
-
|
191
|
-
|
223
|
+
if (typeof k8_version === "undefined") { // k8-0.x
|
224
|
+
for (var i = 0; i < this.length>>1; ++i) {
|
225
|
+
var tmp = this[this.length - i - 1];
|
226
|
+
this[this.length - i - 1] = Bytes.rctab[this[i]];
|
227
|
+
this[i] = Bytes.rctab[tmp];
|
228
|
+
}
|
229
|
+
if (this.length&1)
|
230
|
+
this[this.length>>1] = Bytes.rctab[this[this.length>>1]];
|
231
|
+
} else { // k8-1.x
|
232
|
+
var buf = new Uint8Array(this.buffer);
|
233
|
+
for (var i = 0; i < buf.length>>1; ++i) {
|
234
|
+
var tmp = buf[buf.length - i - 1];
|
235
|
+
buf[buf.length - i - 1] = Bytes.rctab[buf[i]];
|
236
|
+
buf[i] = Bytes.rctab[tmp];
|
237
|
+
}
|
238
|
+
if (buf.length&1)
|
239
|
+
buf[buf.length>>1] = Bytes.rctab[buf[buf.length>>1]];
|
192
240
|
}
|
193
|
-
if (this.length&1)
|
194
|
-
this[this.length>>1] = Bytes.rctab[this[this.length>>1]];
|
195
241
|
}
|
196
242
|
|
197
243
|
/********************
|
@@ -2051,7 +2097,7 @@ function paf_mapeval(args)
|
|
2051
2097
|
warn("Usage: paftools.js mapeval [options] <in.paf>|<in.sam>");
|
2052
2098
|
warn("Options:");
|
2053
2099
|
warn(" -r FLOAT mapping correct if overlap_length/union_length>FLOAT [" + ovlp_ratio + "]");
|
2054
|
-
warn(" -Q INT print wrong mappings with mapQ
|
2100
|
+
warn(" -Q INT print wrong mappings with mapQ>=INT [don't print]");
|
2055
2101
|
warn(" -m INT 0: eval the longest aln only; 1: first aln only; 2: all primary aln [0]");
|
2056
2102
|
exit(1);
|
2057
2103
|
}
|
data/ext/minimap2/options.c
CHANGED
@@ -45,6 +45,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
|
|
45
45
|
opt->alt_drop = 0.15f;
|
46
46
|
|
47
47
|
opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1;
|
48
|
+
opt->transition = 0;
|
48
49
|
opt->sc_ambi = 1;
|
49
50
|
opt->zdrop = 400, opt->zdrop_inv = 200;
|
50
51
|
opt->end_bonus = -1;
|
@@ -90,7 +91,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
|
90
91
|
if (preset == 0) {
|
91
92
|
mm_idxopt_init(io);
|
92
93
|
mm_mapopt_init(mo);
|
93
|
-
} else if (strcmp(preset, "map-ont") == 0) { // this is the same as the default
|
94
|
+
} else if (strcmp(preset, "lr") == 0 || strcmp(preset, "map-ont") == 0) { // this is the same as the default
|
94
95
|
} else if (strcmp(preset, "ava-ont") == 0) {
|
95
96
|
io->flag = 0, io->k = 15, io->w = 5;
|
96
97
|
mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN;
|
@@ -105,13 +106,22 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
|
105
106
|
mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25;
|
106
107
|
mo->bw_long = mo->bw;
|
107
108
|
mo->occ_dist = 0;
|
108
|
-
} else if (strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) {
|
109
|
+
} else if (strcmp(preset, "lr:hq") == 0 || strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) {
|
109
110
|
io->flag = 0, io->k = 19, io->w = 19;
|
110
111
|
mo->max_gap = 10000;
|
111
|
-
mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1;
|
112
|
-
mo->occ_dist = 500;
|
113
112
|
mo->min_mid_occ = 50, mo->max_mid_occ = 500;
|
114
|
-
|
113
|
+
if (strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) {
|
114
|
+
mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1;
|
115
|
+
mo->min_dp_max = 200;
|
116
|
+
}
|
117
|
+
} else if (strcmp(preset, "map-iclr-prerender") == 0) {
|
118
|
+
io->flag = 0, io->k = 15;
|
119
|
+
mo->b = 6, mo->transition = 1;
|
120
|
+
mo->q = 10, mo->q2 = 50;
|
121
|
+
} else if (strcmp(preset, "map-iclr") == 0) {
|
122
|
+
io->flag = 0, io->k = 19;
|
123
|
+
mo->b = 6, mo->transition = 4;
|
124
|
+
mo->q = 10, mo->q2 = 50;
|
115
125
|
} else if (strncmp(preset, "asm", 3) == 0) {
|
116
126
|
io->flag = 0, io->k = 19, io->w = 19;
|
117
127
|
mo->bw = 1000, mo->bw_long = 100000;
|
@@ -156,7 +166,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
|
156
166
|
mo->junc_bonus = 9;
|
157
167
|
mo->zdrop = 200, mo->zdrop_inv = 100; // because mo->a is halved
|
158
168
|
if (strcmp(preset, "splice:hq") == 0)
|
159
|
-
mo->
|
169
|
+
mo->noncan = 5, mo->b = 4, mo->q = 6, mo->q2 = 24;
|
160
170
|
} else return -1;
|
161
171
|
return 0;
|
162
172
|
}
|
@@ -77,7 +77,9 @@ This constructor accepts the following arguments:
|
|
77
77
|
|
78
78
|
* **min_chain_score**: minimum chaing score
|
79
79
|
|
80
|
-
* **bw**: chaining and alignment band width
|
80
|
+
* **bw**: chaining and alignment band width (initial chaining and extension)
|
81
|
+
|
82
|
+
* **bw_long**: chaining and alignment band width (RMQ-based rechaining and closing gaps)
|
81
83
|
|
82
84
|
* **best_n**: max number of alignments to return
|
83
85
|
|
@@ -3,7 +3,7 @@ from libc.stdlib cimport free
|
|
3
3
|
cimport cmappy
|
4
4
|
import sys
|
5
5
|
|
6
|
-
__version__ = '2.
|
6
|
+
__version__ = '2.27'
|
7
7
|
|
8
8
|
cmappy.mm_reset_timer()
|
9
9
|
|
@@ -112,7 +112,7 @@ cdef class Aligner:
|
|
112
112
|
cdef cmappy.mm_idxopt_t idx_opt
|
113
113
|
cdef cmappy.mm_mapopt_t map_opt
|
114
114
|
|
115
|
-
def __cinit__(self, fn_idx_in=None, preset=None, k=None, w=None, min_cnt=None, min_chain_score=None, min_dp_score=None, bw=None, best_n=None, n_threads=3, fn_idx_out=None, max_frag_len=None, extra_flags=None, seq=None, scoring=None):
|
115
|
+
def __cinit__(self, fn_idx_in=None, preset=None, k=None, w=None, min_cnt=None, min_chain_score=None, min_dp_score=None, bw=None, bw_long=None, best_n=None, n_threads=3, fn_idx_out=None, max_frag_len=None, extra_flags=None, seq=None, scoring=None):
|
116
116
|
self._idx = NULL
|
117
117
|
cmappy.mm_set_opt(NULL, &self.idx_opt, &self.map_opt) # set the default options
|
118
118
|
if preset is not None:
|
@@ -125,6 +125,7 @@ cdef class Aligner:
|
|
125
125
|
if min_chain_score is not None: self.map_opt.min_chain_score = min_chain_score
|
126
126
|
if min_dp_score is not None: self.map_opt.min_dp_max = min_dp_score
|
127
127
|
if bw is not None: self.map_opt.bw = bw
|
128
|
+
if bw_long is not None: self.map_opt.bw_long = bw_long
|
128
129
|
if best_n is not None: self.map_opt.best_n = best_n
|
129
130
|
if max_frag_len is not None: self.map_opt.max_frag_len = max_frag_len
|
130
131
|
if extra_flags is not None: self.map_opt.flag |= extra_flags
|
data/ext/minimap2/setup.py
CHANGED
data/lib/minimap2/aligner.rb
CHANGED
@@ -21,10 +21,11 @@ module Minimap2
|
|
21
21
|
# * ava-ont : Nanopore read overlap
|
22
22
|
# @param k [Integer] k-mer length, no larger than 28.
|
23
23
|
# @param w [Integer] minimizer window size, no larger than 255.
|
24
|
-
# @param min_cnt [Integer]
|
25
|
-
# @param min_chain_score [Integer] minimum
|
24
|
+
# @param min_cnt [Integer] minimum number of minimizers on a chain.
|
25
|
+
# @param min_chain_score [Integer] minimum chain score.
|
26
26
|
# @param min_dp_score
|
27
|
-
# @param bw [Integer] chaining and alignment band width.
|
27
|
+
# @param bw [Integer] chaining and alignment band width. (initial chaining and extension)
|
28
|
+
# @param bw_long [Integer] chaining and alignment band width (RMQ-based rechaining and closing gaps)
|
28
29
|
# @param best_n [Integer] max number of alignments to return.
|
29
30
|
# @param n_threads [Integer] number of indexing threads.
|
30
31
|
# @param fn_idx_out [String] name of file to which the index is written.
|
@@ -47,6 +48,7 @@ module Minimap2
|
|
47
48
|
min_chain_score: nil,
|
48
49
|
min_dp_score: nil,
|
49
50
|
bw: nil,
|
51
|
+
bw_long: nil,
|
50
52
|
best_n: nil,
|
51
53
|
n_threads: 3,
|
52
54
|
fn_idx_out: nil,
|
@@ -72,6 +74,7 @@ module Minimap2
|
|
72
74
|
map_opt[:min_chain_score] = min_chain_score if min_chain_score
|
73
75
|
map_opt[:min_dp_max] = min_dp_score if min_dp_score
|
74
76
|
map_opt[:bw] = bw if bw
|
77
|
+
map_opt[:bw_long] = bw_long if bw_long
|
75
78
|
map_opt[:best_n] = best_n if best_n
|
76
79
|
map_opt[:max_frag_len] = max_frag_len if max_frag_len
|
77
80
|
map_opt[:flag] |= extra_flags if extra_flags
|
data/lib/minimap2/alignment.rb
CHANGED
@@ -23,7 +23,7 @@ module Minimap2
|
|
23
23
|
# @return [Integer] length of the matching bases in the alignment,
|
24
24
|
# excluding ambiguous base matches.
|
25
25
|
# @!attribute nm
|
26
|
-
# @return [Integer] number of mismatches, gaps and ambiguous
|
26
|
+
# @return [Integer] number of mismatches, gaps and ambiguous positions in the alignment.
|
27
27
|
# @!attribute primary
|
28
28
|
# @return [Integer] if the alignment is primary (typically the best and the first to generate)
|
29
29
|
# @!attribute q_st
|
@@ -40,6 +40,7 @@ module Minimap2
|
|
40
40
|
NO_HASH_NAME = 0x400000000
|
41
41
|
SPLICE_OLD = 0x800000000
|
42
42
|
SECONDARY_SEQ = 0x1000000000 # output SEQ field for seqondary alignments using hard clipping
|
43
|
+
OUT_DS = 0x2000000000
|
43
44
|
|
44
45
|
HPC = 0x1
|
45
46
|
NO_SEQ = 0x2
|
@@ -109,8 +110,10 @@ module Minimap2
|
|
109
110
|
:dp_score, :int32, # DP score
|
110
111
|
:dp_max, :int32, # score of the max-scoring segment
|
111
112
|
:dp_max2, :int32, # score of the best alternate mappings
|
113
|
+
:dp_max0, :int32, # DP score before mm_update_dp_max() adjustment
|
112
114
|
:n_ambi_trans_strand, :uint32,
|
113
115
|
:n_cigar, :uint32
|
116
|
+
# :cigar, :pointer # variable length array (see cigar method below)
|
114
117
|
|
115
118
|
bit_field :n_ambi_trans_strand,
|
116
119
|
:n_ambi, 30, # number of ambiguous bases
|
@@ -204,6 +207,7 @@ module Minimap2
|
|
204
207
|
:e, :int, # gap-ext
|
205
208
|
:q2, :int, # gap-open
|
206
209
|
:e2, :int, # gap-ext
|
210
|
+
:transition, :int, # transition mismatch score (A:G, C:T)
|
207
211
|
:sc_ambi, :int, # score when one or both bases are "N"
|
208
212
|
:noncan, :int, # cost of non-canonical splicing sites
|
209
213
|
:junc_bonus, :int,
|
@@ -223,7 +227,7 @@ module Minimap2
|
|
223
227
|
:q_occ_frac, :float,
|
224
228
|
:min_mid_occ, :int32,
|
225
229
|
:max_mid_occ, :int32,
|
226
|
-
:mid_occ, :int32,
|
230
|
+
:mid_occ, :int32, # ignore seeds with occurrences above this threshold
|
227
231
|
:max_occ, :int32,
|
228
232
|
:max_max_occ, :int32,
|
229
233
|
:occ_dist, :int32,
|
@@ -15,10 +15,11 @@ module Minimap2
|
|
15
15
|
private_class_method :mm_set_opt_raw
|
16
16
|
|
17
17
|
def self.mm_set_opt(preset, io, mo)
|
18
|
-
ptr =
|
19
|
-
|
20
|
-
else
|
18
|
+
ptr = case preset
|
19
|
+
when 0, nil
|
21
20
|
::FFI::Pointer.new(:int, 0)
|
21
|
+
else
|
22
|
+
::FFI::MemoryPointer.from_string(preset.to_s)
|
22
23
|
end
|
23
24
|
mm_set_opt_raw(ptr, io, mo)
|
24
25
|
end
|
@@ -77,5 +78,17 @@ module Minimap2
|
|
77
78
|
:mm_gen_md, :mm_gen_MD, # Avoid uppercase letters in method names.
|
78
79
|
[:pointer, :pointer, :pointer, Idx.by_ref, Reg1.by_ref, :string],
|
79
80
|
:int
|
81
|
+
|
82
|
+
attach_function \
|
83
|
+
:mm_mapopt_init,
|
84
|
+
[MapOpt.by_ref],
|
85
|
+
:void
|
86
|
+
|
87
|
+
# mmpriv.h
|
88
|
+
|
89
|
+
attach_function \
|
90
|
+
:mm_idxopt_init,
|
91
|
+
[IdxOpt.by_ref],
|
92
|
+
:void
|
80
93
|
end
|
81
94
|
end
|
data/lib/minimap2/ffi.rb
CHANGED
data/lib/minimap2/version.rb
CHANGED
data/lib/minimap2.rb
CHANGED
@@ -37,7 +37,7 @@ module Minimap2
|
|
37
37
|
# @example Get minimap2 version
|
38
38
|
# Minimap2.execute('--version')
|
39
39
|
|
40
|
-
def
|
40
|
+
def execute(*rb_argv)
|
41
41
|
str_ptrs = []
|
42
42
|
# First argument is the program name.
|
43
43
|
str_ptrs << ::FFI::MemoryPointer.from_string("minimap2")
|
@@ -76,7 +76,7 @@ module Minimap2
|
|
76
76
|
# @param [String] file_path
|
77
77
|
# @param [Boolean] comment If True, the comment will be read.
|
78
78
|
# @yield [name, seq, qual, comment]
|
79
|
-
# @return [Enumerator] enum
|
79
|
+
# @return [Enumerator] enum Return Enumerator if not block given.
|
80
80
|
# Note: You can BioRuby instead of this method.
|
81
81
|
|
82
82
|
def fastx_read(file_path, comment: false, &block)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: minimap2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.27.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- kojix2
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-03-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ffi
|
@@ -150,7 +150,7 @@ homepage: https://github.com/kojix2/ruby-minimap2
|
|
150
150
|
licenses:
|
151
151
|
- MIT
|
152
152
|
metadata: {}
|
153
|
-
post_install_message:
|
153
|
+
post_install_message:
|
154
154
|
rdoc_options: []
|
155
155
|
require_paths:
|
156
156
|
- lib
|
@@ -165,8 +165,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
165
165
|
- !ruby/object:Gem::Version
|
166
166
|
version: '0'
|
167
167
|
requirements: []
|
168
|
-
rubygems_version: 3.4
|
169
|
-
signing_key:
|
168
|
+
rubygems_version: 3.5.4
|
169
|
+
signing_key:
|
170
170
|
specification_version: 4
|
171
171
|
summary: minimap2
|
172
172
|
test_files: []
|