minimap2 0.2.26.0 → 0.2.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +16 -16
- data/ext/Rakefile +9 -4
- data/ext/minimap2/NEWS.md +39 -1
- data/ext/minimap2/README.md +8 -5
- data/ext/minimap2/align.c +17 -3
- data/ext/minimap2/cookbook.md +2 -2
- data/ext/minimap2/format.c +84 -18
- data/ext/minimap2/index.c +1 -0
- data/ext/minimap2/main.c +10 -6
- data/ext/minimap2/minimap.h +4 -1
- data/ext/minimap2/minimap2.1 +54 -10
- data/ext/minimap2/misc/paftools.js +79 -33
- data/ext/minimap2/options.c +16 -6
- data/ext/minimap2/python/README.rst +3 -1
- data/ext/minimap2/python/mappy.pyx +3 -2
- data/ext/minimap2/setup.py +1 -1
- data/lib/minimap2/aligner.rb +6 -3
- data/lib/minimap2/alignment.rb +1 -1
- data/lib/minimap2/ffi/constants.rb +5 -1
- data/lib/minimap2/ffi/functions.rb +16 -3
- data/lib/minimap2/ffi.rb +1 -0
- data/lib/minimap2/version.rb +1 -2
- data/lib/minimap2.rb +2 -2
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f618028eabf476209264541d9037f68961548eb02dad4a22338bacdfe383fce7
|
4
|
+
data.tar.gz: f97eb69e9b1e78357cd738ba2a63ce36034e0fbd7c253c5a89a14b23ade19b01
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1ab742822a921c06f31671b0a555a0220cc9b2d0dc2e9d6ef7b72ff90fbc33de4d2f9285819b5064a28e3789065290e21f7e4dedb162c9927aefd8c860ceea35
|
7
|
+
data.tar.gz: 8d0b005004a1ac625a61d8a68073b31c08b4d156308ebfa01e18508e2ed520d948fa5d2a2e4804978a0846abc90bdc142183ae2d7d2c466cb2b00f87afff4d71
|
data/README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# ruby-minimap2
|
2
2
|
|
3
3
|
[](https://rubygems.org/gems/minimap2)
|
4
|
-
[](https://github.com/kojix2/ruby-minimap2/actions/workflows/ci.yml)
|
5
5
|
[](https://rubydoc.info/gems/minimap2)
|
6
6
|
[](https://kojix2.github.io/ruby-minimap2/)
|
7
7
|
[](LICENSE.txt)
|
@@ -23,7 +23,7 @@ gem install minimap2
|
|
23
23
|
bundle install
|
24
24
|
bundle exec rake minimap2:build
|
25
25
|
bundle exec rake install
|
26
|
-
|
26
|
+
|
27
27
|
</details>
|
28
28
|
|
29
29
|
## Quick Start
|
@@ -36,6 +36,7 @@ seq = aligner.seq("MT_human", 100, 200)
|
|
36
36
|
hits = aligner.align(seq)
|
37
37
|
pp hits
|
38
38
|
```
|
39
|
+
|
39
40
|
```
|
40
41
|
[#<Minimap2::Alignment:0x000055bbfde2d128
|
41
42
|
@blen=100,
|
@@ -57,8 +58,6 @@ pp hits
|
|
57
58
|
@strand=1,
|
58
59
|
@trans_strand=0>]
|
59
60
|
```
|
60
|
-
|
61
|
-
</details>
|
62
61
|
|
63
62
|
## APIs Overview
|
64
63
|
|
@@ -87,7 +86,7 @@ pp hits
|
|
87
86
|
- trans_strand Returns transcript strand. +1 if on the forward strand; -1 if on the reverse strand; 0 if unknown.
|
88
87
|
- blen Returns length of the alignment, including both alignment matches and gaps but excluding ambiguous bases.
|
89
88
|
- mlen Returns length of the matching bases in the alignment, excluding ambiguous base matches.
|
90
|
-
- nm Returns number of mismatches, gaps and ambiguous
|
89
|
+
- nm Returns number of mismatches, gaps and ambiguous positions in the alignment.
|
91
90
|
- primary Returns if the alignment is primary (typically the best and the first to generate).
|
92
91
|
- q_st Returns start positions on the query.
|
93
92
|
- q_en Returns end positions on the query.
|
@@ -106,19 +105,20 @@ pp hits
|
|
106
105
|
* MapOpt class Mapping options.
|
107
106
|
```
|
108
107
|
|
109
|
-
|
110
|
-
|
111
|
-
|
108
|
+
- API is based on [Mappy](https://github.com/lh3/minimap2/tree/master/python), the official Python binding for Minimap2.
|
109
|
+
- `Aligner#map` has been changed to `align`, because `map` means iterator in Ruby.
|
110
|
+
- See [documentation](https://kojix2.github.io/ruby-minimap2/) for details.
|
112
111
|
|
113
112
|
<details>
|
114
113
|
<summary><b>C Structures and Functions</b></summary>
|
115
114
|
|
116
115
|
### FFI
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
116
|
+
|
117
|
+
- Ruby-Minimap2 is built on top of [Ruby-FFI](https://github.com/ffi/ffi).
|
118
|
+
- Native C functions can be called from the `Minimap2::FFI` module.
|
119
|
+
- Native C structure members can be accessed.
|
120
|
+
- Bitfields are supported by [ffi-bitfield](https://github.com/kojix2/ffi-bitfield) gems.
|
121
|
+
|
122
122
|
```ruby
|
123
123
|
aligner.idx_opt.members
|
124
124
|
# => [:k, :w, :flag, :bucket_bits, :mini_batch_size, :batch_size]
|
@@ -130,7 +130,7 @@ aligner.idx_opt[:k] = 14
|
|
130
130
|
aligner.idx_opt[:k]
|
131
131
|
# => 14
|
132
132
|
```
|
133
|
-
|
133
|
+
|
134
134
|
</details>
|
135
135
|
|
136
136
|
## Contributing
|
@@ -138,7 +138,7 @@ aligner.idx_opt[:k]
|
|
138
138
|
<details>
|
139
139
|
<summary><b>Development</b></summary>
|
140
140
|
|
141
|
-
|
141
|
+
Fork your repository.
|
142
142
|
then clone.
|
143
143
|
|
144
144
|
```sh
|
@@ -184,7 +184,7 @@ ruby-minimap2 is a library under development and there are many points to be imp
|
|
184
184
|
|
185
185
|
Please feel free to report [bugs](https://github.com/kojix2/ruby-minimap2/issues) and [pull requests](https://github.com/kojix2/ruby-minimap2/pulls)!
|
186
186
|
|
187
|
-
Many OSS projects become abandoned because only the founder has commit rights to the original repository.
|
187
|
+
Many OSS projects become abandoned because only the founder has commit rights to the original repository.
|
188
188
|
If you need commit rights to ruby-minimap2 repository or want to get admin rights and take over the project, please feel free to contact me @kojix2.
|
189
189
|
|
190
190
|
## License
|
data/ext/Rakefile
CHANGED
@@ -18,7 +18,14 @@ namespace :minimap2 do
|
|
18
18
|
# Add -fPIC option to Makefile
|
19
19
|
sh "git apply ../minimap2.patch"
|
20
20
|
sh "cp ../cmappy/cmappy.h ../cmappy/cmappy.c ."
|
21
|
-
|
21
|
+
case RbConfig::CONFIG["host_cpu"]
|
22
|
+
when /arm64/
|
23
|
+
sh "make arm_neon=1 aarch64=1"
|
24
|
+
when /arm/
|
25
|
+
sh "make arm_neon=1"
|
26
|
+
else
|
27
|
+
sh "make"
|
28
|
+
end
|
22
29
|
case RbConfig::CONFIG["host_os"]
|
23
30
|
when /mswin|msys|mingw|cygwin|bccwin|wince|emc/
|
24
31
|
sh "cc *.o -shared -o #{target_fname} -lm -lz -lpthread"
|
@@ -44,10 +51,8 @@ namespace :minimap2 do
|
|
44
51
|
end
|
45
52
|
end
|
46
53
|
|
47
|
-
task cleanall: [:clean]
|
48
|
-
|
49
54
|
desc "`make clean` and remove shared lib"
|
50
|
-
task :
|
55
|
+
task cleanall: [:clean] do
|
51
56
|
Dir.chdir(minimap2_dir) do
|
52
57
|
sh "rm #{target_path}" if File.exist?(target_path)
|
53
58
|
end
|
data/ext/minimap2/NEWS.md
CHANGED
@@ -1,9 +1,47 @@
|
|
1
|
+
Release 2.27-r1193 (12 March 2024)
|
2
|
+
----------------------------------
|
3
|
+
|
4
|
+
Notable changes to minimap2:
|
5
|
+
|
6
|
+
* New feature: added the `lr:hq` preset for accurate long reads at ~1% error
|
7
|
+
rate. This was suggested by Oxford Nanopore developers (#1127). It is not
|
8
|
+
clear if this preset also works well for PacBio HiFi reads.
|
9
|
+
|
10
|
+
* New feature: added the `map-iclr` preset for Illumina Complete Long Reads
|
11
|
+
(#1069), provided by Illumina developers.
|
12
|
+
|
13
|
+
* New feature: added option `-b` to specify mismatch penalty for base
|
14
|
+
transitions (i.e. A-to-G or C-to-T changes).
|
15
|
+
|
16
|
+
* New feature: added option `--ds` to generate a new `ds:Z` tag that
|
17
|
+
indicates uncertainty in INDEL positions. It is an extension to `cs`. The
|
18
|
+
`mgutils-es6.js` script in minigraph parses `ds`.
|
19
|
+
|
20
|
+
* Bugfix: avoided a NULL pointer dereference (#1154). This would not have an
|
21
|
+
effect on most systems but would still be good to fix.
|
22
|
+
|
23
|
+
* Bugfix: reverted the value of `ms:i` to pre-2.22 versions (#1146). This was
|
24
|
+
an oversight. See fcd4df2 for details.
|
25
|
+
|
26
|
+
Notable changes to paftools.js and mappy:
|
27
|
+
|
28
|
+
* New feature: expose `bw_long` to mappy's Aligner class (#1124).
|
29
|
+
|
30
|
+
* Bugfix: fixed several compatibility issues with k8 v1.0 (#1161 and #1166).
|
31
|
+
Subcommands "call", "pbsim2fq" and "mason2fq" were not working with v1.0.
|
32
|
+
|
33
|
+
Minimap2 should output identical alignments to v2.26, except the ms tag.
|
34
|
+
|
35
|
+
(2.27: 12 March 2024, r1193)
|
36
|
+
|
37
|
+
|
38
|
+
|
1
39
|
Release 2.26-r1175 (29 April 2023)
|
2
40
|
----------------------------------
|
3
41
|
|
4
42
|
Fixed the broken Python package. This is the only change.
|
5
43
|
|
6
|
-
(2.
|
44
|
+
(2.26: 25 April 2023, r1173)
|
7
45
|
|
8
46
|
|
9
47
|
|
data/ext/minimap2/README.md
CHANGED
@@ -15,7 +15,7 @@ cd minimap2 && make
|
|
15
15
|
./minimap2 -ax map-pb ref.fa pacbio.fq.gz > aln.sam # PacBio CLR genomic reads
|
16
16
|
./minimap2 -ax map-ont ref.fa ont.fq.gz > aln.sam # Oxford Nanopore genomic reads
|
17
17
|
./minimap2 -ax map-hifi ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.19 or later)
|
18
|
-
./minimap2 -ax
|
18
|
+
./minimap2 -ax lr:hq ref.fa ont-Q20.fq.gz > aln.sam # Nanopore Q20 genomic reads (v2.27 or later)
|
19
19
|
./minimap2 -ax sr ref.fa read1.fa read2.fa > aln.sam # short genomic paired-end reads
|
20
20
|
./minimap2 -ax splice ref.fa rna-reads.fa > aln.sam # spliced long reads (strand unknown)
|
21
21
|
./minimap2 -ax splice -uf -k14 ref.fa reads.fa > aln.sam # noisy Nanopore Direct RNA-seq
|
@@ -74,8 +74,8 @@ Detailed evaluations are available from the [minimap2 paper][doi] or the
|
|
74
74
|
Minimap2 is optimized for x86-64 CPUs. You can acquire precompiled binaries from
|
75
75
|
the [release page][release] with:
|
76
76
|
```sh
|
77
|
-
curl -L https://github.com/lh3/minimap2/releases/download/v2.
|
78
|
-
./minimap2-2.
|
77
|
+
curl -L https://github.com/lh3/minimap2/releases/download/v2.27/minimap2-2.27_x64-linux.tar.bz2 | tar -jxvf -
|
78
|
+
./minimap2-2.27_x64-linux/minimap2
|
79
79
|
```
|
80
80
|
If you want to compile from the source, you need to have a C compiler, GNU make
|
81
81
|
and zlib development files installed. Then type `make` in the source code
|
@@ -139,12 +139,15 @@ parameters at the same time. The default setting is the same as `map-ont`.
|
|
139
139
|
```sh
|
140
140
|
minimap2 -ax map-pb ref.fa pacbio-reads.fq > aln.sam # for PacBio CLR reads
|
141
141
|
minimap2 -ax map-ont ref.fa ont-reads.fq > aln.sam # for Oxford Nanopore reads
|
142
|
+
minimap2 -ax map-iclr ref.fa iclr-reads.fq > aln.sam # for Illumina Complete Long Reads
|
142
143
|
```
|
143
144
|
The difference between `map-pb` and `map-ont` is that `map-pb` uses
|
144
145
|
homopolymer-compressed (HPC) minimizers as seeds, while `map-ont` uses ordinary
|
145
|
-
minimizers as seeds.
|
146
|
+
minimizers as seeds. Empirical evaluation suggests HPC minimizers improve
|
146
147
|
performance and sensitivity when aligning PacBio CLR reads, but hurt when aligning
|
147
|
-
Nanopore reads.
|
148
|
+
Nanopore reads. `map-iclr` uses an adjusted alignment scoring matrix that
|
149
|
+
accounts for the low overall error rate in the reads, with transversion errors
|
150
|
+
being less frequent than transitions.
|
148
151
|
|
149
152
|
#### <a name="map-long-splice"></a>Map long mRNA/cDNA reads
|
150
153
|
|
data/ext/minimap2/align.c
CHANGED
@@ -21,6 +21,18 @@ static void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t sc
|
|
21
21
|
mat[(m - 1) * m + j] = sc_ambi;
|
22
22
|
}
|
23
23
|
|
24
|
+
static void ksw_gen_ts_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t transition, int8_t sc_ambi)
|
25
|
+
{
|
26
|
+
assert(m == 5);
|
27
|
+
ksw_gen_simple_mat(m, mat, a, b, sc_ambi);
|
28
|
+
if (transition == 0 || transition == b) return;
|
29
|
+
transition = transition > 0? -transition : transition;
|
30
|
+
mat[0 * m + 2] = transition; // A->G
|
31
|
+
mat[1 * m + 3] = transition; // C->T
|
32
|
+
mat[2 * m + 0] = transition; // G->A
|
33
|
+
mat[3 * m + 1] = transition; // T->C
|
34
|
+
}
|
35
|
+
|
24
36
|
static inline void mm_seq_rev(uint32_t len, uint8_t *seq)
|
25
37
|
{
|
26
38
|
uint32_t i;
|
@@ -283,7 +295,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
|
|
283
295
|
toff += len;
|
284
296
|
}
|
285
297
|
}
|
286
|
-
p->dp_max = (int32_t)(max + .499);
|
298
|
+
p->dp_max = p->dp_max0 = (int32_t)(max + .499);
|
287
299
|
assert(qoff == r->qe - r->qs && toff == r->re - r->rs);
|
288
300
|
if (is_eqx) mm_update_cigar_eqx(r, qseq, tseq); // NB: it has to be called here as changes to qseq and tseq are not returned
|
289
301
|
}
|
@@ -323,6 +335,8 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint
|
|
323
335
|
for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr);
|
324
336
|
fputc('\n', stderr);
|
325
337
|
}
|
338
|
+
if (opt->transition != 0 && opt->b != opt->transition)
|
339
|
+
flag |= KSW_EZ_GENERIC_SC;
|
326
340
|
if (opt->max_sw_mat > 0 && (int64_t)tlen * qlen > opt->max_sw_mat) {
|
327
341
|
ksw_reset_extz(ez);
|
328
342
|
ez->zdropped = 1;
|
@@ -586,7 +600,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
|
|
586
600
|
|
587
601
|
r2->cnt = 0;
|
588
602
|
if (r->cnt == 0) return;
|
589
|
-
|
603
|
+
ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi);
|
590
604
|
bw = (int)(opt->bw * 1.5 + 1.);
|
591
605
|
bw_long = (int)(opt->bw_long * 1.5 + 1.);
|
592
606
|
if (bw_long < bw) bw_long = bw;
|
@@ -844,7 +858,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i
|
|
844
858
|
if (ql < opt->min_chain_score || ql > opt->max_gap) return 0;
|
845
859
|
if (tl < opt->min_chain_score || tl > opt->max_gap) return 0;
|
846
860
|
|
847
|
-
|
861
|
+
ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi);
|
848
862
|
tseq = (uint8_t*)kmalloc(km, tl);
|
849
863
|
mm_idx_getseq(mi, r1->rid, r1->re, r2->rs, tseq);
|
850
864
|
qseq = r1->rev? &qseq0[0][r2->qe] : &qseq0[1][qlen - r2->qs];
|
data/ext/minimap2/cookbook.md
CHANGED
@@ -31,8 +31,8 @@ To acquire the data used in this cookbook and to install minimap2 and paftools,
|
|
31
31
|
please follow the command lines below:
|
32
32
|
```sh
|
33
33
|
# install minimap2 executables
|
34
|
-
curl -L https://github.com/lh3/minimap2/releases/download/v2.
|
35
|
-
cp minimap2-2.
|
34
|
+
curl -L https://github.com/lh3/minimap2/releases/download/v2.27/minimap2-2.27_x64-linux.tar.bz2 | tar jxf -
|
35
|
+
cp minimap2-2.27_x64-linux/{minimap2,k8,paftools.js} . # copy executables
|
36
36
|
export PATH="$PATH:"`pwd` # put the current directory on PATH
|
37
37
|
# download example datasets
|
38
38
|
curl -L https://github.com/lh3/minimap2/releases/download/v2.10/cookbook-data.tgz | tar zxf -
|
data/ext/minimap2/format.c
CHANGED
@@ -139,10 +139,48 @@ int mm_write_sam_hdr(const mm_idx_t *idx, const char *rg, const char *ver, int a
|
|
139
139
|
return ret;
|
140
140
|
}
|
141
141
|
|
142
|
-
static void
|
142
|
+
static void write_indel_ds(kstring_t *str, int64_t len, const uint8_t *seq, int64_t ll, int64_t lr) // write an indel to ds; adapted from minigraph
|
143
143
|
{
|
144
|
-
|
145
|
-
if (
|
144
|
+
int64_t i;
|
145
|
+
if (ll + lr >= len) {
|
146
|
+
mm_sprintf_lite(str, "[");
|
147
|
+
for (i = 0; i < len; ++i)
|
148
|
+
mm_sprintf_lite(str, "%c", "acgtn"[seq[i]]);
|
149
|
+
mm_sprintf_lite(str, "]");
|
150
|
+
} else {
|
151
|
+
int64_t k = 0;
|
152
|
+
if (ll > 0) {
|
153
|
+
mm_sprintf_lite(str, "[");
|
154
|
+
for (i = 0; i < ll; ++i)
|
155
|
+
mm_sprintf_lite(str, "%c", "acgtn"[seq[k+i]]);
|
156
|
+
mm_sprintf_lite(str, "]");
|
157
|
+
k += ll;
|
158
|
+
}
|
159
|
+
for (i = 0; i < len - lr - ll; ++i)
|
160
|
+
mm_sprintf_lite(str, "%c", "acgtn"[seq[k+i]]);
|
161
|
+
k += len - lr - ll;
|
162
|
+
if (lr > 0) {
|
163
|
+
mm_sprintf_lite(str, "[");
|
164
|
+
for (i = 0; i < lr; ++i)
|
165
|
+
mm_sprintf_lite(str, "%c", "acgtn"[seq[k+i]]);
|
166
|
+
mm_sprintf_lite(str, "]");
|
167
|
+
}
|
168
|
+
}
|
169
|
+
}
|
170
|
+
|
171
|
+
static void write_cs_ds_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int no_iden, int is_ds, int write_tag)
|
172
|
+
{
|
173
|
+
int i, q_off, t_off, q_len = 0, t_len = 0;
|
174
|
+
if (write_tag) mm_sprintf_lite(s, "\t%cs:Z:", is_ds? 'd' : 'c');
|
175
|
+
for (i = 0; i < (int)r->p->n_cigar; ++i) {
|
176
|
+
int op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
|
177
|
+
if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH)
|
178
|
+
q_len += len, t_len += len;
|
179
|
+
else if (op == MM_CIGAR_INS)
|
180
|
+
q_len += len;
|
181
|
+
else if (op == MM_CIGAR_DEL || op == MM_CIGAR_N_SKIP)
|
182
|
+
t_len += len;
|
183
|
+
}
|
146
184
|
for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) {
|
147
185
|
int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
|
148
186
|
assert((op >= MM_CIGAR_MATCH && op <= MM_CIGAR_N_SKIP) || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH);
|
@@ -168,14 +206,42 @@ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
|
|
168
206
|
}
|
169
207
|
q_off += len, t_off += len;
|
170
208
|
} else if (op == MM_CIGAR_INS) {
|
171
|
-
|
172
|
-
|
173
|
-
|
209
|
+
if (is_ds) {
|
210
|
+
int z, ll, lr, y = q_off;
|
211
|
+
for (z = 1; z <= len; ++z)
|
212
|
+
if (y - z < 0 || qseq[y + len - z] != qseq[y - z])
|
213
|
+
break;
|
214
|
+
lr = z - 1;
|
215
|
+
for (z = 0; z < len; ++z)
|
216
|
+
if (y + len + z >= q_len || qseq[y + len + z] != qseq[y + z])
|
217
|
+
break;
|
218
|
+
ll = z;
|
219
|
+
mm_sprintf_lite(s, "+");
|
220
|
+
write_indel_ds(s, len, &qseq[y], ll, lr);
|
221
|
+
} else {
|
222
|
+
for (j = 0, tmp[len] = 0; j < len; ++j)
|
223
|
+
tmp[j] = "acgtn"[qseq[q_off + j]];
|
224
|
+
mm_sprintf_lite(s, "+%s", tmp);
|
225
|
+
}
|
174
226
|
q_off += len;
|
175
227
|
} else if (op == MM_CIGAR_DEL) {
|
176
|
-
|
177
|
-
|
178
|
-
|
228
|
+
if (is_ds) {
|
229
|
+
int z, ll, lr, x = t_off;
|
230
|
+
for (z = 1; z <= len; ++z)
|
231
|
+
if (x - z < 0 || tseq[x + len - z] != tseq[x - z])
|
232
|
+
break;
|
233
|
+
lr = z - 1;
|
234
|
+
for (z = 0; z < len; ++z)
|
235
|
+
if (x + len + z >= t_len || tseq[x + z] != tseq[x + len + z])
|
236
|
+
break;
|
237
|
+
ll = z;
|
238
|
+
mm_sprintf_lite(s, "-");
|
239
|
+
write_indel_ds(s, len, &tseq[x], ll, lr);
|
240
|
+
} else {
|
241
|
+
for (j = 0, tmp[len] = 0; j < len; ++j)
|
242
|
+
tmp[j] = "acgtn"[tseq[t_off + j]];
|
243
|
+
mm_sprintf_lite(s, "-%s", tmp);
|
244
|
+
}
|
179
245
|
t_off += len;
|
180
246
|
} else { // intron
|
181
247
|
assert(len >= 2);
|
@@ -218,7 +284,7 @@ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
|
|
218
284
|
assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
|
219
285
|
}
|
220
286
|
|
221
|
-
static void
|
287
|
+
static void write_cs_ds_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int no_iden, int is_MD, int is_ds, int write_tag, int is_qstrand)
|
222
288
|
{
|
223
289
|
extern unsigned char seq_nt4_table[256];
|
224
290
|
int i;
|
@@ -244,8 +310,8 @@ static void write_cs_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_
|
|
244
310
|
}
|
245
311
|
}
|
246
312
|
}
|
247
|
-
if (is_MD) write_MD_core(s, tseq, qseq, r, tmp, write_tag);
|
248
|
-
else
|
313
|
+
if (is_MD == 1) write_MD_core(s, tseq, qseq, r, tmp, write_tag);
|
314
|
+
else write_cs_ds_core(s, tseq, qseq, r, tmp, no_iden, is_ds, write_tag);
|
249
315
|
kfree(km, qseq); kfree(km, tseq); kfree(km, tmp);
|
250
316
|
}
|
251
317
|
|
@@ -256,7 +322,7 @@ int mm_gen_cs_or_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, cons
|
|
256
322
|
str.s = *buf, str.l = 0, str.m = *max_len;
|
257
323
|
t.l_seq = strlen(seq);
|
258
324
|
t.seq = (char*)seq;
|
259
|
-
|
325
|
+
write_cs_ds_or_MD(km, &str, mi, &t, r, no_iden, is_MD, 0, 0, is_qstrand);
|
260
326
|
*max_len = str.m;
|
261
327
|
*buf = str.s;
|
262
328
|
return str.l;
|
@@ -278,7 +344,7 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
|
|
278
344
|
if (r->id == r->parent) type = r->inv? 'I' : 'P';
|
279
345
|
else type = r->inv? 'i' : 'S';
|
280
346
|
if (r->p) {
|
281
|
-
mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->blen - r->mlen + r->p->n_ambi, r->p->
|
347
|
+
mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->blen - r->mlen + r->p->n_ambi, r->p->dp_max0, r->p->dp_score, r->p->n_ambi);
|
282
348
|
if (r->p->trans_strand == 1 || r->p->trans_strand == 2)
|
283
349
|
mm_sprintf_lite(s, "\tts:A:%c", "?+-?"[r->p->trans_strand]);
|
284
350
|
}
|
@@ -326,8 +392,8 @@ void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const
|
|
326
392
|
for (k = 0; k < r->p->n_cigar; ++k)
|
327
393
|
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, MM_CIGAR_STR[r->p->cigar[k]&0xf]);
|
328
394
|
}
|
329
|
-
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
|
330
|
-
|
395
|
+
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_DS|MM_F_OUT_MD)))
|
396
|
+
write_cs_ds_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, !!(opt_flag&MM_F_OUT_DS), 1, !!(opt_flag&MM_F_QSTRAND));
|
331
397
|
if ((opt_flag & MM_F_COPY_COMMENT) && t->comment)
|
332
398
|
mm_sprintf_lite(s, "\t%s", t->comment);
|
333
399
|
}
|
@@ -535,8 +601,8 @@ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
|
|
535
601
|
}
|
536
602
|
}
|
537
603
|
}
|
538
|
-
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
|
539
|
-
|
604
|
+
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_DS|MM_F_OUT_MD)))
|
605
|
+
write_cs_ds_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, !!(opt_flag&MM_F_OUT_DS), 1, 0);
|
540
606
|
if (cigar_in_tag)
|
541
607
|
write_sam_cigar(s, flag, 1, t->l_seq, r, opt_flag);
|
542
608
|
}
|
data/ext/minimap2/index.c
CHANGED
@@ -192,6 +192,7 @@ int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f)
|
|
192
192
|
if (f <= 0.) return INT32_MAX;
|
193
193
|
for (i = 0; i < 1<<mi->b; ++i)
|
194
194
|
if (mi->B[i].h) n += kh_size((idxhash_t*)mi->B[i].h);
|
195
|
+
if (n == 0) return INT32_MAX;
|
195
196
|
a = (uint32_t*)malloc(n * 4);
|
196
197
|
for (i = n = 0; i < 1<<mi->b; ++i) {
|
197
198
|
idxhash_t *h = (idxhash_t*)mi->B[i].h;
|
data/ext/minimap2/main.c
CHANGED
@@ -77,6 +77,7 @@ static ko_longopt_t long_options[] = {
|
|
77
77
|
{ "print-chains", ko_no_argument, 352 },
|
78
78
|
{ "no-hash-name", ko_no_argument, 353 },
|
79
79
|
{ "secondary-seq", ko_no_argument, 354 },
|
80
|
+
{ "ds", ko_no_argument, 355 },
|
80
81
|
{ "help", ko_no_argument, 'h' },
|
81
82
|
{ "max-intron-len", ko_required_argument, 'G' },
|
82
83
|
{ "version", ko_no_argument, 'V' },
|
@@ -120,7 +121,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int64_t flag, int long_idx, const
|
|
120
121
|
|
121
122
|
int main(int argc, char *argv[])
|
122
123
|
{
|
123
|
-
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:";
|
124
|
+
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:b:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:";
|
124
125
|
ketopt_t o = KETOPT_INIT;
|
125
126
|
mm_mapopt_t opt;
|
126
127
|
mm_idxopt_t ipt;
|
@@ -178,6 +179,7 @@ int main(int argc, char *argv[])
|
|
178
179
|
else if (c == 'm') opt.min_chain_score = atoi(o.arg);
|
179
180
|
else if (c == 'A') opt.a = atoi(o.arg);
|
180
181
|
else if (c == 'B') opt.b = atoi(o.arg);
|
182
|
+
else if (c == 'b') opt.transition = atoi(o.arg);
|
181
183
|
else if (c == 's') opt.min_dp_max = atoi(o.arg);
|
182
184
|
else if (c == 'C') opt.noncan = atoi(o.arg);
|
183
185
|
else if (c == 'I') ipt.batch_size = mm_parse_num(o.arg);
|
@@ -242,6 +244,7 @@ int main(int argc, char *argv[])
|
|
242
244
|
else if (c == 352) mm_dbg_flag |= MM_DBG_PRINT_CHAIN; // --print-chains
|
243
245
|
else if (c == 353) opt.flag |= MM_F_NO_HASH_NAME; // --no-hash-name
|
244
246
|
else if (c == 354) opt.flag |= MM_F_SECONDARY_SEQ; // --secondary-seq
|
247
|
+
else if (c == 355) opt.flag |= MM_F_OUT_DS; // --ds
|
245
248
|
else if (c == 330) {
|
246
249
|
fprintf(stderr, "[WARNING] \033[1;31m --lj-min-ratio has been deprecated.\033[0m\n");
|
247
250
|
} else if (c == 314) { // --frag
|
@@ -358,6 +361,7 @@ int main(int argc, char *argv[])
|
|
358
361
|
fprintf(fp_help, " -R STR SAM read group line in a format like '@RG\\tID:foo\\tSM:bar' []\n");
|
359
362
|
fprintf(fp_help, " -c output CIGAR in PAF\n");
|
360
363
|
fprintf(fp_help, " --cs[=STR] output the cs tag; STR is 'short' (if absent) or 'long' [none]\n");
|
364
|
+
fprintf(fp_help, " --ds output the ds tag, which is an extension to cs\n");
|
361
365
|
fprintf(fp_help, " --MD output the MD tag\n");
|
362
366
|
fprintf(fp_help, " --eqx write =/X CIGAR operators\n");
|
363
367
|
fprintf(fp_help, " -Y use soft clipping for supplementary alignments\n");
|
@@ -367,12 +371,12 @@ int main(int argc, char *argv[])
|
|
367
371
|
fprintf(fp_help, " --version show version number\n");
|
368
372
|
fprintf(fp_help, " Preset:\n");
|
369
373
|
fprintf(fp_help, " -x STR preset (always applied before other options; see minimap2.1 for details) []\n");
|
370
|
-
fprintf(fp_help, " -
|
371
|
-
fprintf(fp_help, " -
|
372
|
-
fprintf(fp_help, " - ava-pb/ava-ont - PacBio/Nanopore read overlap\n");
|
374
|
+
fprintf(fp_help, " - lr:hq - accurate long reads (error rate <1%%) against a reference genome\n");
|
375
|
+
fprintf(fp_help, " - splice/splice:hq - spliced alignment for long reads/accurate long reads\n");
|
373
376
|
fprintf(fp_help, " - asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5%% sequence divergence\n");
|
374
|
-
fprintf(fp_help, " -
|
375
|
-
fprintf(fp_help, " -
|
377
|
+
fprintf(fp_help, " - sr - short reads against a reference\n");
|
378
|
+
fprintf(fp_help, " - map-pb/map-hifi/map-ont/map-iclr - CLR/HiFi/Nanopore/ICLR vs reference mapping\n");
|
379
|
+
fprintf(fp_help, " - ava-pb/ava-ont - PacBio CLR/Nanopore read overlap\n");
|
376
380
|
fprintf(fp_help, "\nSee `man ./minimap2.1' for detailed description of these and other advanced command-line options.\n");
|
377
381
|
return fp_help == stdout? 0 : 1;
|
378
382
|
}
|
data/ext/minimap2/minimap.h
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
#include <stdio.h>
|
6
6
|
#include <sys/types.h>
|
7
7
|
|
8
|
-
#define MM_VERSION "2.
|
8
|
+
#define MM_VERSION "2.27-r1193"
|
9
9
|
|
10
10
|
#define MM_F_NO_DIAG (0x001LL) // no exact diagonal hit
|
11
11
|
#define MM_F_NO_DUAL (0x002LL) // skip pairs where query name is lexicographically larger than target name
|
@@ -44,6 +44,7 @@
|
|
44
44
|
#define MM_F_NO_HASH_NAME (0x400000000LL)
|
45
45
|
#define MM_F_SPLICE_OLD (0x800000000LL)
|
46
46
|
#define MM_F_SECONDARY_SEQ (0x1000000000LL) //output SEQ field for seqondary alignments using hard clipping
|
47
|
+
#define MM_F_OUT_DS (0x2000000000LL)
|
47
48
|
|
48
49
|
#define MM_I_HPC 0x1
|
49
50
|
#define MM_I_NO_SEQ 0x2
|
@@ -97,6 +98,7 @@ typedef struct {
|
|
97
98
|
typedef struct {
|
98
99
|
uint32_t capacity; // the capacity of cigar[]
|
99
100
|
int32_t dp_score, dp_max, dp_max2; // DP score; score of the max-scoring segment; score of the best alternate mappings
|
101
|
+
int32_t dp_max0; // DP score before mm_update_dp_max() adjustment
|
100
102
|
uint32_t n_ambi:30, trans_strand:2; // number of ambiguous bases; transcript strand: 0 for unknown, 1 for +, 2 for -
|
101
103
|
uint32_t n_cigar; // number of cigar operations in cigar[]
|
102
104
|
uint32_t cigar[];
|
@@ -153,6 +155,7 @@ typedef struct {
|
|
153
155
|
float alt_drop;
|
154
156
|
|
155
157
|
int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
|
158
|
+
int transition; // transition mismatch score (A:G, C:T)
|
156
159
|
int sc_ambi; // score when one or both bases are "N"
|
157
160
|
int noncan; // cost of non-canonical splicing sites
|
158
161
|
int junc_bonus;
|
data/ext/minimap2/minimap2.1
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
.TH minimap2 1 "
|
1
|
+
.TH minimap2 1 "12 March 2024" "minimap2-2.27 (r1193)" "Bioinformatics tools"
|
2
2
|
.SH NAME
|
3
3
|
.PP
|
4
4
|
minimap2 - mapping and alignment between collections of DNA sequences
|
@@ -343,6 +343,10 @@ Matching score [2]
|
|
343
343
|
.BI -B \ INT
|
344
344
|
Mismatching penalty [4]
|
345
345
|
.TP
|
346
|
+
.BI -b \ INT
|
347
|
+
Mismatching penalty for transitions [same as
|
348
|
+
.BR -B ].
|
349
|
+
.TP
|
346
350
|
.BI -O \ INT1[,INT2]
|
347
351
|
Gap open penalty [4,24]. If
|
348
352
|
.I INT2
|
@@ -356,10 +360,19 @@ costs
|
|
356
360
|
.RI min{ O1 + k * E1 , O2 + k * E2 }.
|
357
361
|
In the splice mode, the second gap penalties are not used.
|
358
362
|
.TP
|
363
|
+
.BI -J \ INT
|
364
|
+
Splice model [1]. 0 for the original minimap2 splice model that always penalizes non-GT-AG splicing;
|
365
|
+
1 for the miniprot model that considers non-GT-AG. Option
|
366
|
+
.B -C
|
367
|
+
has no effect with the default
|
368
|
+
.BR -J1 .
|
369
|
+
.BR -J0 .
|
370
|
+
.TP
|
359
371
|
.BI -C \ INT
|
360
372
|
Cost for a non-canonical GT-AG splicing (effective with
|
361
|
-
.
|
362
|
-
|
373
|
+
.B --splice
|
374
|
+
.BR -J0 )
|
375
|
+
[0].
|
363
376
|
.TP
|
364
377
|
.BI -z \ INT1[,INT2]
|
365
378
|
Truncate an alignment if the running alignment score drops too quickly along
|
@@ -506,6 +519,9 @@ Output =/X CIGAR operators for sequence match/mismatch.
|
|
506
519
|
.B -Y
|
507
520
|
In SAM output, use soft clipping for supplementary alignments.
|
508
521
|
.TP
|
522
|
+
.B --secondary-seq
|
523
|
+
In SAM output, show query sequences for secondary alignments.
|
524
|
+
.TP
|
509
525
|
.BI --seed \ INT
|
510
526
|
Integer seed for randomizing equally best hits. Minimap2 hashes
|
511
527
|
.I INT
|
@@ -566,15 +582,43 @@ are:
|
|
566
582
|
Align noisy long reads of ~10% error rate to a reference genome. This is the
|
567
583
|
default mode.
|
568
584
|
.TP
|
585
|
+
.B lr:hq
|
586
|
+
Align accurate long reads (error rate <1%) to a reference genome
|
587
|
+
.RB ( -k19
|
588
|
+
.B -w19 -U50,500
|
589
|
+
.BR -g10k ).
|
590
|
+
This was recommended by ONT developers for recent Nanopore reads
|
591
|
+
produced with chemistry v14 that can reach ~99% in accuracy.
|
592
|
+
It was shown to work better for accurate Nanopore reads
|
593
|
+
than
|
594
|
+
.BR map-hifi .
|
595
|
+
.TP
|
569
596
|
.B map-hifi
|
570
597
|
Align PacBio high-fidelity (HiFi) reads to a reference genome
|
571
|
-
.RB ( -
|
572
|
-
.B -
|
598
|
+
.RB ( -xlr:hq
|
599
|
+
.B -A1 -B4 -O6,26 -E2,1
|
573
600
|
.BR -s200 ).
|
601
|
+
It differs from
|
602
|
+
.B lr:hq
|
603
|
+
only in scoring. It has not been tested whether
|
604
|
+
.B lr:hq
|
605
|
+
would work better for PacBio HiFi reads.
|
574
606
|
.TP
|
575
607
|
.B map-pb
|
576
608
|
Align older PacBio continuous long (CLR) reads to a reference genome
|
577
609
|
.RB ( -Hk19 ).
|
610
|
+
Note that this data type is effectively deprecated by HiFi.
|
611
|
+
Unless you work on very old data, you probably want to use
|
612
|
+
.B map-hifi
|
613
|
+
or
|
614
|
+
.BR lr:hq .
|
615
|
+
.TP
|
616
|
+
.B map-iclr
|
617
|
+
Align Illumina Complete Long Reads (ICLR) to a reference genome
|
618
|
+
.RB ( -k19
|
619
|
+
.B -B6 -b4
|
620
|
+
.BR -O10,50 ).
|
621
|
+
This was recommended by Illumina developers.
|
578
622
|
.TP
|
579
623
|
.B asm5
|
580
624
|
Long assembly to reference mapping
|
@@ -582,21 +626,21 @@ Long assembly to reference mapping
|
|
582
626
|
.B -w19 -U50,500 --rmq -r1k,100k -g10k -A1 -B19 -O39,81 -E3,1 -s200 -z200
|
583
627
|
.BR -N50 ).
|
584
628
|
Typically, the alignment will not extend to regions with 5% or higher sequence
|
585
|
-
divergence.
|
629
|
+
divergence. Use this preset if the average divergence is not much higher than 0.1%.
|
586
630
|
.TP
|
587
631
|
.B asm10
|
588
632
|
Long assembly to reference mapping
|
589
633
|
.RB ( -k19
|
590
634
|
.B -w19 -U50,500 --rmq -r1k,100k -g10k -A1 -B9 -O16,41 -E2,1 -s200 -z200
|
591
635
|
.BR -N50 ).
|
592
|
-
|
636
|
+
Use this if the average divergence is around 1%.
|
593
637
|
.TP
|
594
638
|
.B asm20
|
595
639
|
Long assembly to reference mapping
|
596
640
|
.RB ( -k19
|
597
641
|
.B -w10 -U50,500 --rmq -r1k,100k -g10k -A1 -B4 -O6,26 -E2,1 -s200 -z200
|
598
642
|
.BR -N50 ).
|
599
|
-
|
643
|
+
Use this if the average divergence is around several percent.
|
600
644
|
.TP
|
601
645
|
.B splice
|
602
646
|
Long-read spliced alignment
|
@@ -612,13 +656,13 @@ costs are different during chaining; 4) the computation of the
|
|
612
656
|
tag ignores introns to demote hits to pseudogenes.
|
613
657
|
.TP
|
614
658
|
.B splice:hq
|
615
|
-
|
659
|
+
Spliced alignment for accurate long RNA-seq reads such as PacBio iso-seq
|
616
660
|
.RB ( -xsplice
|
617
661
|
.B -C5 -O6,24
|
618
662
|
.BR -B4 ).
|
619
663
|
.TP
|
620
664
|
.B sr
|
621
|
-
Short
|
665
|
+
Short-read alignment without splicing
|
622
666
|
.RB ( -k21
|
623
667
|
.B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -b0 -r100 -p.5 -N20 -f1000,5000 -n2 -m25
|
624
668
|
.B -s40 -g100 -2K50m --heap-sort=yes
|
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/usr/bin/env k8
|
2
2
|
|
3
|
-
var paftools_version = '2.
|
3
|
+
var paftools_version = '2.27-r1193';
|
4
4
|
|
5
5
|
/*****************************
|
6
6
|
***** Library functions *****
|
@@ -133,26 +133,50 @@ Interval.find_ovlp = function(a, st, en)
|
|
133
133
|
|
134
134
|
function fasta_read(fn)
|
135
135
|
{
|
136
|
-
var h = {},
|
136
|
+
var h = {}, seqlen = [];
|
137
|
+
var buf = new Bytes();
|
137
138
|
var file = fn == '-'? new File() : new File(fn);
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
if (
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
139
|
+
if (typeof k8_version == "undefined") { // for k8-0.x
|
140
|
+
var seq = null, name = null, gt = '>'.charCodeAt(0);
|
141
|
+
while (file.readline(buf) >= 0) {
|
142
|
+
if (buf[0] == gt) {
|
143
|
+
if (seq != null && name != null) {
|
144
|
+
seqlen.push([name, seq.length]);
|
145
|
+
h[name] = seq;
|
146
|
+
name = seq = null;
|
147
|
+
}
|
148
|
+
var m, line = buf.toString();
|
149
|
+
if ((m = /^>(\S+)/.exec(line)) != null) {
|
150
|
+
name = m[1];
|
151
|
+
seq = new Bytes();
|
152
|
+
}
|
153
|
+
} else seq.set(buf);
|
154
|
+
}
|
155
|
+
if (seq != null && name != null) {
|
156
|
+
seqlen.push([name, seq.length]);
|
157
|
+
h[name] = seq;
|
158
|
+
}
|
159
|
+
} else { // for k8-1.x
|
160
|
+
var seq = null, name = null;
|
161
|
+
while (file.readline(buf) >= 0) {
|
162
|
+
var line = buf.toString();
|
163
|
+
if (line[0] == ">") {
|
164
|
+
if (seq != null && name != null) {
|
165
|
+
seqlen.push([name, seq.length]);
|
166
|
+
h[name] = new Uint8Array(seq.buffer);
|
167
|
+
name = seq = null;
|
168
|
+
}
|
169
|
+
var m;
|
170
|
+
if ((m = /^>(\S+)/.exec(line)) != null) {
|
171
|
+
name = m[1];
|
172
|
+
seq = new Bytes();
|
173
|
+
}
|
174
|
+
} else seq.set(line);
|
175
|
+
}
|
176
|
+
if (seq != null && name != null) {
|
177
|
+
seqlen.push([name, seq.length]);
|
178
|
+
h[name] = new Uint8Array(seq.buffer);
|
179
|
+
}
|
156
180
|
}
|
157
181
|
buf.destroy();
|
158
182
|
file.close();
|
@@ -161,16 +185,27 @@ function fasta_read(fn)
|
|
161
185
|
|
162
186
|
function fasta_free(fa)
|
163
187
|
{
|
164
|
-
|
165
|
-
fa
|
188
|
+
if (typeof k8_version == "undefined")
|
189
|
+
for (var name in fa)
|
190
|
+
fa[name].destroy();
|
191
|
+
// FIXME: for k8-1.0, sequences are not freed. This is ok for now but not general.
|
166
192
|
}
|
167
193
|
|
168
194
|
Bytes.prototype.reverse = function()
|
169
195
|
{
|
170
|
-
|
171
|
-
var
|
172
|
-
|
173
|
-
|
196
|
+
if (typeof k8_version === "undefined") { // k8-0.x
|
197
|
+
for (var i = 0; i < this.length>>1; ++i) {
|
198
|
+
var tmp = this[i];
|
199
|
+
this[i] = this[this.length - i - 1];
|
200
|
+
this[this.length - i - 1] = tmp;
|
201
|
+
}
|
202
|
+
} else { // k8-1.x
|
203
|
+
var buf = new Uint8Array(this.buffer);
|
204
|
+
for (var i = 0; i < buf.length>>1; ++i) {
|
205
|
+
var tmp = buf[i];
|
206
|
+
buf[i] = buf[buf.length - i - 1];
|
207
|
+
buf[buf.length - i - 1] = tmp;
|
208
|
+
}
|
174
209
|
}
|
175
210
|
}
|
176
211
|
|
@@ -185,13 +220,24 @@ Bytes.prototype.revcomp = function()
|
|
185
220
|
for (var i = 0; i < s1.length; ++i)
|
186
221
|
Bytes.rctab[s1.charCodeAt(i)] = s2.charCodeAt(i);
|
187
222
|
}
|
188
|
-
|
189
|
-
var
|
190
|
-
|
191
|
-
|
223
|
+
if (typeof k8_version === "undefined") { // k8-0.x
|
224
|
+
for (var i = 0; i < this.length>>1; ++i) {
|
225
|
+
var tmp = this[this.length - i - 1];
|
226
|
+
this[this.length - i - 1] = Bytes.rctab[this[i]];
|
227
|
+
this[i] = Bytes.rctab[tmp];
|
228
|
+
}
|
229
|
+
if (this.length&1)
|
230
|
+
this[this.length>>1] = Bytes.rctab[this[this.length>>1]];
|
231
|
+
} else { // k8-1.x
|
232
|
+
var buf = new Uint8Array(this.buffer);
|
233
|
+
for (var i = 0; i < buf.length>>1; ++i) {
|
234
|
+
var tmp = buf[buf.length - i - 1];
|
235
|
+
buf[buf.length - i - 1] = Bytes.rctab[buf[i]];
|
236
|
+
buf[i] = Bytes.rctab[tmp];
|
237
|
+
}
|
238
|
+
if (buf.length&1)
|
239
|
+
buf[buf.length>>1] = Bytes.rctab[buf[buf.length>>1]];
|
192
240
|
}
|
193
|
-
if (this.length&1)
|
194
|
-
this[this.length>>1] = Bytes.rctab[this[this.length>>1]];
|
195
241
|
}
|
196
242
|
|
197
243
|
/********************
|
@@ -2051,7 +2097,7 @@ function paf_mapeval(args)
|
|
2051
2097
|
warn("Usage: paftools.js mapeval [options] <in.paf>|<in.sam>");
|
2052
2098
|
warn("Options:");
|
2053
2099
|
warn(" -r FLOAT mapping correct if overlap_length/union_length>FLOAT [" + ovlp_ratio + "]");
|
2054
|
-
warn(" -Q INT print wrong mappings with mapQ
|
2100
|
+
warn(" -Q INT print wrong mappings with mapQ>=INT [don't print]");
|
2055
2101
|
warn(" -m INT 0: eval the longest aln only; 1: first aln only; 2: all primary aln [0]");
|
2056
2102
|
exit(1);
|
2057
2103
|
}
|
data/ext/minimap2/options.c
CHANGED
@@ -45,6 +45,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
|
|
45
45
|
opt->alt_drop = 0.15f;
|
46
46
|
|
47
47
|
opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1;
|
48
|
+
opt->transition = 0;
|
48
49
|
opt->sc_ambi = 1;
|
49
50
|
opt->zdrop = 400, opt->zdrop_inv = 200;
|
50
51
|
opt->end_bonus = -1;
|
@@ -90,7 +91,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
|
90
91
|
if (preset == 0) {
|
91
92
|
mm_idxopt_init(io);
|
92
93
|
mm_mapopt_init(mo);
|
93
|
-
} else if (strcmp(preset, "map-ont") == 0) { // this is the same as the default
|
94
|
+
} else if (strcmp(preset, "lr") == 0 || strcmp(preset, "map-ont") == 0) { // this is the same as the default
|
94
95
|
} else if (strcmp(preset, "ava-ont") == 0) {
|
95
96
|
io->flag = 0, io->k = 15, io->w = 5;
|
96
97
|
mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN;
|
@@ -105,13 +106,22 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
|
105
106
|
mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25;
|
106
107
|
mo->bw_long = mo->bw;
|
107
108
|
mo->occ_dist = 0;
|
108
|
-
} else if (strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) {
|
109
|
+
} else if (strcmp(preset, "lr:hq") == 0 || strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) {
|
109
110
|
io->flag = 0, io->k = 19, io->w = 19;
|
110
111
|
mo->max_gap = 10000;
|
111
|
-
mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1;
|
112
|
-
mo->occ_dist = 500;
|
113
112
|
mo->min_mid_occ = 50, mo->max_mid_occ = 500;
|
114
|
-
|
113
|
+
if (strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) {
|
114
|
+
mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1;
|
115
|
+
mo->min_dp_max = 200;
|
116
|
+
}
|
117
|
+
} else if (strcmp(preset, "map-iclr-prerender") == 0) {
|
118
|
+
io->flag = 0, io->k = 15;
|
119
|
+
mo->b = 6, mo->transition = 1;
|
120
|
+
mo->q = 10, mo->q2 = 50;
|
121
|
+
} else if (strcmp(preset, "map-iclr") == 0) {
|
122
|
+
io->flag = 0, io->k = 19;
|
123
|
+
mo->b = 6, mo->transition = 4;
|
124
|
+
mo->q = 10, mo->q2 = 50;
|
115
125
|
} else if (strncmp(preset, "asm", 3) == 0) {
|
116
126
|
io->flag = 0, io->k = 19, io->w = 19;
|
117
127
|
mo->bw = 1000, mo->bw_long = 100000;
|
@@ -156,7 +166,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
|
156
166
|
mo->junc_bonus = 9;
|
157
167
|
mo->zdrop = 200, mo->zdrop_inv = 100; // because mo->a is halved
|
158
168
|
if (strcmp(preset, "splice:hq") == 0)
|
159
|
-
mo->
|
169
|
+
mo->noncan = 5, mo->b = 4, mo->q = 6, mo->q2 = 24;
|
160
170
|
} else return -1;
|
161
171
|
return 0;
|
162
172
|
}
|
@@ -77,7 +77,9 @@ This constructor accepts the following arguments:
|
|
77
77
|
|
78
78
|
* **min_chain_score**: minimum chaing score
|
79
79
|
|
80
|
-
* **bw**: chaining and alignment band width
|
80
|
+
* **bw**: chaining and alignment band width (initial chaining and extension)
|
81
|
+
|
82
|
+
* **bw_long**: chaining and alignment band width (RMQ-based rechaining and closing gaps)
|
81
83
|
|
82
84
|
* **best_n**: max number of alignments to return
|
83
85
|
|
@@ -3,7 +3,7 @@ from libc.stdlib cimport free
|
|
3
3
|
cimport cmappy
|
4
4
|
import sys
|
5
5
|
|
6
|
-
__version__ = '2.
|
6
|
+
__version__ = '2.27'
|
7
7
|
|
8
8
|
cmappy.mm_reset_timer()
|
9
9
|
|
@@ -112,7 +112,7 @@ cdef class Aligner:
|
|
112
112
|
cdef cmappy.mm_idxopt_t idx_opt
|
113
113
|
cdef cmappy.mm_mapopt_t map_opt
|
114
114
|
|
115
|
-
def __cinit__(self, fn_idx_in=None, preset=None, k=None, w=None, min_cnt=None, min_chain_score=None, min_dp_score=None, bw=None, best_n=None, n_threads=3, fn_idx_out=None, max_frag_len=None, extra_flags=None, seq=None, scoring=None):
|
115
|
+
def __cinit__(self, fn_idx_in=None, preset=None, k=None, w=None, min_cnt=None, min_chain_score=None, min_dp_score=None, bw=None, bw_long=None, best_n=None, n_threads=3, fn_idx_out=None, max_frag_len=None, extra_flags=None, seq=None, scoring=None):
|
116
116
|
self._idx = NULL
|
117
117
|
cmappy.mm_set_opt(NULL, &self.idx_opt, &self.map_opt) # set the default options
|
118
118
|
if preset is not None:
|
@@ -125,6 +125,7 @@ cdef class Aligner:
|
|
125
125
|
if min_chain_score is not None: self.map_opt.min_chain_score = min_chain_score
|
126
126
|
if min_dp_score is not None: self.map_opt.min_dp_max = min_dp_score
|
127
127
|
if bw is not None: self.map_opt.bw = bw
|
128
|
+
if bw_long is not None: self.map_opt.bw_long = bw_long
|
128
129
|
if best_n is not None: self.map_opt.best_n = best_n
|
129
130
|
if max_frag_len is not None: self.map_opt.max_frag_len = max_frag_len
|
130
131
|
if extra_flags is not None: self.map_opt.flag |= extra_flags
|
data/ext/minimap2/setup.py
CHANGED
data/lib/minimap2/aligner.rb
CHANGED
@@ -21,10 +21,11 @@ module Minimap2
|
|
21
21
|
# * ava-ont : Nanopore read overlap
|
22
22
|
# @param k [Integer] k-mer length, no larger than 28.
|
23
23
|
# @param w [Integer] minimizer window size, no larger than 255.
|
24
|
-
# @param min_cnt [Integer]
|
25
|
-
# @param min_chain_score [Integer] minimum
|
24
|
+
# @param min_cnt [Integer] minimum number of minimizers on a chain.
|
25
|
+
# @param min_chain_score [Integer] minimum chain score.
|
26
26
|
# @param min_dp_score
|
27
|
-
# @param bw [Integer] chaining and alignment band width.
|
27
|
+
# @param bw [Integer] chaining and alignment band width. (initial chaining and extension)
|
28
|
+
# @param bw_long [Integer] chaining and alignment band width (RMQ-based rechaining and closing gaps)
|
28
29
|
# @param best_n [Integer] max number of alignments to return.
|
29
30
|
# @param n_threads [Integer] number of indexing threads.
|
30
31
|
# @param fn_idx_out [String] name of file to which the index is written.
|
@@ -47,6 +48,7 @@ module Minimap2
|
|
47
48
|
min_chain_score: nil,
|
48
49
|
min_dp_score: nil,
|
49
50
|
bw: nil,
|
51
|
+
bw_long: nil,
|
50
52
|
best_n: nil,
|
51
53
|
n_threads: 3,
|
52
54
|
fn_idx_out: nil,
|
@@ -72,6 +74,7 @@ module Minimap2
|
|
72
74
|
map_opt[:min_chain_score] = min_chain_score if min_chain_score
|
73
75
|
map_opt[:min_dp_max] = min_dp_score if min_dp_score
|
74
76
|
map_opt[:bw] = bw if bw
|
77
|
+
map_opt[:bw_long] = bw_long if bw_long
|
75
78
|
map_opt[:best_n] = best_n if best_n
|
76
79
|
map_opt[:max_frag_len] = max_frag_len if max_frag_len
|
77
80
|
map_opt[:flag] |= extra_flags if extra_flags
|
data/lib/minimap2/alignment.rb
CHANGED
@@ -23,7 +23,7 @@ module Minimap2
|
|
23
23
|
# @return [Integer] length of the matching bases in the alignment,
|
24
24
|
# excluding ambiguous base matches.
|
25
25
|
# @!attribute nm
|
26
|
-
# @return [Integer] number of mismatches, gaps and ambiguous
|
26
|
+
# @return [Integer] number of mismatches, gaps and ambiguous positions in the alignment.
|
27
27
|
# @!attribute primary
|
28
28
|
# @return [Integer] if the alignment is primary (typically the best and the first to generate)
|
29
29
|
# @!attribute q_st
|
@@ -40,6 +40,7 @@ module Minimap2
|
|
40
40
|
NO_HASH_NAME = 0x400000000
|
41
41
|
SPLICE_OLD = 0x800000000
|
42
42
|
SECONDARY_SEQ = 0x1000000000 # output SEQ field for seqondary alignments using hard clipping
|
43
|
+
OUT_DS = 0x2000000000
|
43
44
|
|
44
45
|
HPC = 0x1
|
45
46
|
NO_SEQ = 0x2
|
@@ -109,8 +110,10 @@ module Minimap2
|
|
109
110
|
:dp_score, :int32, # DP score
|
110
111
|
:dp_max, :int32, # score of the max-scoring segment
|
111
112
|
:dp_max2, :int32, # score of the best alternate mappings
|
113
|
+
:dp_max0, :int32, # DP score before mm_update_dp_max() adjustment
|
112
114
|
:n_ambi_trans_strand, :uint32,
|
113
115
|
:n_cigar, :uint32
|
116
|
+
# :cigar, :pointer # variable length array (see cigar method below)
|
114
117
|
|
115
118
|
bit_field :n_ambi_trans_strand,
|
116
119
|
:n_ambi, 30, # number of ambiguous bases
|
@@ -204,6 +207,7 @@ module Minimap2
|
|
204
207
|
:e, :int, # gap-ext
|
205
208
|
:q2, :int, # gap-open
|
206
209
|
:e2, :int, # gap-ext
|
210
|
+
:transition, :int, # transition mismatch score (A:G, C:T)
|
207
211
|
:sc_ambi, :int, # score when one or both bases are "N"
|
208
212
|
:noncan, :int, # cost of non-canonical splicing sites
|
209
213
|
:junc_bonus, :int,
|
@@ -223,7 +227,7 @@ module Minimap2
|
|
223
227
|
:q_occ_frac, :float,
|
224
228
|
:min_mid_occ, :int32,
|
225
229
|
:max_mid_occ, :int32,
|
226
|
-
:mid_occ, :int32,
|
230
|
+
:mid_occ, :int32, # ignore seeds with occurrences above this threshold
|
227
231
|
:max_occ, :int32,
|
228
232
|
:max_max_occ, :int32,
|
229
233
|
:occ_dist, :int32,
|
@@ -15,10 +15,11 @@ module Minimap2
|
|
15
15
|
private_class_method :mm_set_opt_raw
|
16
16
|
|
17
17
|
def self.mm_set_opt(preset, io, mo)
|
18
|
-
ptr =
|
19
|
-
|
20
|
-
else
|
18
|
+
ptr = case preset
|
19
|
+
when 0, nil
|
21
20
|
::FFI::Pointer.new(:int, 0)
|
21
|
+
else
|
22
|
+
::FFI::MemoryPointer.from_string(preset.to_s)
|
22
23
|
end
|
23
24
|
mm_set_opt_raw(ptr, io, mo)
|
24
25
|
end
|
@@ -77,5 +78,17 @@ module Minimap2
|
|
77
78
|
:mm_gen_md, :mm_gen_MD, # Avoid uppercase letters in method names.
|
78
79
|
[:pointer, :pointer, :pointer, Idx.by_ref, Reg1.by_ref, :string],
|
79
80
|
:int
|
81
|
+
|
82
|
+
attach_function \
|
83
|
+
:mm_mapopt_init,
|
84
|
+
[MapOpt.by_ref],
|
85
|
+
:void
|
86
|
+
|
87
|
+
# mmpriv.h
|
88
|
+
|
89
|
+
attach_function \
|
90
|
+
:mm_idxopt_init,
|
91
|
+
[IdxOpt.by_ref],
|
92
|
+
:void
|
80
93
|
end
|
81
94
|
end
|
data/lib/minimap2/ffi.rb
CHANGED
data/lib/minimap2/version.rb
CHANGED
data/lib/minimap2.rb
CHANGED
@@ -37,7 +37,7 @@ module Minimap2
|
|
37
37
|
# @example Get minimap2 version
|
38
38
|
# Minimap2.execute('--version')
|
39
39
|
|
40
|
-
def
|
40
|
+
def execute(*rb_argv)
|
41
41
|
str_ptrs = []
|
42
42
|
# First argument is the program name.
|
43
43
|
str_ptrs << ::FFI::MemoryPointer.from_string("minimap2")
|
@@ -76,7 +76,7 @@ module Minimap2
|
|
76
76
|
# @param [String] file_path
|
77
77
|
# @param [Boolean] comment If True, the comment will be read.
|
78
78
|
# @yield [name, seq, qual, comment]
|
79
|
-
# @return [Enumerator] enum
|
79
|
+
# @return [Enumerator] enum Return Enumerator if not block given.
|
80
80
|
# Note: You can BioRuby instead of this method.
|
81
81
|
|
82
82
|
def fastx_read(file_path, comment: false, &block)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: minimap2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.27.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- kojix2
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-03-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ffi
|
@@ -150,7 +150,7 @@ homepage: https://github.com/kojix2/ruby-minimap2
|
|
150
150
|
licenses:
|
151
151
|
- MIT
|
152
152
|
metadata: {}
|
153
|
-
post_install_message:
|
153
|
+
post_install_message:
|
154
154
|
rdoc_options: []
|
155
155
|
require_paths:
|
156
156
|
- lib
|
@@ -165,8 +165,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
165
165
|
- !ruby/object:Gem::Version
|
166
166
|
version: '0'
|
167
167
|
requirements: []
|
168
|
-
rubygems_version: 3.4
|
169
|
-
signing_key:
|
168
|
+
rubygems_version: 3.5.4
|
169
|
+
signing_key:
|
170
170
|
specification_version: 4
|
171
171
|
summary: minimap2
|
172
172
|
test_files: []
|