minimap2 0.2.26.1 → 0.2.28.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 49b59da1af652a8a5ffbde0f5d9e98b32d94a192039d1e948aced9bb926e75d1
4
- data.tar.gz: 309fe28a4a58ccc2e5bd818c751fa33a40065756c0961fc96c35af7dbc24abea
3
+ metadata.gz: f23caa7c5707d41b96b6df1748d257e11bed0554cbc2123d5dca31ef22b9bb05
4
+ data.tar.gz: 6ab72bef6ad874385871460d55503696969c8dc381896873d64c821538bbfd57
5
5
  SHA512:
6
- metadata.gz: b9faab29797eecd594b2f724e3c88d38457d2c60a24f01b9dd6ab0aa17a680e12db68755cf3f47d9873427f1577d81770e798f1395db17b39bffdbaf7230c330
7
- data.tar.gz: cf3a3ea389fb1cbfabfd08fc17e073643db3c7ae59e74dcc11a75649e90ca2629a5c69fcc1548fab8755a8edb99c89a49e4353077aa3ef4b60bc9c129ec88a55
6
+ metadata.gz: 4327be493c432ba562780e79aceff0f53d409a1ff2c5630cde48ad28b4c338fdc9bab2a333087f1e2aa8cd83e2374ce3a7feec9e5133aaeb4c01c3011b0414db
7
+ data.tar.gz: adc86c65a0dbeb775b89385790894cf3ad2fa1c24cdb0b0a9d94302134910ee651afc832dc0b8864c3a24b6b86577ccd694e46effb1a95ecf6bea5e968d189ad
data/README.md CHANGED
@@ -1,7 +1,7 @@
1
1
  # ruby-minimap2
2
2
 
3
3
  [![Gem Version](https://img.shields.io/gem/v/minimap2?color=brightgreen)](https://rubygems.org/gems/minimap2)
4
- [![CI](https://github.com/kojix2/ruby-minimap2/workflows/CI/badge.svg)](https://github.com/kojix2/ruby-minimap2/actions)
4
+ [![test](https://github.com/kojix2/ruby-minimap2/actions/workflows/ci.yml/badge.svg)](https://github.com/kojix2/ruby-minimap2/actions/workflows/ci.yml)
5
5
  [![Docs Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://rubydoc.info/gems/minimap2)
6
6
  [![Docs Latest](https://img.shields.io/badge/docs-latest-blue.svg)](https://kojix2.github.io/ruby-minimap2/)
7
7
  [![The MIT License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE.txt)
@@ -23,7 +23,7 @@ gem install minimap2
23
23
  bundle install
24
24
  bundle exec rake minimap2:build
25
25
  bundle exec rake install
26
-
26
+
27
27
  </details>
28
28
 
29
29
  ## Quick Start
@@ -36,6 +36,7 @@ seq = aligner.seq("MT_human", 100, 200)
36
36
  hits = aligner.align(seq)
37
37
  pp hits
38
38
  ```
39
+
39
40
  ```
40
41
  [#<Minimap2::Alignment:0x000055bbfde2d128
41
42
  @blen=100,
@@ -57,8 +58,6 @@ pp hits
57
58
  @strand=1,
58
59
  @trans_strand=0>]
59
60
  ```
60
-
61
- </details>
62
61
 
63
62
  ## APIs Overview
64
63
 
@@ -87,7 +86,7 @@ pp hits
87
86
  - trans_strand Returns transcript strand. +1 if on the forward strand; -1 if on the reverse strand; 0 if unknown.
88
87
  - blen Returns length of the alignment, including both alignment matches and gaps but excluding ambiguous bases.
89
88
  - mlen Returns length of the matching bases in the alignment, excluding ambiguous base matches.
90
- - nm Returns number of mismatches, gaps and ambiguous poistions in the alignment.
89
+ - nm Returns number of mismatches, gaps and ambiguous positions in the alignment.
91
90
  - primary Returns if the alignment is primary (typically the best and the first to generate).
92
91
  - q_st Returns start positions on the query.
93
92
  - q_en Returns end positions on the query.
@@ -106,19 +105,20 @@ pp hits
106
105
  * MapOpt class Mapping options.
107
106
  ```
108
107
 
109
- * API is based on [Mappy](https://github.com/lh3/minimap2/tree/master/python), the official Python binding for Minimap2.
110
- * `Aligner#map` has been changed to `align`, because `map` means iterator in Ruby.
111
- * See [documentation](https://kojix2.github.io/ruby-minimap2/) for details.
108
+ - API is based on [Mappy](https://github.com/lh3/minimap2/tree/master/python), the official Python binding for Minimap2.
109
+ - `Aligner#map` has been changed to `align`, because `map` means iterator in Ruby.
110
+ - See [documentation](https://kojix2.github.io/ruby-minimap2/) for details.
112
111
 
113
112
  <details>
114
113
  <summary><b>C Structures and Functions</b></summary>
115
114
 
116
115
  ### FFI
117
- * Ruby-Minimap2 is built on top of [Ruby-FFI](https://github.com/ffi/ffi).
118
- * Native C functions can be called from the `Minimap2::FFI` module.
119
- * Native C structure members can be accessed.
120
- * Bitfields are supported by [ffi-bitfield](https://github.com/kojix2/ffi-bitfield) gems.
121
-
116
+
117
+ - Ruby-Minimap2 is built on top of [Ruby-FFI](https://github.com/ffi/ffi).
118
+ - Native C functions can be called from the `Minimap2::FFI` module.
119
+ - Native C structure members can be accessed.
120
+ - Bitfields are supported by [ffi-bitfield](https://github.com/kojix2/ffi-bitfield) gems.
121
+
122
122
  ```ruby
123
123
  aligner.idx_opt.members
124
124
  # => [:k, :w, :flag, :bucket_bits, :mini_batch_size, :batch_size]
@@ -130,7 +130,7 @@ aligner.idx_opt[:k] = 14
130
130
  aligner.idx_opt[:k]
131
131
  # => 14
132
132
  ```
133
-
133
+
134
134
  </details>
135
135
 
136
136
  ## Contributing
@@ -138,7 +138,7 @@ aligner.idx_opt[:k]
138
138
  <details>
139
139
  <summary><b>Development</b></summary>
140
140
 
141
- Fork your repository.
141
+ Fork your repository.
142
142
  then clone.
143
143
 
144
144
  ```sh
@@ -184,7 +184,7 @@ ruby-minimap2 is a library under development and there are many points to be imp
184
184
 
185
185
  Please feel free to report [bugs](https://github.com/kojix2/ruby-minimap2/issues) and [pull requests](https://github.com/kojix2/ruby-minimap2/pulls)!
186
186
 
187
- Many OSS projects become abandoned because only the founder has commit rights to the original repository.
187
+ Many OSS projects become abandoned because only the founder has commit rights to the original repository.
188
188
  If you need commit rights to ruby-minimap2 repository or want to get admin rights and take over the project, please feel free to contact me @kojix2.
189
189
 
190
190
  ## License
data/ext/Rakefile CHANGED
@@ -51,10 +51,8 @@ namespace :minimap2 do
51
51
  end
52
52
  end
53
53
 
54
- task cleanall: [:clean]
55
-
56
54
  desc "`make clean` and remove shared lib"
57
- task :cleanall do
55
+ task cleanall: [:clean] do
58
56
  Dir.chdir(minimap2_dir) do
59
57
  sh "rm #{target_path}" if File.exist?(target_path)
60
58
  end
data/ext/minimap2/NEWS.md CHANGED
@@ -1,9 +1,76 @@
1
+ Release 2.28-r1209 (27 March 2024)
2
+ ----------------------------------
3
+
4
+ Notable changes to minimap2:
5
+
6
+ * Bugfix: `--MD` was not working properly due to the addition of `--ds` in the
7
+ last release (#1181 and #1182).
8
+
9
+ * New feature: added an experimental preset `lq:hqae` for aligning accurate
10
+ long reads back to their assembly. It has been observed that `map-hifi` and
11
+ `lr:hq` may produce many wrong alignments around centromeres when accurate
12
+ long reads (PacBio HiFi or Nanopore duplex/Q20+) are mapped to a diploid
13
+ assembly constructed from them. This new preset produces much more accurate
14
+ alignment. It is still experimental and may be subjective to changes in
15
+ future.
16
+
17
+ * Change: reduced the default `--cap-kalloc` to 500m to lower the peak
18
+ memory consumption (#855).
19
+
20
+ Notable changes to mappy:
21
+
22
+ * Bugfix: mappy option struct was out of sync with minimap2 (#1177).
23
+
24
+ Minimap2 should output identical alignments to v2.27.
25
+
26
+ (2.28: 27 March 2024, r1209)
27
+
28
+
29
+
30
+ Release 2.27-r1193 (12 March 2024)
31
+ ----------------------------------
32
+
33
+ Notable changes to minimap2:
34
+
35
+ * New feature: added the `lr:hq` preset for accurate long reads at ~1% error
36
+ rate. This was suggested by Oxford Nanopore developers (#1127). It is not
37
+ clear if this preset also works well for PacBio HiFi reads.
38
+
39
+ * New feature: added the `map-iclr` preset for Illumina Complete Long Reads
40
+ (#1069), provided by Illumina developers.
41
+
42
+ * New feature: added option `-b` to specify mismatch penalty for base
43
+ transitions (i.e. A-to-G or C-to-T changes).
44
+
45
+ * New feature: added option `--ds` to generate a new `ds:Z` tag that
46
+ indicates uncertainty in INDEL positions. It is an extension to `cs`. The
47
+ `mgutils-es6.js` script in minigraph parses `ds`.
48
+
49
+ * Bugfix: avoided a NULL pointer dereference (#1154). This would not have an
50
+ effect on most systems but would still be good to fix.
51
+
52
+ * Bugfix: reverted the value of `ms:i` to pre-2.22 versions (#1146). This was
53
+ an oversight. See fcd4df2 for details.
54
+
55
+ Notable changes to paftools.js and mappy:
56
+
57
+ * New feature: expose `bw_long` to mappy's Aligner class (#1124).
58
+
59
+ * Bugfix: fixed several compatibility issues with k8 v1.0 (#1161 and #1166).
60
+ Subcommands "call", "pbsim2fq" and "mason2fq" were not working with v1.0.
61
+
62
+ Minimap2 should output identical alignments to v2.26, except the ms tag.
63
+
64
+ (2.27: 12 March 2024, r1193)
65
+
66
+
67
+
1
68
  Release 2.26-r1175 (29 April 2023)
2
69
  ----------------------------------
3
70
 
4
71
  Fixed the broken Python package. This is the only change.
5
72
 
6
- (2.25: 25 April 2023, r1173)
73
+ (2.26: 25 April 2023, r1173)
7
74
 
8
75
 
9
76
 
@@ -15,7 +15,7 @@ cd minimap2 && make
15
15
  ./minimap2 -ax map-pb ref.fa pacbio.fq.gz > aln.sam # PacBio CLR genomic reads
16
16
  ./minimap2 -ax map-ont ref.fa ont.fq.gz > aln.sam # Oxford Nanopore genomic reads
17
17
  ./minimap2 -ax map-hifi ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.19 or later)
18
- ./minimap2 -ax asm20 ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.18 or earlier)
18
+ ./minimap2 -ax lr:hq ref.fa ont-Q20.fq.gz > aln.sam # Nanopore Q20 genomic reads (v2.27 or later)
19
19
  ./minimap2 -ax sr ref.fa read1.fa read2.fa > aln.sam # short genomic paired-end reads
20
20
  ./minimap2 -ax splice ref.fa rna-reads.fa > aln.sam # spliced long reads (strand unknown)
21
21
  ./minimap2 -ax splice -uf -k14 ref.fa reads.fa > aln.sam # noisy Nanopore Direct RNA-seq
@@ -74,8 +74,8 @@ Detailed evaluations are available from the [minimap2 paper][doi] or the
74
74
  Minimap2 is optimized for x86-64 CPUs. You can acquire precompiled binaries from
75
75
  the [release page][release] with:
76
76
  ```sh
77
- curl -L https://github.com/lh3/minimap2/releases/download/v2.26/minimap2-2.26_x64-linux.tar.bz2 | tar -jxvf -
78
- ./minimap2-2.26_x64-linux/minimap2
77
+ curl -L https://github.com/lh3/minimap2/releases/download/v2.28/minimap2-2.28_x64-linux.tar.bz2 | tar -jxvf -
78
+ ./minimap2-2.28_x64-linux/minimap2
79
79
  ```
80
80
  If you want to compile from the source, you need to have a C compiler, GNU make
81
81
  and zlib development files installed. Then type `make` in the source code
@@ -139,12 +139,15 @@ parameters at the same time. The default setting is the same as `map-ont`.
139
139
  ```sh
140
140
  minimap2 -ax map-pb ref.fa pacbio-reads.fq > aln.sam # for PacBio CLR reads
141
141
  minimap2 -ax map-ont ref.fa ont-reads.fq > aln.sam # for Oxford Nanopore reads
142
+ minimap2 -ax map-iclr ref.fa iclr-reads.fq > aln.sam # for Illumina Complete Long Reads
142
143
  ```
143
144
  The difference between `map-pb` and `map-ont` is that `map-pb` uses
144
145
  homopolymer-compressed (HPC) minimizers as seeds, while `map-ont` uses ordinary
145
- minimizers as seeds. Emperical evaluation suggests HPC minimizers improve
146
+ minimizers as seeds. Empirical evaluation suggests HPC minimizers improve
146
147
  performance and sensitivity when aligning PacBio CLR reads, but hurt when aligning
147
- Nanopore reads.
148
+ Nanopore reads. `map-iclr` uses an adjusted alignment scoring matrix that
149
+ accounts for the low overall error rate in the reads, with transversion errors
150
+ being less frequent than transitions.
148
151
 
149
152
  #### <a name="map-long-splice"></a>Map long mRNA/cDNA reads
150
153
 
data/ext/minimap2/align.c CHANGED
@@ -21,6 +21,18 @@ static void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t sc
21
21
  mat[(m - 1) * m + j] = sc_ambi;
22
22
  }
23
23
 
24
+ static void ksw_gen_ts_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t transition, int8_t sc_ambi)
25
+ {
26
+ assert(m == 5);
27
+ ksw_gen_simple_mat(m, mat, a, b, sc_ambi);
28
+ if (transition == 0 || transition == b) return;
29
+ transition = transition > 0? -transition : transition;
30
+ mat[0 * m + 2] = transition; // A->G
31
+ mat[1 * m + 3] = transition; // C->T
32
+ mat[2 * m + 0] = transition; // G->A
33
+ mat[3 * m + 1] = transition; // T->C
34
+ }
35
+
24
36
  static inline void mm_seq_rev(uint32_t len, uint8_t *seq)
25
37
  {
26
38
  uint32_t i;
@@ -283,7 +295,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
283
295
  toff += len;
284
296
  }
285
297
  }
286
- p->dp_max = (int32_t)(max + .499);
298
+ p->dp_max = p->dp_max0 = (int32_t)(max + .499);
287
299
  assert(qoff == r->qe - r->qs && toff == r->re - r->rs);
288
300
  if (is_eqx) mm_update_cigar_eqx(r, qseq, tseq); // NB: it has to be called here as changes to qseq and tseq are not returned
289
301
  }
@@ -323,6 +335,8 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint
323
335
  for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr);
324
336
  fputc('\n', stderr);
325
337
  }
338
+ if (opt->transition != 0 && opt->b != opt->transition)
339
+ flag |= KSW_EZ_GENERIC_SC;
326
340
  if (opt->max_sw_mat > 0 && (int64_t)tlen * qlen > opt->max_sw_mat) {
327
341
  ksw_reset_extz(ez);
328
342
  ez->zdropped = 1;
@@ -586,7 +600,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
586
600
 
587
601
  r2->cnt = 0;
588
602
  if (r->cnt == 0) return;
589
- ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi);
603
+ ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi);
590
604
  bw = (int)(opt->bw * 1.5 + 1.);
591
605
  bw_long = (int)(opt->bw_long * 1.5 + 1.);
592
606
  if (bw_long < bw) bw_long = bw;
@@ -844,7 +858,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i
844
858
  if (ql < opt->min_chain_score || ql > opt->max_gap) return 0;
845
859
  if (tl < opt->min_chain_score || tl > opt->max_gap) return 0;
846
860
 
847
- ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi);
861
+ ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi);
848
862
  tseq = (uint8_t*)kmalloc(km, tl);
849
863
  mm_idx_getseq(mi, r1->rid, r1->re, r2->rs, tseq);
850
864
  qseq = r1->rev? &qseq0[0][r2->qe] : &qseq0[1][qlen - r2->qs];
@@ -919,14 +933,14 @@ double mm_event_identity(const mm_reg1_t *r)
919
933
  static int32_t mm_recal_max_dp(const mm_reg1_t *r, double b2, int32_t match_sc)
920
934
  {
921
935
  uint32_t i;
922
- int32_t n_gap = 0, n_gapo = 0, n_mis;
936
+ int32_t n_gap = 0, n_mis;
923
937
  double gap_cost = 0.0;
924
938
  if (r->p == 0) return -1;
925
939
  for (i = 0; i < r->p->n_cigar; ++i) {
926
940
  int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4;
927
941
  if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) {
928
942
  gap_cost += b2 + (double)mg_log2(1.0 + len);
929
- ++n_gapo, n_gap += len;
943
+ n_gap += len;
930
944
  }
931
945
  }
932
946
  n_mis = r->blen + r->p->n_ambi - r->mlen - n_gap;
@@ -31,8 +31,8 @@ To acquire the data used in this cookbook and to install minimap2 and paftools,
31
31
  please follow the command lines below:
32
32
  ```sh
33
33
  # install minimap2 executables
34
- curl -L https://github.com/lh3/minimap2/releases/download/v2.26/minimap2-2.26_x64-linux.tar.bz2 | tar jxf -
35
- cp minimap2-2.26_x64-linux/{minimap2,k8,paftools.js} . # copy executables
34
+ curl -L https://github.com/lh3/minimap2/releases/download/v2.28/minimap2-2.28_x64-linux.tar.bz2 | tar jxf -
35
+ cp minimap2-2.28_x64-linux/{minimap2,k8,paftools.js} . # copy executables
36
36
  export PATH="$PATH:"`pwd` # put the current directory on PATH
37
37
  # download example datasets
38
38
  curl -L https://github.com/lh3/minimap2/releases/download/v2.10/cookbook-data.tgz | tar zxf -
@@ -139,10 +139,48 @@ int mm_write_sam_hdr(const mm_idx_t *idx, const char *rg, const char *ver, int a
139
139
  return ret;
140
140
  }
141
141
 
142
- static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int no_iden, int write_tag)
142
+ static void write_indel_ds(kstring_t *str, int64_t len, const uint8_t *seq, int64_t ll, int64_t lr) // write an indel to ds; adapted from minigraph
143
143
  {
144
- int i, q_off, t_off;
145
- if (write_tag) mm_sprintf_lite(s, "\tcs:Z:");
144
+ int64_t i;
145
+ if (ll + lr >= len) {
146
+ mm_sprintf_lite(str, "[");
147
+ for (i = 0; i < len; ++i)
148
+ mm_sprintf_lite(str, "%c", "acgtn"[seq[i]]);
149
+ mm_sprintf_lite(str, "]");
150
+ } else {
151
+ int64_t k = 0;
152
+ if (ll > 0) {
153
+ mm_sprintf_lite(str, "[");
154
+ for (i = 0; i < ll; ++i)
155
+ mm_sprintf_lite(str, "%c", "acgtn"[seq[k+i]]);
156
+ mm_sprintf_lite(str, "]");
157
+ k += ll;
158
+ }
159
+ for (i = 0; i < len - lr - ll; ++i)
160
+ mm_sprintf_lite(str, "%c", "acgtn"[seq[k+i]]);
161
+ k += len - lr - ll;
162
+ if (lr > 0) {
163
+ mm_sprintf_lite(str, "[");
164
+ for (i = 0; i < lr; ++i)
165
+ mm_sprintf_lite(str, "%c", "acgtn"[seq[k+i]]);
166
+ mm_sprintf_lite(str, "]");
167
+ }
168
+ }
169
+ }
170
+
171
+ static void write_cs_ds_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int no_iden, int is_ds, int write_tag)
172
+ {
173
+ int i, q_off, t_off, q_len = 0, t_len = 0;
174
+ if (write_tag) mm_sprintf_lite(s, "\t%cs:Z:", is_ds? 'd' : 'c');
175
+ for (i = 0; i < (int)r->p->n_cigar; ++i) {
176
+ int op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
177
+ if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH)
178
+ q_len += len, t_len += len;
179
+ else if (op == MM_CIGAR_INS)
180
+ q_len += len;
181
+ else if (op == MM_CIGAR_DEL || op == MM_CIGAR_N_SKIP)
182
+ t_len += len;
183
+ }
146
184
  for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) {
147
185
  int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
148
186
  assert((op >= MM_CIGAR_MATCH && op <= MM_CIGAR_N_SKIP) || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH);
@@ -168,14 +206,42 @@ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
168
206
  }
169
207
  q_off += len, t_off += len;
170
208
  } else if (op == MM_CIGAR_INS) {
171
- for (j = 0, tmp[len] = 0; j < len; ++j)
172
- tmp[j] = "acgtn"[qseq[q_off + j]];
173
- mm_sprintf_lite(s, "+%s", tmp);
209
+ if (is_ds) {
210
+ int z, ll, lr, y = q_off;
211
+ for (z = 1; z <= len; ++z)
212
+ if (y - z < 0 || qseq[y + len - z] != qseq[y - z])
213
+ break;
214
+ lr = z - 1;
215
+ for (z = 0; z < len; ++z)
216
+ if (y + len + z >= q_len || qseq[y + len + z] != qseq[y + z])
217
+ break;
218
+ ll = z;
219
+ mm_sprintf_lite(s, "+");
220
+ write_indel_ds(s, len, &qseq[y], ll, lr);
221
+ } else {
222
+ for (j = 0, tmp[len] = 0; j < len; ++j)
223
+ tmp[j] = "acgtn"[qseq[q_off + j]];
224
+ mm_sprintf_lite(s, "+%s", tmp);
225
+ }
174
226
  q_off += len;
175
227
  } else if (op == MM_CIGAR_DEL) {
176
- for (j = 0, tmp[len] = 0; j < len; ++j)
177
- tmp[j] = "acgtn"[tseq[t_off + j]];
178
- mm_sprintf_lite(s, "-%s", tmp);
228
+ if (is_ds) {
229
+ int z, ll, lr, x = t_off;
230
+ for (z = 1; z <= len; ++z)
231
+ if (x - z < 0 || tseq[x + len - z] != tseq[x - z])
232
+ break;
233
+ lr = z - 1;
234
+ for (z = 0; z < len; ++z)
235
+ if (x + len + z >= t_len || tseq[x + z] != tseq[x + len + z])
236
+ break;
237
+ ll = z;
238
+ mm_sprintf_lite(s, "-");
239
+ write_indel_ds(s, len, &tseq[x], ll, lr);
240
+ } else {
241
+ for (j = 0, tmp[len] = 0; j < len; ++j)
242
+ tmp[j] = "acgtn"[tseq[t_off + j]];
243
+ mm_sprintf_lite(s, "-%s", tmp);
244
+ }
179
245
  t_off += len;
180
246
  } else { // intron
181
247
  assert(len >= 2);
@@ -218,7 +284,7 @@ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
218
284
  assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
219
285
  }
220
286
 
221
- static void write_cs_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int no_iden, int is_MD, int write_tag, int is_qstrand)
287
+ static void write_cs_ds_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int no_iden, int is_MD, int is_ds, int write_tag, int is_qstrand)
222
288
  {
223
289
  extern unsigned char seq_nt4_table[256];
224
290
  int i;
@@ -245,7 +311,7 @@ static void write_cs_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_
245
311
  }
246
312
  }
247
313
  if (is_MD) write_MD_core(s, tseq, qseq, r, tmp, write_tag);
248
- else write_cs_core(s, tseq, qseq, r, tmp, no_iden, write_tag);
314
+ else write_cs_ds_core(s, tseq, qseq, r, tmp, no_iden, is_ds, write_tag);
249
315
  kfree(km, qseq); kfree(km, tseq); kfree(km, tmp);
250
316
  }
251
317
 
@@ -256,7 +322,7 @@ int mm_gen_cs_or_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, cons
256
322
  str.s = *buf, str.l = 0, str.m = *max_len;
257
323
  t.l_seq = strlen(seq);
258
324
  t.seq = (char*)seq;
259
- write_cs_or_MD(km, &str, mi, &t, r, no_iden, is_MD, 0, is_qstrand);
325
+ write_cs_ds_or_MD(km, &str, mi, &t, r, no_iden, is_MD, 0, 0, is_qstrand);
260
326
  *max_len = str.m;
261
327
  *buf = str.s;
262
328
  return str.l;
@@ -278,7 +344,7 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
278
344
  if (r->id == r->parent) type = r->inv? 'I' : 'P';
279
345
  else type = r->inv? 'i' : 'S';
280
346
  if (r->p) {
281
- mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->blen - r->mlen + r->p->n_ambi, r->p->dp_max, r->p->dp_score, r->p->n_ambi);
347
+ mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->blen - r->mlen + r->p->n_ambi, r->p->dp_max0, r->p->dp_score, r->p->n_ambi);
282
348
  if (r->p->trans_strand == 1 || r->p->trans_strand == 2)
283
349
  mm_sprintf_lite(s, "\tts:A:%c", "?+-?"[r->p->trans_strand]);
284
350
  }
@@ -326,8 +392,8 @@ void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const
326
392
  for (k = 0; k < r->p->n_cigar; ++k)
327
393
  mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, MM_CIGAR_STR[r->p->cigar[k]&0xf]);
328
394
  }
329
- if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
330
- write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, 1, !!(opt_flag&MM_F_QSTRAND));
395
+ if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_DS|MM_F_OUT_MD)))
396
+ write_cs_ds_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), !!(opt_flag&MM_F_OUT_MD), !!(opt_flag&MM_F_OUT_DS), 1, !!(opt_flag&MM_F_QSTRAND));
331
397
  if ((opt_flag & MM_F_COPY_COMMENT) && t->comment)
332
398
  mm_sprintf_lite(s, "\t%s", t->comment);
333
399
  }
@@ -535,8 +601,8 @@ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
535
601
  }
536
602
  }
537
603
  }
538
- if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
539
- write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, 1, 0);
604
+ if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_DS|MM_F_OUT_MD)))
605
+ write_cs_ds_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, !!(opt_flag&MM_F_OUT_DS), 1, 0);
540
606
  if (cigar_in_tag)
541
607
  write_sam_cigar(s, flag, 1, t->l_seq, r, opt_flag);
542
608
  }
data/ext/minimap2/index.c CHANGED
@@ -192,6 +192,7 @@ int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f)
192
192
  if (f <= 0.) return INT32_MAX;
193
193
  for (i = 0; i < 1<<mi->b; ++i)
194
194
  if (mi->B[i].h) n += kh_size((idxhash_t*)mi->B[i].h);
195
+ if (n == 0) return INT32_MAX;
195
196
  a = (uint32_t*)malloc(n * 4);
196
197
  for (i = n = 0; i < 1<<mi->b; ++i) {
197
198
  idxhash_t *h = (idxhash_t*)mi->B[i].h;
@@ -149,7 +149,7 @@ mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int
149
149
  int is_cdna, int n_seg, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km)
150
150
  { // TODO: make sure this works when n has more than 32 bits
151
151
  int32_t *f, *t, *v, n_u, n_v, mmax_f = 0, max_drop = bw;
152
- int64_t *p, i, j, max_ii, st = 0, n_iter = 0;
152
+ int64_t *p, i, j, max_ii, st = 0;
153
153
  uint64_t *u;
154
154
 
155
155
  if (_u) *_u = 0, *n_u_ = 0;
@@ -174,7 +174,6 @@ mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int
174
174
  for (j = i - 1; j >= st; --j) {
175
175
  int32_t sc;
176
176
  sc = comput_sc(&a[i], &a[j], max_dist_x, max_dist_y, bw, chn_pen_gap, chn_pen_skip, is_cdna, n_seg);
177
- ++n_iter;
178
177
  if (sc == INT32_MIN) continue;
179
178
  sc += f[j];
180
179
  if (sc > max_f) {
@@ -204,6 +203,7 @@ mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int
204
203
  if (max_ii < 0 || (a[i].x - a[max_ii].x <= (int64_t)max_dist_x && f[max_ii] < f[i]))
205
204
  max_ii = i;
206
205
  if (mmax_f < max_f) mmax_f = max_f;
206
+ //fprintf(stderr, "X1\t%ld\t%ld:%d\t%ld\t%ld:%d\t%ld\t%ld\n", (long)i, (long)(a[i].x>>32), (int32_t)a[i].x, (long)max_j, max_j<0?-1L:(long)(a[max_j].x>>32), max_j<0?-1:(int32_t)a[max_j].x, (long)max_f, (long)v[i]);
207
207
  }
208
208
 
209
209
  u = mg_chain_backtrack(km, n, f, p, v, t, min_cnt, min_sc, max_drop, &n_u, &n_v);
@@ -263,7 +263,8 @@ mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_ski
263
263
  return 0;
264
264
  }
265
265
  if (max_dist < bw) max_dist = bw;
266
- if (max_dist_inner <= 0 || max_dist_inner >= max_dist) max_dist_inner = 0;
266
+ if (max_dist_inner < 0) max_dist_inner = 0;
267
+ if (max_dist_inner > max_dist) max_dist_inner = max_dist;
267
268
  p = Kmalloc(km, int64_t, n);
268
269
  f = Kmalloc(km, int32_t, n);
269
270
  t = Kcalloc(km, int32_t, n);
@@ -325,12 +326,11 @@ mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_ski
325
326
  krmq_interval(lc_elem, root_inner, &s, &lo, &hi);
326
327
  if (lo) {
327
328
  const lc_elem_t *q;
328
- int32_t width, n_rmq_iter = 0;
329
+ int32_t width;
329
330
  krmq_itr_t(lc_elem) itr;
330
331
  krmq_itr_find(lc_elem, root_inner, lo, &itr);
331
332
  while ((q = krmq_at(&itr)) != 0) {
332
333
  if (q->y < (int32_t)a[i].y - max_dist_inner) break;
333
- ++n_rmq_iter;
334
334
  j = q->i;
335
335
  sc = f[j] + comput_sc_simple(&a[i], &a[j], chn_pen_gap, chn_pen_skip, 0, &width);
336
336
  if (width <= bw) {
data/ext/minimap2/main.c CHANGED
@@ -77,6 +77,9 @@ static ko_longopt_t long_options[] = {
77
77
  { "print-chains", ko_no_argument, 352 },
78
78
  { "no-hash-name", ko_no_argument, 353 },
79
79
  { "secondary-seq", ko_no_argument, 354 },
80
+ { "ds", ko_no_argument, 355 },
81
+ { "rmq-inner", ko_required_argument, 356 },
82
+ { "dbg-seed-occ", ko_no_argument, 501 },
80
83
  { "help", ko_no_argument, 'h' },
81
84
  { "max-intron-len", ko_required_argument, 'G' },
82
85
  { "version", ko_no_argument, 'V' },
@@ -120,7 +123,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int64_t flag, int long_idx, const
120
123
 
121
124
  int main(int argc, char *argv[])
122
125
  {
123
- const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:";
126
+ const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:b:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:";
124
127
  ketopt_t o = KETOPT_INIT;
125
128
  mm_mapopt_t opt;
126
129
  mm_idxopt_t ipt;
@@ -178,6 +181,7 @@ int main(int argc, char *argv[])
178
181
  else if (c == 'm') opt.min_chain_score = atoi(o.arg);
179
182
  else if (c == 'A') opt.a = atoi(o.arg);
180
183
  else if (c == 'B') opt.b = atoi(o.arg);
184
+ else if (c == 'b') opt.transition = atoi(o.arg);
181
185
  else if (c == 's') opt.min_dp_max = atoi(o.arg);
182
186
  else if (c == 'C') opt.noncan = atoi(o.arg);
183
187
  else if (c == 'I') ipt.batch_size = mm_parse_num(o.arg);
@@ -242,6 +246,9 @@ int main(int argc, char *argv[])
242
246
  else if (c == 352) mm_dbg_flag |= MM_DBG_PRINT_CHAIN; // --print-chains
243
247
  else if (c == 353) opt.flag |= MM_F_NO_HASH_NAME; // --no-hash-name
244
248
  else if (c == 354) opt.flag |= MM_F_SECONDARY_SEQ; // --secondary-seq
249
+ else if (c == 355) opt.flag |= MM_F_OUT_DS; // --ds
250
+ else if (c == 356) opt.rmq_inner_dist = mm_parse_num(o.arg); // --rmq-inner
251
+ else if (c == 501) mm_dbg_flag |= MM_DBG_SEED_FREQ; // --dbg-seed-occ
245
252
  else if (c == 330) {
246
253
  fprintf(stderr, "[WARNING] \033[1;31m --lj-min-ratio has been deprecated.\033[0m\n");
247
254
  } else if (c == 314) { // --frag
@@ -358,6 +365,7 @@ int main(int argc, char *argv[])
358
365
  fprintf(fp_help, " -R STR SAM read group line in a format like '@RG\\tID:foo\\tSM:bar' []\n");
359
366
  fprintf(fp_help, " -c output CIGAR in PAF\n");
360
367
  fprintf(fp_help, " --cs[=STR] output the cs tag; STR is 'short' (if absent) or 'long' [none]\n");
368
+ fprintf(fp_help, " --ds output the ds tag, which is an extension to cs\n");
361
369
  fprintf(fp_help, " --MD output the MD tag\n");
362
370
  fprintf(fp_help, " --eqx write =/X CIGAR operators\n");
363
371
  fprintf(fp_help, " -Y use soft clipping for supplementary alignments\n");
@@ -367,12 +375,12 @@ int main(int argc, char *argv[])
367
375
  fprintf(fp_help, " --version show version number\n");
368
376
  fprintf(fp_help, " Preset:\n");
369
377
  fprintf(fp_help, " -x STR preset (always applied before other options; see minimap2.1 for details) []\n");
370
- fprintf(fp_help, " - map-pb/map-ont - PacBio CLR/Nanopore vs reference mapping\n");
371
- fprintf(fp_help, " - map-hifi - PacBio HiFi reads vs reference mapping\n");
372
- fprintf(fp_help, " - ava-pb/ava-ont - PacBio/Nanopore read overlap\n");
378
+ fprintf(fp_help, " - lr:hq - accurate long reads (error rate <1%%) against a reference genome\n");
379
+ fprintf(fp_help, " - splice/splice:hq - spliced alignment for long reads/accurate long reads\n");
373
380
  fprintf(fp_help, " - asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5%% sequence divergence\n");
374
- fprintf(fp_help, " - splice/splice:hq - long-read/Pacbio-CCS spliced alignment\n");
375
- fprintf(fp_help, " - sr - genomic short-read mapping\n");
381
+ fprintf(fp_help, " - sr - short reads against a reference\n");
382
+ fprintf(fp_help, " - map-pb/map-hifi/map-ont/map-iclr - CLR/HiFi/Nanopore/ICLR vs reference mapping\n");
383
+ fprintf(fp_help, " - ava-pb/ava-ont - PacBio CLR/Nanopore read overlap\n");
376
384
  fprintf(fp_help, "\nSee `man ./minimap2.1' for detailed description of these and other advanced command-line options.\n");
377
385
  return fp_help == stdout? 0 : 1;
378
386
  }
@@ -5,7 +5,7 @@
5
5
  #include <stdio.h>
6
6
  #include <sys/types.h>
7
7
 
8
- #define MM_VERSION "2.26-r1175"
8
+ #define MM_VERSION "2.28-r1209"
9
9
 
10
10
  #define MM_F_NO_DIAG (0x001LL) // no exact diagonal hit
11
11
  #define MM_F_NO_DUAL (0x002LL) // skip pairs where query name is lexicographically larger than target name
@@ -44,6 +44,7 @@
44
44
  #define MM_F_NO_HASH_NAME (0x400000000LL)
45
45
  #define MM_F_SPLICE_OLD (0x800000000LL)
46
46
  #define MM_F_SECONDARY_SEQ (0x1000000000LL) //output SEQ field for seqondary alignments using hard clipping
47
+ #define MM_F_OUT_DS (0x2000000000LL)
47
48
 
48
49
  #define MM_I_HPC 0x1
49
50
  #define MM_I_NO_SEQ 0x2
@@ -97,6 +98,7 @@ typedef struct {
97
98
  typedef struct {
98
99
  uint32_t capacity; // the capacity of cigar[]
99
100
  int32_t dp_score, dp_max, dp_max2; // DP score; score of the max-scoring segment; score of the best alternate mappings
101
+ int32_t dp_max0; // DP score before mm_update_dp_max() adjustment
100
102
  uint32_t n_ambi:30, trans_strand:2; // number of ambiguous bases; transcript strand: 0 for unknown, 1 for +, 2 for -
101
103
  uint32_t n_cigar; // number of cigar operations in cigar[]
102
104
  uint32_t cigar[];
@@ -153,6 +155,7 @@ typedef struct {
153
155
  float alt_drop;
154
156
 
155
157
  int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
158
+ int transition; // transition mismatch score (A:G, C:T)
156
159
  int sc_ambi; // score when one or both bases are "N"
157
160
  int noncan; // cost of non-canonical splicing sites
158
161
  int junc_bonus;