minimap2 0.2.26.1 → 0.2.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 49b59da1af652a8a5ffbde0f5d9e98b32d94a192039d1e948aced9bb926e75d1
4
- data.tar.gz: 309fe28a4a58ccc2e5bd818c751fa33a40065756c0961fc96c35af7dbc24abea
3
+ metadata.gz: f23caa7c5707d41b96b6df1748d257e11bed0554cbc2123d5dca31ef22b9bb05
4
+ data.tar.gz: 6ab72bef6ad874385871460d55503696969c8dc381896873d64c821538bbfd57
5
5
  SHA512:
6
- metadata.gz: b9faab29797eecd594b2f724e3c88d38457d2c60a24f01b9dd6ab0aa17a680e12db68755cf3f47d9873427f1577d81770e798f1395db17b39bffdbaf7230c330
7
- data.tar.gz: cf3a3ea389fb1cbfabfd08fc17e073643db3c7ae59e74dcc11a75649e90ca2629a5c69fcc1548fab8755a8edb99c89a49e4353077aa3ef4b60bc9c129ec88a55
6
+ metadata.gz: 4327be493c432ba562780e79aceff0f53d409a1ff2c5630cde48ad28b4c338fdc9bab2a333087f1e2aa8cd83e2374ce3a7feec9e5133aaeb4c01c3011b0414db
7
+ data.tar.gz: adc86c65a0dbeb775b89385790894cf3ad2fa1c24cdb0b0a9d94302134910ee651afc832dc0b8864c3a24b6b86577ccd694e46effb1a95ecf6bea5e968d189ad
data/README.md CHANGED
@@ -1,7 +1,7 @@
1
1
  # ruby-minimap2
2
2
 
3
3
  [![Gem Version](https://img.shields.io/gem/v/minimap2?color=brightgreen)](https://rubygems.org/gems/minimap2)
4
- [![CI](https://github.com/kojix2/ruby-minimap2/workflows/CI/badge.svg)](https://github.com/kojix2/ruby-minimap2/actions)
4
+ [![test](https://github.com/kojix2/ruby-minimap2/actions/workflows/ci.yml/badge.svg)](https://github.com/kojix2/ruby-minimap2/actions/workflows/ci.yml)
5
5
  [![Docs Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://rubydoc.info/gems/minimap2)
6
6
  [![Docs Latest](https://img.shields.io/badge/docs-latest-blue.svg)](https://kojix2.github.io/ruby-minimap2/)
7
7
  [![The MIT License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE.txt)
@@ -23,7 +23,7 @@ gem install minimap2
23
23
  bundle install
24
24
  bundle exec rake minimap2:build
25
25
  bundle exec rake install
26
-
26
+
27
27
  </details>
28
28
 
29
29
  ## Quick Start
@@ -36,6 +36,7 @@ seq = aligner.seq("MT_human", 100, 200)
36
36
  hits = aligner.align(seq)
37
37
  pp hits
38
38
  ```
39
+
39
40
  ```
40
41
  [#<Minimap2::Alignment:0x000055bbfde2d128
41
42
  @blen=100,
@@ -57,8 +58,6 @@ pp hits
57
58
  @strand=1,
58
59
  @trans_strand=0>]
59
60
  ```
60
-
61
- </details>
62
61
 
63
62
  ## APIs Overview
64
63
 
@@ -87,7 +86,7 @@ pp hits
87
86
  - trans_strand Returns transcript strand. +1 if on the forward strand; -1 if on the reverse strand; 0 if unknown.
88
87
  - blen Returns length of the alignment, including both alignment matches and gaps but excluding ambiguous bases.
89
88
  - mlen Returns length of the matching bases in the alignment, excluding ambiguous base matches.
90
- - nm Returns number of mismatches, gaps and ambiguous poistions in the alignment.
89
+ - nm Returns number of mismatches, gaps and ambiguous positions in the alignment.
91
90
  - primary Returns if the alignment is primary (typically the best and the first to generate).
92
91
  - q_st Returns start positions on the query.
93
92
  - q_en Returns end positions on the query.
@@ -106,19 +105,20 @@ pp hits
106
105
  * MapOpt class Mapping options.
107
106
  ```
108
107
 
109
- * API is based on [Mappy](https://github.com/lh3/minimap2/tree/master/python), the official Python binding for Minimap2.
110
- * `Aligner#map` has been changed to `align`, because `map` means iterator in Ruby.
111
- * See [documentation](https://kojix2.github.io/ruby-minimap2/) for details.
108
+ - API is based on [Mappy](https://github.com/lh3/minimap2/tree/master/python), the official Python binding for Minimap2.
109
+ - `Aligner#map` has been changed to `align`, because `map` means iterator in Ruby.
110
+ - See [documentation](https://kojix2.github.io/ruby-minimap2/) for details.
112
111
 
113
112
  <details>
114
113
  <summary><b>C Structures and Functions</b></summary>
115
114
 
116
115
  ### FFI
117
- * Ruby-Minimap2 is built on top of [Ruby-FFI](https://github.com/ffi/ffi).
118
- * Native C functions can be called from the `Minimap2::FFI` module.
119
- * Native C structure members can be accessed.
120
- * Bitfields are supported by [ffi-bitfield](https://github.com/kojix2/ffi-bitfield) gems.
121
-
116
+
117
+ - Ruby-Minimap2 is built on top of [Ruby-FFI](https://github.com/ffi/ffi).
118
+ - Native C functions can be called from the `Minimap2::FFI` module.
119
+ - Native C structure members can be accessed.
120
+ - Bitfields are supported by [ffi-bitfield](https://github.com/kojix2/ffi-bitfield) gems.
121
+
122
122
  ```ruby
123
123
  aligner.idx_opt.members
124
124
  # => [:k, :w, :flag, :bucket_bits, :mini_batch_size, :batch_size]
@@ -130,7 +130,7 @@ aligner.idx_opt[:k] = 14
130
130
  aligner.idx_opt[:k]
131
131
  # => 14
132
132
  ```
133
-
133
+
134
134
  </details>
135
135
 
136
136
  ## Contributing
@@ -138,7 +138,7 @@ aligner.idx_opt[:k]
138
138
  <details>
139
139
  <summary><b>Development</b></summary>
140
140
 
141
- Fork your repository.
141
+ Fork your repository.
142
142
  then clone.
143
143
 
144
144
  ```sh
@@ -184,7 +184,7 @@ ruby-minimap2 is a library under development and there are many points to be imp
184
184
 
185
185
  Please feel free to report [bugs](https://github.com/kojix2/ruby-minimap2/issues) and [pull requests](https://github.com/kojix2/ruby-minimap2/pulls)!
186
186
 
187
- Many OSS projects become abandoned because only the founder has commit rights to the original repository.
187
+ Many OSS projects become abandoned because only the founder has commit rights to the original repository.
188
188
  If you need commit rights to ruby-minimap2 repository or want to get admin rights and take over the project, please feel free to contact me @kojix2.
189
189
 
190
190
  ## License
data/ext/Rakefile CHANGED
@@ -51,10 +51,8 @@ namespace :minimap2 do
51
51
  end
52
52
  end
53
53
 
54
- task cleanall: [:clean]
55
-
56
54
  desc "`make clean` and remove shared lib"
57
- task :cleanall do
55
+ task cleanall: [:clean] do
58
56
  Dir.chdir(minimap2_dir) do
59
57
  sh "rm #{target_path}" if File.exist?(target_path)
60
58
  end
data/ext/minimap2/NEWS.md CHANGED
@@ -1,9 +1,76 @@
1
+ Release 2.28-r1209 (27 March 2024)
2
+ ----------------------------------
3
+
4
+ Notable changes to minimap2:
5
+
6
+ * Bugfix: `--MD` was not working properly due to the addition of `--ds` in the
7
+ last release (#1181 and #1182).
8
+
9
+ * New feature: added an experimental preset `lq:hqae` for aligning accurate
10
+ long reads back to their assembly. It has been observed that `map-hifi` and
11
+ `lr:hq` may produce many wrong alignments around centromeres when accurate
12
+ long reads (PacBio HiFi or Nanopore duplex/Q20+) are mapped to a diploid
13
+ assembly constructed from them. This new preset produces much more accurate
14
+ alignment. It is still experimental and may be subjective to changes in
15
+ future.
16
+
17
+ * Change: reduced the default `--cap-kalloc` to 500m to lower the peak
18
+ memory consumption (#855).
19
+
20
+ Notable changes to mappy:
21
+
22
+ * Bugfix: mappy option struct was out of sync with minimap2 (#1177).
23
+
24
+ Minimap2 should output identical alignments to v2.27.
25
+
26
+ (2.28: 27 March 2024, r1209)
27
+
28
+
29
+
30
+ Release 2.27-r1193 (12 March 2024)
31
+ ----------------------------------
32
+
33
+ Notable changes to minimap2:
34
+
35
+ * New feature: added the `lr:hq` preset for accurate long reads at ~1% error
36
+ rate. This was suggested by Oxford Nanopore developers (#1127). It is not
37
+ clear if this preset also works well for PacBio HiFi reads.
38
+
39
+ * New feature: added the `map-iclr` preset for Illumina Complete Long Reads
40
+ (#1069), provided by Illumina developers.
41
+
42
+ * New feature: added option `-b` to specify mismatch penalty for base
43
+ transitions (i.e. A-to-G or C-to-T changes).
44
+
45
+ * New feature: added option `--ds` to generate a new `ds:Z` tag that
46
+ indicates uncertainty in INDEL positions. It is an extension to `cs`. The
47
+ `mgutils-es6.js` script in minigraph parses `ds`.
48
+
49
+ * Bugfix: avoided a NULL pointer dereference (#1154). This would not have an
50
+ effect on most systems but would still be good to fix.
51
+
52
+ * Bugfix: reverted the value of `ms:i` to pre-2.22 versions (#1146). This was
53
+ an oversight. See fcd4df2 for details.
54
+
55
+ Notable changes to paftools.js and mappy:
56
+
57
+ * New feature: expose `bw_long` to mappy's Aligner class (#1124).
58
+
59
+ * Bugfix: fixed several compatibility issues with k8 v1.0 (#1161 and #1166).
60
+ Subcommands "call", "pbsim2fq" and "mason2fq" were not working with v1.0.
61
+
62
+ Minimap2 should output identical alignments to v2.26, except the ms tag.
63
+
64
+ (2.27: 12 March 2024, r1193)
65
+
66
+
67
+
1
68
  Release 2.26-r1175 (29 April 2023)
2
69
  ----------------------------------
3
70
 
4
71
  Fixed the broken Python package. This is the only change.
5
72
 
6
- (2.25: 25 April 2023, r1173)
73
+ (2.26: 25 April 2023, r1173)
7
74
 
8
75
 
9
76
 
@@ -15,7 +15,7 @@ cd minimap2 && make
15
15
  ./minimap2 -ax map-pb ref.fa pacbio.fq.gz > aln.sam # PacBio CLR genomic reads
16
16
  ./minimap2 -ax map-ont ref.fa ont.fq.gz > aln.sam # Oxford Nanopore genomic reads
17
17
  ./minimap2 -ax map-hifi ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.19 or later)
18
- ./minimap2 -ax asm20 ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.18 or earlier)
18
+ ./minimap2 -ax lr:hq ref.fa ont-Q20.fq.gz > aln.sam # Nanopore Q20 genomic reads (v2.27 or later)
19
19
  ./minimap2 -ax sr ref.fa read1.fa read2.fa > aln.sam # short genomic paired-end reads
20
20
  ./minimap2 -ax splice ref.fa rna-reads.fa > aln.sam # spliced long reads (strand unknown)
21
21
  ./minimap2 -ax splice -uf -k14 ref.fa reads.fa > aln.sam # noisy Nanopore Direct RNA-seq
@@ -74,8 +74,8 @@ Detailed evaluations are available from the [minimap2 paper][doi] or the
74
74
  Minimap2 is optimized for x86-64 CPUs. You can acquire precompiled binaries from
75
75
  the [release page][release] with:
76
76
  ```sh
77
- curl -L https://github.com/lh3/minimap2/releases/download/v2.26/minimap2-2.26_x64-linux.tar.bz2 | tar -jxvf -
78
- ./minimap2-2.26_x64-linux/minimap2
77
+ curl -L https://github.com/lh3/minimap2/releases/download/v2.28/minimap2-2.28_x64-linux.tar.bz2 | tar -jxvf -
78
+ ./minimap2-2.28_x64-linux/minimap2
79
79
  ```
80
80
  If you want to compile from the source, you need to have a C compiler, GNU make
81
81
  and zlib development files installed. Then type `make` in the source code
@@ -139,12 +139,15 @@ parameters at the same time. The default setting is the same as `map-ont`.
139
139
  ```sh
140
140
  minimap2 -ax map-pb ref.fa pacbio-reads.fq > aln.sam # for PacBio CLR reads
141
141
  minimap2 -ax map-ont ref.fa ont-reads.fq > aln.sam # for Oxford Nanopore reads
142
+ minimap2 -ax map-iclr ref.fa iclr-reads.fq > aln.sam # for Illumina Complete Long Reads
142
143
  ```
143
144
  The difference between `map-pb` and `map-ont` is that `map-pb` uses
144
145
  homopolymer-compressed (HPC) minimizers as seeds, while `map-ont` uses ordinary
145
- minimizers as seeds. Emperical evaluation suggests HPC minimizers improve
146
+ minimizers as seeds. Empirical evaluation suggests HPC minimizers improve
146
147
  performance and sensitivity when aligning PacBio CLR reads, but hurt when aligning
147
- Nanopore reads.
148
+ Nanopore reads. `map-iclr` uses an adjusted alignment scoring matrix that
149
+ accounts for the low overall error rate in the reads, with transversion errors
150
+ being less frequent than transitions.
148
151
 
149
152
  #### <a name="map-long-splice"></a>Map long mRNA/cDNA reads
150
153
 
data/ext/minimap2/align.c CHANGED
@@ -21,6 +21,18 @@ static void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t sc
21
21
  mat[(m - 1) * m + j] = sc_ambi;
22
22
  }
23
23
 
24
+ static void ksw_gen_ts_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t transition, int8_t sc_ambi)
25
+ {
26
+ assert(m == 5);
27
+ ksw_gen_simple_mat(m, mat, a, b, sc_ambi);
28
+ if (transition == 0 || transition == b) return;
29
+ transition = transition > 0? -transition : transition;
30
+ mat[0 * m + 2] = transition; // A->G
31
+ mat[1 * m + 3] = transition; // C->T
32
+ mat[2 * m + 0] = transition; // G->A
33
+ mat[3 * m + 1] = transition; // T->C
34
+ }
35
+
24
36
  static inline void mm_seq_rev(uint32_t len, uint8_t *seq)
25
37
  {
26
38
  uint32_t i;
@@ -283,7 +295,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
283
295
  toff += len;
284
296
  }
285
297
  }
286
- p->dp_max = (int32_t)(max + .499);
298
+ p->dp_max = p->dp_max0 = (int32_t)(max + .499);
287
299
  assert(qoff == r->qe - r->qs && toff == r->re - r->rs);
288
300
  if (is_eqx) mm_update_cigar_eqx(r, qseq, tseq); // NB: it has to be called here as changes to qseq and tseq are not returned
289
301
  }
@@ -323,6 +335,8 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint
323
335
  for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr);
324
336
  fputc('\n', stderr);
325
337
  }
338
+ if (opt->transition != 0 && opt->b != opt->transition)
339
+ flag |= KSW_EZ_GENERIC_SC;
326
340
  if (opt->max_sw_mat > 0 && (int64_t)tlen * qlen > opt->max_sw_mat) {
327
341
  ksw_reset_extz(ez);
328
342
  ez->zdropped = 1;
@@ -586,7 +600,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
586
600
 
587
601
  r2->cnt = 0;
588
602
  if (r->cnt == 0) return;
589
- ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi);
603
+ ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi);
590
604
  bw = (int)(opt->bw * 1.5 + 1.);
591
605
  bw_long = (int)(opt->bw_long * 1.5 + 1.);
592
606
  if (bw_long < bw) bw_long = bw;
@@ -844,7 +858,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i
844
858
  if (ql < opt->min_chain_score || ql > opt->max_gap) return 0;
845
859
  if (tl < opt->min_chain_score || tl > opt->max_gap) return 0;
846
860
 
847
- ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi);
861
+ ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi);
848
862
  tseq = (uint8_t*)kmalloc(km, tl);
849
863
  mm_idx_getseq(mi, r1->rid, r1->re, r2->rs, tseq);
850
864
  qseq = r1->rev? &qseq0[0][r2->qe] : &qseq0[1][qlen - r2->qs];
@@ -919,14 +933,14 @@ double mm_event_identity(const mm_reg1_t *r)
919
933
  static int32_t mm_recal_max_dp(const mm_reg1_t *r, double b2, int32_t match_sc)
920
934
  {
921
935
  uint32_t i;
922
- int32_t n_gap = 0, n_gapo = 0, n_mis;
936
+ int32_t n_gap = 0, n_mis;
923
937
  double gap_cost = 0.0;
924
938
  if (r->p == 0) return -1;
925
939
  for (i = 0; i < r->p->n_cigar; ++i) {
926
940
  int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4;
927
941
  if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) {
928
942
  gap_cost += b2 + (double)mg_log2(1.0 + len);
929
- ++n_gapo, n_gap += len;
943
+ n_gap += len;
930
944
  }
931
945
  }
932
946
  n_mis = r->blen + r->p->n_ambi - r->mlen - n_gap;
@@ -31,8 +31,8 @@ To acquire the data used in this cookbook and to install minimap2 and paftools,
31
31
  please follow the command lines below:
32
32
  ```sh
33
33
  # install minimap2 executables
34
- curl -L https://github.com/lh3/minimap2/releases/download/v2.26/minimap2-2.26_x64-linux.tar.bz2 | tar jxf -
35
- cp minimap2-2.26_x64-linux/{minimap2,k8,paftools.js} . # copy executables
34
+ curl -L https://github.com/lh3/minimap2/releases/download/v2.28/minimap2-2.28_x64-linux.tar.bz2 | tar jxf -
35
+ cp minimap2-2.28_x64-linux/{minimap2,k8,paftools.js} . # copy executables
36
36
  export PATH="$PATH:"`pwd` # put the current directory on PATH
37
37
  # download example datasets
38
38
  curl -L https://github.com/lh3/minimap2/releases/download/v2.10/cookbook-data.tgz | tar zxf -
@@ -139,10 +139,48 @@ int mm_write_sam_hdr(const mm_idx_t *idx, const char *rg, const char *ver, int a
139
139
  return ret;
140
140
  }
141
141
 
142
- static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int no_iden, int write_tag)
142
+ static void write_indel_ds(kstring_t *str, int64_t len, const uint8_t *seq, int64_t ll, int64_t lr) // write an indel to ds; adapted from minigraph
143
143
  {
144
- int i, q_off, t_off;
145
- if (write_tag) mm_sprintf_lite(s, "\tcs:Z:");
144
+ int64_t i;
145
+ if (ll + lr >= len) {
146
+ mm_sprintf_lite(str, "[");
147
+ for (i = 0; i < len; ++i)
148
+ mm_sprintf_lite(str, "%c", "acgtn"[seq[i]]);
149
+ mm_sprintf_lite(str, "]");
150
+ } else {
151
+ int64_t k = 0;
152
+ if (ll > 0) {
153
+ mm_sprintf_lite(str, "[");
154
+ for (i = 0; i < ll; ++i)
155
+ mm_sprintf_lite(str, "%c", "acgtn"[seq[k+i]]);
156
+ mm_sprintf_lite(str, "]");
157
+ k += ll;
158
+ }
159
+ for (i = 0; i < len - lr - ll; ++i)
160
+ mm_sprintf_lite(str, "%c", "acgtn"[seq[k+i]]);
161
+ k += len - lr - ll;
162
+ if (lr > 0) {
163
+ mm_sprintf_lite(str, "[");
164
+ for (i = 0; i < lr; ++i)
165
+ mm_sprintf_lite(str, "%c", "acgtn"[seq[k+i]]);
166
+ mm_sprintf_lite(str, "]");
167
+ }
168
+ }
169
+ }
170
+
171
+ static void write_cs_ds_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int no_iden, int is_ds, int write_tag)
172
+ {
173
+ int i, q_off, t_off, q_len = 0, t_len = 0;
174
+ if (write_tag) mm_sprintf_lite(s, "\t%cs:Z:", is_ds? 'd' : 'c');
175
+ for (i = 0; i < (int)r->p->n_cigar; ++i) {
176
+ int op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
177
+ if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH)
178
+ q_len += len, t_len += len;
179
+ else if (op == MM_CIGAR_INS)
180
+ q_len += len;
181
+ else if (op == MM_CIGAR_DEL || op == MM_CIGAR_N_SKIP)
182
+ t_len += len;
183
+ }
146
184
  for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) {
147
185
  int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
148
186
  assert((op >= MM_CIGAR_MATCH && op <= MM_CIGAR_N_SKIP) || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH);
@@ -168,14 +206,42 @@ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
168
206
  }
169
207
  q_off += len, t_off += len;
170
208
  } else if (op == MM_CIGAR_INS) {
171
- for (j = 0, tmp[len] = 0; j < len; ++j)
172
- tmp[j] = "acgtn"[qseq[q_off + j]];
173
- mm_sprintf_lite(s, "+%s", tmp);
209
+ if (is_ds) {
210
+ int z, ll, lr, y = q_off;
211
+ for (z = 1; z <= len; ++z)
212
+ if (y - z < 0 || qseq[y + len - z] != qseq[y - z])
213
+ break;
214
+ lr = z - 1;
215
+ for (z = 0; z < len; ++z)
216
+ if (y + len + z >= q_len || qseq[y + len + z] != qseq[y + z])
217
+ break;
218
+ ll = z;
219
+ mm_sprintf_lite(s, "+");
220
+ write_indel_ds(s, len, &qseq[y], ll, lr);
221
+ } else {
222
+ for (j = 0, tmp[len] = 0; j < len; ++j)
223
+ tmp[j] = "acgtn"[qseq[q_off + j]];
224
+ mm_sprintf_lite(s, "+%s", tmp);
225
+ }
174
226
  q_off += len;
175
227
  } else if (op == MM_CIGAR_DEL) {
176
- for (j = 0, tmp[len] = 0; j < len; ++j)
177
- tmp[j] = "acgtn"[tseq[t_off + j]];
178
- mm_sprintf_lite(s, "-%s", tmp);
228
+ if (is_ds) {
229
+ int z, ll, lr, x = t_off;
230
+ for (z = 1; z <= len; ++z)
231
+ if (x - z < 0 || tseq[x + len - z] != tseq[x - z])
232
+ break;
233
+ lr = z - 1;
234
+ for (z = 0; z < len; ++z)
235
+ if (x + len + z >= t_len || tseq[x + z] != tseq[x + len + z])
236
+ break;
237
+ ll = z;
238
+ mm_sprintf_lite(s, "-");
239
+ write_indel_ds(s, len, &tseq[x], ll, lr);
240
+ } else {
241
+ for (j = 0, tmp[len] = 0; j < len; ++j)
242
+ tmp[j] = "acgtn"[tseq[t_off + j]];
243
+ mm_sprintf_lite(s, "-%s", tmp);
244
+ }
179
245
  t_off += len;
180
246
  } else { // intron
181
247
  assert(len >= 2);
@@ -218,7 +284,7 @@ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
218
284
  assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
219
285
  }
220
286
 
221
- static void write_cs_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int no_iden, int is_MD, int write_tag, int is_qstrand)
287
+ static void write_cs_ds_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int no_iden, int is_MD, int is_ds, int write_tag, int is_qstrand)
222
288
  {
223
289
  extern unsigned char seq_nt4_table[256];
224
290
  int i;
@@ -245,7 +311,7 @@ static void write_cs_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_
245
311
  }
246
312
  }
247
313
  if (is_MD) write_MD_core(s, tseq, qseq, r, tmp, write_tag);
248
- else write_cs_core(s, tseq, qseq, r, tmp, no_iden, write_tag);
314
+ else write_cs_ds_core(s, tseq, qseq, r, tmp, no_iden, is_ds, write_tag);
249
315
  kfree(km, qseq); kfree(km, tseq); kfree(km, tmp);
250
316
  }
251
317
 
@@ -256,7 +322,7 @@ int mm_gen_cs_or_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, cons
256
322
  str.s = *buf, str.l = 0, str.m = *max_len;
257
323
  t.l_seq = strlen(seq);
258
324
  t.seq = (char*)seq;
259
- write_cs_or_MD(km, &str, mi, &t, r, no_iden, is_MD, 0, is_qstrand);
325
+ write_cs_ds_or_MD(km, &str, mi, &t, r, no_iden, is_MD, 0, 0, is_qstrand);
260
326
  *max_len = str.m;
261
327
  *buf = str.s;
262
328
  return str.l;
@@ -278,7 +344,7 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
278
344
  if (r->id == r->parent) type = r->inv? 'I' : 'P';
279
345
  else type = r->inv? 'i' : 'S';
280
346
  if (r->p) {
281
- mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->blen - r->mlen + r->p->n_ambi, r->p->dp_max, r->p->dp_score, r->p->n_ambi);
347
+ mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->blen - r->mlen + r->p->n_ambi, r->p->dp_max0, r->p->dp_score, r->p->n_ambi);
282
348
  if (r->p->trans_strand == 1 || r->p->trans_strand == 2)
283
349
  mm_sprintf_lite(s, "\tts:A:%c", "?+-?"[r->p->trans_strand]);
284
350
  }
@@ -326,8 +392,8 @@ void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const
326
392
  for (k = 0; k < r->p->n_cigar; ++k)
327
393
  mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, MM_CIGAR_STR[r->p->cigar[k]&0xf]);
328
394
  }
329
- if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
330
- write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, 1, !!(opt_flag&MM_F_QSTRAND));
395
+ if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_DS|MM_F_OUT_MD)))
396
+ write_cs_ds_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), !!(opt_flag&MM_F_OUT_MD), !!(opt_flag&MM_F_OUT_DS), 1, !!(opt_flag&MM_F_QSTRAND));
331
397
  if ((opt_flag & MM_F_COPY_COMMENT) && t->comment)
332
398
  mm_sprintf_lite(s, "\t%s", t->comment);
333
399
  }
@@ -535,8 +601,8 @@ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
535
601
  }
536
602
  }
537
603
  }
538
- if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
539
- write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, 1, 0);
604
+ if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_DS|MM_F_OUT_MD)))
605
+ write_cs_ds_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, !!(opt_flag&MM_F_OUT_DS), 1, 0);
540
606
  if (cigar_in_tag)
541
607
  write_sam_cigar(s, flag, 1, t->l_seq, r, opt_flag);
542
608
  }
data/ext/minimap2/index.c CHANGED
@@ -192,6 +192,7 @@ int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f)
192
192
  if (f <= 0.) return INT32_MAX;
193
193
  for (i = 0; i < 1<<mi->b; ++i)
194
194
  if (mi->B[i].h) n += kh_size((idxhash_t*)mi->B[i].h);
195
+ if (n == 0) return INT32_MAX;
195
196
  a = (uint32_t*)malloc(n * 4);
196
197
  for (i = n = 0; i < 1<<mi->b; ++i) {
197
198
  idxhash_t *h = (idxhash_t*)mi->B[i].h;
@@ -149,7 +149,7 @@ mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int
149
149
  int is_cdna, int n_seg, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km)
150
150
  { // TODO: make sure this works when n has more than 32 bits
151
151
  int32_t *f, *t, *v, n_u, n_v, mmax_f = 0, max_drop = bw;
152
- int64_t *p, i, j, max_ii, st = 0, n_iter = 0;
152
+ int64_t *p, i, j, max_ii, st = 0;
153
153
  uint64_t *u;
154
154
 
155
155
  if (_u) *_u = 0, *n_u_ = 0;
@@ -174,7 +174,6 @@ mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int
174
174
  for (j = i - 1; j >= st; --j) {
175
175
  int32_t sc;
176
176
  sc = comput_sc(&a[i], &a[j], max_dist_x, max_dist_y, bw, chn_pen_gap, chn_pen_skip, is_cdna, n_seg);
177
- ++n_iter;
178
177
  if (sc == INT32_MIN) continue;
179
178
  sc += f[j];
180
179
  if (sc > max_f) {
@@ -204,6 +203,7 @@ mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int
204
203
  if (max_ii < 0 || (a[i].x - a[max_ii].x <= (int64_t)max_dist_x && f[max_ii] < f[i]))
205
204
  max_ii = i;
206
205
  if (mmax_f < max_f) mmax_f = max_f;
206
+ //fprintf(stderr, "X1\t%ld\t%ld:%d\t%ld\t%ld:%d\t%ld\t%ld\n", (long)i, (long)(a[i].x>>32), (int32_t)a[i].x, (long)max_j, max_j<0?-1L:(long)(a[max_j].x>>32), max_j<0?-1:(int32_t)a[max_j].x, (long)max_f, (long)v[i]);
207
207
  }
208
208
 
209
209
  u = mg_chain_backtrack(km, n, f, p, v, t, min_cnt, min_sc, max_drop, &n_u, &n_v);
@@ -263,7 +263,8 @@ mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_ski
263
263
  return 0;
264
264
  }
265
265
  if (max_dist < bw) max_dist = bw;
266
- if (max_dist_inner <= 0 || max_dist_inner >= max_dist) max_dist_inner = 0;
266
+ if (max_dist_inner < 0) max_dist_inner = 0;
267
+ if (max_dist_inner > max_dist) max_dist_inner = max_dist;
267
268
  p = Kmalloc(km, int64_t, n);
268
269
  f = Kmalloc(km, int32_t, n);
269
270
  t = Kcalloc(km, int32_t, n);
@@ -325,12 +326,11 @@ mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_ski
325
326
  krmq_interval(lc_elem, root_inner, &s, &lo, &hi);
326
327
  if (lo) {
327
328
  const lc_elem_t *q;
328
- int32_t width, n_rmq_iter = 0;
329
+ int32_t width;
329
330
  krmq_itr_t(lc_elem) itr;
330
331
  krmq_itr_find(lc_elem, root_inner, lo, &itr);
331
332
  while ((q = krmq_at(&itr)) != 0) {
332
333
  if (q->y < (int32_t)a[i].y - max_dist_inner) break;
333
- ++n_rmq_iter;
334
334
  j = q->i;
335
335
  sc = f[j] + comput_sc_simple(&a[i], &a[j], chn_pen_gap, chn_pen_skip, 0, &width);
336
336
  if (width <= bw) {
data/ext/minimap2/main.c CHANGED
@@ -77,6 +77,9 @@ static ko_longopt_t long_options[] = {
77
77
  { "print-chains", ko_no_argument, 352 },
78
78
  { "no-hash-name", ko_no_argument, 353 },
79
79
  { "secondary-seq", ko_no_argument, 354 },
80
+ { "ds", ko_no_argument, 355 },
81
+ { "rmq-inner", ko_required_argument, 356 },
82
+ { "dbg-seed-occ", ko_no_argument, 501 },
80
83
  { "help", ko_no_argument, 'h' },
81
84
  { "max-intron-len", ko_required_argument, 'G' },
82
85
  { "version", ko_no_argument, 'V' },
@@ -120,7 +123,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int64_t flag, int long_idx, const
120
123
 
121
124
  int main(int argc, char *argv[])
122
125
  {
123
- const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:";
126
+ const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:b:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:";
124
127
  ketopt_t o = KETOPT_INIT;
125
128
  mm_mapopt_t opt;
126
129
  mm_idxopt_t ipt;
@@ -178,6 +181,7 @@ int main(int argc, char *argv[])
178
181
  else if (c == 'm') opt.min_chain_score = atoi(o.arg);
179
182
  else if (c == 'A') opt.a = atoi(o.arg);
180
183
  else if (c == 'B') opt.b = atoi(o.arg);
184
+ else if (c == 'b') opt.transition = atoi(o.arg);
181
185
  else if (c == 's') opt.min_dp_max = atoi(o.arg);
182
186
  else if (c == 'C') opt.noncan = atoi(o.arg);
183
187
  else if (c == 'I') ipt.batch_size = mm_parse_num(o.arg);
@@ -242,6 +246,9 @@ int main(int argc, char *argv[])
242
246
  else if (c == 352) mm_dbg_flag |= MM_DBG_PRINT_CHAIN; // --print-chains
243
247
  else if (c == 353) opt.flag |= MM_F_NO_HASH_NAME; // --no-hash-name
244
248
  else if (c == 354) opt.flag |= MM_F_SECONDARY_SEQ; // --secondary-seq
249
+ else if (c == 355) opt.flag |= MM_F_OUT_DS; // --ds
250
+ else if (c == 356) opt.rmq_inner_dist = mm_parse_num(o.arg); // --rmq-inner
251
+ else if (c == 501) mm_dbg_flag |= MM_DBG_SEED_FREQ; // --dbg-seed-occ
245
252
  else if (c == 330) {
246
253
  fprintf(stderr, "[WARNING] \033[1;31m --lj-min-ratio has been deprecated.\033[0m\n");
247
254
  } else if (c == 314) { // --frag
@@ -358,6 +365,7 @@ int main(int argc, char *argv[])
358
365
  fprintf(fp_help, " -R STR SAM read group line in a format like '@RG\\tID:foo\\tSM:bar' []\n");
359
366
  fprintf(fp_help, " -c output CIGAR in PAF\n");
360
367
  fprintf(fp_help, " --cs[=STR] output the cs tag; STR is 'short' (if absent) or 'long' [none]\n");
368
+ fprintf(fp_help, " --ds output the ds tag, which is an extension to cs\n");
361
369
  fprintf(fp_help, " --MD output the MD tag\n");
362
370
  fprintf(fp_help, " --eqx write =/X CIGAR operators\n");
363
371
  fprintf(fp_help, " -Y use soft clipping for supplementary alignments\n");
@@ -367,12 +375,12 @@ int main(int argc, char *argv[])
367
375
  fprintf(fp_help, " --version show version number\n");
368
376
  fprintf(fp_help, " Preset:\n");
369
377
  fprintf(fp_help, " -x STR preset (always applied before other options; see minimap2.1 for details) []\n");
370
- fprintf(fp_help, " - map-pb/map-ont - PacBio CLR/Nanopore vs reference mapping\n");
371
- fprintf(fp_help, " - map-hifi - PacBio HiFi reads vs reference mapping\n");
372
- fprintf(fp_help, " - ava-pb/ava-ont - PacBio/Nanopore read overlap\n");
378
+ fprintf(fp_help, " - lr:hq - accurate long reads (error rate <1%%) against a reference genome\n");
379
+ fprintf(fp_help, " - splice/splice:hq - spliced alignment for long reads/accurate long reads\n");
373
380
  fprintf(fp_help, " - asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5%% sequence divergence\n");
374
- fprintf(fp_help, " - splice/splice:hq - long-read/Pacbio-CCS spliced alignment\n");
375
- fprintf(fp_help, " - sr - genomic short-read mapping\n");
381
+ fprintf(fp_help, " - sr - short reads against a reference\n");
382
+ fprintf(fp_help, " - map-pb/map-hifi/map-ont/map-iclr - CLR/HiFi/Nanopore/ICLR vs reference mapping\n");
383
+ fprintf(fp_help, " - ava-pb/ava-ont - PacBio CLR/Nanopore read overlap\n");
376
384
  fprintf(fp_help, "\nSee `man ./minimap2.1' for detailed description of these and other advanced command-line options.\n");
377
385
  return fp_help == stdout? 0 : 1;
378
386
  }
@@ -5,7 +5,7 @@
5
5
  #include <stdio.h>
6
6
  #include <sys/types.h>
7
7
 
8
- #define MM_VERSION "2.26-r1175"
8
+ #define MM_VERSION "2.28-r1209"
9
9
 
10
10
  #define MM_F_NO_DIAG (0x001LL) // no exact diagonal hit
11
11
  #define MM_F_NO_DUAL (0x002LL) // skip pairs where query name is lexicographically larger than target name
@@ -44,6 +44,7 @@
44
44
  #define MM_F_NO_HASH_NAME (0x400000000LL)
45
45
  #define MM_F_SPLICE_OLD (0x800000000LL)
46
46
  #define MM_F_SECONDARY_SEQ (0x1000000000LL) //output SEQ field for seqondary alignments using hard clipping
47
+ #define MM_F_OUT_DS (0x2000000000LL)
47
48
 
48
49
  #define MM_I_HPC 0x1
49
50
  #define MM_I_NO_SEQ 0x2
@@ -97,6 +98,7 @@ typedef struct {
97
98
  typedef struct {
98
99
  uint32_t capacity; // the capacity of cigar[]
99
100
  int32_t dp_score, dp_max, dp_max2; // DP score; score of the max-scoring segment; score of the best alternate mappings
101
+ int32_t dp_max0; // DP score before mm_update_dp_max() adjustment
100
102
  uint32_t n_ambi:30, trans_strand:2; // number of ambiguous bases; transcript strand: 0 for unknown, 1 for +, 2 for -
101
103
  uint32_t n_cigar; // number of cigar operations in cigar[]
102
104
  uint32_t cigar[];
@@ -153,6 +155,7 @@ typedef struct {
153
155
  float alt_drop;
154
156
 
155
157
  int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
158
+ int transition; // transition mismatch score (A:G, C:T)
156
159
  int sc_ambi; // score when one or both bases are "N"
157
160
  int noncan; // cost of non-canonical splicing sites
158
161
  int junc_bonus;