minimap2 0.2.22.0 → 0.2.24.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -76
  3. data/ext/Rakefile +55 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +821 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +369 -0
  41. data/ext/minimap2/main.c +459 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +410 -0
  44. data/ext/minimap2/minimap2.1 +725 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +132 -0
  50. data/ext/minimap2/options.c +234 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/lib/minimap2/aligner.rb +4 -4
  93. data/lib/minimap2/alignment.rb +11 -11
  94. data/lib/minimap2/ffi/constants.rb +20 -16
  95. data/lib/minimap2/ffi/functions.rb +5 -0
  96. data/lib/minimap2/ffi.rb +4 -5
  97. data/lib/minimap2/version.rb +2 -2
  98. data/lib/minimap2.rb +51 -15
  99. metadata +97 -79
  100. data/lib/minimap2/ffi_helper.rb +0 -53
  101. data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,169 @@
1
+ #include <zlib.h>
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include <assert.h>
5
+ #define __STDC_LIMIT_MACROS
6
+ #include "bseq.h"
7
+ #include "kvec.h"
8
+ #include "kseq.h"
9
+ KSEQ_INIT2(, gzFile, gzread)
10
+
11
+ unsigned char seq_comp_table[256] = {
12
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
13
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
14
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
15
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
16
+ 64, 'T', 'V', 'G', 'H', 'E', 'F', 'C', 'D', 'I', 'J', 'M', 'L', 'K', 'N', 'O',
17
+ 'P', 'Q', 'Y', 'S', 'A', 'A', 'B', 'W', 'X', 'R', 'Z', 91, 92, 93, 94, 95,
18
+ 96, 't', 'v', 'g', 'h', 'e', 'f', 'c', 'd', 'i', 'j', 'm', 'l', 'k', 'n', 'o',
19
+ 'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127,
20
+ 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
21
+ 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
22
+ 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
23
+ 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
24
+ 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
25
+ 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
26
+ 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
27
+ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
28
+ };
29
+
30
+ #define CHECK_PAIR_THRES 1000000
31
+
32
+ struct mm_bseq_file_s {
33
+ gzFile fp;
34
+ kseq_t *ks;
35
+ mm_bseq1_t s;
36
+ };
37
+
38
+ mm_bseq_file_t *mm_bseq_open(const char *fn)
39
+ {
40
+ mm_bseq_file_t *fp;
41
+ gzFile f;
42
+ f = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(0, "r");
43
+ if (f == 0) return 0;
44
+ fp = (mm_bseq_file_t*)calloc(1, sizeof(mm_bseq_file_t));
45
+ fp->fp = f;
46
+ fp->ks = kseq_init(fp->fp);
47
+ return fp;
48
+ }
49
+
50
+ void mm_bseq_close(mm_bseq_file_t *fp)
51
+ {
52
+ kseq_destroy(fp->ks);
53
+ gzclose(fp->fp);
54
+ free(fp);
55
+ }
56
+
57
+ static inline char *kstrdup(const kstring_t *s)
58
+ {
59
+ char *t;
60
+ t = (char*)malloc(s->l + 1);
61
+ memcpy(t, s->s, s->l + 1);
62
+ return t;
63
+ }
64
+
65
+ static inline void kseq2bseq(kseq_t *ks, mm_bseq1_t *s, int with_qual, int with_comment)
66
+ {
67
+ int i;
68
+ if (ks->name.l == 0)
69
+ fprintf(stderr, "[WARNING]\033[1;31m empty sequence name in the input.\033[0m\n");
70
+ s->name = kstrdup(&ks->name);
71
+ s->seq = kstrdup(&ks->seq);
72
+ for (i = 0; i < (int)ks->seq.l; ++i) // convert U to T
73
+ if (s->seq[i] == 'u' || s->seq[i] == 'U')
74
+ --s->seq[i];
75
+ s->qual = with_qual && ks->qual.l? kstrdup(&ks->qual) : 0;
76
+ s->comment = with_comment && ks->comment.l? kstrdup(&ks->comment) : 0;
77
+ s->l_seq = ks->seq.l;
78
+ }
79
+
80
+ mm_bseq1_t *mm_bseq_read3(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int with_comment, int frag_mode, int *n_)
81
+ {
82
+ int64_t size = 0;
83
+ int ret;
84
+ kvec_t(mm_bseq1_t) a = {0,0,0};
85
+ kseq_t *ks = fp->ks;
86
+ *n_ = 0;
87
+ if (fp->s.seq) {
88
+ kv_resize(mm_bseq1_t, 0, a, 256);
89
+ kv_push(mm_bseq1_t, 0, a, fp->s);
90
+ size = fp->s.l_seq;
91
+ memset(&fp->s, 0, sizeof(mm_bseq1_t));
92
+ }
93
+ while ((ret = kseq_read(ks)) >= 0) {
94
+ mm_bseq1_t *s;
95
+ assert(ks->seq.l <= INT32_MAX);
96
+ if (a.m == 0) kv_resize(mm_bseq1_t, 0, a, 256);
97
+ kv_pushp(mm_bseq1_t, 0, a, &s);
98
+ kseq2bseq(ks, s, with_qual, with_comment);
99
+ size += s->l_seq;
100
+ if (size >= chunk_size) {
101
+ if (frag_mode && a.a[a.n-1].l_seq < CHECK_PAIR_THRES) {
102
+ while ((ret = kseq_read(ks)) >= 0) {
103
+ kseq2bseq(ks, &fp->s, with_qual, with_comment);
104
+ if (mm_qname_same(fp->s.name, a.a[a.n-1].name)) {
105
+ kv_push(mm_bseq1_t, 0, a, fp->s);
106
+ memset(&fp->s, 0, sizeof(mm_bseq1_t));
107
+ } else break;
108
+ }
109
+ }
110
+ break;
111
+ }
112
+ }
113
+ if (ret < -1) {
114
+ if (a.n) fprintf(stderr, "[WARNING]\033[1;31m failed to parse the FASTA/FASTQ record next to '%s'. Continue anyway.\033[0m\n", a.a[a.n-1].name);
115
+ else fprintf(stderr, "[WARNING]\033[1;31m failed to parse the first FASTA/FASTQ record. Continue anyway.\033[0m\n");
116
+ }
117
+ *n_ = a.n;
118
+ return a.a;
119
+ }
120
+
121
+ mm_bseq1_t *mm_bseq_read2(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int frag_mode, int *n_)
122
+ {
123
+ return mm_bseq_read3(fp, chunk_size, with_qual, 0, frag_mode, n_);
124
+ }
125
+
126
+ mm_bseq1_t *mm_bseq_read(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int *n_)
127
+ {
128
+ return mm_bseq_read2(fp, chunk_size, with_qual, 0, n_);
129
+ }
130
+
131
+ mm_bseq1_t *mm_bseq_read_frag2(int n_fp, mm_bseq_file_t **fp, int64_t chunk_size, int with_qual, int with_comment, int *n_)
132
+ {
133
+ int i;
134
+ int64_t size = 0;
135
+ kvec_t(mm_bseq1_t) a = {0,0,0};
136
+ *n_ = 0;
137
+ if (n_fp < 1) return 0;
138
+ while (1) {
139
+ int n_read = 0;
140
+ for (i = 0; i < n_fp; ++i)
141
+ if (kseq_read(fp[i]->ks) >= 0)
142
+ ++n_read;
143
+ if (n_read < n_fp) {
144
+ if (n_read > 0)
145
+ fprintf(stderr, "[W::%s]\033[1;31m query files have different number of records; extra records skipped.\033[0m\n", __func__);
146
+ break; // some file reaches the end
147
+ }
148
+ if (a.m == 0) kv_resize(mm_bseq1_t, 0, a, 256);
149
+ for (i = 0; i < n_fp; ++i) {
150
+ mm_bseq1_t *s;
151
+ kv_pushp(mm_bseq1_t, 0, a, &s);
152
+ kseq2bseq(fp[i]->ks, s, with_qual, with_comment);
153
+ size += s->l_seq;
154
+ }
155
+ if (size >= chunk_size) break;
156
+ }
157
+ *n_ = a.n;
158
+ return a.a;
159
+ }
160
+
161
+ mm_bseq1_t *mm_bseq_read_frag(int n_fp, mm_bseq_file_t **fp, int64_t chunk_size, int with_qual, int *n_)
162
+ {
163
+ return mm_bseq_read_frag2(n_fp, fp, chunk_size, with_qual, 0, n_);
164
+ }
165
+
166
+ int mm_bseq_eof(mm_bseq_file_t *fp)
167
+ {
168
+ return (ks_eof(fp->ks->f) && fp->s.seq == 0);
169
+ }
@@ -0,0 +1,64 @@
1
+ #ifndef MM_BSEQ_H
2
+ #define MM_BSEQ_H
3
+
4
+ #include <stdint.h>
5
+ #include <string.h>
6
+
7
+ #ifdef __cplusplus
8
+ extern "C" {
9
+ #endif
10
+
11
+ struct mm_bseq_file_s;
12
+ typedef struct mm_bseq_file_s mm_bseq_file_t;
13
+
14
+ typedef struct {
15
+ int l_seq, rid;
16
+ char *name, *seq, *qual, *comment;
17
+ } mm_bseq1_t;
18
+
19
+ mm_bseq_file_t *mm_bseq_open(const char *fn);
20
+ void mm_bseq_close(mm_bseq_file_t *fp);
21
+ mm_bseq1_t *mm_bseq_read3(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int with_comment, int frag_mode, int *n_);
22
+ mm_bseq1_t *mm_bseq_read2(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int frag_mode, int *n_);
23
+ mm_bseq1_t *mm_bseq_read(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int *n_);
24
+ mm_bseq1_t *mm_bseq_read_frag2(int n_fp, mm_bseq_file_t **fp, int64_t chunk_size, int with_qual, int with_comment, int *n_);
25
+ mm_bseq1_t *mm_bseq_read_frag(int n_fp, mm_bseq_file_t **fp, int64_t chunk_size, int with_qual, int *n_);
26
+ int mm_bseq_eof(mm_bseq_file_t *fp);
27
+
28
+ extern unsigned char seq_nt4_table[256];
29
+ extern unsigned char seq_comp_table[256];
30
+
31
+ static inline int mm_qname_len(const char *s)
32
+ {
33
+ int l;
34
+ l = strlen(s);
35
+ return l >= 3 && s[l-1] >= '0' && s[l-1] <= '9' && s[l-2] == '/'? l - 2 : l;
36
+ }
37
+
38
+ static inline int mm_qname_same(const char *s1, const char *s2)
39
+ {
40
+ int l1, l2;
41
+ l1 = mm_qname_len(s1);
42
+ l2 = mm_qname_len(s2);
43
+ return (l1 == l2 && strncmp(s1, s2, l1) == 0);
44
+ }
45
+
46
+ static inline void mm_revcomp_bseq(mm_bseq1_t *s)
47
+ {
48
+ int i, t, l = s->l_seq;
49
+ for (i = 0; i < l>>1; ++i) {
50
+ t = s->seq[l - i - 1];
51
+ s->seq[l - i - 1] = seq_comp_table[(uint8_t)s->seq[i]];
52
+ s->seq[i] = seq_comp_table[t];
53
+ }
54
+ if (l&1) s->seq[l>>1] = seq_comp_table[(uint8_t)s->seq[l>>1]];
55
+ if (s->qual)
56
+ for (i = 0; i < l>>1; ++i)
57
+ t = s->qual[l - i - 1], s->qual[l - i - 1] = s->qual[i], s->qual[i] = t;
58
+ }
59
+
60
+ #ifdef __cplusplus
61
+ }
62
+ #endif
63
+
64
+ #endif
@@ -0,0 +1,30 @@
1
+ ## Contributor Code of Conduct
2
+
3
+ As contributors and maintainers of this project, we pledge to respect all
4
+ people who contribute through reporting issues, posting feature requests,
5
+ updating documentation, submitting pull requests or patches, and other
6
+ activities.
7
+
8
+ We are committed to making participation in this project a harassment-free
9
+ experience for everyone, regardless of level of experience, gender, gender
10
+ identity and expression, sexual orientation, disability, personal appearance,
11
+ body size, race, age, or religion.
12
+
13
+ Examples of unacceptable behavior by participants include the use of sexual
14
+ language or imagery, derogatory comments or personal attacks, trolling, public
15
+ or private harassment, insults, or other unprofessional conduct.
16
+
17
+ Project maintainers have the right and responsibility to remove, edit, or
18
+ reject comments, commits, code, wiki edits, issues, and other contributions
19
+ that are not aligned to this Code of Conduct. Project maintainers or
20
+ contributors who do not follow the Code of Conduct may be removed from the
21
+ project team.
22
+
23
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
24
+ reported by opening an issue or contacting the maintainer via email.
25
+
26
+ This Code of Conduct is adapted from the [Contributor Covenant][cc], [version
27
+ 1.0.0][v1].
28
+
29
+ [cc]: http://contributor-covenant.org/
30
+ [v1]: http://contributor-covenant.org/version/1/0/0/
@@ -0,0 +1,243 @@
1
+ ## Table of Contents
2
+
3
+ - [Introduction & Installation](#intro)
4
+ - [Mapping Genomic Reads](#map-reads)
5
+ * [Mapping long reads](#map-pb)
6
+ * [Mapping Illumina paired-end reads](#map-sr)
7
+ * [Evaluating mapping accuracy with simulated reads (for developers)](#mapeval)
8
+ - [Mapping Long RNA-seq Reads](#map-rna)
9
+ * [Mapping Nanopore 2D cDNA reads](#map-ont-cdna-2d)
10
+ * [Mapping Nanopore direct-RNA reads](#map-direct-rna)
11
+ * [Mapping PacBio Iso-seq reads](#map-iso-seq)
12
+ - [Full-Genome Alignment](#genome-aln)
13
+ * [Intra-species assembly alignment](#asm-to-ref)
14
+ * [Cross-species full-genome alignment](#x-species)
15
+ * [Eyeballing alignment](#view-aln)
16
+ * [Calling variants from assembly-to-reference alignment](#asm-var)
17
+ * [Constructing self-homology map](#hom-map)
18
+ * [Lift Over (for developers)](#liftover)
19
+ - [Read Overlap](#read-overlap)
20
+ * [Long-read overlap](#long-read-overlap)
21
+ * [Evaluating overlap sensitivity (for developers)](#ov-eval)
22
+
23
+ ## <a name="intro"></a>Introduction & Installation
24
+
25
+ This cookbook walks you through a variety of applications of minimap2 and its
26
+ companion script `paftools.js`. All data here are freely available from the
27
+ minimap2 release page at version tag [v2.10][v2.10]. Some examples only work
28
+ with v2.10 or later.
29
+
30
+ To acquire the data used in this cookbook and to install minimap2 and paftools,
31
+ please follow the command lines below:
32
+ ```sh
33
+ # install minimap2 executables
34
+ curl -L https://github.com/lh3/minimap2/releases/download/v2.24/minimap2-2.24_x64-linux.tar.bz2 | tar jxf -
35
+ cp minimap2-2.24_x64-linux/{minimap2,k8,paftools.js} . # copy executables
36
+ export PATH="$PATH:"`pwd` # put the current directory on PATH
37
+ # download example datasets
38
+ curl -L https://github.com/lh3/minimap2/releases/download/v2.10/cookbook-data.tgz | tar zxf -
39
+ ```
40
+
41
+ ## <a name="map-reads"></a>Mapping Genomic Reads
42
+
43
+ ### <a name="map-pb"></a>Mapping long reads
44
+ ```sh
45
+ minimap2 -ax map-pb -t4 ecoli_ref.fa ecoli_p6_25x_canu.fa > mapped.sam
46
+ ```
47
+ Alternatively, you can create a minimap2 index first and then map:
48
+ ```sh
49
+ minimap2 -x map-pb -d ecoli-pb.mmi ecoli_ref.fa # create an index
50
+ minimap2 -ax map-pb ecoli-pb.mmi ecoli_p6_25x_canu.fa > mapped.sam
51
+ ```
52
+ This will save you a couple of minutes when you map against the human genome.
53
+ **HOWEVER**, key algorithm parameters such as the k-mer length and window
54
+ size can't be changed after indexing. Minimap2 will give you a warning if
55
+ parameters used in a pre-built index doesn't match parameters on the command
56
+ line. **Please always make sure you are using an intended pre-built index.**
57
+
58
+ ### <a name="map-sr"></a>Mapping Illumina paired-end reads:
59
+ ```sh
60
+ minimap2 -ax sr -t4 ecoli_ref.fa ecoli_mason_1.fq ecoli_mason_2.fq > mapped-sr.sam
61
+ ```
62
+
63
+ ### <a name="mapeval"></a>Evaluating mapping accuracy with simulated reads (for developers)
64
+ ```sh
65
+ minimap2 -ax sr ecoli_ref.fa ecoli_mason_1.fq ecoli_mason_2.fq | paftools.js mapeval -
66
+ ```
67
+ The output is:
68
+ ```
69
+ Q 60 19712 0 0.000000000 19712
70
+ Q 0 282 219 0.010953286 19994
71
+ U 6
72
+ ```
73
+ where a `U`-line gives the number of unmapped reads (for SAM input only); a
74
+ `Q`-line gives:
75
+
76
+ 1. Mapping quality (mapQ) threshold
77
+ 2. Number of mapped reads between this threshold and the previous mapQ threshold.
78
+ 3. Number of wrong mappings in the same mapQ interval
79
+ 4. Accumulative mapping error rate
80
+ 5. Accumulative number of mappings
81
+
82
+ For `paftools.js mapeval` to work, you need to encode the true read positions
83
+ in read names in the right format. For [pbsim2][pbsim] and [mason2][mason2], we
84
+ provide scripts to generate the right format. Simulated reads in this cookbook
85
+ were created with the following command lines:
86
+ ```sh
87
+ # in the pbsim2 source code directory:
88
+ src/pbsim --depth 1 --length-min 5000 --length-mean 20000 --accuracy-mean 0.95 --hmm_model data/R94.model ../ecoli_ref.fa
89
+ paftools.js pbsim2fq ../ecoli_ref.fa.fai sd_0001.maf > ../ecoli_pbsim.fa
90
+
91
+ # mason2 simulation
92
+ mason_simulator --illumina-prob-mismatch-scale 2.5 -ir ecoli_ref.fa -n 10000 -o tmp-l.fq -or tmp-r.fq -oa tmp.sam
93
+ paftools.js mason2fq tmp.sam | seqtk seq -1 > ecoli_mason_1.fq
94
+ paftools.js mason2fq tmp.sam | seqtk seq -2 > ecoli_mason_2.fq
95
+ ```
96
+
97
+
98
+
99
+ ## <a name="map-rna"></a>Mapping Long RNA-seq Reads
100
+
101
+ ### <a name="map-ont-cdna-2d"></a>Mapping Nanopore 2D cDNA reads
102
+ ```sh
103
+ minimap2 -ax splice SIRV_E2.fa SIRV_ont-cdna.fa > aln.sam
104
+ ```
105
+ You can compare the alignment to the true annotations with:
106
+ ```sh
107
+ paftools.js junceval SIRV_E2C.gtf aln.sam
108
+ ```
109
+ It gives the percentage of introns found in the annotation. For SIRV data, it
110
+ is possible to achieve higher junction accuracy with
111
+ ```sh
112
+ minimap2 -ax splice --splice-flank=no SIRV_E2.fa SIRV_ont-cdna.fa | paftools.js junceval SIRV_E2C.gtf
113
+ ```
114
+ This is because minimap2 models one additional evolutionarily conserved base
115
+ around a canonical junction, but SIRV doesn't honor this signal. Option
116
+ `--splice-flank=no` asks minimap2 no to model this additional base.
117
+
118
+ In the output a tag `ts:A:+` indicates that the read strand is the same as the
119
+ transcript strand; `ts:A:-` indicates the read strand is opposite to the
120
+ transcript strand. This tag is inferred from the GT-AG signal and is thus only
121
+ available to spliced reads.
122
+
123
+ ### <a name="map-direct-rna"></a>Mapping Nanopore direct-RNA reads
124
+ ```sh
125
+ minimap2 -ax splice -k14 -uf SIRV_E2.fa SIRV_ont-drna.fa > aln.sam
126
+ ```
127
+ Direct-RNA reads are noisier, so we use a shorter k-mer for improved
128
+ sensitivity. Here, option `-uf` forces minimap2 to map reads to the forward
129
+ transcript strand only because direct-RNA reads are stranded. Again, applying
130
+ `--splice-flank=no` helps junction accuracy for SIRV data.
131
+
132
+ ### <a name="map-iso-seq"></a>Mapping PacBio Iso-seq reads
133
+ ```sh
134
+ minimap2 -ax splice -uf -C5 SIRV_E2.fa SIRV_iso-seq.fq > aln.sam
135
+ ```
136
+ Option `-C5` reduces the penalty on non-canonical splicing sites. It helps
137
+ to align such sites correctly for data with low error rate such as Iso-seq
138
+ reads and traditional cDNAs. On this example, minimap2 makes one junction
139
+ error. Applying `--splice-flank=no` fixes this alignment error.
140
+
141
+ Note that the command line above is optimized for the final Iso-seq reads.
142
+ PacBio's Iso-seq pipeline produces intermediate sequences at varying quality.
143
+ For example, some intermediate reads are not stranded. For these reads, option
144
+ `-uf` will lead to more errors. Please revise the minimap2 command line
145
+ accordingly.
146
+
147
+
148
+
149
+ ## <a name="genome-aln"></a>Full-Genome Alignment
150
+
151
+ ### <a name="asm-to-ref"></a>Intra-species assembly alignment
152
+ ```sh
153
+ # option "--cs" is recommended as paftools.js may need it
154
+ minimap2 -cx asm5 --cs ecoli_ref.fa ecoli_canu.fa > ecoli_canu.paf
155
+ ```
156
+ Here `ecoli_canu.fa` is the Canu assembly of `ecoli_p6_25x_canu.fa`. This
157
+ command line outputs alignments in the [PAF format][paf]. Use `-a` instead of
158
+ `-c` to get output in the SAM format.
159
+
160
+ ### <a name="x-species"></a>Cross-species full-genome alignment
161
+ ```sh
162
+ minimap2 -cx asm20 --cs ecoli_ref.fa ecoli_O104:H4.fa > ecoli_O104:H4.paf
163
+ sort -k6,6 -k8,8n ecoli_O104:H4.paf | paftools.js call -f ecoli_ref.fa -L10000 -l1000 - > out.vcf
164
+ ```
165
+ Minimap2 has three presets for full-genome alignment: "asm5" for sequence
166
+ divergence below 1%, "asm10" for divergence around a couple of percent and
167
+ "asm20" for divergence not more than 10%. In theory, with the right setting,
168
+ minimap2 should work for sequence pairs with sequence divergence up to ~15%,
169
+ but this has not been carefully evaluated.
170
+
171
+ ### <a name="view-aln"></a>Eyeballing alignment
172
+ ```sh
173
+ # option "--cs" required; minimap2-r741 or higher required for the "asm20" preset
174
+ minimap2 -cx asm20 --cs ecoli_ref.fa ecoli_O104:H4.fa | paftools.js view - | less -S
175
+ ```
176
+ This prints the alignment in a BLAST-like format.
177
+
178
+ ### <a name="asm-var"></a>Calling variants from assembly-to-reference alignment
179
+ ```sh
180
+ # don't forget the "--cs" option; otherwise it doesn't work
181
+ minimap2 -cx asm5 --cs ecoli_ref.fa ecoli_canu.fa \
182
+ | sort -k6,6 -k8,8n \
183
+ | paftools.js call -f ecoli_ref.fa - > out.vcf
184
+ ```
185
+ Without option `-f`, `paftools.js call` outputs in a custom format. In this
186
+ format, lines starting with `R` give the regions covered by one contig only.
187
+ This information is not available in the VCF output.
188
+
189
+ ### <a name="hom-map"></a>Constructing self-homology map
190
+ ```sh
191
+ minimap2 -DP -k19 -w19 -m200 ecoli_ref.fa ecoli_ref.fa > out.paf
192
+ ```
193
+ Option `-D` asks minimap2 to ignore anchors from perfect self match and `-P`
194
+ outputs all chains. For large nomes, we don't recommend to perform base-level
195
+ alignment (with `-c`, `-a` or `--cs`) when `-P` is applied. This is because
196
+ base-alignment is slow and occasionally gives wrong alignments close to the
197
+ diagonal of a dotter plot. For E. coli, though, base-alignment is still fast.
198
+
199
+ ### <a name="liftover"></a>Lift over (for developers)
200
+ ```sh
201
+ minimap2 -cx asm5 --cs ecoli_ref.fa ecoli_canu.fa > ecoli_canu.paf
202
+ echo -e 'tig00000001\t200000\t300000' | paftools.js liftover ecoli_canu.paf -
203
+ ```
204
+ This lifts over a region on query sequences to one or multiple regions on
205
+ reference sequences. Note that this paftools.js command may not be efficient
206
+ enough to lift millions of regions.
207
+
208
+
209
+
210
+ ## <a name="read-overlap"></a>Read Overlap
211
+
212
+ ### <a name="long-read-overlap"></a>Long read overlap
213
+ ```sh
214
+ # For pacbio reads:
215
+ minimap2 -x ava-pb ecoli_p6_25x_canu.fa ecoli_p6_25x_canu.fa > overlap.paf
216
+ # For Nanopore reads (ava-ont also works with PacBio but not as good):
217
+ minimap2 -x ava-ont -r 10000 ecoli_p6_25x_canu.fa ecoli_p6_25x_canu.fa > overlap.paf
218
+ # If you have miniasm installed:
219
+ miniasm -f ecoli_p6_25x_canu.fa overlap.paf > asm.gfa
220
+ ```
221
+ Here we explicitly applied `-r 10000`. We are considering to set this as the
222
+ default for the `ava-ont` mode as this seems to improve the contiguity for
223
+ nanopore read assembly (Loman, personal communication).
224
+
225
+ *Minimap2 doesn't work well with short-read overlap.*
226
+
227
+ ### <a name="ov-eval"></a>Evaluating overlap sensitivity (for developers)
228
+
229
+ ```sh
230
+ # read to reference mapping
231
+ minimap2 -cx map-pb ecoli_ref.fa ecoli_p6_25x_canu.fa > to-ref.paf
232
+ # evaluate overlap sensitivity
233
+ sort -k6,6 -k8,8n to-ref.paf | paftools.js ov-eval - overlap.paf
234
+ ```
235
+ You can see that for PacBio reads, minimap2 achieves higher overlap sensitivity
236
+ with `-x ava-pb` (99% vs 93% with `-x ava-ont`).
237
+
238
+
239
+
240
+ [pbsim]: https://github.com/yukiteruono/pbsim2
241
+ [mason2]: https://github.com/seqan/seqan/tree/master/apps/mason2
242
+ [paf]: https://github.com/lh3/miniasm/blob/master/PAF.md
243
+ [v2.10]: https://github.com/lh3/minimap2/releases/tag/v2.10
@@ -0,0 +1,64 @@
1
+ #include <math.h>
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include <assert.h>
5
+ #include "mmpriv.h"
6
+
7
+ static inline int32_t get_for_qpos(int32_t qlen, const mm128_t *a)
8
+ {
9
+ int32_t x = (int32_t)a->y;
10
+ int32_t q_span = a->y>>32 & 0xff;
11
+ if (a->x>>63)
12
+ x = qlen - 1 - (x + 1 - q_span); // revert the position to the forward strand of query
13
+ return x;
14
+ }
15
+
16
+ static int get_mini_idx(int qlen, const mm128_t *a, int32_t n, const uint64_t *mini_pos)
17
+ {
18
+ int32_t x, L = 0, R = n - 1;
19
+ x = get_for_qpos(qlen, a);
20
+ while (L <= R) { // binary search
21
+ int32_t m = ((uint64_t)L + R) >> 1;
22
+ int32_t y = (int32_t)mini_pos[m];
23
+ if (y < x) L = m + 1;
24
+ else if (y > x) R = m - 1;
25
+ else return m;
26
+ }
27
+ return -1;
28
+ }
29
+
30
+ void mm_est_err(const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, const mm128_t *a, int32_t n, const uint64_t *mini_pos)
31
+ {
32
+ int i;
33
+ uint64_t sum_k = 0;
34
+ float avg_k;
35
+
36
+ if (n == 0) return;
37
+ for (i = 0; i < n; ++i)
38
+ sum_k += mini_pos[i] >> 32 & 0xff;
39
+ avg_k = (float)sum_k / n;
40
+
41
+ for (i = 0; i < n_regs; ++i) {
42
+ mm_reg1_t *r = &regs[i];
43
+ int32_t st, en, j, k, n_match, n_tot, l_ref;
44
+ r->div = -1.0f;
45
+ if (r->cnt == 0) continue;
46
+ st = en = get_mini_idx(qlen, r->rev? &a[r->as + r->cnt - 1] : &a[r->as], n, mini_pos);
47
+ if (st < 0) {
48
+ if (mm_verbose >= 2)
49
+ fprintf(stderr, "[WARNING] logic inconsistency in mm_est_err(). Please contact the developer.\n");
50
+ continue;
51
+ }
52
+ l_ref = mi->seq[r->rid].len;
53
+ for (k = 1, j = st + 1, n_match = 1; j < n && k < r->cnt; ++j) {
54
+ int32_t x;
55
+ x = get_for_qpos(qlen, r->rev? &a[r->as + r->cnt - 1 - k] : &a[r->as + k]);
56
+ if (x == (int32_t)mini_pos[j])
57
+ ++k, en = j, ++n_match;
58
+ }
59
+ n_tot = en - st + 1;
60
+ if (r->qs > avg_k && r->rs > avg_k) ++n_tot;
61
+ if (qlen - r->qs > avg_k && l_ref - r->re > avg_k) ++n_tot;
62
+ r->div = n_match >= n_tot? 0.0f : (float)(1.0 - pow((double)n_match / n_tot, 1.0 / avg_k));
63
+ }
64
+ }
@@ -0,0 +1,63 @@
1
+ // To compile:
2
+ // gcc -g -O2 example.c libminimap2.a -lz
3
+
4
+ #include <stdlib.h>
5
+ #include <assert.h>
6
+ #include <stdio.h>
7
+ #include <zlib.h>
8
+ #include "minimap.h"
9
+ #include "kseq.h"
10
+ KSEQ_INIT(gzFile, gzread)
11
+
12
+ int main(int argc, char *argv[])
13
+ {
14
+ mm_idxopt_t iopt;
15
+ mm_mapopt_t mopt;
16
+ int n_threads = 3;
17
+
18
+ mm_verbose = 2; // disable message output to stderr
19
+ mm_set_opt(0, &iopt, &mopt);
20
+ mopt.flag |= MM_F_CIGAR; // perform alignment
21
+
22
+ if (argc < 3) {
23
+ fprintf(stderr, "Usage: minimap2-lite <target.fa> <query.fa>\n");
24
+ return 1;
25
+ }
26
+
27
+ // open query file for reading; you may use your favorite FASTA/Q parser
28
+ gzFile f = gzopen(argv[2], "r");
29
+ assert(f);
30
+ kseq_t *ks = kseq_init(f);
31
+
32
+ // open index reader
33
+ mm_idx_reader_t *r = mm_idx_reader_open(argv[1], &iopt, 0);
34
+ mm_idx_t *mi;
35
+ while ((mi = mm_idx_reader_read(r, n_threads)) != 0) { // traverse each part of the index
36
+ mm_mapopt_update(&mopt, mi); // this sets the maximum minimizer occurrence; TODO: set a better default in mm_mapopt_init()!
37
+ mm_tbuf_t *tbuf = mm_tbuf_init(); // thread buffer; for multi-threading, allocate one tbuf for each thread
38
+ gzrewind(f);
39
+ kseq_rewind(ks);
40
+ while (kseq_read(ks) >= 0) { // each kseq_read() call reads one query sequence
41
+ mm_reg1_t *reg;
42
+ int j, i, n_reg;
43
+ reg = mm_map(mi, ks->seq.l, ks->seq.s, &n_reg, tbuf, &mopt, 0); // get all hits for the query
44
+ for (j = 0; j < n_reg; ++j) { // traverse hits and print them out
45
+ mm_reg1_t *r = &reg[j];
46
+ assert(r->p); // with MM_F_CIGAR, this should not be NULL
47
+ printf("%s\t%d\t%d\t%d\t%c\t", ks->name.s, ks->seq.l, r->qs, r->qe, "+-"[r->rev]);
48
+ printf("%s\t%d\t%d\t%d\t%d\t%d\t%d\tcg:Z:", mi->seq[r->rid].name, mi->seq[r->rid].len, r->rs, r->re, r->mlen, r->blen, r->mapq);
49
+ for (i = 0; i < r->p->n_cigar; ++i) // IMPORTANT: this gives the CIGAR in the aligned regions. NO soft/hard clippings!
50
+ printf("%d%c", r->p->cigar[i]>>4, MM_CIGAR_STR[r->p->cigar[i]&0xf]);
51
+ putchar('\n');
52
+ free(r->p);
53
+ }
54
+ free(reg);
55
+ }
56
+ mm_tbuf_destroy(tbuf);
57
+ mm_idx_destroy(mi);
58
+ }
59
+ mm_idx_reader_close(r); // close the index reader
60
+ kseq_destroy(ks); // close the query file
61
+ gzclose(f);
62
+ return 0;
63
+ }