minimap2 0.0.4 → 0.2.23.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +113 -98
  3. data/ext/Rakefile +41 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +807 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +344 -0
  41. data/ext/minimap2/main.c +455 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +409 -0
  44. data/ext/minimap2/minimap2.1 +722 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +131 -0
  50. data/ext/minimap2/options.c +233 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/ext/vendor/libminimap2.so +0 -0
  93. data/lib/minimap2/aligner.rb +16 -5
  94. data/lib/minimap2/alignment.rb +6 -2
  95. data/lib/minimap2/ffi/constants.rb +74 -53
  96. data/lib/minimap2/ffi/functions.rb +5 -0
  97. data/lib/minimap2/ffi.rb +1 -2
  98. data/lib/minimap2/version.rb +2 -1
  99. data/lib/minimap2.rb +67 -22
  100. metadata +98 -64
  101. data/lib/minimap2/ffi_helper.rb +0 -53
@@ -0,0 +1,169 @@
1
+ #include <zlib.h>
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include <assert.h>
5
+ #define __STDC_LIMIT_MACROS
6
+ #include "bseq.h"
7
+ #include "kvec.h"
8
+ #include "kseq.h"
9
+ KSEQ_INIT2(, gzFile, gzread)
10
+
11
+ unsigned char seq_comp_table[256] = {
12
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
13
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
14
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
15
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
16
+ 64, 'T', 'V', 'G', 'H', 'E', 'F', 'C', 'D', 'I', 'J', 'M', 'L', 'K', 'N', 'O',
17
+ 'P', 'Q', 'Y', 'S', 'A', 'A', 'B', 'W', 'X', 'R', 'Z', 91, 92, 93, 94, 95,
18
+ 96, 't', 'v', 'g', 'h', 'e', 'f', 'c', 'd', 'i', 'j', 'm', 'l', 'k', 'n', 'o',
19
+ 'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127,
20
+ 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
21
+ 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
22
+ 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
23
+ 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
24
+ 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
25
+ 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
26
+ 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
27
+ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
28
+ };
29
+
30
+ #define CHECK_PAIR_THRES 1000000
31
+
32
+ struct mm_bseq_file_s {
33
+ gzFile fp;
34
+ kseq_t *ks;
35
+ mm_bseq1_t s;
36
+ };
37
+
38
+ mm_bseq_file_t *mm_bseq_open(const char *fn)
39
+ {
40
+ mm_bseq_file_t *fp;
41
+ gzFile f;
42
+ f = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(0, "r");
43
+ if (f == 0) return 0;
44
+ fp = (mm_bseq_file_t*)calloc(1, sizeof(mm_bseq_file_t));
45
+ fp->fp = f;
46
+ fp->ks = kseq_init(fp->fp);
47
+ return fp;
48
+ }
49
+
50
+ void mm_bseq_close(mm_bseq_file_t *fp)
51
+ {
52
+ kseq_destroy(fp->ks);
53
+ gzclose(fp->fp);
54
+ free(fp);
55
+ }
56
+
57
+ static inline char *kstrdup(const kstring_t *s)
58
+ {
59
+ char *t;
60
+ t = (char*)malloc(s->l + 1);
61
+ memcpy(t, s->s, s->l + 1);
62
+ return t;
63
+ }
64
+
65
+ static inline void kseq2bseq(kseq_t *ks, mm_bseq1_t *s, int with_qual, int with_comment)
66
+ {
67
+ int i;
68
+ if (ks->name.l == 0)
69
+ fprintf(stderr, "[WARNING]\033[1;31m empty sequence name in the input.\033[0m\n");
70
+ s->name = kstrdup(&ks->name);
71
+ s->seq = kstrdup(&ks->seq);
72
+ for (i = 0; i < (int)ks->seq.l; ++i) // convert U to T
73
+ if (s->seq[i] == 'u' || s->seq[i] == 'U')
74
+ --s->seq[i];
75
+ s->qual = with_qual && ks->qual.l? kstrdup(&ks->qual) : 0;
76
+ s->comment = with_comment && ks->comment.l? kstrdup(&ks->comment) : 0;
77
+ s->l_seq = ks->seq.l;
78
+ }
79
+
80
+ mm_bseq1_t *mm_bseq_read3(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int with_comment, int frag_mode, int *n_)
81
+ {
82
+ int64_t size = 0;
83
+ int ret;
84
+ kvec_t(mm_bseq1_t) a = {0,0,0};
85
+ kseq_t *ks = fp->ks;
86
+ *n_ = 0;
87
+ if (fp->s.seq) {
88
+ kv_resize(mm_bseq1_t, 0, a, 256);
89
+ kv_push(mm_bseq1_t, 0, a, fp->s);
90
+ size = fp->s.l_seq;
91
+ memset(&fp->s, 0, sizeof(mm_bseq1_t));
92
+ }
93
+ while ((ret = kseq_read(ks)) >= 0) {
94
+ mm_bseq1_t *s;
95
+ assert(ks->seq.l <= INT32_MAX);
96
+ if (a.m == 0) kv_resize(mm_bseq1_t, 0, a, 256);
97
+ kv_pushp(mm_bseq1_t, 0, a, &s);
98
+ kseq2bseq(ks, s, with_qual, with_comment);
99
+ size += s->l_seq;
100
+ if (size >= chunk_size) {
101
+ if (frag_mode && a.a[a.n-1].l_seq < CHECK_PAIR_THRES) {
102
+ while ((ret = kseq_read(ks)) >= 0) {
103
+ kseq2bseq(ks, &fp->s, with_qual, with_comment);
104
+ if (mm_qname_same(fp->s.name, a.a[a.n-1].name)) {
105
+ kv_push(mm_bseq1_t, 0, a, fp->s);
106
+ memset(&fp->s, 0, sizeof(mm_bseq1_t));
107
+ } else break;
108
+ }
109
+ }
110
+ break;
111
+ }
112
+ }
113
+ if (ret < -1) {
114
+ if (a.n) fprintf(stderr, "[WARNING]\033[1;31m failed to parse the FASTA/FASTQ record next to '%s'. Continue anyway.\033[0m\n", a.a[a.n-1].name);
115
+ else fprintf(stderr, "[WARNING]\033[1;31m failed to parse the first FASTA/FASTQ record. Continue anyway.\033[0m\n");
116
+ }
117
+ *n_ = a.n;
118
+ return a.a;
119
+ }
120
+
121
+ mm_bseq1_t *mm_bseq_read2(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int frag_mode, int *n_)
122
+ {
123
+ return mm_bseq_read3(fp, chunk_size, with_qual, 0, frag_mode, n_);
124
+ }
125
+
126
+ mm_bseq1_t *mm_bseq_read(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int *n_)
127
+ {
128
+ return mm_bseq_read2(fp, chunk_size, with_qual, 0, n_);
129
+ }
130
+
131
+ mm_bseq1_t *mm_bseq_read_frag2(int n_fp, mm_bseq_file_t **fp, int64_t chunk_size, int with_qual, int with_comment, int *n_)
132
+ {
133
+ int i;
134
+ int64_t size = 0;
135
+ kvec_t(mm_bseq1_t) a = {0,0,0};
136
+ *n_ = 0;
137
+ if (n_fp < 1) return 0;
138
+ while (1) {
139
+ int n_read = 0;
140
+ for (i = 0; i < n_fp; ++i)
141
+ if (kseq_read(fp[i]->ks) >= 0)
142
+ ++n_read;
143
+ if (n_read < n_fp) {
144
+ if (n_read > 0)
145
+ fprintf(stderr, "[W::%s]\033[1;31m query files have different number of records; extra records skipped.\033[0m\n", __func__);
146
+ break; // some file reaches the end
147
+ }
148
+ if (a.m == 0) kv_resize(mm_bseq1_t, 0, a, 256);
149
+ for (i = 0; i < n_fp; ++i) {
150
+ mm_bseq1_t *s;
151
+ kv_pushp(mm_bseq1_t, 0, a, &s);
152
+ kseq2bseq(fp[i]->ks, s, with_qual, with_comment);
153
+ size += s->l_seq;
154
+ }
155
+ if (size >= chunk_size) break;
156
+ }
157
+ *n_ = a.n;
158
+ return a.a;
159
+ }
160
+
161
+ mm_bseq1_t *mm_bseq_read_frag(int n_fp, mm_bseq_file_t **fp, int64_t chunk_size, int with_qual, int *n_)
162
+ {
163
+ return mm_bseq_read_frag2(n_fp, fp, chunk_size, with_qual, 0, n_);
164
+ }
165
+
166
+ int mm_bseq_eof(mm_bseq_file_t *fp)
167
+ {
168
+ return (ks_eof(fp->ks->f) && fp->s.seq == 0);
169
+ }
@@ -0,0 +1,64 @@
1
+ #ifndef MM_BSEQ_H
2
+ #define MM_BSEQ_H
3
+
4
+ #include <stdint.h>
5
+ #include <string.h>
6
+
7
+ #ifdef __cplusplus
8
+ extern "C" {
9
+ #endif
10
+
11
+ struct mm_bseq_file_s;
12
+ typedef struct mm_bseq_file_s mm_bseq_file_t;
13
+
14
+ typedef struct {
15
+ int l_seq, rid;
16
+ char *name, *seq, *qual, *comment;
17
+ } mm_bseq1_t;
18
+
19
+ mm_bseq_file_t *mm_bseq_open(const char *fn);
20
+ void mm_bseq_close(mm_bseq_file_t *fp);
21
+ mm_bseq1_t *mm_bseq_read3(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int with_comment, int frag_mode, int *n_);
22
+ mm_bseq1_t *mm_bseq_read2(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int frag_mode, int *n_);
23
+ mm_bseq1_t *mm_bseq_read(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int *n_);
24
+ mm_bseq1_t *mm_bseq_read_frag2(int n_fp, mm_bseq_file_t **fp, int64_t chunk_size, int with_qual, int with_comment, int *n_);
25
+ mm_bseq1_t *mm_bseq_read_frag(int n_fp, mm_bseq_file_t **fp, int64_t chunk_size, int with_qual, int *n_);
26
+ int mm_bseq_eof(mm_bseq_file_t *fp);
27
+
28
+ extern unsigned char seq_nt4_table[256];
29
+ extern unsigned char seq_comp_table[256];
30
+
31
+ static inline int mm_qname_len(const char *s)
32
+ {
33
+ int l;
34
+ l = strlen(s);
35
+ return l >= 3 && s[l-1] >= '0' && s[l-1] <= '9' && s[l-2] == '/'? l - 2 : l;
36
+ }
37
+
38
+ static inline int mm_qname_same(const char *s1, const char *s2)
39
+ {
40
+ int l1, l2;
41
+ l1 = mm_qname_len(s1);
42
+ l2 = mm_qname_len(s2);
43
+ return (l1 == l2 && strncmp(s1, s2, l1) == 0);
44
+ }
45
+
46
+ static inline void mm_revcomp_bseq(mm_bseq1_t *s)
47
+ {
48
+ int i, t, l = s->l_seq;
49
+ for (i = 0; i < l>>1; ++i) {
50
+ t = s->seq[l - i - 1];
51
+ s->seq[l - i - 1] = seq_comp_table[(uint8_t)s->seq[i]];
52
+ s->seq[i] = seq_comp_table[t];
53
+ }
54
+ if (l&1) s->seq[l>>1] = seq_comp_table[(uint8_t)s->seq[l>>1]];
55
+ if (s->qual)
56
+ for (i = 0; i < l>>1; ++i)
57
+ t = s->qual[l - i - 1], s->qual[l - i - 1] = s->qual[i], s->qual[i] = t;
58
+ }
59
+
60
+ #ifdef __cplusplus
61
+ }
62
+ #endif
63
+
64
+ #endif
@@ -0,0 +1,30 @@
1
+ ## Contributor Code of Conduct
2
+
3
+ As contributors and maintainers of this project, we pledge to respect all
4
+ people who contribute through reporting issues, posting feature requests,
5
+ updating documentation, submitting pull requests or patches, and other
6
+ activities.
7
+
8
+ We are committed to making participation in this project a harassment-free
9
+ experience for everyone, regardless of level of experience, gender, gender
10
+ identity and expression, sexual orientation, disability, personal appearance,
11
+ body size, race, age, or religion.
12
+
13
+ Examples of unacceptable behavior by participants include the use of sexual
14
+ language or imagery, derogatory comments or personal attacks, trolling, public
15
+ or private harassment, insults, or other unprofessional conduct.
16
+
17
+ Project maintainers have the right and responsibility to remove, edit, or
18
+ reject comments, commits, code, wiki edits, issues, and other contributions
19
+ that are not aligned to this Code of Conduct. Project maintainers or
20
+ contributors who do not follow the Code of Conduct may be removed from the
21
+ project team.
22
+
23
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
24
+ reported by opening an issue or contacting the maintainer via email.
25
+
26
+ This Code of Conduct is adapted from the [Contributor Covenant][cc], [version
27
+ 1.0.0][v1].
28
+
29
+ [cc]: http://contributor-covenant.org/
30
+ [v1]: http://contributor-covenant.org/version/1/0/0/
@@ -0,0 +1,243 @@
1
+ ## Table of Contents
2
+
3
+ - [Introduction & Installation](#intro)
4
+ - [Mapping Genomic Reads](#map-reads)
5
+ * [Mapping long reads](#map-pb)
6
+ * [Mapping Illumina paired-end reads](#map-sr)
7
+ * [Evaluating mapping accuracy with simulated reads (for developers)](#mapeval)
8
+ - [Mapping Long RNA-seq Reads](#map-rna)
9
+ * [Mapping Nanopore 2D cDNA reads](#map-ont-cdna-2d)
10
+ * [Mapping Nanopore direct-RNA reads](#map-direct-rna)
11
+ * [Mapping PacBio Iso-seq reads](#map-iso-seq)
12
+ - [Full-Genome Alignment](#genome-aln)
13
+ * [Intra-species assembly alignment](#asm-to-ref)
14
+ * [Cross-species full-genome alignment](#x-species)
15
+ * [Eyeballing alignment](#view-aln)
16
+ * [Calling variants from assembly-to-reference alignment](#asm-var)
17
+ * [Constructing self-homology map](#hom-map)
18
+ * [Lift Over (for developers)](#liftover)
19
+ - [Read Overlap](#read-overlap)
20
+ * [Long-read overlap](#long-read-overlap)
21
+ * [Evaluating overlap sensitivity (for developers)](#ov-eval)
22
+
23
+ ## <a name="intro"></a>Introduction & Installation
24
+
25
+ This cookbook walks you through a variety of applications of minimap2 and its
26
+ companion script `paftools.js`. All data here are freely available from the
27
+ minimap2 release page at version tag [v2.10][v2.10]. Some examples only work
28
+ with v2.10 or later.
29
+
30
+ To acquire the data used in this cookbook and to install minimap2 and paftools,
31
+ please follow the command lines below:
32
+ ```sh
33
+ # install minimap2 executables
34
+ curl -L https://github.com/lh3/minimap2/releases/download/v2.23/minimap2-2.23_x64-linux.tar.bz2 | tar jxf -
35
+ cp minimap2-2.23_x64-linux/{minimap2,k8,paftools.js} . # copy executables
36
+ export PATH="$PATH:"`pwd` # put the current directory on PATH
37
+ # download example datasets
38
+ curl -L https://github.com/lh3/minimap2/releases/download/v2.10/cookbook-data.tgz | tar zxf -
39
+ ```
40
+
41
+ ## <a name="map-reads"></a>Mapping Genomic Reads
42
+
43
+ ### <a name="map-pb"></a>Mapping long reads
44
+ ```sh
45
+ minimap2 -ax map-pb -t4 ecoli_ref.fa ecoli_p6_25x_canu.fa > mapped.sam
46
+ ```
47
+ Alternatively, you can create a minimap2 index first and then map:
48
+ ```sh
49
+ minimap2 -x map-pb -d ecoli-pb.mmi ecoli_ref.fa # create an index
50
+ minimap2 -ax map-pb ecoli-pb.mmi ecoli_p6_25x_canu.fa > mapped.sam
51
+ ```
52
+ This will save you a couple of minutes when you map against the human genome.
53
+ **HOWEVER**, key algorithm parameters such as the k-mer length and window
54
+ size can't be changed after indexing. Minimap2 will give you a warning if
55
+ parameters used in a pre-built index doesn't match parameters on the command
56
+ line. **Please always make sure you are using an intended pre-built index.**
57
+
58
+ ### <a name="map-sr"></a>Mapping Illumina paired-end reads:
59
+ ```sh
60
+ minimap2 -ax sr -t4 ecoli_ref.fa ecoli_mason_1.fq ecoli_mason_2.fq > mapped-sr.sam
61
+ ```
62
+
63
+ ### <a name="mapeval"></a>Evaluating mapping accuracy with simulated reads (for developers)
64
+ ```sh
65
+ minimap2 -ax sr ecoli_ref.fa ecoli_mason_1.fq ecoli_mason_2.fq | paftools.js mapeval -
66
+ ```
67
+ The output is:
68
+ ```
69
+ Q 60 19712 0 0.000000000 19712
70
+ Q 0 282 219 0.010953286 19994
71
+ U 6
72
+ ```
73
+ where a `U`-line gives the number of unmapped reads (for SAM input only); a
74
+ `Q`-line gives:
75
+
76
+ 1. Mapping quality (mapQ) threshold
77
+ 2. Number of mapped reads between this threshold and the previous mapQ threshold.
78
+ 3. Number of wrong mappings in the same mapQ interval
79
+ 4. Accumulative mapping error rate
80
+ 5. Accumulative number of mappings
81
+
82
+ For `paftools.js mapeval` to work, you need to encode the true read positions
83
+ in read names in the right format. For [pbsim2][pbsim] and [mason2][mason2], we
84
+ provide scripts to generate the right format. Simulated reads in this cookbook
85
+ were created with the following command lines:
86
+ ```sh
87
+ # in the pbsim2 source code directory:
88
+ src/pbsim --depth 1 --length-min 5000 --length-mean 20000 --accuracy-mean 0.95 --hmm_model data/R94.model ../ecoli_ref.fa
89
+ paftools.js pbsim2fq ../ecoli_ref.fa.fai sd_0001.maf > ../ecoli_pbsim.fa
90
+
91
+ # mason2 simulation
92
+ mason_simulator --illumina-prob-mismatch-scale 2.5 -ir ecoli_ref.fa -n 10000 -o tmp-l.fq -or tmp-r.fq -oa tmp.sam
93
+ paftools.js mason2fq tmp.sam | seqtk seq -1 > ecoli_mason_1.fq
94
+ paftools.js mason2fq tmp.sam | seqtk seq -2 > ecoli_mason_2.fq
95
+ ```
96
+
97
+
98
+
99
+ ## <a name="map-rna"></a>Mapping Long RNA-seq Reads
100
+
101
+ ### <a name="map-ont-cdna-2d"></a>Mapping Nanopore 2D cDNA reads
102
+ ```sh
103
+ minimap2 -ax splice SIRV_E2.fa SIRV_ont-cdna.fa > aln.sam
104
+ ```
105
+ You can compare the alignment to the true annotations with:
106
+ ```sh
107
+ paftools.js junceval SIRV_E2C.gtf aln.sam
108
+ ```
109
+ It gives the percentage of introns found in the annotation. For SIRV data, it
110
+ is possible to achieve higher junction accuracy with
111
+ ```sh
112
+ minimap2 -ax splice --splice-flank=no SIRV_E2.fa SIRV_ont-cdna.fa | paftools.js junceval SIRV_E2C.gtf
113
+ ```
114
+ This is because minimap2 models one additional evolutionarily conserved base
115
+ around a canonical junction, but SIRV doesn't honor this signal. Option
116
+ `--splice-flank=no` asks minimap2 no to model this additional base.
117
+
118
+ In the output a tag `ts:A:+` indicates that the read strand is the same as the
119
+ transcript strand; `ts:A:-` indicates the read strand is opposite to the
120
+ transcript strand. This tag is inferred from the GT-AG signal and is thus only
121
+ available to spliced reads.
122
+
123
+ ### <a name="map-direct-rna"></a>Mapping Nanopore direct-RNA reads
124
+ ```sh
125
+ minimap2 -ax splice -k14 -uf SIRV_E2.fa SIRV_ont-drna.fa > aln.sam
126
+ ```
127
+ Direct-RNA reads are noisier, so we use a shorter k-mer for improved
128
+ sensitivity. Here, option `-uf` forces minimap2 to map reads to the forward
129
+ transcript strand only because direct-RNA reads are stranded. Again, applying
130
+ `--splice-flank=no` helps junction accuracy for SIRV data.
131
+
132
+ ### <a name="map-iso-seq"></a>Mapping PacBio Iso-seq reads
133
+ ```sh
134
+ minimap2 -ax splice -uf -C5 SIRV_E2.fa SIRV_iso-seq.fq > aln.sam
135
+ ```
136
+ Option `-C5` reduces the penalty on non-canonical splicing sites. It helps
137
+ to align such sites correctly for data with low error rate such as Iso-seq
138
+ reads and traditional cDNAs. On this example, minimap2 makes one junction
139
+ error. Applying `--splice-flank=no` fixes this alignment error.
140
+
141
+ Note that the command line above is optimized for the final Iso-seq reads.
142
+ PacBio's Iso-seq pipeline produces intermediate sequences at varying quality.
143
+ For example, some intermediate reads are not stranded. For these reads, option
144
+ `-uf` will lead to more errors. Please revise the minimap2 command line
145
+ accordingly.
146
+
147
+
148
+
149
+ ## <a name="genome-aln"></a>Full-Genome Alignment
150
+
151
+ ### <a name="asm-to-ref"></a>Intra-species assembly alignment
152
+ ```sh
153
+ # option "--cs" is recommended as paftools.js may need it
154
+ minimap2 -cx asm5 --cs ecoli_ref.fa ecoli_canu.fa > ecoli_canu.paf
155
+ ```
156
+ Here `ecoli_canu.fa` is the Canu assembly of `ecoli_p6_25x_canu.fa`. This
157
+ command line outputs alignments in the [PAF format][paf]. Use `-a` instead of
158
+ `-c` to get output in the SAM format.
159
+
160
+ ### <a name="x-species"></a>Cross-species full-genome alignment
161
+ ```sh
162
+ minimap2 -cx asm20 --cs ecoli_ref.fa ecoli_O104:H4.fa > ecoli_O104:H4.paf
163
+ sort -k6,6 -k8,8n ecoli_O104:H4.paf | paftools.js call -f ecoli_ref.fa -L10000 -l1000 - > out.vcf
164
+ ```
165
+ Minimap2 has three presets for full-genome alignment: "asm5" for sequence
166
+ divergence below 1%, "asm10" for divergence around a couple of percent and
167
+ "asm20" for divergence not more than 10%. In theory, with the right setting,
168
+ minimap2 should work for sequence pairs with sequence divergence up to ~15%,
169
+ but this has not been carefully evaluated.
170
+
171
+ ### <a name="view-aln"></a>Eyeballing alignment
172
+ ```sh
173
+ # option "--cs" required; minimap2-r741 or higher required for the "asm20" preset
174
+ minimap2 -cx asm20 --cs ecoli_ref.fa ecoli_O104:H4.fa | paftools.js view - | less -S
175
+ ```
176
+ This prints the alignment in a BLAST-like format.
177
+
178
+ ### <a name="asm-var"></a>Calling variants from assembly-to-reference alignment
179
+ ```sh
180
+ # don't forget the "--cs" option; otherwise it doesn't work
181
+ minimap2 -cx asm5 --cs ecoli_ref.fa ecoli_canu.fa \
182
+ | sort -k6,6 -k8,8n \
183
+ | paftools.js call -f ecoli_ref.fa - > out.vcf
184
+ ```
185
+ Without option `-f`, `paftools.js call` outputs in a custom format. In this
186
+ format, lines starting with `R` give the regions covered by one contig only.
187
+ This information is not available in the VCF output.
188
+
189
+ ### <a name="hom-map"></a>Constructing self-homology map
190
+ ```sh
191
+ minimap2 -DP -k19 -w19 -m200 ecoli_ref.fa ecoli_ref.fa > out.paf
192
+ ```
193
+ Option `-D` asks minimap2 to ignore anchors from perfect self match and `-P`
194
+ outputs all chains. For large nomes, we don't recommend to perform base-level
195
+ alignment (with `-c`, `-a` or `--cs`) when `-P` is applied. This is because
196
+ base-alignment is slow and occasionally gives wrong alignments close to the
197
+ diagonal of a dotter plot. For E. coli, though, base-alignment is still fast.
198
+
199
+ ### <a name="liftover"></a>Lift over (for developers)
200
+ ```sh
201
+ minimap2 -cx asm5 --cs ecoli_ref.fa ecoli_canu.fa > ecoli_canu.paf
202
+ echo -e 'tig00000001\t200000\t300000' | paftools.js liftover ecoli_canu.paf -
203
+ ```
204
+ This lifts over a region on query sequences to one or multiple regions on
205
+ reference sequences. Note that this paftools.js command may not be efficient
206
+ enough to lift millions of regions.
207
+
208
+
209
+
210
+ ## <a name="read-overlap"></a>Read Overlap
211
+
212
+ ### <a name="long-read-overlap"></a>Long read overlap
213
+ ```sh
214
+ # For pacbio reads:
215
+ minimap2 -x ava-pb ecoli_p6_25x_canu.fa ecoli_p6_25x_canu.fa > overlap.paf
216
+ # For Nanopore reads (ava-ont also works with PacBio but not as good):
217
+ minimap2 -x ava-ont -r 10000 ecoli_p6_25x_canu.fa ecoli_p6_25x_canu.fa > overlap.paf
218
+ # If you have miniasm installed:
219
+ miniasm -f ecoli_p6_25x_canu.fa overlap.paf > asm.gfa
220
+ ```
221
+ Here we explicitly applied `-r 10000`. We are considering to set this as the
222
+ default for the `ava-ont` mode as this seems to improve the contiguity for
223
+ nanopore read assembly (Loman, personal communication).
224
+
225
+ *Minimap2 doesn't work well with short-read overlap.*
226
+
227
+ ### <a name="ov-eval"></a>Evaluating overlap sensitivity (for developers)
228
+
229
+ ```sh
230
+ # read to reference mapping
231
+ minimap2 -cx map-pb ecoli_ref.fa ecoli_p6_25x_canu.fa > to-ref.paf
232
+ # evaluate overlap sensitivity
233
+ sort -k6,6 -k8,8n to-ref.paf | paftools.js ov-eval - overlap.paf
234
+ ```
235
+ You can see that for PacBio reads, minimap2 achieves higher overlap sensitivity
236
+ with `-x ava-pb` (99% vs 93% with `-x ava-ont`).
237
+
238
+
239
+
240
+ [pbsim]: https://github.com/yukiteruono/pbsim2
241
+ [mason2]: https://github.com/seqan/seqan/tree/master/apps/mason2
242
+ [paf]: https://github.com/lh3/miniasm/blob/master/PAF.md
243
+ [v2.10]: https://github.com/lh3/minimap2/releases/tag/v2.10
@@ -0,0 +1,64 @@
1
+ #include <math.h>
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include <assert.h>
5
+ #include "mmpriv.h"
6
+
7
+ static inline int32_t get_for_qpos(int32_t qlen, const mm128_t *a)
8
+ {
9
+ int32_t x = (int32_t)a->y;
10
+ int32_t q_span = a->y>>32 & 0xff;
11
+ if (a->x>>63)
12
+ x = qlen - 1 - (x + 1 - q_span); // revert the position to the forward strand of query
13
+ return x;
14
+ }
15
+
16
+ static int get_mini_idx(int qlen, const mm128_t *a, int32_t n, const uint64_t *mini_pos)
17
+ {
18
+ int32_t x, L = 0, R = n - 1;
19
+ x = get_for_qpos(qlen, a);
20
+ while (L <= R) { // binary search
21
+ int32_t m = ((uint64_t)L + R) >> 1;
22
+ int32_t y = (int32_t)mini_pos[m];
23
+ if (y < x) L = m + 1;
24
+ else if (y > x) R = m - 1;
25
+ else return m;
26
+ }
27
+ return -1;
28
+ }
29
+
30
+ void mm_est_err(const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, const mm128_t *a, int32_t n, const uint64_t *mini_pos)
31
+ {
32
+ int i;
33
+ uint64_t sum_k = 0;
34
+ float avg_k;
35
+
36
+ if (n == 0) return;
37
+ for (i = 0; i < n; ++i)
38
+ sum_k += mini_pos[i] >> 32 & 0xff;
39
+ avg_k = (float)sum_k / n;
40
+
41
+ for (i = 0; i < n_regs; ++i) {
42
+ mm_reg1_t *r = &regs[i];
43
+ int32_t st, en, j, k, n_match, n_tot, l_ref;
44
+ r->div = -1.0f;
45
+ if (r->cnt == 0) continue;
46
+ st = en = get_mini_idx(qlen, r->rev? &a[r->as + r->cnt - 1] : &a[r->as], n, mini_pos);
47
+ if (st < 0) {
48
+ if (mm_verbose >= 2)
49
+ fprintf(stderr, "[WARNING] logic inconsistency in mm_est_err(). Please contact the developer.\n");
50
+ continue;
51
+ }
52
+ l_ref = mi->seq[r->rid].len;
53
+ for (k = 1, j = st + 1, n_match = 1; j < n && k < r->cnt; ++j) {
54
+ int32_t x;
55
+ x = get_for_qpos(qlen, r->rev? &a[r->as + r->cnt - 1 - k] : &a[r->as + k]);
56
+ if (x == (int32_t)mini_pos[j])
57
+ ++k, en = j, ++n_match;
58
+ }
59
+ n_tot = en - st + 1;
60
+ if (r->qs > avg_k && r->rs > avg_k) ++n_tot;
61
+ if (qlen - r->qs > avg_k && l_ref - r->re > avg_k) ++n_tot;
62
+ r->div = n_match >= n_tot? 0.0f : (float)(1.0 - pow((double)n_match / n_tot, 1.0 / avg_k));
63
+ }
64
+ }
@@ -0,0 +1,63 @@
1
+ // To compile:
2
+ // gcc -g -O2 example.c libminimap2.a -lz
3
+
4
+ #include <stdlib.h>
5
+ #include <assert.h>
6
+ #include <stdio.h>
7
+ #include <zlib.h>
8
+ #include "minimap.h"
9
+ #include "kseq.h"
10
+ KSEQ_INIT(gzFile, gzread)
11
+
12
+ int main(int argc, char *argv[])
13
+ {
14
+ mm_idxopt_t iopt;
15
+ mm_mapopt_t mopt;
16
+ int n_threads = 3;
17
+
18
+ mm_verbose = 2; // disable message output to stderr
19
+ mm_set_opt(0, &iopt, &mopt);
20
+ mopt.flag |= MM_F_CIGAR; // perform alignment
21
+
22
+ if (argc < 3) {
23
+ fprintf(stderr, "Usage: minimap2-lite <target.fa> <query.fa>\n");
24
+ return 1;
25
+ }
26
+
27
+ // open query file for reading; you may use your favorite FASTA/Q parser
28
+ gzFile f = gzopen(argv[2], "r");
29
+ assert(f);
30
+ kseq_t *ks = kseq_init(f);
31
+
32
+ // open index reader
33
+ mm_idx_reader_t *r = mm_idx_reader_open(argv[1], &iopt, 0);
34
+ mm_idx_t *mi;
35
+ while ((mi = mm_idx_reader_read(r, n_threads)) != 0) { // traverse each part of the index
36
+ mm_mapopt_update(&mopt, mi); // this sets the maximum minimizer occurrence; TODO: set a better default in mm_mapopt_init()!
37
+ mm_tbuf_t *tbuf = mm_tbuf_init(); // thread buffer; for multi-threading, allocate one tbuf for each thread
38
+ gzrewind(f);
39
+ kseq_rewind(ks);
40
+ while (kseq_read(ks) >= 0) { // each kseq_read() call reads one query sequence
41
+ mm_reg1_t *reg;
42
+ int j, i, n_reg;
43
+ reg = mm_map(mi, ks->seq.l, ks->seq.s, &n_reg, tbuf, &mopt, 0); // get all hits for the query
44
+ for (j = 0; j < n_reg; ++j) { // traverse hits and print them out
45
+ mm_reg1_t *r = &reg[j];
46
+ assert(r->p); // with MM_F_CIGAR, this should not be NULL
47
+ printf("%s\t%d\t%d\t%d\t%c\t", ks->name.s, ks->seq.l, r->qs, r->qe, "+-"[r->rev]);
48
+ printf("%s\t%d\t%d\t%d\t%d\t%d\t%d\tcg:Z:", mi->seq[r->rid].name, mi->seq[r->rid].len, r->rs, r->re, r->mlen, r->blen, r->mapq);
49
+ for (i = 0; i < r->p->n_cigar; ++i) // IMPORTANT: this gives the CIGAR in the aligned regions. NO soft/hard clippings!
50
+ printf("%d%c", r->p->cigar[i]>>4, MM_CIGAR_STR[r->p->cigar[i]&0xf]);
51
+ putchar('\n');
52
+ free(r->p);
53
+ }
54
+ free(reg);
55
+ }
56
+ mm_tbuf_destroy(tbuf);
57
+ mm_idx_destroy(mi);
58
+ }
59
+ mm_idx_reader_close(r); // close the index reader
60
+ kseq_destroy(ks); // close the query file
61
+ gzclose(f);
62
+ return 0;
63
+ }