minimap2 0.0.4 → 0.2.23.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +113 -98
  3. data/ext/Rakefile +41 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +807 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +344 -0
  41. data/ext/minimap2/main.c +455 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +409 -0
  44. data/ext/minimap2/minimap2.1 +722 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +131 -0
  50. data/ext/minimap2/options.c +233 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/ext/vendor/libminimap2.so +0 -0
  93. data/lib/minimap2/aligner.rb +16 -5
  94. data/lib/minimap2/alignment.rb +6 -2
  95. data/lib/minimap2/ffi/constants.rb +74 -53
  96. data/lib/minimap2/ffi/functions.rb +5 -0
  97. data/lib/minimap2/ffi.rb +1 -2
  98. data/lib/minimap2/version.rb +2 -1
  99. data/lib/minimap2.rb +67 -22
  100. metadata +98 -64
  101. data/lib/minimap2/ffi_helper.rb +0 -53
@@ -0,0 +1,162 @@
1
+ #include <stdlib.h>
2
+ #include "mmpriv.h"
3
+
4
+ int mm_verbose = 1;
5
+ int mm_dbg_flag = 0;
6
+ double mm_realtime0;
7
+
8
+ #if defined(WIN32) || defined(_WIN32)
9
+ #include <windows.h>
10
+
11
+ struct timezone
12
+ {
13
+ __int32 tz_minuteswest; /* minutes W of Greenwich */
14
+ int tz_dsttime; /* type of dst correction */
15
+ };
16
+
17
+ /*
18
+ * gettimeofday.c
19
+ * Win32 gettimeofday() replacement
20
+ * taken from PostgreSQL, according to
21
+ * https://stackoverflow.com/questions/1676036/what-should-i-use-to-replace-gettimeofday-on-windows
22
+ *
23
+ * src/port/gettimeofday.c
24
+ *
25
+ * Copyright (c) 2003 SRA, Inc.
26
+ * Copyright (c) 2003 SKC, Inc.
27
+ *
28
+ * Permission to use, copy, modify, and distribute this software and
29
+ * its documentation for any purpose, without fee, and without a
30
+ * written agreement is hereby granted, provided that the above
31
+ * copyright notice and this paragraph and the following two
32
+ * paragraphs appear in all copies.
33
+ *
34
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE TO ANY PARTY FOR DIRECT,
35
+ * INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
36
+ * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
37
+ * DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED
38
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
39
+ *
40
+ * THE AUTHOR SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT
41
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
42
+ * A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS
43
+ * IS" BASIS, AND THE AUTHOR HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE,
44
+ * SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
45
+ */
46
+
47
+ /* FILETIME of Jan 1 1970 00:00:00. */
48
+ static const unsigned __int64 epoch = ((unsigned __int64) 116444736000000000ULL);
49
+
50
+ /*
51
+ * timezone information is stored outside the kernel so tzp isn't used anymore.
52
+ *
53
+ * Note: this function is not for Win32 high precision timing purpose. See
54
+ * elapsed_time().
55
+ */
56
+ int gettimeofday(struct timeval * tp, struct timezone *tzp)
57
+ {
58
+ FILETIME file_time;
59
+ SYSTEMTIME system_time;
60
+ ULARGE_INTEGER ularge;
61
+
62
+ GetSystemTime(&system_time);
63
+ SystemTimeToFileTime(&system_time, &file_time);
64
+ ularge.LowPart = file_time.dwLowDateTime;
65
+ ularge.HighPart = file_time.dwHighDateTime;
66
+
67
+ tp->tv_sec = (long) ((ularge.QuadPart - epoch) / 10000000L);
68
+ tp->tv_usec = (long) (system_time.wMilliseconds * 1000);
69
+
70
+ return 0;
71
+ }
72
+
73
+ // taken from https://stackoverflow.com/questions/5272470/c-get-cpu-usage-on-linux-and-windows
74
+ double cputime()
75
+ {
76
+ HANDLE hProcess = GetCurrentProcess();
77
+ FILETIME ftCreation, ftExit, ftKernel, ftUser;
78
+ SYSTEMTIME stKernel;
79
+ SYSTEMTIME stUser;
80
+
81
+ GetProcessTimes(hProcess, &ftCreation, &ftExit, &ftKernel, &ftUser);
82
+ FileTimeToSystemTime(&ftKernel, &stKernel);
83
+ FileTimeToSystemTime(&ftUser, &stUser);
84
+
85
+ double kernelModeTime = ((stKernel.wHour * 60.) + stKernel.wMinute * 60.) + stKernel.wSecond * 1. + stKernel.wMilliseconds / 1000.;
86
+ double userModeTime = ((stUser.wHour * 60.) + stUser.wMinute * 60.) + stUser.wSecond * 1. + stUser.wMilliseconds / 1000.;
87
+
88
+ return kernelModeTime + userModeTime;
89
+ }
90
+
91
+ long peakrss(void) { return 0; }
92
+ #else
93
+ #include <sys/resource.h>
94
+ #include <sys/time.h>
95
+
96
+ double cputime(void)
97
+ {
98
+ struct rusage r;
99
+ getrusage(RUSAGE_SELF, &r);
100
+ return r.ru_utime.tv_sec + r.ru_stime.tv_sec + 1e-6 * (r.ru_utime.tv_usec + r.ru_stime.tv_usec);
101
+ }
102
+
103
+ long peakrss(void)
104
+ {
105
+ struct rusage r;
106
+ getrusage(RUSAGE_SELF, &r);
107
+ #ifdef __linux__
108
+ return r.ru_maxrss * 1024;
109
+ #else
110
+ return r.ru_maxrss;
111
+ #endif
112
+ }
113
+
114
+ #endif /* WIN32 || _WIN32 */
115
+
116
+ double realtime(void)
117
+ {
118
+ struct timeval tp;
119
+ gettimeofday(&tp, NULL);
120
+ return tp.tv_sec + tp.tv_usec * 1e-6;
121
+ }
122
+
123
+ void mm_err_puts(const char *str)
124
+ {
125
+ int ret;
126
+ ret = puts(str);
127
+ if (ret == EOF) {
128
+ perror("[ERROR] failed to write the results");
129
+ exit(EXIT_FAILURE);
130
+ }
131
+ }
132
+
133
+ void mm_err_fwrite(const void *p, size_t size, size_t nitems, FILE *fp)
134
+ {
135
+ int ret;
136
+ ret = fwrite(p, size, nitems, fp);
137
+ if (ret == EOF) {
138
+ perror("[ERROR] failed to write data");
139
+ exit(EXIT_FAILURE);
140
+ }
141
+ }
142
+
143
+ void mm_err_fread(void *p, size_t size, size_t nitems, FILE *fp)
144
+ {
145
+ int ret;
146
+ ret = fread(p, size, nitems, fp);
147
+ if (ret == EOF) {
148
+ perror("[ERROR] failed to read data");
149
+ exit(EXIT_FAILURE);
150
+ }
151
+ }
152
+
153
+ #include "ksort.h"
154
+
155
+ #define sort_key_128x(a) ((a).x)
156
+ KRADIX_SORT_INIT(128x, mm128_t, sort_key_128x, 8)
157
+
158
+ #define sort_key_64(x) (x)
159
+ KRADIX_SORT_INIT(64, uint64_t, sort_key_64, 8)
160
+
161
+ KSORT_INIT_GENERIC(uint32_t)
162
+ KSORT_INIT_GENERIC(uint64_t)
@@ -0,0 +1,131 @@
1
+ #ifndef MMPRIV2_H
2
+ #define MMPRIV2_H
3
+
4
+ #include <assert.h>
5
+ #include "minimap.h"
6
+ #include "bseq.h"
7
+ #include "kseq.h"
8
+
9
+ #define MM_PARENT_UNSET (-1)
10
+ #define MM_PARENT_TMP_PRI (-2)
11
+
12
+ #define MM_DBG_NO_KALLOC 0x1
13
+ #define MM_DBG_PRINT_QNAME 0x2
14
+ #define MM_DBG_PRINT_SEED 0x4
15
+ #define MM_DBG_PRINT_ALN_SEQ 0x8
16
+
17
+ #define MM_SEED_LONG_JOIN (1ULL<<40)
18
+ #define MM_SEED_IGNORE (1ULL<<41)
19
+ #define MM_SEED_TANDEM (1ULL<<42)
20
+ #define MM_SEED_SELF (1ULL<<43)
21
+
22
+ #define MM_SEED_SEG_SHIFT 48
23
+ #define MM_SEED_SEG_MASK (0xffULL<<(MM_SEED_SEG_SHIFT))
24
+
25
+ #ifndef kroundup32
26
+ #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
27
+ #endif
28
+
29
+ #define mm_seq4_set(s, i, c) ((s)[(i)>>3] |= (uint32_t)(c) << (((i)&7)<<2))
30
+ #define mm_seq4_get(s, i) ((s)[(i)>>3] >> (((i)&7)<<2) & 0xf)
31
+
32
+ #define MALLOC(type, len) ((type*)malloc((len) * sizeof(type)))
33
+ #define CALLOC(type, len) ((type*)calloc((len), sizeof(type)))
34
+
35
+ #ifdef __cplusplus
36
+ extern "C" {
37
+ #endif
38
+
39
+ typedef struct {
40
+ uint32_t n;
41
+ uint32_t q_pos;
42
+ uint32_t q_span:31, flt:1;
43
+ uint32_t seg_id:31, is_tandem:1;
44
+ const uint64_t *cr;
45
+ } mm_seed_t;
46
+
47
+ typedef struct {
48
+ int n_u, n_a;
49
+ uint64_t *u;
50
+ mm128_t *a;
51
+ } mm_seg_t;
52
+
53
+ double cputime(void);
54
+ double realtime(void);
55
+ long peakrss(void);
56
+
57
+ void radix_sort_128x(mm128_t *beg, mm128_t *end);
58
+ void radix_sort_64(uint64_t *beg, uint64_t *end);
59
+ uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
60
+
61
+ void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p);
62
+
63
+ mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos);
64
+ void mm_seed_mz_flt(void *km, mm128_v *mv, int32_t q_occ_max, float q_occ_frac);
65
+
66
+ double mm_event_identity(const mm_reg1_t *r);
67
+ int mm_write_sam_hdr(const mm_idx_t *mi, const char *rg, const char *ver, int argc, char *argv[]);
68
+ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag);
69
+ void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag, int rep_len);
70
+ void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs);
71
+ void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int seg_idx, int reg_idx, int n_seg, const int *n_regs, const mm_reg1_t *const* regs, void *km, int64_t opt_flag);
72
+ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int seg_idx, int reg_idx, int n_seg, const int *n_regss, const mm_reg1_t *const* regss, void *km, int64_t opt_flag, int rep_len);
73
+
74
+ void mm_idxopt_init(mm_idxopt_t *opt);
75
+ const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
76
+ int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f);
77
+ int mm_idx_getseq2(const mm_idx_t *mi, int is_rev, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
78
+ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
79
+ mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a, int is_qstrand);
80
+
81
+ mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale,
82
+ int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
83
+ mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
84
+ int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
85
+ mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_skip, int cap_rmq_size, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
86
+ int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
87
+
88
+ void mm_mark_alt(const mm_idx_t *mi, int n, mm_reg1_t *r);
89
+ void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a, int is_qstrand);
90
+ void mm_sync_regs(void *km, int n_regs, mm_reg1_t *regs);
91
+ int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a);
92
+ int mm_set_sam_pri(int n, mm_reg1_t *r);
93
+ void mm_set_parent(void *km, float mask_level, int mask_len, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac);
94
+ void mm_select_sub(void *km, float pri_ratio, int min_diff, int best_n, int check_strand, int min_strand_sc, int *n_, mm_reg1_t *r);
95
+ void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r);
96
+ int mm_filter_strand_retained(int n_regs, mm_reg1_t *r);
97
+ void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs);
98
+ void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r, float alt_diff_frac);
99
+ void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr);
100
+ void mm_update_dp_max(int qlen, int n_regs, mm_reg1_t *regs, float frac, int a, int b);
101
+
102
+ void mm_est_err(const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, const mm128_t *a, int32_t n, const uint64_t *mini_pos);
103
+
104
+ mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int n_regs0, const mm_reg1_t *regs0, int *n_regs, mm_reg1_t **regs, const mm128_t *a);
105
+ void mm_seg_free(void *km, int n_segs, mm_seg_t *segs);
106
+ void mm_pair(void *km, int max_gap_ref, int dp_bonus, int sub_diff, int match_sc, const int *qlens, int *n_regs, mm_reg1_t **regs);
107
+
108
+ FILE *mm_split_init(const char *prefix, const mm_idx_t *mi);
109
+ mm_idx_t *mm_split_merge_prep(const char *prefix, int n_splits, FILE **fp, uint32_t *n_seq_part);
110
+ int mm_split_merge(int n_segs, const char **fn, const mm_mapopt_t *opt, int n_split_idx);
111
+ void mm_split_rm_tmp(const char *prefix, int n_splits);
112
+
113
+ void mm_err_puts(const char *str);
114
+ void mm_err_fwrite(const void *p, size_t size, size_t nitems, FILE *fp);
115
+ void mm_err_fread(void *p, size_t size, size_t nitems, FILE *fp);
116
+
117
+ static inline float mg_log2(float x) // NB: this doesn't work when x<2
118
+ {
119
+ union { float f; uint32_t i; } z = { x };
120
+ float log_2 = ((z.i >> 23) & 255) - 128;
121
+ z.i &= ~(255 << 23);
122
+ z.i += 127 << 23;
123
+ log_2 += (-0.34484843f * z.f + 2.02466578f) * z.f - 0.67487759f;
124
+ return log_2;
125
+ }
126
+
127
+ #ifdef __cplusplus
128
+ }
129
+ #endif
130
+
131
+ #endif
@@ -0,0 +1,233 @@
1
+ #include <stdio.h>
2
+ #include <limits.h>
3
+ #include "mmpriv.h"
4
+
5
+ void mm_idxopt_init(mm_idxopt_t *opt)
6
+ {
7
+ memset(opt, 0, sizeof(mm_idxopt_t));
8
+ opt->k = 15, opt->w = 10, opt->flag = 0;
9
+ opt->bucket_bits = 14;
10
+ opt->mini_batch_size = 50000000;
11
+ opt->batch_size = 4000000000ULL;
12
+ }
13
+
14
+ void mm_mapopt_init(mm_mapopt_t *opt)
15
+ {
16
+ memset(opt, 0, sizeof(mm_mapopt_t));
17
+ opt->seed = 11;
18
+ opt->mid_occ_frac = 2e-4f;
19
+ opt->min_mid_occ = 10;
20
+ opt->max_mid_occ = 1000000;
21
+ opt->sdust_thres = 0; // no SDUST masking
22
+ opt->q_occ_frac = 0.01f;
23
+
24
+ opt->min_cnt = 3;
25
+ opt->min_chain_score = 40;
26
+ opt->bw = 500, opt->bw_long = 20000;
27
+ opt->max_gap = 5000;
28
+ opt->max_gap_ref = -1;
29
+ opt->max_chain_skip = 25;
30
+ opt->max_chain_iter = 5000;
31
+ opt->rmq_inner_dist = 1000;
32
+ opt->rmq_size_cap = 100000;
33
+ opt->rmq_rescue_size = 1000;
34
+ opt->rmq_rescue_ratio = 0.1f;
35
+ opt->chain_gap_scale = 0.8f;
36
+ opt->chain_skip_scale = 0.0f;
37
+ opt->max_max_occ = 4095;
38
+ opt->occ_dist = 500;
39
+
40
+ opt->mask_level = 0.5f;
41
+ opt->mask_len = INT_MAX;
42
+ opt->pri_ratio = 0.8f;
43
+ opt->best_n = 5;
44
+
45
+ opt->alt_drop = 0.15f;
46
+
47
+ opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1;
48
+ opt->sc_ambi = 1;
49
+ opt->zdrop = 400, opt->zdrop_inv = 200;
50
+ opt->end_bonus = -1;
51
+ opt->min_dp_max = opt->min_chain_score * opt->a;
52
+ opt->min_ksw_len = 200;
53
+ opt->anchor_ext_len = 20, opt->anchor_ext_shift = 6;
54
+ opt->max_clip_ratio = 1.0f;
55
+ opt->mini_batch_size = 500000000;
56
+ opt->max_sw_mat = 100000000;
57
+ opt->cap_kalloc = 1000000000;
58
+
59
+ opt->rank_min_len = 500;
60
+ opt->rank_frac = 0.9f;
61
+
62
+ opt->pe_ori = 0; // FF
63
+ opt->pe_bonus = 33;
64
+ }
65
+
66
+ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
67
+ {
68
+ if ((opt->flag & MM_F_SPLICE_FOR) || (opt->flag & MM_F_SPLICE_REV))
69
+ opt->flag |= MM_F_SPLICE;
70
+ if (opt->mid_occ <= 0) {
71
+ opt->mid_occ = mm_idx_cal_max_occ(mi, opt->mid_occ_frac);
72
+ if (opt->mid_occ < opt->min_mid_occ)
73
+ opt->mid_occ = opt->min_mid_occ;
74
+ if (opt->max_mid_occ > opt->min_mid_occ && opt->mid_occ > opt->max_mid_occ)
75
+ opt->mid_occ = opt->max_mid_occ;
76
+ }
77
+ if (mm_verbose >= 3)
78
+ fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), opt->mid_occ);
79
+ }
80
+
81
+ void mm_mapopt_max_intron_len(mm_mapopt_t *opt, int max_intron_len)
82
+ {
83
+ if ((opt->flag & MM_F_SPLICE) && max_intron_len > 0)
84
+ opt->max_gap_ref = opt->bw = opt->bw_long = max_intron_len;
85
+ }
86
+
87
+ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
88
+ {
89
+ if (preset == 0) {
90
+ mm_idxopt_init(io);
91
+ mm_mapopt_init(mo);
92
+ } else if (strcmp(preset, "map-ont") == 0) { // this is the same as the default
93
+ } else if (strcmp(preset, "ava-ont") == 0) {
94
+ io->flag = 0, io->k = 15, io->w = 5;
95
+ mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN;
96
+ mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25;
97
+ mo->bw = mo->bw_long = 2000;
98
+ mo->occ_dist = 0;
99
+ } else if (strcmp(preset, "map10k") == 0 || strcmp(preset, "map-pb") == 0) {
100
+ io->flag |= MM_I_HPC, io->k = 19;
101
+ } else if (strcmp(preset, "ava-pb") == 0) {
102
+ io->flag |= MM_I_HPC, io->k = 19, io->w = 5;
103
+ mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN;
104
+ mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25;
105
+ mo->bw_long = mo->bw;
106
+ mo->occ_dist = 0;
107
+ } else if (strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) {
108
+ io->flag = 0, io->k = 19, io->w = 19;
109
+ mo->max_gap = 10000;
110
+ mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1;
111
+ mo->occ_dist = 500;
112
+ mo->min_mid_occ = 50, mo->max_mid_occ = 500;
113
+ mo->min_dp_max = 200;
114
+ } else if (strncmp(preset, "asm", 3) == 0) {
115
+ io->flag = 0, io->k = 19, io->w = 19;
116
+ mo->bw = mo->bw_long = 100000;
117
+ mo->max_gap = 10000;
118
+ mo->flag |= MM_F_RMQ;
119
+ mo->min_mid_occ = 50, mo->max_mid_occ = 500;
120
+ mo->min_dp_max = 200;
121
+ mo->best_n = 50;
122
+ if (strcmp(preset, "asm5") == 0) {
123
+ mo->a = 1, mo->b = 19, mo->q = 39, mo->q2 = 81, mo->e = 3, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
124
+ } else if (strcmp(preset, "asm10") == 0) {
125
+ mo->a = 1, mo->b = 9, mo->q = 16, mo->q2 = 41, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
126
+ } else if (strcmp(preset, "asm20") == 0) {
127
+ mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
128
+ io->w = 10;
129
+ } else return -1;
130
+ } else if (strcmp(preset, "short") == 0 || strcmp(preset, "sr") == 0) {
131
+ io->flag = 0, io->k = 21, io->w = 11;
132
+ mo->flag |= MM_F_SR | MM_F_FRAG_MODE | MM_F_NO_PRINT_2ND | MM_F_2_IO_THREADS | MM_F_HEAP_SORT;
133
+ mo->pe_ori = 0<<1|1; // FR
134
+ mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 24, mo->e2 = 1;
135
+ mo->zdrop = mo->zdrop_inv = 100;
136
+ mo->end_bonus = 10;
137
+ mo->max_frag_len = 800;
138
+ mo->max_gap = 100;
139
+ mo->bw = mo->bw_long = 100;
140
+ mo->pri_ratio = 0.5f;
141
+ mo->min_cnt = 2;
142
+ mo->min_chain_score = 25;
143
+ mo->min_dp_max = 40;
144
+ mo->best_n = 20;
145
+ mo->mid_occ = 1000;
146
+ mo->max_occ = 5000;
147
+ mo->mini_batch_size = 50000000;
148
+ } else if (strncmp(preset, "splice", 6) == 0 || strcmp(preset, "cdna") == 0) {
149
+ io->flag = 0, io->k = 15, io->w = 5;
150
+ mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV | MM_F_SPLICE_FLANK;
151
+ mo->max_sw_mat = 0;
152
+ mo->max_gap = 2000, mo->max_gap_ref = mo->bw = mo->bw_long = 200000;
153
+ mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0;
154
+ mo->noncan = 9;
155
+ mo->junc_bonus = 9;
156
+ mo->zdrop = 200, mo->zdrop_inv = 100; // because mo->a is halved
157
+ if (strcmp(preset, "splice:hq") == 0)
158
+ mo->junc_bonus = 5, mo->b = 4, mo->q = 6, mo->q2 = 24;
159
+ } else return -1;
160
+ return 0;
161
+ }
162
+
163
+ int mm_check_opt(const mm_idxopt_t *io, const mm_mapopt_t *mo)
164
+ {
165
+ if (mo->bw > mo->bw_long) {
166
+ if (mm_verbose >= 1)
167
+ fprintf(stderr, "[ERROR]\033[1;31m with '-rNUM1,NUM2', NUM1 (%d) can't be larger than NUM2 (%d)\033[0m\n", mo->bw, mo->bw_long);
168
+ return -8;
169
+ }
170
+ if ((mo->flag & MM_F_RMQ) && (mo->flag & (MM_F_SR|MM_F_SPLICE))) {
171
+ if (mm_verbose >= 1)
172
+ fprintf(stderr, "[ERROR]\033[1;31m --rmq doesn't work with --sr or --splice\033[0m\n");
173
+ return -7;
174
+ }
175
+ if (mo->split_prefix && (mo->flag & (MM_F_OUT_CS|MM_F_OUT_MD))) {
176
+ if (mm_verbose >= 1)
177
+ fprintf(stderr, "[ERROR]\033[1;31m --cs or --MD doesn't work with --split-prefix\033[0m\n");
178
+ return -6;
179
+ }
180
+ if (io->k <= 0 || io->w <= 0) {
181
+ if (mm_verbose >= 1)
182
+ fprintf(stderr, "[ERROR]\033[1;31m -k and -w must be positive\033[0m\n");
183
+ return -5;
184
+ }
185
+ if (mo->best_n < 0) {
186
+ if (mm_verbose >= 1)
187
+ fprintf(stderr, "[ERROR]\033[1;31m -N must be no less than 0\033[0m\n");
188
+ return -4;
189
+ }
190
+ if (mo->best_n == 0 && mm_verbose >= 2)
191
+ fprintf(stderr, "[WARNING]\033[1;31m '-N 0' reduces mapping accuracy. Please use '--secondary=no' instead.\033[0m\n");
192
+ if (mo->pri_ratio < 0.0f || mo->pri_ratio > 1.0f) {
193
+ if (mm_verbose >= 1)
194
+ fprintf(stderr, "[ERROR]\033[1;31m -p must be within 0 and 1 (including 0 and 1)\033[0m\n");
195
+ return -4;
196
+ }
197
+ if ((mo->flag & MM_F_FOR_ONLY) && (mo->flag & MM_F_REV_ONLY)) {
198
+ if (mm_verbose >= 1)
199
+ fprintf(stderr, "[ERROR]\033[1;31m --for-only and --rev-only can't be applied at the same time\033[0m\n");
200
+ return -3;
201
+ }
202
+ if (mo->e <= 0 || mo->q <= 0) {
203
+ if (mm_verbose >= 1)
204
+ fprintf(stderr, "[ERROR]\033[1;31m -O and -E must be positive\033[0m\n");
205
+ return -1;
206
+ }
207
+ if ((mo->q != mo->q2 || mo->e != mo->e2) && !(mo->e > mo->e2 && mo->q + mo->e < mo->q2 + mo->e2)) {
208
+ if (mm_verbose >= 1)
209
+ fprintf(stderr, "[ERROR]\033[1;31m dual gap penalties violating E1>E2 and O1+E1<O2+E2\033[0m\n");
210
+ return -2;
211
+ }
212
+ if ((mo->q + mo->e) + (mo->q2 + mo->e2) > 127) {
213
+ if (mm_verbose >= 1)
214
+ fprintf(stderr, "[ERROR]\033[1;31m scoring system violating ({-O}+{-E})+({-O2}+{-E2}) <= 127\033[0m\n");
215
+ return -1;
216
+ }
217
+ if (mo->zdrop < mo->zdrop_inv) {
218
+ if (mm_verbose >= 1)
219
+ fprintf(stderr, "[ERROR]\033[1;31m Z-drop should not be less than inversion-Z-drop\033[0m\n");
220
+ return -5;
221
+ }
222
+ if ((mo->flag & MM_F_NO_PRINT_2ND) && (mo->flag & MM_F_ALL_CHAINS)) {
223
+ if (mm_verbose >= 1)
224
+ fprintf(stderr, "[ERROR]\033[1;31m -X/-P and --secondary=no can't be applied at the same time\033[0m\n");
225
+ return -5;
226
+ }
227
+ if ((mo->flag & MM_F_QSTRAND) && ((mo->flag & (MM_F_OUT_SAM|MM_F_SPLICE|MM_F_FRAG_MODE)) || (io->flag & MM_I_HPC))) {
228
+ if (mm_verbose >= 1)
229
+ fprintf(stderr, "[ERROR]\033[1;31m --qstrand doesn't work with -a, -H, --frag or --splice\033[0m\n");
230
+ return -5;
231
+ }
232
+ return 0;
233
+ }
data/ext/minimap2/pe.c ADDED
@@ -0,0 +1,177 @@
1
+ #include <stdlib.h>
2
+ #include <math.h>
3
+ #include "mmpriv.h"
4
+ #include "kvec.h"
5
+
6
+ void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r)
7
+ {
8
+ if (pri_ratio > 0.0f && *n_ > 0) {
9
+ int i, k, n = *n_, n_2nd = 0;
10
+ int max_dist = n_segs == 2? qlens[0] + qlens[1] + max_gap_ref : 0;
11
+ for (i = k = 0; i < n; ++i) {
12
+ int to_keep = 0;
13
+ if (r[i].parent == i) { // primary
14
+ to_keep = 1;
15
+ } else if (r[i].score + min_diff >= r[r[i].parent].score) {
16
+ to_keep = 1;
17
+ } else {
18
+ mm_reg1_t *p = &r[r[i].parent], *q = &r[i];
19
+ if (p->rev == q->rev && p->rid == q->rid && q->re - p->rs < max_dist && p->re - q->rs < max_dist) { // child and parent are close on the ref
20
+ if (q->score >= p->score * pri1)
21
+ to_keep = 1;
22
+ } else {
23
+ int is_par_both = (n_segs == 2 && p->qs < qlens[0] && p->qe > qlens[0]);
24
+ int is_chi_both = (n_segs == 2 && q->qs < qlens[0] && q->qe > qlens[0]);
25
+ if (is_chi_both || is_chi_both == is_par_both) {
26
+ if (q->score >= p->score * pri_ratio)
27
+ to_keep = 1;
28
+ } else { // the remaining case: is_chi_both == 0 && is_par_both == 1
29
+ if (q->score >= p->score * pri2)
30
+ to_keep = 1;
31
+ }
32
+ }
33
+ }
34
+ if (to_keep && r[i].parent != i) {
35
+ if (n_2nd++ >= best_n) to_keep = 0; // don't keep if there are too many secondary hits
36
+ }
37
+ if (to_keep) r[k++] = r[i];
38
+ else if (r[i].p) free(r[i].p);
39
+ }
40
+ if (k != n) mm_sync_regs(km, k, r); // removing hits requires sync()
41
+ *n_ = k;
42
+ }
43
+ }
44
+
45
+ void mm_set_pe_thru(const int *qlens, int *n_regs, mm_reg1_t **regs)
46
+ {
47
+ int s, i, n_pri[2], pri[2];
48
+ n_pri[0] = n_pri[1] = 0;
49
+ pri[0] = pri[1] = -1;
50
+ for (s = 0; s < 2; ++s)
51
+ for (i = 0; i < n_regs[s]; ++i)
52
+ if (regs[s][i].id == regs[s][i].parent)
53
+ ++n_pri[s], pri[s] = i;
54
+ if (n_pri[0] == 1 && n_pri[1] == 1) {
55
+ mm_reg1_t *p = &regs[0][pri[0]];
56
+ mm_reg1_t *q = &regs[1][pri[1]];
57
+ if (p->rid == q->rid && p->rev == q->rev && abs(p->rs - q->rs) < 3 && abs(p->re - q->re) < 3
58
+ && ((p->qs == 0 && qlens[1] - q->qe == 0) || (q->qs == 0 && qlens[0] - p->qe == 0)))
59
+ {
60
+ p->pe_thru = q->pe_thru = 1;
61
+ }
62
+ }
63
+ }
64
+
65
+ #include "ksort.h"
66
+
67
+ typedef struct {
68
+ int s, rev;
69
+ uint64_t key;
70
+ mm_reg1_t *r;
71
+ } pair_arr_t;
72
+
73
+ #define sort_key_pair(a) ((a).key)
74
+ KRADIX_SORT_INIT(pair, pair_arr_t, sort_key_pair, 8)
75
+
76
+ void mm_pair(void *km, int max_gap_ref, int pe_bonus, int sub_diff, int match_sc, const int *qlens, int *n_regs, mm_reg1_t **regs)
77
+ {
78
+ int i, j, s, n, last[2], dp_thres, segs = 0, max_idx[2];
79
+ int64_t max;
80
+ pair_arr_t *a;
81
+ kvec_t(uint64_t) sc = {0,0,0};
82
+
83
+ a = (pair_arr_t*)kmalloc(km, (n_regs[0] + n_regs[1]) * sizeof(pair_arr_t));
84
+ for (s = n = 0, dp_thres = 0; s < 2; ++s) {
85
+ int max = 0;
86
+ for (i = 0; i < n_regs[s]; ++i) {
87
+ a[n].s = s;
88
+ a[n].r = &regs[s][i];
89
+ a[n].rev = a[n].r->rev;
90
+ a[n].key = (uint64_t)a[n].r->rid << 32 | a[n].r->rs<<1 | (s^a[n].rev);
91
+ max = max > a[n].r->p->dp_max? max : a[n].r->p->dp_max;
92
+ ++n;
93
+ segs |= 1<<s;
94
+ }
95
+ dp_thres += max;
96
+ }
97
+ if (segs != 3) {
98
+ kfree(km, a); // only one end is mapped
99
+ return;
100
+ }
101
+ dp_thres -= pe_bonus;
102
+ if (dp_thres < 0) dp_thres = 0;
103
+ radix_sort_pair(a, a + n);
104
+
105
+ max = -1;
106
+ max_idx[0] = max_idx[1] = -1;
107
+ last[0] = last[1] = -1;
108
+ kv_resize(uint64_t, km, sc, (size_t)n);
109
+ for (i = 0; i < n; ++i) {
110
+ if (a[i].key & 1) { // reverse first read or forward second read
111
+ mm_reg1_t *q, *r;
112
+ if (last[a[i].rev] < 0) continue;
113
+ r = a[i].r;
114
+ q = a[last[a[i].rev]].r;
115
+ if (r->rid != q->rid || r->rs - q->re > max_gap_ref) continue;
116
+ for (j = last[a[i].rev]; j >= 0; --j) {
117
+ int64_t score;
118
+ if (a[j].rev != a[i].rev || a[j].s == a[i].s) continue;
119
+ q = a[j].r;
120
+ if (r->rid != q->rid || r->rs - q->re > max_gap_ref) break;
121
+ if (r->p->dp_max + q->p->dp_max < dp_thres) continue;
122
+ score = (int64_t)(r->p->dp_max + q->p->dp_max) << 32 | (r->hash + q->hash);
123
+ if (score > max)
124
+ max = score, max_idx[a[j].s] = j, max_idx[a[i].s] = i;
125
+ kv_push(uint64_t, km, sc, score);
126
+ }
127
+ } else { // forward first read or reverse second read
128
+ last[a[i].rev] = i;
129
+ }
130
+ }
131
+ if (sc.n > 1)
132
+ radix_sort_64(sc.a, sc.a + sc.n);
133
+
134
+ if (sc.n > 0 && max > 0) { // found at least one pair
135
+ int n_sub = 0, mapq_pe;
136
+ mm_reg1_t *r[2];
137
+ r[0] = a[max_idx[0]].r, r[1] = a[max_idx[1]].r;
138
+ r[0]->proper_frag = r[1]->proper_frag = 1;
139
+ for (s = 0; s < 2; ++s) {
140
+ if (r[s]->id != r[s]->parent) { // then lift to primary and update parent
141
+ mm_reg1_t *p = &regs[s][r[s]->parent];
142
+ for (i = 0; i < n_regs[s]; ++i)
143
+ if (regs[s][i].parent == p->id)
144
+ regs[s][i].parent = r[s]->id;
145
+ p->mapq = 0;
146
+ }
147
+ if (!r[s]->sam_pri) { // then sync sam_pri
148
+ for (i = 0; i < n_regs[s]; ++i)
149
+ regs[s][i].sam_pri = 0;
150
+ r[s]->sam_pri = 1;
151
+ }
152
+ }
153
+ mapq_pe = r[0]->mapq > r[1]->mapq? r[0]->mapq : r[1]->mapq;
154
+ for (i = 0; i < (int)sc.n; ++i)
155
+ if ((sc.a[i]>>32) + sub_diff >= (uint64_t)max>>32)
156
+ ++n_sub;
157
+ if (sc.n > 1) {
158
+ int mapq_pe_alt;
159
+ mapq_pe_alt = (int)(6.02f * ((max>>32) - (sc.a[sc.n - 2]>>32)) / match_sc - 4.343f * logf(n_sub)); // n_sub > 0 because it counts the optimal, too
160
+ mapq_pe = mapq_pe < mapq_pe_alt? mapq_pe : mapq_pe_alt;
161
+ }
162
+ if (r[0]->mapq < mapq_pe) r[0]->mapq = (int)(.2f * r[0]->mapq + .8f * mapq_pe + .499f);
163
+ if (r[1]->mapq < mapq_pe) r[1]->mapq = (int)(.2f * r[1]->mapq + .8f * mapq_pe + .499f);
164
+ if (sc.n == 1) {
165
+ if (r[0]->mapq < 2) r[0]->mapq = 2;
166
+ if (r[1]->mapq < 2) r[1]->mapq = 2;
167
+ } else if ((uint64_t)max>>32 > sc.a[sc.n - 2]>>32) {
168
+ if (r[0]->mapq < 1) r[0]->mapq = 1;
169
+ if (r[1]->mapq < 1) r[1]->mapq = 1;
170
+ }
171
+ }
172
+
173
+ kfree(km, a);
174
+ kfree(km, sc.a);
175
+
176
+ mm_set_pe_thru(qlens, n_regs, regs);
177
+ }