minimap2 0.2.22.0 → 0.2.24.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +60 -76
- data/ext/Rakefile +55 -0
- data/ext/cmappy/cmappy.c +129 -0
- data/ext/cmappy/cmappy.h +44 -0
- data/ext/minimap2/FAQ.md +46 -0
- data/ext/minimap2/LICENSE.txt +24 -0
- data/ext/minimap2/MANIFEST.in +10 -0
- data/ext/minimap2/Makefile +132 -0
- data/ext/minimap2/Makefile.simde +97 -0
- data/ext/minimap2/NEWS.md +821 -0
- data/ext/minimap2/README.md +403 -0
- data/ext/minimap2/align.c +1020 -0
- data/ext/minimap2/bseq.c +169 -0
- data/ext/minimap2/bseq.h +64 -0
- data/ext/minimap2/code_of_conduct.md +30 -0
- data/ext/minimap2/cookbook.md +243 -0
- data/ext/minimap2/esterr.c +64 -0
- data/ext/minimap2/example.c +63 -0
- data/ext/minimap2/format.c +559 -0
- data/ext/minimap2/hit.c +466 -0
- data/ext/minimap2/index.c +775 -0
- data/ext/minimap2/kalloc.c +205 -0
- data/ext/minimap2/kalloc.h +76 -0
- data/ext/minimap2/kdq.h +132 -0
- data/ext/minimap2/ketopt.h +120 -0
- data/ext/minimap2/khash.h +615 -0
- data/ext/minimap2/krmq.h +474 -0
- data/ext/minimap2/kseq.h +256 -0
- data/ext/minimap2/ksort.h +153 -0
- data/ext/minimap2/ksw2.h +184 -0
- data/ext/minimap2/ksw2_dispatch.c +96 -0
- data/ext/minimap2/ksw2_extd2_sse.c +402 -0
- data/ext/minimap2/ksw2_exts2_sse.c +416 -0
- data/ext/minimap2/ksw2_extz2_sse.c +313 -0
- data/ext/minimap2/ksw2_ll_sse.c +152 -0
- data/ext/minimap2/kthread.c +159 -0
- data/ext/minimap2/kthread.h +15 -0
- data/ext/minimap2/kvec.h +105 -0
- data/ext/minimap2/lchain.c +369 -0
- data/ext/minimap2/main.c +459 -0
- data/ext/minimap2/map.c +714 -0
- data/ext/minimap2/minimap.h +410 -0
- data/ext/minimap2/minimap2.1 +725 -0
- data/ext/minimap2/misc/README.md +179 -0
- data/ext/minimap2/misc/mmphase.js +335 -0
- data/ext/minimap2/misc/paftools.js +3149 -0
- data/ext/minimap2/misc.c +162 -0
- data/ext/minimap2/mmpriv.h +132 -0
- data/ext/minimap2/options.c +234 -0
- data/ext/minimap2/pe.c +177 -0
- data/ext/minimap2/python/README.rst +196 -0
- data/ext/minimap2/python/cmappy.h +152 -0
- data/ext/minimap2/python/cmappy.pxd +153 -0
- data/ext/minimap2/python/mappy.pyx +273 -0
- data/ext/minimap2/python/minimap2.py +39 -0
- data/ext/minimap2/sdust.c +213 -0
- data/ext/minimap2/sdust.h +25 -0
- data/ext/minimap2/seed.c +131 -0
- data/ext/minimap2/setup.py +55 -0
- data/ext/minimap2/sketch.c +143 -0
- data/ext/minimap2/splitidx.c +84 -0
- data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
- data/ext/minimap2/test/MT-human.fa +278 -0
- data/ext/minimap2/test/MT-orang.fa +276 -0
- data/ext/minimap2/test/q-inv.fa +4 -0
- data/ext/minimap2/test/q2.fa +2 -0
- data/ext/minimap2/test/t-inv.fa +127 -0
- data/ext/minimap2/test/t2.fa +2 -0
- data/ext/minimap2/tex/Makefile +21 -0
- data/ext/minimap2/tex/bioinfo.cls +930 -0
- data/ext/minimap2/tex/blasr-mc.eval +17 -0
- data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
- data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
- data/ext/minimap2/tex/bwa.eval +55 -0
- data/ext/minimap2/tex/eval2roc.pl +33 -0
- data/ext/minimap2/tex/graphmap.eval +4 -0
- data/ext/minimap2/tex/hs38-simu.sh +10 -0
- data/ext/minimap2/tex/minialign.eval +49 -0
- data/ext/minimap2/tex/minimap2.bib +460 -0
- data/ext/minimap2/tex/minimap2.tex +724 -0
- data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
- data/ext/minimap2/tex/mm2-update.tex +240 -0
- data/ext/minimap2/tex/mm2.approx.eval +12 -0
- data/ext/minimap2/tex/mm2.eval +13 -0
- data/ext/minimap2/tex/natbib.bst +1288 -0
- data/ext/minimap2/tex/natbib.sty +803 -0
- data/ext/minimap2/tex/ngmlr.eval +38 -0
- data/ext/minimap2/tex/roc.gp +60 -0
- data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
- data/ext/minimap2.patch +19 -0
- data/lib/minimap2/aligner.rb +4 -4
- data/lib/minimap2/alignment.rb +11 -11
- data/lib/minimap2/ffi/constants.rb +20 -16
- data/lib/minimap2/ffi/functions.rb +5 -0
- data/lib/minimap2/ffi.rb +4 -5
- data/lib/minimap2/version.rb +2 -2
- data/lib/minimap2.rb +51 -15
- metadata +97 -79
- data/lib/minimap2/ffi_helper.rb +0 -53
- data/vendor/libminimap2.so +0 -0
data/ext/minimap2/misc.c
ADDED
@@ -0,0 +1,162 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include "mmpriv.h"
|
3
|
+
|
4
|
+
int mm_verbose = 1;
|
5
|
+
int mm_dbg_flag = 0;
|
6
|
+
double mm_realtime0;
|
7
|
+
|
8
|
+
#if defined(WIN32) || defined(_WIN32)
|
9
|
+
#include <windows.h>
|
10
|
+
|
11
|
+
struct timezone
|
12
|
+
{
|
13
|
+
__int32 tz_minuteswest; /* minutes W of Greenwich */
|
14
|
+
int tz_dsttime; /* type of dst correction */
|
15
|
+
};
|
16
|
+
|
17
|
+
/*
|
18
|
+
* gettimeofday.c
|
19
|
+
* Win32 gettimeofday() replacement
|
20
|
+
* taken from PostgreSQL, according to
|
21
|
+
* https://stackoverflow.com/questions/1676036/what-should-i-use-to-replace-gettimeofday-on-windows
|
22
|
+
*
|
23
|
+
* src/port/gettimeofday.c
|
24
|
+
*
|
25
|
+
* Copyright (c) 2003 SRA, Inc.
|
26
|
+
* Copyright (c) 2003 SKC, Inc.
|
27
|
+
*
|
28
|
+
* Permission to use, copy, modify, and distribute this software and
|
29
|
+
* its documentation for any purpose, without fee, and without a
|
30
|
+
* written agreement is hereby granted, provided that the above
|
31
|
+
* copyright notice and this paragraph and the following two
|
32
|
+
* paragraphs appear in all copies.
|
33
|
+
*
|
34
|
+
* IN NO EVENT SHALL THE AUTHOR BE LIABLE TO ANY PARTY FOR DIRECT,
|
35
|
+
* INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
|
36
|
+
* LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
|
37
|
+
* DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED
|
38
|
+
* OF THE POSSIBILITY OF SUCH DAMAGE.
|
39
|
+
*
|
40
|
+
* THE AUTHOR SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT
|
41
|
+
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
42
|
+
* A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS
|
43
|
+
* IS" BASIS, AND THE AUTHOR HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE,
|
44
|
+
* SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
|
45
|
+
*/
|
46
|
+
|
47
|
+
/* FILETIME of Jan 1 1970 00:00:00. */
|
48
|
+
static const unsigned __int64 epoch = ((unsigned __int64) 116444736000000000ULL);
|
49
|
+
|
50
|
+
/*
|
51
|
+
* timezone information is stored outside the kernel so tzp isn't used anymore.
|
52
|
+
*
|
53
|
+
* Note: this function is not for Win32 high precision timing purpose. See
|
54
|
+
* elapsed_time().
|
55
|
+
*/
|
56
|
+
int gettimeofday(struct timeval * tp, struct timezone *tzp)
|
57
|
+
{
|
58
|
+
FILETIME file_time;
|
59
|
+
SYSTEMTIME system_time;
|
60
|
+
ULARGE_INTEGER ularge;
|
61
|
+
|
62
|
+
GetSystemTime(&system_time);
|
63
|
+
SystemTimeToFileTime(&system_time, &file_time);
|
64
|
+
ularge.LowPart = file_time.dwLowDateTime;
|
65
|
+
ularge.HighPart = file_time.dwHighDateTime;
|
66
|
+
|
67
|
+
tp->tv_sec = (long) ((ularge.QuadPart - epoch) / 10000000L);
|
68
|
+
tp->tv_usec = (long) (system_time.wMilliseconds * 1000);
|
69
|
+
|
70
|
+
return 0;
|
71
|
+
}
|
72
|
+
|
73
|
+
// taken from https://stackoverflow.com/questions/5272470/c-get-cpu-usage-on-linux-and-windows
|
74
|
+
double cputime()
|
75
|
+
{
|
76
|
+
HANDLE hProcess = GetCurrentProcess();
|
77
|
+
FILETIME ftCreation, ftExit, ftKernel, ftUser;
|
78
|
+
SYSTEMTIME stKernel;
|
79
|
+
SYSTEMTIME stUser;
|
80
|
+
|
81
|
+
GetProcessTimes(hProcess, &ftCreation, &ftExit, &ftKernel, &ftUser);
|
82
|
+
FileTimeToSystemTime(&ftKernel, &stKernel);
|
83
|
+
FileTimeToSystemTime(&ftUser, &stUser);
|
84
|
+
|
85
|
+
double kernelModeTime = ((stKernel.wHour * 60.) + stKernel.wMinute * 60.) + stKernel.wSecond * 1. + stKernel.wMilliseconds / 1000.;
|
86
|
+
double userModeTime = ((stUser.wHour * 60.) + stUser.wMinute * 60.) + stUser.wSecond * 1. + stUser.wMilliseconds / 1000.;
|
87
|
+
|
88
|
+
return kernelModeTime + userModeTime;
|
89
|
+
}
|
90
|
+
|
91
|
+
long peakrss(void) { return 0; }
|
92
|
+
#else
|
93
|
+
#include <sys/resource.h>
|
94
|
+
#include <sys/time.h>
|
95
|
+
|
96
|
+
double cputime(void)
|
97
|
+
{
|
98
|
+
struct rusage r;
|
99
|
+
getrusage(RUSAGE_SELF, &r);
|
100
|
+
return r.ru_utime.tv_sec + r.ru_stime.tv_sec + 1e-6 * (r.ru_utime.tv_usec + r.ru_stime.tv_usec);
|
101
|
+
}
|
102
|
+
|
103
|
+
long peakrss(void)
|
104
|
+
{
|
105
|
+
struct rusage r;
|
106
|
+
getrusage(RUSAGE_SELF, &r);
|
107
|
+
#ifdef __linux__
|
108
|
+
return r.ru_maxrss * 1024;
|
109
|
+
#else
|
110
|
+
return r.ru_maxrss;
|
111
|
+
#endif
|
112
|
+
}
|
113
|
+
|
114
|
+
#endif /* WIN32 || _WIN32 */
|
115
|
+
|
116
|
+
double realtime(void)
|
117
|
+
{
|
118
|
+
struct timeval tp;
|
119
|
+
gettimeofday(&tp, NULL);
|
120
|
+
return tp.tv_sec + tp.tv_usec * 1e-6;
|
121
|
+
}
|
122
|
+
|
123
|
+
void mm_err_puts(const char *str)
|
124
|
+
{
|
125
|
+
int ret;
|
126
|
+
ret = puts(str);
|
127
|
+
if (ret == EOF) {
|
128
|
+
perror("[ERROR] failed to write the results");
|
129
|
+
exit(EXIT_FAILURE);
|
130
|
+
}
|
131
|
+
}
|
132
|
+
|
133
|
+
void mm_err_fwrite(const void *p, size_t size, size_t nitems, FILE *fp)
|
134
|
+
{
|
135
|
+
int ret;
|
136
|
+
ret = fwrite(p, size, nitems, fp);
|
137
|
+
if (ret == EOF) {
|
138
|
+
perror("[ERROR] failed to write data");
|
139
|
+
exit(EXIT_FAILURE);
|
140
|
+
}
|
141
|
+
}
|
142
|
+
|
143
|
+
void mm_err_fread(void *p, size_t size, size_t nitems, FILE *fp)
|
144
|
+
{
|
145
|
+
int ret;
|
146
|
+
ret = fread(p, size, nitems, fp);
|
147
|
+
if (ret == EOF) {
|
148
|
+
perror("[ERROR] failed to read data");
|
149
|
+
exit(EXIT_FAILURE);
|
150
|
+
}
|
151
|
+
}
|
152
|
+
|
153
|
+
#include "ksort.h"
|
154
|
+
|
155
|
+
#define sort_key_128x(a) ((a).x)
|
156
|
+
KRADIX_SORT_INIT(128x, mm128_t, sort_key_128x, 8)
|
157
|
+
|
158
|
+
#define sort_key_64(x) (x)
|
159
|
+
KRADIX_SORT_INIT(64, uint64_t, sort_key_64, 8)
|
160
|
+
|
161
|
+
KSORT_INIT_GENERIC(uint32_t)
|
162
|
+
KSORT_INIT_GENERIC(uint64_t)
|
@@ -0,0 +1,132 @@
|
|
1
|
+
#ifndef MMPRIV2_H
|
2
|
+
#define MMPRIV2_H
|
3
|
+
|
4
|
+
#include <assert.h>
|
5
|
+
#include "minimap.h"
|
6
|
+
#include "bseq.h"
|
7
|
+
#include "kseq.h"
|
8
|
+
|
9
|
+
#define MM_PARENT_UNSET (-1)
|
10
|
+
#define MM_PARENT_TMP_PRI (-2)
|
11
|
+
|
12
|
+
#define MM_DBG_NO_KALLOC 0x1
|
13
|
+
#define MM_DBG_PRINT_QNAME 0x2
|
14
|
+
#define MM_DBG_PRINT_SEED 0x4
|
15
|
+
#define MM_DBG_PRINT_ALN_SEQ 0x8
|
16
|
+
#define MM_DBG_PRINT_CHAIN 0x10
|
17
|
+
|
18
|
+
#define MM_SEED_LONG_JOIN (1ULL<<40)
|
19
|
+
#define MM_SEED_IGNORE (1ULL<<41)
|
20
|
+
#define MM_SEED_TANDEM (1ULL<<42)
|
21
|
+
#define MM_SEED_SELF (1ULL<<43)
|
22
|
+
|
23
|
+
#define MM_SEED_SEG_SHIFT 48
|
24
|
+
#define MM_SEED_SEG_MASK (0xffULL<<(MM_SEED_SEG_SHIFT))
|
25
|
+
|
26
|
+
#ifndef kroundup32
|
27
|
+
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
|
28
|
+
#endif
|
29
|
+
|
30
|
+
#define mm_seq4_set(s, i, c) ((s)[(i)>>3] |= (uint32_t)(c) << (((i)&7)<<2))
|
31
|
+
#define mm_seq4_get(s, i) ((s)[(i)>>3] >> (((i)&7)<<2) & 0xf)
|
32
|
+
|
33
|
+
#define MALLOC(type, len) ((type*)malloc((len) * sizeof(type)))
|
34
|
+
#define CALLOC(type, len) ((type*)calloc((len), sizeof(type)))
|
35
|
+
|
36
|
+
#ifdef __cplusplus
|
37
|
+
extern "C" {
|
38
|
+
#endif
|
39
|
+
|
40
|
+
typedef struct {
|
41
|
+
uint32_t n;
|
42
|
+
uint32_t q_pos;
|
43
|
+
uint32_t q_span:31, flt:1;
|
44
|
+
uint32_t seg_id:31, is_tandem:1;
|
45
|
+
const uint64_t *cr;
|
46
|
+
} mm_seed_t;
|
47
|
+
|
48
|
+
typedef struct {
|
49
|
+
int n_u, n_a;
|
50
|
+
uint64_t *u;
|
51
|
+
mm128_t *a;
|
52
|
+
} mm_seg_t;
|
53
|
+
|
54
|
+
double cputime(void);
|
55
|
+
double realtime(void);
|
56
|
+
long peakrss(void);
|
57
|
+
|
58
|
+
void radix_sort_128x(mm128_t *beg, mm128_t *end);
|
59
|
+
void radix_sort_64(uint64_t *beg, uint64_t *end);
|
60
|
+
uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
|
61
|
+
|
62
|
+
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p);
|
63
|
+
|
64
|
+
mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos);
|
65
|
+
void mm_seed_mz_flt(void *km, mm128_v *mv, int32_t q_occ_max, float q_occ_frac);
|
66
|
+
|
67
|
+
double mm_event_identity(const mm_reg1_t *r);
|
68
|
+
int mm_write_sam_hdr(const mm_idx_t *mi, const char *rg, const char *ver, int argc, char *argv[]);
|
69
|
+
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag);
|
70
|
+
void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag, int rep_len);
|
71
|
+
void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs);
|
72
|
+
void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int seg_idx, int reg_idx, int n_seg, const int *n_regs, const mm_reg1_t *const* regs, void *km, int64_t opt_flag);
|
73
|
+
void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int seg_idx, int reg_idx, int n_seg, const int *n_regss, const mm_reg1_t *const* regss, void *km, int64_t opt_flag, int rep_len);
|
74
|
+
|
75
|
+
void mm_idxopt_init(mm_idxopt_t *opt);
|
76
|
+
const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
|
77
|
+
int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f);
|
78
|
+
int mm_idx_getseq2(const mm_idx_t *mi, int is_rev, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
|
79
|
+
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
|
80
|
+
mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a, int is_qstrand);
|
81
|
+
|
82
|
+
mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale,
|
83
|
+
int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
|
84
|
+
mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
|
85
|
+
int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
|
86
|
+
mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_skip, int cap_rmq_size, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
|
87
|
+
int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
|
88
|
+
|
89
|
+
void mm_mark_alt(const mm_idx_t *mi, int n, mm_reg1_t *r);
|
90
|
+
void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a, int is_qstrand);
|
91
|
+
void mm_sync_regs(void *km, int n_regs, mm_reg1_t *regs);
|
92
|
+
int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a);
|
93
|
+
int mm_set_sam_pri(int n, mm_reg1_t *r);
|
94
|
+
void mm_set_parent(void *km, float mask_level, int mask_len, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac);
|
95
|
+
void mm_select_sub(void *km, float pri_ratio, int min_diff, int best_n, int check_strand, int min_strand_sc, int *n_, mm_reg1_t *r);
|
96
|
+
void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r);
|
97
|
+
int mm_filter_strand_retained(int n_regs, mm_reg1_t *r);
|
98
|
+
void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs);
|
99
|
+
void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r, float alt_diff_frac);
|
100
|
+
void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr);
|
101
|
+
void mm_update_dp_max(int qlen, int n_regs, mm_reg1_t *regs, float frac, int a, int b);
|
102
|
+
|
103
|
+
void mm_est_err(const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, const mm128_t *a, int32_t n, const uint64_t *mini_pos);
|
104
|
+
|
105
|
+
mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int n_regs0, const mm_reg1_t *regs0, int *n_regs, mm_reg1_t **regs, const mm128_t *a);
|
106
|
+
void mm_seg_free(void *km, int n_segs, mm_seg_t *segs);
|
107
|
+
void mm_pair(void *km, int max_gap_ref, int dp_bonus, int sub_diff, int match_sc, const int *qlens, int *n_regs, mm_reg1_t **regs);
|
108
|
+
|
109
|
+
FILE *mm_split_init(const char *prefix, const mm_idx_t *mi);
|
110
|
+
mm_idx_t *mm_split_merge_prep(const char *prefix, int n_splits, FILE **fp, uint32_t *n_seq_part);
|
111
|
+
int mm_split_merge(int n_segs, const char **fn, const mm_mapopt_t *opt, int n_split_idx);
|
112
|
+
void mm_split_rm_tmp(const char *prefix, int n_splits);
|
113
|
+
|
114
|
+
void mm_err_puts(const char *str);
|
115
|
+
void mm_err_fwrite(const void *p, size_t size, size_t nitems, FILE *fp);
|
116
|
+
void mm_err_fread(void *p, size_t size, size_t nitems, FILE *fp);
|
117
|
+
|
118
|
+
static inline float mg_log2(float x) // NB: this doesn't work when x<2
|
119
|
+
{
|
120
|
+
union { float f; uint32_t i; } z = { x };
|
121
|
+
float log_2 = ((z.i >> 23) & 255) - 128;
|
122
|
+
z.i &= ~(255 << 23);
|
123
|
+
z.i += 127 << 23;
|
124
|
+
log_2 += (-0.34484843f * z.f + 2.02466578f) * z.f - 0.67487759f;
|
125
|
+
return log_2;
|
126
|
+
}
|
127
|
+
|
128
|
+
#ifdef __cplusplus
|
129
|
+
}
|
130
|
+
#endif
|
131
|
+
|
132
|
+
#endif
|
@@ -0,0 +1,234 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <limits.h>
|
3
|
+
#include "mmpriv.h"
|
4
|
+
|
5
|
+
void mm_idxopt_init(mm_idxopt_t *opt)
|
6
|
+
{
|
7
|
+
memset(opt, 0, sizeof(mm_idxopt_t));
|
8
|
+
opt->k = 15, opt->w = 10, opt->flag = 0;
|
9
|
+
opt->bucket_bits = 14;
|
10
|
+
opt->mini_batch_size = 50000000;
|
11
|
+
opt->batch_size = 4000000000ULL;
|
12
|
+
}
|
13
|
+
|
14
|
+
void mm_mapopt_init(mm_mapopt_t *opt)
|
15
|
+
{
|
16
|
+
memset(opt, 0, sizeof(mm_mapopt_t));
|
17
|
+
opt->seed = 11;
|
18
|
+
opt->mid_occ_frac = 2e-4f;
|
19
|
+
opt->min_mid_occ = 10;
|
20
|
+
opt->max_mid_occ = 1000000;
|
21
|
+
opt->sdust_thres = 0; // no SDUST masking
|
22
|
+
opt->q_occ_frac = 0.01f;
|
23
|
+
|
24
|
+
opt->min_cnt = 3;
|
25
|
+
opt->min_chain_score = 40;
|
26
|
+
opt->bw = 500, opt->bw_long = 20000;
|
27
|
+
opt->max_gap = 5000;
|
28
|
+
opt->max_gap_ref = -1;
|
29
|
+
opt->max_chain_skip = 25;
|
30
|
+
opt->max_chain_iter = 5000;
|
31
|
+
opt->rmq_inner_dist = 1000;
|
32
|
+
opt->rmq_size_cap = 100000;
|
33
|
+
opt->rmq_rescue_size = 1000;
|
34
|
+
opt->rmq_rescue_ratio = 0.1f;
|
35
|
+
opt->chain_gap_scale = 0.8f;
|
36
|
+
opt->chain_skip_scale = 0.0f;
|
37
|
+
opt->max_max_occ = 4095;
|
38
|
+
opt->occ_dist = 500;
|
39
|
+
|
40
|
+
opt->mask_level = 0.5f;
|
41
|
+
opt->mask_len = INT_MAX;
|
42
|
+
opt->pri_ratio = 0.8f;
|
43
|
+
opt->best_n = 5;
|
44
|
+
|
45
|
+
opt->alt_drop = 0.15f;
|
46
|
+
|
47
|
+
opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1;
|
48
|
+
opt->sc_ambi = 1;
|
49
|
+
opt->zdrop = 400, opt->zdrop_inv = 200;
|
50
|
+
opt->end_bonus = -1;
|
51
|
+
opt->min_dp_max = opt->min_chain_score * opt->a;
|
52
|
+
opt->min_ksw_len = 200;
|
53
|
+
opt->anchor_ext_len = 20, opt->anchor_ext_shift = 6;
|
54
|
+
opt->max_clip_ratio = 1.0f;
|
55
|
+
opt->mini_batch_size = 500000000;
|
56
|
+
opt->max_sw_mat = 100000000;
|
57
|
+
opt->cap_kalloc = 1000000000;
|
58
|
+
|
59
|
+
opt->rank_min_len = 500;
|
60
|
+
opt->rank_frac = 0.9f;
|
61
|
+
|
62
|
+
opt->pe_ori = 0; // FF
|
63
|
+
opt->pe_bonus = 33;
|
64
|
+
}
|
65
|
+
|
66
|
+
void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
|
67
|
+
{
|
68
|
+
if ((opt->flag & MM_F_SPLICE_FOR) || (opt->flag & MM_F_SPLICE_REV))
|
69
|
+
opt->flag |= MM_F_SPLICE;
|
70
|
+
if (opt->mid_occ <= 0) {
|
71
|
+
opt->mid_occ = mm_idx_cal_max_occ(mi, opt->mid_occ_frac);
|
72
|
+
if (opt->mid_occ < opt->min_mid_occ)
|
73
|
+
opt->mid_occ = opt->min_mid_occ;
|
74
|
+
if (opt->max_mid_occ > opt->min_mid_occ && opt->mid_occ > opt->max_mid_occ)
|
75
|
+
opt->mid_occ = opt->max_mid_occ;
|
76
|
+
}
|
77
|
+
if (opt->bw_long < opt->bw) opt->bw_long = opt->bw;
|
78
|
+
if (mm_verbose >= 3)
|
79
|
+
fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), opt->mid_occ);
|
80
|
+
}
|
81
|
+
|
82
|
+
void mm_mapopt_max_intron_len(mm_mapopt_t *opt, int max_intron_len)
|
83
|
+
{
|
84
|
+
if ((opt->flag & MM_F_SPLICE) && max_intron_len > 0)
|
85
|
+
opt->max_gap_ref = opt->bw = opt->bw_long = max_intron_len;
|
86
|
+
}
|
87
|
+
|
88
|
+
int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
89
|
+
{
|
90
|
+
if (preset == 0) {
|
91
|
+
mm_idxopt_init(io);
|
92
|
+
mm_mapopt_init(mo);
|
93
|
+
} else if (strcmp(preset, "map-ont") == 0) { // this is the same as the default
|
94
|
+
} else if (strcmp(preset, "ava-ont") == 0) {
|
95
|
+
io->flag = 0, io->k = 15, io->w = 5;
|
96
|
+
mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN;
|
97
|
+
mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25;
|
98
|
+
mo->bw = mo->bw_long = 2000;
|
99
|
+
mo->occ_dist = 0;
|
100
|
+
} else if (strcmp(preset, "map10k") == 0 || strcmp(preset, "map-pb") == 0) {
|
101
|
+
io->flag |= MM_I_HPC, io->k = 19;
|
102
|
+
} else if (strcmp(preset, "ava-pb") == 0) {
|
103
|
+
io->flag |= MM_I_HPC, io->k = 19, io->w = 5;
|
104
|
+
mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN;
|
105
|
+
mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25;
|
106
|
+
mo->bw_long = mo->bw;
|
107
|
+
mo->occ_dist = 0;
|
108
|
+
} else if (strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) {
|
109
|
+
io->flag = 0, io->k = 19, io->w = 19;
|
110
|
+
mo->max_gap = 10000;
|
111
|
+
mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1;
|
112
|
+
mo->occ_dist = 500;
|
113
|
+
mo->min_mid_occ = 50, mo->max_mid_occ = 500;
|
114
|
+
mo->min_dp_max = 200;
|
115
|
+
} else if (strncmp(preset, "asm", 3) == 0) {
|
116
|
+
io->flag = 0, io->k = 19, io->w = 19;
|
117
|
+
mo->bw = 1000, mo->bw_long = 100000;
|
118
|
+
mo->max_gap = 10000;
|
119
|
+
mo->flag |= MM_F_RMQ;
|
120
|
+
mo->min_mid_occ = 50, mo->max_mid_occ = 500;
|
121
|
+
mo->min_dp_max = 200;
|
122
|
+
mo->best_n = 50;
|
123
|
+
if (strcmp(preset, "asm5") == 0) {
|
124
|
+
mo->a = 1, mo->b = 19, mo->q = 39, mo->q2 = 81, mo->e = 3, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
|
125
|
+
} else if (strcmp(preset, "asm10") == 0) {
|
126
|
+
mo->a = 1, mo->b = 9, mo->q = 16, mo->q2 = 41, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
|
127
|
+
} else if (strcmp(preset, "asm20") == 0) {
|
128
|
+
mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
|
129
|
+
io->w = 10;
|
130
|
+
} else return -1;
|
131
|
+
} else if (strcmp(preset, "short") == 0 || strcmp(preset, "sr") == 0) {
|
132
|
+
io->flag = 0, io->k = 21, io->w = 11;
|
133
|
+
mo->flag |= MM_F_SR | MM_F_FRAG_MODE | MM_F_NO_PRINT_2ND | MM_F_2_IO_THREADS | MM_F_HEAP_SORT;
|
134
|
+
mo->pe_ori = 0<<1|1; // FR
|
135
|
+
mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 24, mo->e2 = 1;
|
136
|
+
mo->zdrop = mo->zdrop_inv = 100;
|
137
|
+
mo->end_bonus = 10;
|
138
|
+
mo->max_frag_len = 800;
|
139
|
+
mo->max_gap = 100;
|
140
|
+
mo->bw = mo->bw_long = 100;
|
141
|
+
mo->pri_ratio = 0.5f;
|
142
|
+
mo->min_cnt = 2;
|
143
|
+
mo->min_chain_score = 25;
|
144
|
+
mo->min_dp_max = 40;
|
145
|
+
mo->best_n = 20;
|
146
|
+
mo->mid_occ = 1000;
|
147
|
+
mo->max_occ = 5000;
|
148
|
+
mo->mini_batch_size = 50000000;
|
149
|
+
} else if (strncmp(preset, "splice", 6) == 0 || strcmp(preset, "cdna") == 0) {
|
150
|
+
io->flag = 0, io->k = 15, io->w = 5;
|
151
|
+
mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV | MM_F_SPLICE_FLANK;
|
152
|
+
mo->max_sw_mat = 0;
|
153
|
+
mo->max_gap = 2000, mo->max_gap_ref = mo->bw = mo->bw_long = 200000;
|
154
|
+
mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0;
|
155
|
+
mo->noncan = 9;
|
156
|
+
mo->junc_bonus = 9;
|
157
|
+
mo->zdrop = 200, mo->zdrop_inv = 100; // because mo->a is halved
|
158
|
+
if (strcmp(preset, "splice:hq") == 0)
|
159
|
+
mo->junc_bonus = 5, mo->b = 4, mo->q = 6, mo->q2 = 24;
|
160
|
+
} else return -1;
|
161
|
+
return 0;
|
162
|
+
}
|
163
|
+
|
164
|
+
int mm_check_opt(const mm_idxopt_t *io, const mm_mapopt_t *mo)
|
165
|
+
{
|
166
|
+
if (mo->bw > mo->bw_long) {
|
167
|
+
if (mm_verbose >= 1)
|
168
|
+
fprintf(stderr, "[ERROR]\033[1;31m with '-rNUM1,NUM2', NUM1 (%d) can't be larger than NUM2 (%d)\033[0m\n", mo->bw, mo->bw_long);
|
169
|
+
return -8;
|
170
|
+
}
|
171
|
+
if ((mo->flag & MM_F_RMQ) && (mo->flag & (MM_F_SR|MM_F_SPLICE))) {
|
172
|
+
if (mm_verbose >= 1)
|
173
|
+
fprintf(stderr, "[ERROR]\033[1;31m --rmq doesn't work with --sr or --splice\033[0m\n");
|
174
|
+
return -7;
|
175
|
+
}
|
176
|
+
if (mo->split_prefix && (mo->flag & (MM_F_OUT_CS|MM_F_OUT_MD))) {
|
177
|
+
if (mm_verbose >= 1)
|
178
|
+
fprintf(stderr, "[ERROR]\033[1;31m --cs or --MD doesn't work with --split-prefix\033[0m\n");
|
179
|
+
return -6;
|
180
|
+
}
|
181
|
+
if (io->k <= 0 || io->w <= 0) {
|
182
|
+
if (mm_verbose >= 1)
|
183
|
+
fprintf(stderr, "[ERROR]\033[1;31m -k and -w must be positive\033[0m\n");
|
184
|
+
return -5;
|
185
|
+
}
|
186
|
+
if (mo->best_n < 0) {
|
187
|
+
if (mm_verbose >= 1)
|
188
|
+
fprintf(stderr, "[ERROR]\033[1;31m -N must be no less than 0\033[0m\n");
|
189
|
+
return -4;
|
190
|
+
}
|
191
|
+
if (mo->best_n == 0 && mm_verbose >= 2)
|
192
|
+
fprintf(stderr, "[WARNING]\033[1;31m '-N 0' reduces mapping accuracy. Please use '--secondary=no' instead.\033[0m\n");
|
193
|
+
if (mo->pri_ratio < 0.0f || mo->pri_ratio > 1.0f) {
|
194
|
+
if (mm_verbose >= 1)
|
195
|
+
fprintf(stderr, "[ERROR]\033[1;31m -p must be within 0 and 1 (including 0 and 1)\033[0m\n");
|
196
|
+
return -4;
|
197
|
+
}
|
198
|
+
if ((mo->flag & MM_F_FOR_ONLY) && (mo->flag & MM_F_REV_ONLY)) {
|
199
|
+
if (mm_verbose >= 1)
|
200
|
+
fprintf(stderr, "[ERROR]\033[1;31m --for-only and --rev-only can't be applied at the same time\033[0m\n");
|
201
|
+
return -3;
|
202
|
+
}
|
203
|
+
if (mo->e <= 0 || mo->q <= 0) {
|
204
|
+
if (mm_verbose >= 1)
|
205
|
+
fprintf(stderr, "[ERROR]\033[1;31m -O and -E must be positive\033[0m\n");
|
206
|
+
return -1;
|
207
|
+
}
|
208
|
+
if ((mo->q != mo->q2 || mo->e != mo->e2) && !(mo->e > mo->e2 && mo->q + mo->e < mo->q2 + mo->e2)) {
|
209
|
+
if (mm_verbose >= 1)
|
210
|
+
fprintf(stderr, "[ERROR]\033[1;31m dual gap penalties violating E1>E2 and O1+E1<O2+E2\033[0m\n");
|
211
|
+
return -2;
|
212
|
+
}
|
213
|
+
if ((mo->q + mo->e) + (mo->q2 + mo->e2) > 127) {
|
214
|
+
if (mm_verbose >= 1)
|
215
|
+
fprintf(stderr, "[ERROR]\033[1;31m scoring system violating ({-O}+{-E})+({-O2}+{-E2}) <= 127\033[0m\n");
|
216
|
+
return -1;
|
217
|
+
}
|
218
|
+
if (mo->zdrop < mo->zdrop_inv) {
|
219
|
+
if (mm_verbose >= 1)
|
220
|
+
fprintf(stderr, "[ERROR]\033[1;31m Z-drop should not be less than inversion-Z-drop\033[0m\n");
|
221
|
+
return -5;
|
222
|
+
}
|
223
|
+
if ((mo->flag & MM_F_NO_PRINT_2ND) && (mo->flag & MM_F_ALL_CHAINS)) {
|
224
|
+
if (mm_verbose >= 1)
|
225
|
+
fprintf(stderr, "[ERROR]\033[1;31m -X/-P and --secondary=no can't be applied at the same time\033[0m\n");
|
226
|
+
return -5;
|
227
|
+
}
|
228
|
+
if ((mo->flag & MM_F_QSTRAND) && ((mo->flag & (MM_F_OUT_SAM|MM_F_SPLICE|MM_F_FRAG_MODE)) || (io->flag & MM_I_HPC))) {
|
229
|
+
if (mm_verbose >= 1)
|
230
|
+
fprintf(stderr, "[ERROR]\033[1;31m --qstrand doesn't work with -a, -H, --frag or --splice\033[0m\n");
|
231
|
+
return -5;
|
232
|
+
}
|
233
|
+
return 0;
|
234
|
+
}
|