minimap2 0.2.27.0 → 0.2.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -0
  3. data/ext/cmappy/cmappy.c +3 -3
  4. data/ext/cmappy/cmappy.h +1 -1
  5. data/ext/minimap2/FAQ.md +1 -1
  6. data/ext/minimap2/Makefile +4 -3
  7. data/ext/minimap2/NEWS.md +68 -0
  8. data/ext/minimap2/README.md +30 -14
  9. data/ext/minimap2/align.c +136 -52
  10. data/ext/minimap2/cookbook.md +2 -2
  11. data/ext/minimap2/format.c +59 -5
  12. data/ext/minimap2/hit.c +14 -6
  13. data/ext/minimap2/index.c +304 -13
  14. data/ext/minimap2/jump.c +201 -0
  15. data/ext/minimap2/kalloc.h +8 -0
  16. data/ext/minimap2/ksw2.h +5 -2
  17. data/ext/minimap2/ksw2_dispatch.c +5 -5
  18. data/ext/minimap2/ksw2_exts2_sse.c +17 -6
  19. data/ext/minimap2/lchain.c +5 -5
  20. data/ext/minimap2/main.c +64 -12
  21. data/ext/minimap2/map.c +35 -8
  22. data/ext/minimap2/minimap.h +14 -3
  23. data/ext/minimap2/minimap2.1 +98 -46
  24. data/ext/minimap2/misc/README.md +2 -1
  25. data/ext/minimap2/misc/pafcluster.js +241 -0
  26. data/ext/minimap2/misc/paftools.js +17 -6
  27. data/ext/minimap2/mmpriv.h +25 -4
  28. data/ext/minimap2/options.c +36 -3
  29. data/ext/minimap2/python/cmappy.h +3 -3
  30. data/ext/minimap2/python/cmappy.pxd +5 -2
  31. data/ext/minimap2/python/mappy.pyx +20 -7
  32. data/ext/minimap2/python/minimap2.py +5 -3
  33. data/ext/minimap2/seed.c +2 -1
  34. data/ext/minimap2/setup.py +2 -2
  35. data/ext/minimap2.patch +2 -2
  36. data/lib/minimap2/aligner.rb +19 -12
  37. data/lib/minimap2/alignment.rb +1 -0
  38. data/lib/minimap2/ffi/constants.rb +10 -2
  39. data/lib/minimap2/ffi/functions.rb +145 -6
  40. data/lib/minimap2/ffi/mappy.rb +1 -1
  41. data/lib/minimap2/version.rb +1 -1
  42. data/lib/minimap2.rb +2 -2
  43. metadata +8 -7
  44. data/ext/minimap2/misc/mmphase.js +0 -335
@@ -14,6 +14,7 @@
14
14
  #define MM_DBG_PRINT_SEED 0x4
15
15
  #define MM_DBG_PRINT_ALN_SEQ 0x8
16
16
  #define MM_DBG_PRINT_CHAIN 0x10
17
+ #define MM_DBG_SEED_FREQ 0x20
17
18
 
18
19
  #define MM_SEED_LONG_JOIN (1ULL<<40)
19
20
  #define MM_SEED_IGNORE (1ULL<<41)
@@ -23,6 +24,9 @@
23
24
  #define MM_SEED_SEG_SHIFT 48
24
25
  #define MM_SEED_SEG_MASK (0xffULL<<(MM_SEED_SEG_SHIFT))
25
26
 
27
+ #define MM_JUNC_ANNO 0x1
28
+ #define MM_JUNC_MISC 0x2
29
+
26
30
  #ifndef kroundup32
27
31
  #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
28
32
  #endif
@@ -32,6 +36,7 @@
32
36
 
33
37
  #define MALLOC(type, len) ((type*)malloc((len) * sizeof(type)))
34
38
  #define CALLOC(type, len) ((type*)calloc((len), sizeof(type)))
39
+ #define REALLOC(type, ptr, cnt) ((type*)realloc((ptr), (cnt) * sizeof(type)))
35
40
 
36
41
  #ifdef __cplusplus
37
42
  extern "C" {
@@ -51,6 +56,12 @@ typedef struct {
51
56
  mm128_t *a;
52
57
  } mm_seg_t;
53
58
 
59
+ typedef struct {
60
+ int32_t off, off2, cnt;
61
+ int16_t strand;
62
+ uint16_t flag;
63
+ } mm_idx_jjump1_t;
64
+
54
65
  double cputime(void);
55
66
  double realtime(void);
56
67
  long peakrss(void);
@@ -68,19 +79,23 @@ double mm_event_identity(const mm_reg1_t *r);
68
79
  int mm_write_sam_hdr(const mm_idx_t *mi, const char *rg, const char *ver, int argc, char *argv[]);
69
80
  void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag);
70
81
  void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag, int rep_len);
82
+ void mm_write_paf4(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag, int rep_len, int n_seg, int seg_idx);
71
83
  void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs);
72
84
  void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int seg_idx, int reg_idx, int n_seg, const int *n_regs, const mm_reg1_t *const* regs, void *km, int64_t opt_flag);
73
85
  void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int seg_idx, int reg_idx, int n_seg, const int *n_regss, const mm_reg1_t *const* regss, void *km, int64_t opt_flag, int rep_len);
86
+ void mm_write_junc(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r);
74
87
 
88
+ // indexing related in index.c
75
89
  void mm_idxopt_init(mm_idxopt_t *opt);
76
90
  const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
77
91
  int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f);
78
92
  int mm_idx_getseq2(const mm_idx_t *mi, int is_rev, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
79
- mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
80
93
  mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a, int is_qstrand);
94
+ int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc);
95
+ int mm_idx_jjump_read(mm_idx_t *mi, const char *fn, int flag, int min_sc);
96
+ const mm_idx_jjump1_t *mm_idx_jump_get(const mm_idx_t *db, int32_t cid, int32_t st, int32_t en, int32_t *n);
81
97
 
82
- mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale,
83
- int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
98
+ // chaining in lchain.c
84
99
  mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
85
100
  int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
86
101
  mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_skip, int cap_rmq_size, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
@@ -97,8 +112,12 @@ void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int
97
112
  int mm_filter_strand_retained(int n_regs, mm_reg1_t *r);
98
113
  void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs);
99
114
  void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r, float alt_diff_frac);
100
- void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr);
115
+ void mm_set_mapq2(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr, int is_splice);
101
116
  void mm_update_dp_max(int qlen, int n_regs, mm_reg1_t *regs, float frac, int a, int b);
117
+ void mm_jump_split(void *km, const mm_idx_t *mi, const mm_mapopt_t *opt, int32_t qlen, const uint8_t *qseq, mm_reg1_t *r, int32_t ts_strand);
118
+
119
+ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
120
+ void mm_enlarge_cigar(mm_reg1_t *r, uint32_t n_cigar);
102
121
 
103
122
  void mm_est_err(const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, const mm128_t *a, int32_t n, const uint64_t *mini_pos);
104
123
 
@@ -106,6 +125,8 @@ mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int
106
125
  void mm_seg_free(void *km, int n_segs, mm_seg_t *segs);
107
126
  void mm_pair(void *km, int max_gap_ref, int dp_bonus, int sub_diff, int match_sc, const int *qlens, int *n_regs, mm_reg1_t **regs);
108
127
 
128
+ void mm_jump_split(void *km, const mm_idx_t *mi, const mm_mapopt_t *opt, int32_t qlen, const uint8_t *qseq, mm_reg1_t *r, int32_t ts_strand);
129
+
109
130
  FILE *mm_split_init(const char *prefix, const mm_idx_t *mi);
110
131
  mm_idx_t *mm_split_merge_prep(const char *prefix, int n_splits, FILE **fp, uint32_t *n_seq_part);
111
132
  int mm_split_merge(int n_segs, const char **fn, const mm_mapopt_t *opt, int n_split_idx);
@@ -55,13 +55,15 @@ void mm_mapopt_init(mm_mapopt_t *opt)
55
55
  opt->max_clip_ratio = 1.0f;
56
56
  opt->mini_batch_size = 500000000;
57
57
  opt->max_sw_mat = 100000000;
58
- opt->cap_kalloc = 1000000000;
58
+ opt->cap_kalloc = 500000000;
59
59
 
60
60
  opt->rank_min_len = 500;
61
61
  opt->rank_frac = 0.9f;
62
62
 
63
63
  opt->pe_ori = 0; // FF
64
64
  opt->pe_bonus = 33;
65
+
66
+ opt->jump_min_match = 3;
65
67
  }
66
68
 
67
69
  void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
@@ -114,6 +116,14 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
114
116
  mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1;
115
117
  mo->min_dp_max = 200;
116
118
  }
119
+ } else if (strcmp(preset, "lr:hqae") == 0) { // high-quality assembly evaluation
120
+ io->flag = 0, io->k = 25, io->w = 51;
121
+ mo->flag |= MM_F_RMQ;
122
+ mo->min_mid_occ = 50, mo->max_mid_occ = 500;
123
+ mo->rmq_inner_dist = 5000;
124
+ mo->occ_dist = 200;
125
+ mo->best_n = 100;
126
+ mo->chain_gap_scale = 5.0f;
117
127
  } else if (strcmp(preset, "map-iclr-prerender") == 0) {
118
128
  io->flag = 0, io->k = 15;
119
129
  mo->b = 6, mo->transition = 1;
@@ -156,7 +166,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
156
166
  mo->mid_occ = 1000;
157
167
  mo->max_occ = 5000;
158
168
  mo->mini_batch_size = 50000000;
159
- } else if (strncmp(preset, "splice", 6) == 0 || strcmp(preset, "cdna") == 0) {
169
+ } else if (strcmp(preset, "splice") == 0 || strcmp(preset, "splice:hq") == 0 || strcmp(preset, "splice:sr") == 0 || strcmp(preset, "cdna") == 0) {
160
170
  io->flag = 0, io->k = 15, io->w = 5;
161
171
  mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV | MM_F_SPLICE_FLANK;
162
172
  mo->max_sw_mat = 0;
@@ -164,13 +174,31 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
164
174
  mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0;
165
175
  mo->noncan = 9;
166
176
  mo->junc_bonus = 9;
177
+ mo->junc_pen = 5;
167
178
  mo->zdrop = 200, mo->zdrop_inv = 100; // because mo->a is halved
168
- if (strcmp(preset, "splice:hq") == 0)
179
+ if (strcmp(preset, "splice:hq") == 0) {
169
180
  mo->noncan = 5, mo->b = 4, mo->q = 6, mo->q2 = 24;
181
+ } else if (strcmp(preset, "splice:sr") == 0) {
182
+ mo->flag |= MM_F_NO_PRINT_2ND | MM_F_2_IO_THREADS | MM_F_HEAP_SORT | MM_F_FRAG_MODE | MM_F_WEAK_PAIRING | MM_F_SR_RNA;
183
+ mo->noncan = 5, mo->b = 4, mo->q = 6, mo->q2 = 24;
184
+ mo->min_chain_score = 25;
185
+ mo->min_dp_max = 40;
186
+ mo->min_ksw_len = 20;
187
+ mo->pe_ori = 0<<1|1; // FR
188
+ mo->best_n = 10;
189
+ mo->mini_batch_size = 100000000;
190
+ }
170
191
  } else return -1;
171
192
  return 0;
172
193
  }
173
194
 
195
+ int mm_max_spsc_bonus(const mm_mapopt_t *mo)
196
+ {
197
+ int max_sc = (mo->q2 + 1) / 2 - 1;
198
+ max_sc = max_sc > mo->q2 - mo->q? max_sc : mo->q2 - mo->q;
199
+ return max_sc;
200
+ }
201
+
174
202
  int mm_check_opt(const mm_idxopt_t *io, const mm_mapopt_t *mo)
175
203
  {
176
204
  if (mo->bw > mo->bw_long) {
@@ -225,6 +253,11 @@ int mm_check_opt(const mm_idxopt_t *io, const mm_mapopt_t *mo)
225
253
  fprintf(stderr, "[ERROR]\033[1;31m scoring system violating ({-O}+{-E})+({-O2}+{-E2}) <= 127\033[0m\n");
226
254
  return -1;
227
255
  }
256
+ if (mo->sc_ambi < 0 || mo->sc_ambi >= mo->b) {
257
+ if (mm_verbose >= 1)
258
+ fprintf(stderr, "[ERROR]\033[1;31m --score-N should be within [0,{-B})\033[0m\n");
259
+ return -1;
260
+ }
228
261
  if (mo->zdrop < mo->zdrop_inv) {
229
262
  if (mm_verbose >= 1)
230
263
  fprintf(stderr, "[ERROR]\033[1;31m Z-drop should not be less than inversion-Z-drop\033[0m\n");
@@ -71,13 +71,13 @@ static inline void mm_reset_timer(void)
71
71
  }
72
72
 
73
73
  extern unsigned char seq_comp_table[256];
74
- static inline mm_reg1_t *mm_map_aux(const mm_idx_t *mi, const char *seq1, const char *seq2, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt)
74
+ static inline mm_reg1_t *mm_map_aux(const mm_idx_t *mi, const char* seqname, const char *seq1, const char *seq2, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt)
75
75
  {
76
76
  mm_reg1_t *r;
77
77
 
78
78
  Py_BEGIN_ALLOW_THREADS
79
79
  if (seq2 == 0) {
80
- r = mm_map(mi, strlen(seq1), seq1, n_regs, b, opt, NULL);
80
+ r = mm_map(mi, strlen(seq1), seq1, n_regs, b, opt, seqname);
81
81
  } else {
82
82
  int _n_regs[2];
83
83
  mm_reg1_t *regs[2];
@@ -94,7 +94,7 @@ static inline mm_reg1_t *mm_map_aux(const mm_idx_t *mi, const char *seq1, const
94
94
  seq[1][i] = seq_comp_table[t];
95
95
  }
96
96
  if (len[1]&1) seq[1][len[1]>>1] = seq_comp_table[(uint8_t)seq[1][len[1]>>1]];
97
- mm_map_frag(mi, 2, len, (const char**)seq, _n_regs, regs, b, opt, NULL);
97
+ mm_map_frag(mi, 2, len, (const char**)seq, _n_regs, regs, b, opt, seqname);
98
98
  for (i = 0; i < _n_regs[1]; ++i)
99
99
  regs[1][i].rev = !regs[1][i].rev;
100
100
  *n_regs = _n_regs[0] + _n_regs[1];
@@ -36,9 +36,10 @@ cdef extern from "minimap.h":
36
36
  float alt_drop
37
37
 
38
38
  int a, b, q, e, q2, e2
39
+ int transition
39
40
  int sc_ambi
40
41
  int noncan
41
- int junc_bonus
42
+ int junc_bonus, junc_pen
42
43
  int zdrop, zdrop_inv
43
44
  int end_bonus
44
45
  int min_dp_max
@@ -51,6 +52,8 @@ cdef extern from "minimap.h":
51
52
 
52
53
  int pe_ori, pe_bonus
53
54
 
55
+ int jump_min_match;
56
+
54
57
  float mid_occ_frac
55
58
  float q_occ_frac
56
59
  int32_t min_mid_occ
@@ -128,7 +131,7 @@ cdef extern from "cmappy.h":
128
131
 
129
132
  void mm_reg2hitpy(const mm_idx_t *mi, mm_reg1_t *r, mm_hitpy_t *h)
130
133
  void mm_free_reg1(mm_reg1_t *r)
131
- mm_reg1_t *mm_map_aux(const mm_idx_t *mi, const char *seq1, const char *seq2, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt)
134
+ mm_reg1_t *mm_map_aux(const mm_idx_t *mi, const char* seqname, const char *seq1, const char *seq2, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt)
132
135
  char *mappy_fetch_seq(const mm_idx_t *mi, const char *name, int st, int en, int *l)
133
136
  mm_idx_t *mappy_idx_seq(int w, int k, int is_hpc, int bucket_bits, const char *seq, int l)
134
137
 
@@ -3,7 +3,7 @@ from libc.stdlib cimport free
3
3
  cimport cmappy
4
4
  import sys
5
5
 
6
- __version__ = '2.27'
6
+ __version__ = '2.29'
7
7
 
8
8
  cmappy.mm_reset_timer()
9
9
 
@@ -96,6 +96,7 @@ cdef class Alignment:
96
96
  a = [str(self._q_st), str(self._q_en), strand, self._ctg, str(self._ctg_len), str(self._r_st), str(self._r_en),
97
97
  str(self._mlen), str(self._blen), str(self._mapq), tp, ts, "cg:Z:" + self.cigar_str]
98
98
  if self._cs != "": a.append("cs:Z:" + self._cs)
99
+ if self._MD != "": a.append("MD:Z:" + self._MD)
99
100
  return "\t".join(a)
100
101
 
101
102
  cdef class ThreadBuffer:
@@ -112,7 +113,7 @@ cdef class Aligner:
112
113
  cdef cmappy.mm_idxopt_t idx_opt
113
114
  cdef cmappy.mm_mapopt_t map_opt
114
115
 
115
- def __cinit__(self, fn_idx_in=None, preset=None, k=None, w=None, min_cnt=None, min_chain_score=None, min_dp_score=None, bw=None, bw_long=None, best_n=None, n_threads=3, fn_idx_out=None, max_frag_len=None, extra_flags=None, seq=None, scoring=None):
116
+ def __cinit__(self, fn_idx_in=None, preset=None, k=None, w=None, min_cnt=None, min_chain_score=None, min_dp_score=None, bw=None, bw_long=None, best_n=None, n_threads=3, fn_idx_out=None, max_frag_len=None, extra_flags=None, seq=None, scoring=None, sc_ambi=None, max_chain_skip=None):
116
117
  self._idx = NULL
117
118
  cmappy.mm_set_opt(NULL, &self.idx_opt, &self.map_opt) # set the default options
118
119
  if preset is not None:
@@ -137,6 +138,8 @@ cdef class Aligner:
137
138
  self.map_opt.q2, self.map_opt.e2 = scoring[4], scoring[5]
138
139
  if len(scoring) >= 7:
139
140
  self.map_opt.sc_ambi = scoring[6]
141
+ if sc_ambi is not None: self.map_opt.sc_ambi = sc_ambi
142
+ if max_chain_skip is not None: self.map_opt.max_chain_skip = max_chain_skip
140
143
 
141
144
  cdef cmappy.mm_idx_reader_t *r;
142
145
 
@@ -162,7 +165,7 @@ cdef class Aligner:
162
165
  def __bool__(self):
163
166
  return (self._idx != NULL)
164
167
 
165
- def map(self, seq, seq2=None, buf=None, cs=False, MD=False, max_frag_len=None, extra_flags=None):
168
+ def map(self, seq, seq2=None, name=None, buf=None, cs=False, MD=False, max_frag_len=None, extra_flags=None):
166
169
  cdef cmappy.mm_reg1_t *regs
167
170
  cdef cmappy.mm_hitpy_t h
168
171
  cdef ThreadBuffer b
@@ -184,11 +187,20 @@ cdef class Aligner:
184
187
  km = cmappy.mm_tbuf_get_km(b._b)
185
188
 
186
189
  _seq = seq if isinstance(seq, bytes) else seq.encode()
190
+ if name is not None:
191
+ _name = name if isinstance(name, bytes) else name.encode()
192
+
187
193
  if seq2 is None:
188
- regs = cmappy.mm_map_aux(self._idx, _seq, NULL, &n_regs, b._b, &map_opt)
194
+ if name is None:
195
+ regs = cmappy.mm_map_aux(self._idx, NULL, _seq, NULL, &n_regs, b._b, &map_opt)
196
+ else:
197
+ regs = cmappy.mm_map_aux(self._idx, _name, _seq, NULL, &n_regs, b._b, &map_opt)
189
198
  else:
190
199
  _seq2 = seq2 if isinstance(seq2, bytes) else seq2.encode()
191
- regs = cmappy.mm_map_aux(self._idx, _seq, _seq2, &n_regs, b._b, &map_opt)
200
+ if name is None:
201
+ regs = cmappy.mm_map_aux(self._idx, NULL, _seq, _seq2, &n_regs, b._b, &map_opt)
202
+ else:
203
+ regs = cmappy.mm_map_aux(self._idx, _name, _seq, _seq2, &n_regs, b._b, &map_opt)
192
204
 
193
205
  try:
194
206
  i = 0
@@ -199,11 +211,12 @@ cdef class Aligner:
199
211
  c = h.cigar32[k]
200
212
  cigar.append([c>>4, c&0xf])
201
213
  if cs or MD: # generate the cs and/or the MD tag, if requested
214
+ _cur_seq = _seq2 if h.seg_id > 0 and seq2 is not None else _seq
202
215
  if cs:
203
- l_cs_str = cmappy.mm_gen_cs(km, &cs_str, &m_cs_str, self._idx, &regs[i], _seq, 1)
216
+ l_cs_str = cmappy.mm_gen_cs(km, &cs_str, &m_cs_str, self._idx, &regs[i], _cur_seq, 1)
204
217
  _cs = cs_str[:l_cs_str] if isinstance(cs_str, str) else cs_str[:l_cs_str].decode()
205
218
  if MD:
206
- l_cs_str = cmappy.mm_gen_MD(km, &cs_str, &m_cs_str, self._idx, &regs[i], _seq)
219
+ l_cs_str = cmappy.mm_gen_MD(km, &cs_str, &m_cs_str, self._idx, &regs[i], _cur_seq)
207
220
  _MD = cs_str[:l_cs_str] if isinstance(cs_str, str) else cs_str[:l_cs_str].decode()
208
221
  yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.mlen, h.blen, h.NM, h.trans_strand, h.seg_id, _cs, _MD)
209
222
  cmappy.mm_free_reg1(&regs[i])
@@ -5,7 +5,7 @@ import getopt
5
5
  import mappy as mp
6
6
 
7
7
  def main(argv):
8
- opts, args = getopt.getopt(argv[1:], "x:n:m:k:w:r:c")
8
+ opts, args = getopt.getopt(argv[1:], "x:n:m:k:w:r:cM")
9
9
  if len(args) < 2:
10
10
  print("Usage: minimap2.py [options] <ref.fa>|<ref.mmi> <query.fq>")
11
11
  print("Options:")
@@ -16,10 +16,11 @@ def main(argv):
16
16
  print(" -w INT minimizer window length")
17
17
  print(" -r INT band width")
18
18
  print(" -c output the cs tag")
19
+ print(" -M output the MD tag")
19
20
  sys.exit(1)
20
21
 
21
22
  preset = min_cnt = min_sc = k = w = bw = None
22
- out_cs = False
23
+ out_cs = out_MD = False
23
24
  for opt, arg in opts:
24
25
  if opt == '-x': preset = arg
25
26
  elif opt == '-n': min_cnt = int(arg)
@@ -28,11 +29,12 @@ def main(argv):
28
29
  elif opt == '-k': k = int(arg)
29
30
  elif opt == '-w': w = int(arg)
30
31
  elif opt == '-c': out_cs = True
32
+ elif opt == '-M': out_MD = True
31
33
 
32
34
  a = mp.Aligner(args[0], preset=preset, min_cnt=min_cnt, min_chain_score=min_sc, k=k, w=w, bw=bw)
33
35
  if not a: raise Exception("ERROR: failed to load/build index file '{}'".format(args[0]))
34
36
  for name, seq, qual in mp.fastx_read(args[1]): # read one sequence
35
- for h in a.map(seq, cs=out_cs): # traverse hits
37
+ for h in a.map(seq, cs=out_cs, MD=out_MD): # traverse hits
36
38
  print('{}\t{}\t{}'.format(name, len(seq), h))
37
39
 
38
40
  if __name__ == "__main__":
data/ext/minimap2/seed.c CHANGED
@@ -112,7 +112,8 @@ mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int ma
112
112
  }
113
113
  for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < n_m0; ++i) {
114
114
  mm_seed_t *q = &m[i];
115
- //fprintf(stderr, "X\t%d\t%d\t%d\n", q->q_pos>>1, q->n, q->flt);
115
+ if (mm_dbg_flag & MM_DBG_SEED_FREQ)
116
+ fprintf(stderr, "SF\t%d\t%d\t%d\n", q->q_pos>>1, q->n, q->flt);
116
117
  if (q->flt) {
117
118
  int en = (q->q_pos >> 1) + 1, st = en - q->q_span;
118
119
  if (st > rep_en) {
@@ -23,7 +23,7 @@ def readme():
23
23
 
24
24
  setup(
25
25
  name = 'mappy',
26
- version = '2.27',
26
+ version = '2.29',
27
27
  url = 'https://github.com/lh3/minimap2',
28
28
  description = 'Minimap2 python binding',
29
29
  long_description = readme(),
@@ -33,7 +33,7 @@ setup(
33
33
  keywords = 'sequence-alignment',
34
34
  scripts = ['python/minimap2.py'],
35
35
  ext_modules = [Extension('mappy',
36
- sources = ['python/mappy.pyx', 'align.c', 'bseq.c', 'lchain.c', 'seed.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'options.c',
36
+ sources = ['python/mappy.pyx', 'align.c', 'bseq.c', 'lchain.c', 'seed.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'jump.c', 'options.c',
37
37
  'ksw2_extd2_sse.c', 'ksw2_exts2_sse.c', 'ksw2_extz2_sse.c', 'ksw2_ll_sse.c',
38
38
  'kalloc.c', 'kthread.c', 'map.c', 'misc.c', 'sdust.c', 'sketch.c', 'esterr.c', 'splitidx.c'],
39
39
  depends = ['minimap.h', 'bseq.h', 'kalloc.h', 'kdq.h', 'khash.h', 'kseq.h', 'ksort.h',
data/ext/minimap2.patch CHANGED
@@ -6,13 +6,13 @@
6
6
  CPPFLAGS= -DHAVE_KALLOC
7
7
  INCLUDES=
8
8
  OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o \
9
- lchain.o align.o hit.o seed.o map.o format.o pe.o esterr.o splitidx.o \
9
+ lchain.o align.o hit.o seed.o jump.o map.o format.o pe.o esterr.o splitidx.o \
10
10
  - ksw2_ll_sse.o
11
11
  + ksw2_ll_sse.o cmappy.o
12
12
  PROG= minimap2
13
13
  PROG_EXTRA= sdust minimap2-lite
14
14
  LIBS= -lm -lz -lpthread
15
- @@ -134,3 +134,4 @@ sdust.o: kalloc.h kdq.h kvec.h sdust.h
15
+ @@ -135,3 +135,4 @@ sdust.o: kalloc.h kdq.h kvec.h sdust.h
16
16
  seed.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h ksort.h
17
17
  sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h kseq.h
18
18
  splitidx.o: mmpriv.h minimap.h bseq.h kseq.h
@@ -54,9 +54,10 @@ module Minimap2
54
54
  fn_idx_out: nil,
55
55
  max_frag_len: nil,
56
56
  extra_flags: nil,
57
- scoring: nil
57
+ scoring: nil,
58
+ sc_ambi: nil,
59
+ max_chain_skip: nil
58
60
  )
59
-
60
61
  @idx_opt = FFI::IdxOpt.new
61
62
  @map_opt = FFI::MapOpt.new
62
63
 
@@ -91,6 +92,8 @@ module Minimap2
91
92
  map_opt[:sc_ambi] = scoring[6] if scoring.size >= 7
92
93
  end
93
94
  end
95
+ map_opt[:sc_ambi] = sc_ambi if sc_ambi
96
+ map_opt[:max_chain_skip] = max_chain_skip if max_chain_skip
94
97
 
95
98
  if fn_idx_in
96
99
  warn "Since fn_idx_in is specified, the seq argument will be ignored." if seq
@@ -134,13 +137,13 @@ module Minimap2
134
137
 
135
138
  def align(
136
139
  seq, seq2 = nil,
140
+ name: nil,
137
141
  buf: nil,
138
142
  cs: false,
139
143
  md: false,
140
144
  max_frag_len: nil,
141
145
  extra_flags: nil
142
146
  )
143
-
144
147
  return if index.null?
145
148
  return if (map_opt[:flag] & 4).zero? && (index[:flag] & 2).zero?
146
149
 
@@ -151,7 +154,7 @@ module Minimap2
151
154
  km = FFI.mm_tbuf_get_km(buf)
152
155
 
153
156
  n_regs_ptr = ::FFI::MemoryPointer.new :int
154
- regs_ptr = FFI.mm_map_aux(index, seq, seq2, n_regs_ptr, buf, map_opt)
157
+ regs_ptr = FFI.mm_map_aux(index, name, seq, seq2, n_regs_ptr, buf, map_opt)
155
158
  n_regs = n_regs_ptr.read_int
156
159
 
157
160
  regs = Array.new(n_regs) do |i|
@@ -174,15 +177,19 @@ module Minimap2
174
177
  cigar = c.map { |x| [x >> 4, x & 0xf] } # 32-bit CIGAR encoding -> Ruby array
175
178
 
176
179
  _cs = ""
177
- if cs
178
- l_cs_str = FFI.mm_gen_cs(km, cs_str, m_cs_str, @index, regs[i], seq, 1)
179
- _cs = cs_str.read_pointer.read_string(l_cs_str)
180
- end
181
-
182
180
  _md = ""
183
- if md
184
- l_cs_str = FFI.mm_gen_md(km, cs_str, m_cs_str, @index, regs[i], seq)
185
- _md = cs_str.read_pointer.read_string(l_cs_str)
181
+ if cs or md
182
+ cur_seq = hit[:seg_id] > 0 && seq2 ? seq2 : seq
183
+
184
+ if cs
185
+ l_cs_str = FFI.mm_gen_cs(km, cs_str, m_cs_str, @index, regs[i], cur_seq, 1)
186
+ _cs = cs_str.read_pointer.read_string(l_cs_str)
187
+ end
188
+
189
+ if md
190
+ l_cs_str = FFI.mm_gen_md(km, cs_str, m_cs_str, @index, regs[i], cur_seq)
191
+ _md = cs_str.read_pointer.read_string(l_cs_str)
192
+ end
186
193
  end
187
194
 
188
195
  alignments << Alignment.new(hit, cigar, _cs, _md)
@@ -107,6 +107,7 @@ module Minimap2
107
107
  a = [@q_st, @q_en, strand, @ctg, @ctg_len, @r_st, @r_en,
108
108
  @mlen, @blen, @mapq, tp, ts, "cg:Z:#{@cigar_str}"]
109
109
  a << "cs:Z:#{@cs}" if @cs
110
+ a << "MD:Z:#{@md}" if @md
110
111
  a.join("\t")
111
112
  end
112
113
  end
@@ -41,6 +41,9 @@ module Minimap2
41
41
  SPLICE_OLD = 0x800000000
42
42
  SECONDARY_SEQ = 0x1000000000 # output SEQ field for seqondary alignments using hard clipping
43
43
  OUT_DS = 0x2000000000
44
+ WEAK_PAIRING = 0x4000000000
45
+ SR_RNA = 0x8000000000
46
+ OUT_JUNC = 0x10000000000
44
47
 
45
48
  HPC = 0x1
46
49
  NO_SEQ = 0x2
@@ -99,6 +102,8 @@ module Minimap2
99
102
  :S, :pointer, # 4-bit packed sequence
100
103
  :B, :pointer, # index (hidden)
101
104
  :I, :pointer, # intervals (hidden)
105
+ :spsc, :pointer, # splice score (hidden)
106
+ :J, :pointer, # junctions to create jumps (hidden)
102
107
  :km, :pointer,
103
108
  :h, :pointer
104
109
  end
@@ -113,7 +118,7 @@ module Minimap2
113
118
  :dp_max0, :int32, # DP score before mm_update_dp_max() adjustment
114
119
  :n_ambi_trans_strand, :uint32,
115
120
  :n_cigar, :uint32
116
- # :cigar, :pointer # variable length array (see cigar method below)
121
+ # :cigar, :pointer # variable length array (see cigar method below)
117
122
 
118
123
  bit_field :n_ambi_trans_strand,
119
124
  :n_ambi, 30, # number of ambiguous bases
@@ -160,7 +165,8 @@ module Minimap2
160
165
  :split_inv, 1,
161
166
  :is_alt, 1,
162
167
  :strand_retained, 1,
163
- :dummy, 5
168
+ :is_spliced, 1,
169
+ :dummy, 4
164
170
  end
165
171
 
166
172
  # indexing option
@@ -210,6 +216,7 @@ module Minimap2
210
216
  :transition, :int, # transition mismatch score (A:G, C:T)
211
217
  :sc_ambi, :int, # score when one or both bases are "N"
212
218
  :noncan, :int, # cost of non-canonical splicing sites
219
+ :junc_pen, :int,
213
220
  :junc_bonus, :int,
214
221
  :zdrop, :int, # break alignment if alignment score drops too fast along the diagonal
215
222
  :zdrop_inv, :int,
@@ -223,6 +230,7 @@ module Minimap2
223
230
  :rank_frac, :float,
224
231
  :pe_ori, :int,
225
232
  :pe_bonus, :int,
233
+ :jump_min_match, :int32,
226
234
  :mid_occ_frac, :float, # only used by mm_mapopt_update(); see below
227
235
  :q_occ_frac, :float,
228
236
  :min_mid_occ, :int32,