minimap2 0.2.27.0 → 0.2.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -0
  3. data/ext/cmappy/cmappy.c +3 -3
  4. data/ext/cmappy/cmappy.h +1 -1
  5. data/ext/minimap2/FAQ.md +1 -1
  6. data/ext/minimap2/Makefile +4 -3
  7. data/ext/minimap2/NEWS.md +68 -0
  8. data/ext/minimap2/README.md +30 -14
  9. data/ext/minimap2/align.c +136 -52
  10. data/ext/minimap2/cookbook.md +2 -2
  11. data/ext/minimap2/format.c +59 -5
  12. data/ext/minimap2/hit.c +14 -6
  13. data/ext/minimap2/index.c +304 -13
  14. data/ext/minimap2/jump.c +201 -0
  15. data/ext/minimap2/kalloc.h +8 -0
  16. data/ext/minimap2/ksw2.h +5 -2
  17. data/ext/minimap2/ksw2_dispatch.c +5 -5
  18. data/ext/minimap2/ksw2_exts2_sse.c +17 -6
  19. data/ext/minimap2/lchain.c +5 -5
  20. data/ext/minimap2/main.c +64 -12
  21. data/ext/minimap2/map.c +35 -8
  22. data/ext/minimap2/minimap.h +14 -3
  23. data/ext/minimap2/minimap2.1 +98 -46
  24. data/ext/minimap2/misc/README.md +2 -1
  25. data/ext/minimap2/misc/pafcluster.js +241 -0
  26. data/ext/minimap2/misc/paftools.js +17 -6
  27. data/ext/minimap2/mmpriv.h +25 -4
  28. data/ext/minimap2/options.c +36 -3
  29. data/ext/minimap2/python/cmappy.h +3 -3
  30. data/ext/minimap2/python/cmappy.pxd +5 -2
  31. data/ext/minimap2/python/mappy.pyx +20 -7
  32. data/ext/minimap2/python/minimap2.py +5 -3
  33. data/ext/minimap2/seed.c +2 -1
  34. data/ext/minimap2/setup.py +2 -2
  35. data/ext/minimap2.patch +2 -2
  36. data/lib/minimap2/aligner.rb +19 -12
  37. data/lib/minimap2/alignment.rb +1 -0
  38. data/lib/minimap2/ffi/constants.rb +10 -2
  39. data/lib/minimap2/ffi/functions.rb +145 -6
  40. data/lib/minimap2/ffi/mappy.rb +1 -1
  41. data/lib/minimap2/version.rb +1 -1
  42. data/lib/minimap2.rb +2 -2
  43. metadata +8 -7
  44. data/ext/minimap2/misc/mmphase.js +0 -335
@@ -253,6 +253,52 @@ static void write_cs_ds_core(kstring_t *s, const uint8_t *tseq, const uint8_t *q
253
253
  assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
254
254
  }
255
255
 
256
+ static inline void revcomp_splice(uint8_t s[2])
257
+ {
258
+ uint8_t c = s[1] < 4? 3 - s[1] : 4;
259
+ s[1] = s[0] < 4? 3 - s[0] : 4;
260
+ s[0] = c;
261
+ }
262
+
263
+ void mm_write_junc(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r)
264
+ {
265
+ int32_t i, t_off, swritten = 0;
266
+ s->l = 0;
267
+ if (!r->is_spliced || r->p == 0) return; // no junctions
268
+ if (r->p->trans_strand != 1 && r->p->trans_strand != 2) return; // no preferred strand
269
+ for (i = 0, t_off = r->rs; i < (int)r->p->n_cigar; ++i) {
270
+ int op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
271
+ if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH || op == MM_CIGAR_DEL) {
272
+ t_off += len;
273
+ } else if (op == MM_CIGAR_N_SKIP) { // intron
274
+ uint8_t donor[2], acceptor[2];
275
+ int32_t score1 = 0, score2 = 0, rev;
276
+ assert(len >= 2);
277
+ rev = (r->p->trans_strand == 2) ^ r->rev;
278
+ if (!rev) {
279
+ mm_idx_getseq(mi, r->rid, t_off, t_off + 2, donor);
280
+ mm_idx_getseq(mi, r->rid, t_off + len - 2, t_off + len, acceptor);
281
+ } else {
282
+ mm_idx_getseq(mi, r->rid, t_off, t_off + 2, acceptor);
283
+ mm_idx_getseq(mi, r->rid, t_off + len - 2, t_off + len, donor);
284
+ revcomp_splice(donor);
285
+ revcomp_splice(acceptor);
286
+ }
287
+ //fprintf(stderr, "%c%c-%c%c\n", "ACGTN"[donor[0]], "ACGTN"[donor[1]], "ACGTN"[acceptor[0]], "ACGTN"[acceptor[1]]);
288
+ if (donor[0] == 2 && donor[1] == 3) score1 = 3;
289
+ else if (donor[0] == 2 && donor[1] == 1) score1 = 2;
290
+ else if (donor[0] == 0 && donor[1] == 3) score1 = 1;
291
+ if (acceptor[0] == 0 && acceptor[1] == 2) score2 = 3;
292
+ else if (acceptor[0] == 0 && acceptor[1] == 1) score2 = 1;
293
+ if (swritten) mm_sprintf_lite(s, "\n");
294
+ else swritten = 1;
295
+ mm_sprintf_lite(s, "%s\t%d\t%d\t%s\t%d\t%c", mi->seq[r->rid].name, t_off, t_off + len, t->name, score1 + score2, "+-"[rev]);
296
+ t_off += len;
297
+ }
298
+ }
299
+ assert(t_off == r->re);
300
+ }
301
+
256
302
  static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int write_tag)
257
303
  {
258
304
  int i, q_off, t_off, l_MD = 0;
@@ -310,7 +356,7 @@ static void write_cs_ds_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const
310
356
  }
311
357
  }
312
358
  }
313
- if (is_MD == 1) write_MD_core(s, tseq, qseq, r, tmp, write_tag);
359
+ if (is_MD) write_MD_core(s, tseq, qseq, r, tmp, write_tag);
314
360
  else write_cs_ds_core(s, tseq, qseq, r, tmp, no_iden, is_ds, write_tag);
315
361
  kfree(km, qseq); kfree(km, tseq); kfree(km, tmp);
316
362
  }
@@ -366,15 +412,18 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
366
412
  if (r->split) mm_sprintf_lite(s, "\tzd:i:%d", r->split);
367
413
  }
368
414
 
369
- void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag, int rep_len)
415
+ void mm_write_paf4(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag, int rep_len, int n_seg, int seg_idx)
370
416
  {
371
417
  s->l = 0;
418
+ mm_sprintf_lite(s, "%s", t->name);
419
+ if ((opt_flag & MM_F_FRAG_MODE) && n_seg >= 2 && seg_idx >= 0)
420
+ mm_sprintf_lite(s, "/%d", seg_idx + 1);
372
421
  if (r == 0) {
373
- mm_sprintf_lite(s, "%s\t%d\t0\t0\t*\t*\t0\t0\t0\t0\t0\t0", t->name, t->l_seq);
422
+ mm_sprintf_lite(s, "\t%d\t0\t0\t*\t*\t0\t0\t0\t0\t0\t0", t->l_seq);
374
423
  if (rep_len >= 0) mm_sprintf_lite(s, "\trl:i:%d", rep_len);
375
424
  return;
376
425
  }
377
- mm_sprintf_lite(s, "%s\t%d\t%d\t%d\t%c\t", t->name, t->l_seq, r->qs, r->qe, "+-"[r->rev]);
426
+ mm_sprintf_lite(s, "\t%d\t%d\t%d\t%c\t", t->l_seq, r->qs, r->qe, "+-"[r->rev]);
378
427
  if (mi->seq[r->rid].name) mm_sprintf_lite(s, "%s", mi->seq[r->rid].name);
379
428
  else mm_sprintf_lite(s, "%d", r->rid);
380
429
  mm_sprintf_lite(s, "\t%d", mi->seq[r->rid].len);
@@ -393,11 +442,16 @@ void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const
393
442
  mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, MM_CIGAR_STR[r->p->cigar[k]&0xf]);
394
443
  }
395
444
  if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_DS|MM_F_OUT_MD)))
396
- write_cs_ds_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, !!(opt_flag&MM_F_OUT_DS), 1, !!(opt_flag&MM_F_QSTRAND));
445
+ write_cs_ds_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), !!(opt_flag&MM_F_OUT_MD), !!(opt_flag&MM_F_OUT_DS), 1, !!(opt_flag&MM_F_QSTRAND));
397
446
  if ((opt_flag & MM_F_COPY_COMMENT) && t->comment)
398
447
  mm_sprintf_lite(s, "\t%s", t->comment);
399
448
  }
400
449
 
450
+ void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag, int rep_len)
451
+ {
452
+ mm_write_paf4(s, mi, t, r, km, opt_flag, rep_len, 0, 0);
453
+ }
454
+
401
455
  void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag)
402
456
  {
403
457
  mm_write_paf3(s, mi, t, r, km, opt_flag, -1);
data/ext/minimap2/hit.c CHANGED
@@ -55,7 +55,7 @@ mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u,
55
55
  mm_reg1_t *r;
56
56
  int i, k;
57
57
 
58
- if (n_u == 0) return 0;
58
+ if (n_u <= 0) return 0;
59
59
 
60
60
  // sort by score
61
61
  z = (mm128_t*)kmalloc(km, n_u * 16);
@@ -418,16 +418,19 @@ static void mm_set_inv_mapq(void *km, int n_regs, mm_reg1_t *regs)
418
418
  kfree(km, aux);
419
419
  }
420
420
 
421
- void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr)
421
+ void mm_set_mapq2(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr, int is_splice)
422
422
  {
423
423
  static const float q_coef = 40.0f;
424
424
  int64_t sum_sc = 0;
425
425
  float uniq_ratio;
426
- int i;
426
+ int i, n_2nd_splice = 0;
427
427
  if (n_regs == 0) return;
428
- for (i = 0; i < n_regs; ++i)
428
+ for (i = 0; i < n_regs; ++i) {
429
429
  if (regs[i].parent == regs[i].id)
430
430
  sum_sc += regs[i].score;
431
+ else if (regs[i].is_spliced)
432
+ ++n_2nd_splice;
433
+ }
431
434
  uniq_ratio = (float)sum_sc / (sum_sc + rep_len);
432
435
  for (i = 0; i < n_regs; ++i) {
433
436
  mm_reg1_t *r = &regs[i];
@@ -440,13 +443,18 @@ void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int ma
440
443
  pen_cm = pen_s1 < pen_cm? pen_s1 : pen_cm;
441
444
  subsc = r->subsc > min_chain_sc? r->subsc : min_chain_sc;
442
445
  if (r->p && r->p->dp_max2 > 0 && r->p->dp_max > 0) {
443
- float identity = (float)r->mlen / r->blen;
444
- float x = (float)r->p->dp_max2 * subsc / r->p->dp_max / r->score0;
446
+ float x, identity = (float)r->mlen / r->blen;
447
+ if (is_sr && is_splice)
448
+ x = (float)r->p->dp_max2 / r->p->dp_max; // ignore chaining score; for short RNA-seq reads, unspliced chaining score tends to be higher
449
+ else
450
+ x = (float)r->p->dp_max2 * subsc / r->p->dp_max / r->score0;
445
451
  mapq = (int)(identity * pen_cm * q_coef * (1.0f - x * x) * logf((float)r->p->dp_max / match_sc));
446
452
  if (!is_sr) {
447
453
  int mapq_alt = (int)(6.02f * identity * identity * (r->p->dp_max - r->p->dp_max2) / match_sc + .499f); // BWA-MEM like mapQ, mostly for short reads
448
454
  mapq = mapq < mapq_alt? mapq : mapq_alt; // in case the long-read heuristic fails
449
455
  }
456
+ if (is_splice && is_sr && r->is_spliced && n_2nd_splice == 0)
457
+ mapq += 10;
450
458
  } else {
451
459
  float x = (float)subsc / r->score0;
452
460
  if (r->p) {
data/ext/minimap2/index.c CHANGED
@@ -12,6 +12,7 @@
12
12
  #include "bseq.h"
13
13
  #include "minimap.h"
14
14
  #include "mmpriv.h"
15
+ #include "ksw2.h"
15
16
  #include "kvec.h"
16
17
  #include "khash.h"
17
18
 
@@ -32,7 +33,7 @@ typedef struct mm_idx_bucket_s {
32
33
  } mm_idx_bucket_t;
33
34
 
34
35
  typedef struct {
35
- int32_t st, en, max; // max is not used for now
36
+ int32_t st, en, cnt;
36
37
  int32_t score:30, strand:2;
37
38
  } mm_idx_intv1_t;
38
39
 
@@ -41,6 +42,11 @@ typedef struct mm_idx_intv_s {
41
42
  mm_idx_intv1_t *a;
42
43
  } mm_idx_intv_t;
43
44
 
45
+ typedef struct mm_idx_jjump_s {
46
+ int32_t n, m;
47
+ mm_idx_jjump1_t *a;
48
+ } mm_idx_jjump_t;
49
+
44
50
  mm_idx_t *mm_idx_init(int w, int k, int b, int flag)
45
51
  {
46
52
  mm_idx_t *mi;
@@ -65,11 +71,17 @@ void mm_idx_destroy(mm_idx_t *mi)
65
71
  kh_destroy(idx, (idxhash_t*)mi->B[i].h);
66
72
  }
67
73
  }
74
+ if (mi->spsc) free(mi->spsc);
68
75
  if (mi->I) {
69
76
  for (i = 0; i < mi->n_seq; ++i)
70
77
  free(mi->I[i].a);
71
78
  free(mi->I);
72
79
  }
80
+ if (mi->J) {
81
+ for (i = 0; i < mi->n_seq; ++i)
82
+ free(mi->J[i].a);
83
+ free(mi->J);
84
+ }
73
85
  if (!mi->km) {
74
86
  for (i = 0; i < mi->n_seq; ++i)
75
87
  free(mi->seq[i].name);
@@ -99,7 +111,7 @@ const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n)
99
111
 
100
112
  void mm_idx_stat(const mm_idx_t *mi)
101
113
  {
102
- int n = 0, n1 = 0;
114
+ int64_t n = 0, n1 = 0;
103
115
  uint32_t i;
104
116
  uint64_t sum = 0, len = 0;
105
117
  fprintf(stderr, "[M::%s] kmer size: %d; skip: %d; is_hpc: %d; #seq: %d\n", __func__, mi->k, mi->w, mi->flag&MM_I_HPC, mi->n_seq);
@@ -117,8 +129,8 @@ void mm_idx_stat(const mm_idx_t *mi)
117
129
  if (kh_key(h, k)&1) ++n1;
118
130
  }
119
131
  }
120
- fprintf(stderr, "[M::%s::%.3f*%.2f] distinct minimizers: %d (%.2f%% are singletons); average occurrences: %.3lf; average spacing: %.3lf; total length: %ld\n",
121
- __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), n, 100.0*n1/n, (double)sum / n, (double)len / sum, (long)len);
132
+ fprintf(stderr, "[M::%s::%.3f*%.2f] distinct minimizers: %ld (%.2f%% are singletons); average occurrences: %.3lf; average spacing: %.3lf; total length: %ld\n",
133
+ __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), (long)n, 100.0*n1/n, (double)sum / n, (double)len / sum, (long)len);
122
134
  }
123
135
 
124
136
  int mm_idx_index_name(mm_idx_t *mi)
@@ -657,10 +669,17 @@ int mm_idx_alt_read(mm_idx_t *mi, const char *fn)
657
669
  return n_alt;
658
670
  }
659
671
 
672
+ /***************
673
+ * BED reading *
674
+ ***************/
675
+
660
676
  #define sort_key_bed(a) ((a).st)
661
677
  KRADIX_SORT_INIT(bed, mm_idx_intv1_t, sort_key_bed, 4)
662
678
 
663
- mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc)
679
+ #define sort_key_end(a) ((a).en)
680
+ KRADIX_SORT_INIT(end, mm_idx_intv1_t, sort_key_end, 4)
681
+
682
+ static mm_idx_intv_t *mm_idx_bed_read_core(const mm_idx_t *mi, const char *fn, int read_junc, int min_sc)
664
683
  {
665
684
  gzFile fp;
666
685
  kstream_t *ks;
@@ -669,7 +688,7 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc
669
688
 
670
689
  fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
671
690
  if (fp == 0) return 0;
672
- I = (mm_idx_intv_t*)calloc(mi->n_seq, sizeof(*I));
691
+ I = CALLOC(mm_idx_intv_t, mi->n_seq);
673
692
  ks = ks_init(fp);
674
693
  while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0) {
675
694
  mm_idx_intv_t *r;
@@ -690,7 +709,7 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc
690
709
  t.en = atol(q);
691
710
  if (t.en < 0) break;
692
711
  } else if (i == 4) { // BED score
693
- t.score = atol(q);
712
+ t.score = *q >= '0' && *q <= '9'? atol(q) : -1;
694
713
  } else if (i == 5) { // strand
695
714
  t.strand = *q == '+'? 1 : *q == '-'? -1 : 0;
696
715
  } else if (i == 9) {
@@ -706,7 +725,8 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc
706
725
  ++i, q = p + 1;
707
726
  }
708
727
  }
709
- if (id < 0 || t.st < 0 || t.st >= t.en) continue;
728
+ if (id < 0 || t.st < 0 || t.st >= t.en) continue; // contig ID not found, or other problems
729
+ if (min_sc > 0 && t.score < min_sc) continue;
710
730
  r = &I[id];
711
731
  if (i >= 11 && read_junc) { // BED12
712
732
  int32_t st, sz, en;
@@ -739,14 +759,44 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc
739
759
  return I;
740
760
  }
741
761
 
742
- int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc)
762
+ static mm_idx_intv_t *mm_idx_bed_read_merge(const mm_idx_t *mi, const char *fn, int read_junc, int min_sc)
743
763
  {
764
+ long n = 0, n0 = 0;
744
765
  int32_t i;
766
+ mm_idx_intv_t *I;
767
+ I = mm_idx_bed_read_core(mi, fn, read_junc, min_sc);
768
+ if (I == 0) return 0;
769
+ for (i = 0; i < mi->n_seq; ++i) {
770
+ int32_t j, j0, k;
771
+ mm_idx_intv_t *intv = &I[i];
772
+ n0 += intv->n;
773
+ radix_sort_bed(intv->a, intv->a + intv->n); // sort by st
774
+ for (j = 1, j0 = 0; j <= intv->n; ++j) { // sort by st and then by end
775
+ if (j == intv->n || intv->a[j].st != intv->a[j0].st) {
776
+ radix_sort_end(intv->a + j0, intv->a + j);
777
+ j0 = j;
778
+ }
779
+ }
780
+ for (j = 1, j0 = 0, k = 0; j <= intv->n; ++j) { // merge intervals with the same (st, en)
781
+ if (j == intv->n || intv->a[j].st != intv->a[j0].st || intv->a[j].en != intv->a[j0].en) {
782
+ intv->a[k] = intv->a[j0];
783
+ intv->a[k++].cnt = j - j0;
784
+ j0 = j;
785
+ }
786
+ }
787
+ intv->a = REALLOC(mm_idx_intv1_t, intv->a, k);
788
+ intv->n = intv->m = k;
789
+ n += k;
790
+ }
791
+ if (mm_verbose >= 3)
792
+ fprintf(stderr, "[%s] read %ld introns, %ld of which are non-redundant\n", __func__, n0, n);
793
+ return I;
794
+ }
795
+
796
+ int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc)
797
+ {
745
798
  if (mi->h == 0) mm_idx_index_name(mi);
746
- mi->I = mm_idx_read_bed(mi, fn, read_junc);
747
- if (mi->I == 0) return -1;
748
- for (i = 0; i < mi->n_seq; ++i) // TODO: eliminate redundant intervals
749
- radix_sort_bed(mi->I[i].a, mi->I[i].a + mi->I[i].n);
799
+ mi->I = mm_idx_bed_read_merge(mi, fn, read_junc, -1);
750
800
  return 0;
751
801
  }
752
802
 
@@ -774,3 +824,244 @@ int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uin
774
824
  }
775
825
  return left;
776
826
  }
827
+
828
+ /*********************************
829
+ * Reading junctions for jumping *
830
+ *********************************/
831
+
832
+ #define sort_key_jj(a) ((a).off)
833
+ KRADIX_SORT_INIT(jj, mm_idx_jjump1_t, sort_key_jj, 4)
834
+
835
+ #define sort_key_jj2(a) ((a).off2)
836
+ KRADIX_SORT_INIT(jj2, mm_idx_jjump1_t, sort_key_jj2, 4)
837
+
838
+ static void sort_jjump(mm_idx_jjump_t *jj2)
839
+ {
840
+ int32_t j0, j, k;
841
+ if (jj2 == 0 || jj2->n == 0) return;
842
+ radix_sort_jj(jj2->a, jj2->a + jj2->n);
843
+ for (j0 = 0, j = 1; j <= jj2->n; ++j) {
844
+ if (j == jj2->n || jj2->a[j0].off != jj2->a[j].off) {
845
+ radix_sort_jj2(jj2->a + j0, jj2->a + j);
846
+ j0 = j;
847
+ }
848
+ }
849
+ // the actual merge
850
+ for (j0 = 0, j = 1, k = 0; j <= jj2->n; ++j) {
851
+ if (j == jj2->n || jj2->a[j0].off != jj2->a[j].off || jj2->a[j0].off2 != jj2->a[j].off2) {
852
+ int32_t t, cnt = 0;
853
+ uint16_t flag = 0;
854
+ for (t = j0; t < j; ++t) cnt += jj2->a[t].cnt, flag |= jj2->a[t].flag;
855
+ jj2->a[k] = jj2->a[j0];
856
+ jj2->a[k].cnt = cnt;
857
+ jj2->a[k++].flag = flag;
858
+ j0 = j;
859
+ }
860
+ }
861
+ jj2->n = k;
862
+ jj2->a = REALLOC(mm_idx_jjump1_t, jj2->a, k);
863
+ }
864
+
865
+ static mm_idx_jjump_t *mm_idx_bed2jjump(const mm_idx_t *mi, const mm_idx_intv_t *I, uint16_t flag)
866
+ {
867
+ int32_t i;
868
+ mm_idx_jjump_t *J;
869
+ J = CALLOC(mm_idx_jjump_t, mi->n_seq);
870
+ for (i = 0; i < mi->n_seq; ++i) {
871
+ int32_t j, k;
872
+ const mm_idx_intv_t *intv = &I[i];
873
+ mm_idx_jjump_t *jj = &J[i];
874
+ jj->n = intv->n * 2;
875
+ jj->a = CALLOC(mm_idx_jjump1_t, jj->n);
876
+ for (j = k = 0; j < intv->n; ++j) {
877
+ jj->a[k].off = intv->a[j].st, jj->a[k].off2 = intv->a[j].en, jj->a[k].cnt = intv->a[j].cnt, jj->a[k].strand = intv->a[j].strand, jj->a[k++].flag = flag;
878
+ jj->a[k].off = intv->a[j].en, jj->a[k].off2 = intv->a[j].st, jj->a[k].cnt = intv->a[j].cnt, jj->a[k].strand = intv->a[j].strand, jj->a[k++].flag = flag;
879
+ }
880
+ sort_jjump(jj);
881
+ }
882
+ return J;
883
+ }
884
+
885
+ static mm_idx_jjump_t *mm_idx_jjump_merge(const mm_idx_t *mi, const mm_idx_jjump_t *J0, const mm_idx_jjump_t *J1)
886
+ {
887
+ int32_t i;
888
+ mm_idx_jjump_t *J2;
889
+ J2 = CALLOC(mm_idx_jjump_t, mi->n_seq);
890
+ for (i = 0; i < mi->n_seq; ++i) {
891
+ int32_t j, k;
892
+ const mm_idx_jjump_t *jj0 = &J0[i], *jj1 = &J1[i];
893
+ mm_idx_jjump_t *jj2 = &J2[i];
894
+ jj2->n = jj0->n + jj1->n;
895
+ jj2->a = CALLOC(mm_idx_jjump1_t, jj2->n);
896
+ for (j = k = 0; j < jj0->n; ++j) jj2->a[k++] = jj0->a[j];
897
+ for (j = 0; j < jj1->n; ++j) jj2->a[k++] = jj1->a[j];
898
+ sort_jjump(jj2);
899
+ }
900
+ return J2;
901
+ }
902
+
903
+ int mm_idx_jjump_read(mm_idx_t *mi, const char *fn, int flag, int min_sc)
904
+ {
905
+ int32_t i, j, n_anno = 0, n_misc = 0;
906
+ mm_idx_intv_t *I;
907
+ mm_idx_jjump_t *J;
908
+ if (mi->h == 0) mm_idx_index_name(mi);
909
+ I = mm_idx_bed_read_merge(mi, fn, 1, min_sc);
910
+ J = mm_idx_bed2jjump(mi, I, flag);
911
+ for (i = 0; i < mi->n_seq; ++i) free(I[i].a);
912
+ free(I);
913
+ if (mi->J) {
914
+ mm_idx_jjump_t *J2;
915
+ J2 = mm_idx_jjump_merge(mi, mi->J, J);
916
+ for (i = 0; i < mi->n_seq; ++i) {
917
+ free(mi->J[i].a); free(J[i].a);
918
+ }
919
+ free(mi->J); free(J);
920
+ mi->J = J2;
921
+ } else mi->J = J;
922
+ for (i = 0; i < mi->n_seq; ++i) {
923
+ for (j = 0; j < mi->J[i].n; ++j)
924
+ if (mi->J[i].a[j].flag & MM_JUNC_ANNO) ++n_anno;
925
+ else ++n_misc;
926
+ }
927
+ if (mm_verbose >= 3)
928
+ fprintf(stderr, "[%s] there are %d annotated and %d other splice positions in the index\n", __func__, n_anno, n_misc);
929
+ return 0;
930
+ }
931
+
932
+ static int32_t mm_idx_jump_get_core(int32_t n, const mm_idx_jjump1_t *a, int32_t x) // similar to mm_idx_find_intv()
933
+ {
934
+ int32_t s = 0, e = n;
935
+ if (n == 0) return -1;
936
+ if (x < a[0].off) return -1;
937
+ while (s < e) {
938
+ int32_t mid = s + (e - s) / 2;
939
+ if (x >= a[mid].off && (mid + 1 >= n || x < a[mid+1].off)) return mid;
940
+ else if (x < a[mid].off) e = mid;
941
+ else s = mid + 1;
942
+ }
943
+ assert(0);
944
+ }
945
+
946
+ const mm_idx_jjump1_t *mm_idx_jump_get(const mm_idx_t *db, int32_t cid, int32_t st, int32_t en, int32_t *n)
947
+ {
948
+ mm_idx_jjump_t *s;
949
+ int32_t l, r;
950
+ *n = 0;
951
+ if (cid >= db->n_seq || cid < 0 || db->J == 0) return 0;
952
+ if (en < 0 || en > db->seq[cid].len) en = db->seq[cid].len;
953
+ s = &db->J[cid];
954
+ if (s->n == 0) return 0;
955
+ l = mm_idx_jump_get_core(s->n, s->a, st);
956
+ r = mm_idx_jump_get_core(s->n, s->a, en);
957
+ *n = r - l;
958
+ return &s->a[l + 1];
959
+ }
960
+
961
+ /****************
962
+ * splice score *
963
+ ****************/
964
+
965
+ typedef struct mm_idx_spsc_s {
966
+ uint32_t n, m;
967
+ uint64_t *a; // pos<<56 | score<<1 | acceptor
968
+ } mm_idx_spsc_t;
969
+
970
+ int32_t mm_idx_spsc_read(mm_idx_t *idx, const char *fn, int32_t max_sc)
971
+ {
972
+ gzFile fp;
973
+ kstring_t str = {0,0,0};
974
+ kstream_t *ks;
975
+ int32_t dret, j;
976
+ int64_t n_read = 0;
977
+
978
+ fp = fn && strcmp(fn, "-") != 0? gzopen(fn, "rb") : gzdopen(0, "rb");
979
+ if (fp == 0) return -1;
980
+ if (idx->h == 0) mm_idx_index_name(idx);
981
+ if (max_sc > 63) max_sc = 63;
982
+ idx->spsc = Kcalloc(0, mm_idx_spsc_t, idx->n_seq * 2);
983
+ ks = ks_init(fp);
984
+ while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
985
+ mm_idx_spsc_t *s;
986
+ char *p, *q, *name = 0;
987
+ int32_t i, type = -1, strand = 0, cid = -1, score = -1;
988
+ int64_t pos = -1;
989
+ for (i = 0, p = q = str.s;; ++p) {
990
+ if (*p == '\t' || *p == 0) {
991
+ int c = *p;
992
+ *p = 0;
993
+ if (i == 0) {
994
+ name = q;
995
+ } else if (i == 1) {
996
+ pos = atol(q);
997
+ } else if (i == 2) {
998
+ strand = *q == '+'? 1 : '-'? -1 : 0;
999
+ } else if (i == 3) {
1000
+ type = *q == 'D'? 0 : *q == 'A'? 1 : -1;
1001
+ } else if (i == 4) {
1002
+ score = atoi(q);
1003
+ break;
1004
+ }
1005
+ if (c == 0) break;
1006
+ q = p + 1, ++i;
1007
+ }
1008
+ }
1009
+ if (i < 4) continue; // not enough fields
1010
+ if (score > max_sc) score = max_sc;
1011
+ if (score < -max_sc) score = -max_sc;
1012
+ cid = mm_idx_name2id(idx, name);
1013
+ if (cid < 0 || type < 0 || strand == 0 || pos < 0) continue; // FIXME: give a warning!
1014
+ s = &idx->spsc[cid << 1 | (strand > 0? 0 : 1)];
1015
+ Kgrow(0, uint64_t, s->a, s->n, s->m);
1016
+ if (pos > 0 && pos < idx->seq[cid].len) { // ignore scores at the ends
1017
+ s->a[s->n++] = (uint64_t)pos << 8 | (score + KSW_SPSC_OFFSET) << 1 | type;
1018
+ ++n_read;
1019
+ }
1020
+ }
1021
+ ks_destroy(ks);
1022
+ gzclose(fp);
1023
+ for (j = 0; j < idx->n_seq * 2; ++j) {
1024
+ mm_idx_spsc_t *s = &idx->spsc[j];
1025
+ if (s->n > 0)
1026
+ radix_sort_64(s->a, s->a + s->n);
1027
+ }
1028
+ if (mm_verbose >= 3)
1029
+ fprintf(stderr, "[M::%s] read %ld splice scores\n", __func__, (long)n_read);
1030
+ return 0;
1031
+ }
1032
+
1033
+ static int32_t mm_idx_find_intv(int32_t n, const uint64_t *a, int64_t x)
1034
+ {
1035
+ int32_t s = 0, e = n;
1036
+ if (n == 0) return -1;
1037
+ if (x < a[0]>>8) return -1;
1038
+ while (s < e) {
1039
+ int32_t mid = s + (e - s) / 2;
1040
+ if (x >= a[mid]>>8 && (mid + 1 >= n || x < a[mid+1]>>8)) return mid;
1041
+ else if (x < a[mid]>>8) e = mid;
1042
+ else s = mid + 1;
1043
+ }
1044
+ assert(0);
1045
+ }
1046
+
1047
+ int64_t mm_idx_spsc_get(const mm_idx_t *db, int32_t cid, int64_t st, int64_t en, int32_t rev, uint8_t *sc)
1048
+ {
1049
+ const mm_idx_spsc_t *s;
1050
+ if (cid >= db->n_seq || cid < 0 || db->spsc == 0) return -1;
1051
+ if (en < 0 || en > db->seq[cid].len) en = db->seq[cid].len;
1052
+ memset(sc, 0xff, en - st);
1053
+ s = &db->spsc[cid << 1 | (!!rev)];
1054
+ if (s->n > 0) {
1055
+ int32_t j, l, r;
1056
+ l = mm_idx_find_intv(s->n, s->a, st);
1057
+ r = mm_idx_find_intv(s->n, s->a, en);
1058
+ for (j = l + 1; j <= r; ++j) {
1059
+ int64_t x = (s->a[j]>>8) - st;
1060
+ uint8_t score = s->a[j] & 0xff;
1061
+ assert(x <= en - st);
1062
+ if (x == en - st) continue;
1063
+ if (sc[x] == 0xff || sc[x] < score) sc[x] = score;
1064
+ }
1065
+ }
1066
+ return en - st;
1067
+ }