minimap2 0.2.28.0 → 0.2.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,7 @@
5
5
  #include <stdio.h>
6
6
  #include <sys/types.h>
7
7
 
8
- #define MM_VERSION "2.28-r1209"
8
+ #define MM_VERSION "2.29-r1283"
9
9
 
10
10
  #define MM_F_NO_DIAG (0x001LL) // no exact diagonal hit
11
11
  #define MM_F_NO_DUAL (0x002LL) // skip pairs where query name is lexicographically larger than target name
@@ -45,6 +45,9 @@
45
45
  #define MM_F_SPLICE_OLD (0x800000000LL)
46
46
  #define MM_F_SECONDARY_SEQ (0x1000000000LL) //output SEQ field for seqondary alignments using hard clipping
47
47
  #define MM_F_OUT_DS (0x2000000000LL)
48
+ #define MM_F_WEAK_PAIRING (0x4000000000LL)
49
+ #define MM_F_SR_RNA (0x8000000000LL)
50
+ #define MM_F_OUT_JUNC (0x10000000000LL)
48
51
 
49
52
  #define MM_I_HPC 0x1
50
53
  #define MM_I_NO_SEQ 0x2
@@ -91,6 +94,8 @@ typedef struct {
91
94
  uint32_t *S; // 4-bit packed sequence
92
95
  struct mm_idx_bucket_s *B; // index (hidden)
93
96
  struct mm_idx_intv_s *I; // intervals (hidden)
97
+ struct mm_idx_spsc_s *spsc;// splice score (hidden)
98
+ struct mm_idx_jjump_s *J; // junctions to create jumps (hidden)
94
99
  void *km, *h;
95
100
  } mm_idx_t;
96
101
 
@@ -115,7 +120,7 @@ typedef struct {
115
120
  int32_t mlen, blen; // seeded exact match length; seeded alignment block length
116
121
  int32_t n_sub; // number of suboptimal mappings
117
122
  int32_t score0; // initial chaining score (before chain merging/spliting)
118
- uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, seg_id:8, split_inv:1, is_alt:1, strand_retained:1, dummy:5;
123
+ uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, seg_id:8, split_inv:1, is_alt:1, strand_retained:1, is_spliced:1, dummy:4;
119
124
  uint32_t hash;
120
125
  float div;
121
126
  mm_extra_t *p;
@@ -158,7 +163,7 @@ typedef struct {
158
163
  int transition; // transition mismatch score (A:G, C:T)
159
164
  int sc_ambi; // score when one or both bases are "N"
160
165
  int noncan; // cost of non-canonical splicing sites
161
- int junc_bonus;
166
+ int junc_bonus, junc_pen;
162
167
  int zdrop, zdrop_inv; // break alignment if alignment score drops too fast along the diagonal
163
168
  int end_bonus;
164
169
  int min_dp_max; // drop an alignment if the score of the max scoring segment is below this threshold
@@ -171,6 +176,8 @@ typedef struct {
171
176
 
172
177
  int pe_ori, pe_bonus;
173
178
 
179
+ int32_t jump_min_match;
180
+
174
181
  float mid_occ_frac; // only used by mm_mapopt_update(); see below
175
182
  float q_occ_frac;
176
183
  int32_t min_mid_occ, max_mid_occ;
@@ -411,6 +418,10 @@ int mm_idx_alt_read(mm_idx_t *mi, const char *fn);
411
418
  int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc);
412
419
  int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uint8_t *s);
413
420
 
421
+ int mm_max_spsc_bonus(const mm_mapopt_t *mo);
422
+ int32_t mm_idx_spsc_read(mm_idx_t *idx, const char *fn, int32_t max_sc);
423
+ int64_t mm_idx_spsc_get(const mm_idx_t *db, int32_t cid, int64_t st0, int64_t en0, int32_t rev, uint8_t *sc);
424
+
414
425
  // deprecated APIs for backward compatibility
415
426
  void mm_mapopt_init(mm_mapopt_t *opt);
416
427
  mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads);
@@ -1,4 +1,4 @@
1
- .TH minimap2 1 "12 March 2024" "minimap2-2.28 (r1209)" "Bioinformatics tools"
1
+ .TH minimap2 1 "18 April 2025" "minimap2-2.29 (r1283)" "Bioinformatics tools"
2
2
  .SH NAME
3
3
  .PP
4
4
  minimap2 - mapping and alignment between collections of DNA sequences
@@ -79,19 +79,6 @@ Minimizer k-mer length [15]
79
79
  .BI -w \ INT
80
80
  Minimizer window size [10]. A minimizer is the smallest k-mer
81
81
  in a window of w consecutive k-mers.
82
- .TP
83
- .BI -j \ INT
84
- Syncmer submer size [10]. Option
85
- .B -j
86
- and
87
- .B -w
88
- will override each: if
89
- .B -w
90
- is applied after
91
- .BR -j ,
92
- .B -j
93
- will have no effect, and vice versa.
94
-
95
82
  .TP
96
83
  .B -H
97
84
  Use homopolymer-compressed (HPC) minimizers. An HPC sequence is constructed by
@@ -310,11 +297,13 @@ maximum alignment gap is mostly controlled by
310
297
  .B --splice
311
298
  Enable the splice alignment mode.
312
299
  .TP
313
- .B --sr
314
- Enable short-read alignment heuristics. In the short-read mode, minimap2
315
- applies a second round of chaining with a higher minimizer occurrence threshold
316
- if no good chain is found. In addition, minimap2 attempts to patch gaps between
317
- seeds with ungapped alignment.
300
+ .BR --sr [= no | dna | rna ]
301
+ Enable short-read alignment heuristics [no]. If this option is used with no argument,
302
+ .RB ` dna '
303
+ is set. In the DNA short-read mode, minimap2 applies a second round of chaining
304
+ with a higher minimizer occurrence threshold if no good chain is found. In
305
+ addition, minimap2 attempts to patch gaps between seeds with ungapped
306
+ alignment.
318
307
  .TP
319
308
  .BI --split-prefix \ STR
320
309
  Prefix to create temporary files. Typically used for a multi-part index.
@@ -334,10 +323,6 @@ Only map to the reverse complement strand of the reference sequences.
334
323
  If yes, sort anchors with heap merge, instead of radix sort. Heap merge is
335
324
  faster for short reads, but slower for long reads. [no]
336
325
  .TP
337
- .B --no-pairing
338
- Treat two reads in a pair as independent reads. The mate related fields in SAM
339
- are still properly populated.
340
- .TP
341
326
  .B --no-hash-name
342
327
  Produce the same alignment for identical sequences regardless of their sequence names.
343
328
  .SS Alignment options
@@ -371,7 +356,16 @@ Splice model [1]. 0 for the original minimap2 splice model that always penalizes
371
356
  .B -C
372
357
  has no effect with the default
373
358
  .BR -J1 .
374
- .BR -J0 .
359
+ .TP
360
+ .BR -j \ FILE
361
+ Junctions used to extend alignment towards ends of reads [].
362
+ .I FILE
363
+ can be gene annotations in the BED12 format (aka 12-column BED), or intron
364
+ positions in 5-column BED with the strand column required. BED12 file can be
365
+ converted from GTF/GFF3 with `paftools.js gff2bed anno.gtf'. This option is
366
+ intended for short RNA-seq reads, while
367
+ .B --junc-bed
368
+ for long noisy RNA-seq reads.
375
369
  .TP
376
370
  .BI -C \ INT
377
371
  Cost for a non-canonical GT-AG splicing (effective with
@@ -414,7 +408,16 @@ no attempt to match GT-AG [n]
414
408
  Score bonus when alignment extends to the end of the query sequence [0].
415
409
  .TP
416
410
  .BI --score-N \ INT
417
- Score of a mismatch involving ambiguous bases [1].
411
+ Penalty of a mismatch involving ambiguous bases [1].
412
+ .TP
413
+ .BR --pairing = strong | weak | no
414
+ How to pair paired-end reads [strong].
415
+ .RB ` no '
416
+ for aligning the two ends in a pair independently with no `properly paired' set.
417
+ .RB ` weak '
418
+ for aligning the two ends independently and then pairing the hits.
419
+ .RB ` strong '
420
+ for jointly aligning and pairing the two ends.
418
421
  .TP
419
422
  .BR --splice-flank = yes | no
420
423
  Assume the next base to a
@@ -433,16 +436,40 @@ on SIRV data, please add
433
436
  .B --splice-flank=no
434
437
  to the command line.
435
438
  .TP
439
+ .BR --spsc \ FILE
440
+ Splice scores []. Each line consists of five fields: 1) contig, 2) offset, 3) `+' or `-', 4) `D' or `A', and 5) score,
441
+ where offset is the number of bases before a splice junction, `D' indicates the
442
+ line corresponds to a donor site and `A' for an acceptor site.
443
+ A positive score suggests the junction is preferred and a negative score
444
+ suggests the junction is not preferred.
445
+ .TP
446
+ .BR --junc-pen \ INT
447
+ Penalty for a position not in FILE specified by
448
+ .B --spsc
449
+ [5]. Effective with
450
+ .B --spsc
451
+ but not
452
+ .BR --junc-bed .
453
+ .TP
436
454
  .BR --junc-bed \ FILE
437
- Gene annotations in the BED12 format (aka 12-column BED), or intron positions
438
- in 5-column BED. With this option, minimap2 prefers splicing in annotations.
439
- BED12 file can be converted from GTF/GFF3 with `paftools.js gff2bed anno.gtf'
440
- [].
455
+ Junctions to prefer during base alignment [].
456
+ Same format as
457
+ .BR -j .
458
+ It is
459
+ .I NOT
460
+ recommended to apply this option to short RNA-seq reads. This would increase
461
+ run time with little improvement to junction accuracy.
441
462
  .TP
442
463
  .BR --junc-bonus \ INT
443
- Score bonus for a splice donor or acceptor found in annotation (effective with
444
- .BR --junc-bed )
445
- [9].
464
+ Score bonus for a splice donor or acceptor found in annotation [9]. Effective with
465
+ .B --junc-bed
466
+ but not
467
+ .BR --spsc .
468
+ .TP
469
+ .BR --jump-min-match \ INT
470
+ Minimum matching length to create a jump [3]. Equivalent to
471
+ .B STAR
472
+ .BR --alignSJDBoverhangMin .
446
473
  .TP
447
474
  .BI --end-seed-pen \ INT
448
475
  Drop a terminal anchor if
@@ -500,20 +527,13 @@ Copy input FASTA/Q comments to output.
500
527
  .B -c
501
528
  Generate CIGAR. In PAF, the CIGAR is written to the `cg' custom tag.
502
529
  .TP
503
- .BI --cs[= STR ]
530
+ .BR --cs [= short | long ]
504
531
  Output the
505
532
  .B cs
506
533
  tag.
507
- .I STR
508
- can be either
509
- .I short
510
- or
511
- .IR long .
512
- If no
513
- .I STR
514
- is given,
515
- .I short
516
- is assumed. [none]
534
+ If no argument is given,
535
+ .RB ` short '
536
+ is set. [none]
517
537
  .TP
518
538
  .B --MD
519
539
  Output the MD tag (see the SAM spec).
@@ -527,6 +547,26 @@ In SAM output, use soft clipping for supplementary alignments.
527
547
  .B --secondary-seq
528
548
  In SAM output, show query sequences for secondary alignments.
529
549
  .TP
550
+ .B --write-junc
551
+ Output splice junctions in 6-column BED: contig name, start, end,
552
+ read name, score and strand. Score is the sum of donor and acceptor scores,
553
+ where GT gets 3, GC gets 2 and AT gets 1 at donor sites,
554
+ while AG gets 3 and AC gets 1 at acceptor sites.
555
+ Alignments with mapping quality below 10 are ignored.
556
+ .TP
557
+ .BI --pass1 \ FILE
558
+ Junctions BED file outputted by
559
+ .B --write-junc
560
+ []. Rows with scores lower than 5 are ignored. When both
561
+ .B -j
562
+ and
563
+ .B --pass1
564
+ are present, junctions in
565
+ .B -j
566
+ are preferred over in
567
+ .BR --pass1
568
+ when there is ambiguity.
569
+ .TP
530
570
  .BI --seed \ INT
531
571
  Integer seed for randomizing equally best hits. Minimap2 hashes
532
572
  .I INT
@@ -666,10 +706,16 @@ Spliced alignment for accurate long RNA-seq reads such as PacBio iso-seq
666
706
  .B -C5 -O6,24
667
707
  .BR -B4 ).
668
708
  .TP
709
+ .B splice:sr
710
+ Spliced alignment for short RNA-seq reads
711
+ .RB ( -xsplice:hq
712
+ .B --frag=yes -m25 -s40 -2K100m --heap-sort=yes --pairing=weak --sr=rna --min-dp-len=20
713
+ .BR --secondary=no ).
714
+ .TP
669
715
  .B sr
670
716
  Short-read alignment without splicing
671
717
  .RB ( -k21
672
- .B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -b0 -r100 -p.5 -N20 -f1000,5000 -n2 -m25
718
+ .B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -r100 -p.5 -N20 -f1000,5000 -n2 -m25
673
719
  .B -s40 -g100 -2K50m --heap-sort=yes
674
720
  .BR --secondary=no ).
675
721
  .TP
@@ -742,7 +788,7 @@ s2 i Chaining score of the best secondary chain
742
788
  NM i Total number of mismatches and gaps in the alignment
743
789
  MD Z To generate the ref sequence in the alignment
744
790
  AS i DP alignment score
745
- SA Z List of other supplementary alignments
791
+ SA Z List of other supplementary alignments (with approximate CIGAR strings)
746
792
  ms i DP score of the max scoring segment in the alignment
747
793
  nn i Number of ambiguous bases in the alignment
748
794
  ts A Transcript strand (splice mode only)
@@ -751,6 +797,7 @@ cs Z Difference string
751
797
  dv f Approximate per-base sequence divergence
752
798
  de f Gap-compressed per-base sequence divergence
753
799
  rl i Length of query regions harboring repetitive seeds
800
+ zd i Alignment broken due to Z-drop; bit 1: left broken; bit 2: right broken
754
801
  .TE
755
802
 
756
803
  .PP
@@ -16,7 +16,8 @@ minimap2 -c test/MT-human.fa test/MT-orang.fa \
16
16
  | paftools.js liftover -l10000 - <(echo -e "MT_orang\t2000\t5000") # liftOver
17
17
  # no test data for the following examples
18
18
  paftools.js junceval -e anno.gtf splice.sam > out.txt # compare splice junctions to annotations
19
- paftools.js splice2bed anno.gtf > anno.bed # convert GTF/GFF3 to BED12
19
+ paftools.js splice2bed splice.sam > splice.bed # convert PAF/SAM to BED12
20
+ paftools.js gff2bed anno.gtf > anno.bed # convert GTF/GFF3 to BED12
20
21
  ```
21
22
 
22
23
  ## Table of Contents
@@ -0,0 +1,241 @@
1
+ #!/usr/bin/env k8
2
+
3
+ "use strict";
4
+
5
+ Array.prototype.delete_at = function(i) {
6
+ for (let j = i; j < this.length - 1; ++j)
7
+ this[j] = this[j + 1];
8
+ --this.length;
9
+ }
10
+
11
+ function* getopt(argv, ostr, longopts) {
12
+ if (argv.length == 0) return;
13
+ let pos = 0, cur = 0;
14
+ while (cur < argv.length) {
15
+ let lopt = "", opt = "?", arg = "";
16
+ while (cur < argv.length) { // skip non-option arguments
17
+ if (argv[cur][0] == "-" && argv[cur].length > 1) {
18
+ if (argv[cur] == "--") cur = argv.length;
19
+ break;
20
+ } else ++cur;
21
+ }
22
+ if (cur == argv.length) break;
23
+ let a = argv[cur];
24
+ if (a[0] == "-" && a[1] == "-") { // a long option
25
+ pos = -1;
26
+ let c = 0, k = -1, tmp = "", o;
27
+ const pos_eq = a.indexOf("=");
28
+ if (pos_eq > 0) {
29
+ o = a.substring(2, pos_eq);
30
+ arg = a.substring(pos_eq + 1);
31
+ } else o = a.substring(2);
32
+ for (let i = 0; i < longopts.length; ++i) {
33
+ let y = longopts[i];
34
+ if (y[y.length - 1] == "=") y = y.substring(0, y.length - 1);
35
+ if (o.length <= y.length && o == y.substring(0, o.length)) {
36
+ k = i, tmp = y;
37
+ ++c; // c is the number of matches
38
+ if (o == y) { // exact match
39
+ c = 1;
40
+ break;
41
+ }
42
+ }
43
+ }
44
+ if (c == 1) { // find a unique match
45
+ lopt = tmp;
46
+ if (pos_eq < 0 && longopts[k][longopts[k].length-1] == "=" && cur + 1 < argv.length) {
47
+ arg = argv[cur+1];
48
+ argv.delete_at(cur + 1);
49
+ }
50
+ }
51
+ } else { // a short option
52
+ if (pos == 0) pos = 1;
53
+ opt = a[pos++];
54
+ let k = ostr.indexOf(opt);
55
+ if (k < 0) {
56
+ opt = "?";
57
+ } else if (k + 1 < ostr.length && ostr[k+1] == ":") { // requiring an argument
58
+ if (pos >= a.length) {
59
+ arg = argv[cur+1];
60
+ argv.delete_at(cur + 1);
61
+ } else arg = a.substring(pos);
62
+ pos = -1;
63
+ }
64
+ }
65
+ if (pos < 0 || pos >= argv[cur].length) {
66
+ argv.delete_at(cur);
67
+ pos = 0;
68
+ }
69
+ if (lopt != "") yield { opt: `--${lopt}`, arg: arg };
70
+ else if (opt != "?") yield { opt: `-${opt}`, arg: arg };
71
+ else yield { opt: "?", arg: "" };
72
+ }
73
+ }
74
+
75
+ function* k8_readline(fn) {
76
+ let buf = new Bytes();
77
+ let file = new File(fn);
78
+ while (file.readline(buf) >= 0) {
79
+ yield buf.toString();
80
+ }
81
+ file.close();
82
+ buf.destroy();
83
+ }
84
+
85
+ function merge_hits(b) {
86
+ if (b.length == 1)
87
+ return { name1:b[0].name1, name2:b[0].name2, len1:b[0].len1, len2:b[0].len2, min_cov:b[0].min_cov, max_cov:b[0].max_cov, cov1:b[0].cov1, cov2:b[0].cov2, s1:b[0].s1, dv:b[0].dv };
88
+ b.sort(function(x, y) { return x.st1 - y.st1 });
89
+ let f = [], bt = [];
90
+ for (let i = 0; i < b.length; ++i)
91
+ f[i] = b[i].s1, bt[i] = -1;
92
+ for (let i = 0; i < b.length; ++i) {
93
+ for (let j = 0; j < i; ++j) {
94
+ if (b[j].st2 < b[i].st2) {
95
+ if (b[j].en1 >= b[i].en1) continue;
96
+ if (b[j].en2 >= b[i].en2) continue;
97
+ const ov1 = b[j].en1 <= b[i].st1? 0 : b[i].st1 - b[j].en1;
98
+ const li1 = b[i].en1 - b[i].st1;
99
+ const s11 = b[i].s1 / li1 * (li1 - ov1);
100
+ const ov2 = b[j].en2 <= b[i].st2? 0 : b[i].st2 - b[j].en2;
101
+ const li2 = b[i].en2 - b[i].st2;
102
+ const s12 = b[i].s1 / li2 * (li2 - ov2);
103
+ const s1 = s11 < s12? s11 : s12;
104
+ if (f[i] < f[j] + s1)
105
+ f[i] = f[j] + s1, bt[i] = j;
106
+ }
107
+ }
108
+ }
109
+ let max_i = -1, max_f = 0, d = [];
110
+ for (let i = 0; i < b.length; ++i)
111
+ if (max_f < f[i])
112
+ max_f = f[i], max_i = i;
113
+ for (let k = max_i; k >= 0; k = bt[k])
114
+ d.push(k);
115
+ d = d.reverse();
116
+ let dv = 0, tot = 0, cov1 = 0, cov2 = 0, st1 = 0, en1 = 0, st2 = 0, en2 = 0;
117
+ for (let k = 0; k < d.length; ++k) {
118
+ const i = d[k];
119
+ tot += b[i].blen;
120
+ dv += b[i].dv * b[i].blen;
121
+ if (b[i].st1 > en1) {
122
+ cov1 += en1 - st1;
123
+ st1 = b[i].st1, en1 = b[i].en1;
124
+ } else en1 = en1 > b[i].en1? en1 : b[i].en1;
125
+ if (b[i].st2 > en2) {
126
+ cov2 += en2 - st2;
127
+ st2 = b[i].st2, en2 = b[i].en2;
128
+ } else en2 = en2 > b[i].en2? en2 : b[i].en2;
129
+ }
130
+ dv /= tot;
131
+ cov1 = (cov1 + (en1 - st1)) / b[0].len1;
132
+ cov2 = (cov2 + (en2 - st2)) / b[0].len2;
133
+ const min_cov = cov1 < cov2? cov1 : cov2;
134
+ const max_cov = cov1 > cov2? cov1 : cov2;
135
+ //warn(d.length, b[0].name1, b[0].name2, min_cov, max_cov);
136
+ return { name1:b[0].name1, name2:b[0].name2, len1:b[0].len1, len2:b[0].len2, min_cov:min_cov, max_cov:max_cov, cov1:cov1, cov2:cov2, s1:max_f, dv:dv };
137
+ }
138
+
139
+ function main(args) {
140
+ let opt = { min_cov:.9, max_dv:.015, max_diff:20000 };
141
+ for (const o of getopt(args, "c:d:e:", [])) {
142
+ if (o.opt == '-c') opt.min_cov = parseFloat(o.arg);
143
+ else if (o.opt == '-d') opt.max_dv = parseFloat(o.arg);
144
+ else if (o.opt == '-e') opt.max_diff = parseFloat(o.arg);
145
+ }
146
+ if (args.length == 0) {
147
+ print("Usage: pafcluster.js [options] <ava.paf>");
148
+ print("Options:");
149
+ print(` -c FLOAT min coverage [${opt.min_cov}]`);
150
+ print(` -d FLOAT max divergence [${opt.max_dv}]`);
151
+ print(` -e FLOAT max difference [${opt.max_diff}]`);
152
+ return;
153
+ }
154
+
155
+ // read
156
+ let a = [], len = {}, name2len = {};
157
+ for (const line of k8_readline(args[0])) {
158
+ let m, t = line.split("\t");
159
+ if (t[4] != "+") continue;
160
+ for (let i = 1; i < 4; ++i) t[i] = parseInt(t[i]);
161
+ for (let i = 6; i < 11; ++i) t[i] = parseInt(t[i]);
162
+ const len1 = t[1], len2 = t[6];
163
+ let s1 = -1, dv = -1.0;
164
+ for (let i = 12; i < t.length; ++i) {
165
+ if ((m = /^(s1|dv):\S:(\S+)/.exec(t[i])) != null) {
166
+ if (m[1] == "s1") s1 = parseInt(m[2]);
167
+ else if (m[1] == "dv") dv = parseFloat(m[2]);
168
+ }
169
+ }
170
+ if (s1 < 0 || dv < 0) continue;
171
+ const cov1 = (parseInt(t[3]) - parseInt(t[2])) / len1;
172
+ const cov2 = (parseInt(t[8]) - parseInt(t[7])) / len2;
173
+ const min_cov = cov1 < cov2? cov1 : cov2;
174
+ const max_cov = cov1 > cov2? cov1 : cov2;
175
+ name2len[t[0]] = len1;
176
+ name2len[t[5]] = len2;
177
+ a.push({ name1:t[0], name2:t[5], len1:len1, len2:len2, min_cov:min_cov, max_cov:max_cov, s1:s1, dv:dv, cov1:cov1, cov2:cov2, st1:t[2], en1:t[3], st2:t[7], en2:t[8], blen:t[10] });
178
+ len[t[0]] = len1, len[t[5]] = len2;
179
+ }
180
+ warn(`Read ${a.length} hits`);
181
+
182
+ // merge duplicated hits
183
+ let h = {};
184
+ for (let i = 0; i < a.length; ++i) {
185
+ const key = `${a[i].name1}\t${a[i].name2}`;
186
+ if (h[key] == null) h[key] = [];
187
+ h[key].push(a[i]);
188
+ }
189
+ a = [];
190
+ for (const key in h)
191
+ a.push(merge_hits(h[key]));
192
+
193
+ // core loop
194
+ while (a.length > 1) {
195
+ // select the sequence with the highest sum of s1
196
+ let h = {};
197
+ for (let i = 0; i < a.length; ++i) {
198
+ if (h[a[i].name1] == null) h[a[i].name1] = 0;
199
+ h[a[i].name1] += a[i].s1;
200
+ }
201
+ let max_s1 = 0, max_name = "";
202
+ for (const name in h)
203
+ if (max_s1 < h[name])
204
+ max_s1 = h[name], max_name = name;
205
+ // find contigs in the same group
206
+ h = {};
207
+ h[max_name] = 1;
208
+ for (let i = 0; i < a.length; ++i) {
209
+ if (a[i].name1 != max_name && a[i].name2 != max_name)
210
+ continue;
211
+ const diff1 = a[i].len1 * (1.0 - a[i].cov1);
212
+ const diff2 = a[i].len2 * (1.0 - a[i].cov2);
213
+ if (a[i].min_cov >= opt.min_cov && a[i].dv <= opt.max_dv && diff1 <= opt.max_diff && diff2 <= opt.max_diff)
214
+ h[a[i].name1] = h[a[i].name2] = 1;
215
+ }
216
+ let n = 0;
217
+ for (const key in h) {
218
+ ++n;
219
+ delete name2len[key];
220
+ }
221
+ print(`SD\t${max_name}\t${n}`);
222
+ for (const key in h) print(`CL\t${key}\t${len[key]}`);
223
+ print("//");
224
+ // filter out redundant hits
225
+ let b = [];
226
+ for (let i = 0; i < a.length; ++i)
227
+ if (h[a[i].name1] == null && h[a[i].name2] == null)
228
+ b.push(a[i]);
229
+ warn(`Reduced the number of hits from ${a.length} to ${b.length}`);
230
+ a = b;
231
+ }
232
+
233
+ // output remaining singletons
234
+ for (const key in name2len) {
235
+ print(`SD\t${key}\t1`);
236
+ print(`CL\t${key}\t${name2len[key]}`);
237
+ print(`//`);
238
+ }
239
+ }
240
+
241
+ main(arguments);
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env k8
2
2
 
3
- var paftools_version = '2.28-r1209';
3
+ var paftools_version = '2.29-r1283';
4
4
 
5
5
  /*****************************
6
6
  ***** Library functions *****
@@ -2187,7 +2187,7 @@ function paf_mapeval(args)
2187
2187
  }
2188
2188
 
2189
2189
  var lineno = 0, last = null, a = [], n_unmapped = null;
2190
- var re_cigar = /(\d+)([MIDSHN])/g;
2190
+ var re_cigar = /(\d+)([MIDSHN=X])/g;
2191
2191
  while (file.readline(buf) >= 0) {
2192
2192
  var m, line = buf.toString();
2193
2193
  ++lineno;
@@ -2225,7 +2225,7 @@ function paf_mapeval(args)
2225
2225
  var n_gap = 0, mlen = 0;
2226
2226
  while ((m = re_cigar.exec(t[5])) != null) {
2227
2227
  var len = parseInt(m[1]);
2228
- if (m[2] == 'M') pos_end += len, mlen += len;
2228
+ if (m[2] == 'M' || m[2] == 'X' || m[2] == '=') pos_end += len, mlen += len;
2229
2229
  else if (m[2] == 'I') n_gap += len;
2230
2230
  else if (m[2] == 'D') n_gap += len, pos_end += len;
2231
2231
  }
@@ -2494,6 +2494,10 @@ function paf_junceval(args)
2494
2494
  } else { // SAM
2495
2495
  ctg_name = t[2], pos = parseInt(t[3]) - 1, cigar = t[5];
2496
2496
  var flag = parseInt(t[1]);
2497
+ if (flag & 1) {
2498
+ if (flag & 0x40) qname += '/1';
2499
+ else if (flag & 0x80) qname += '/2';
2500
+ }
2497
2501
  if (flag&0x100) continue; // secondary
2498
2502
  }
2499
2503
 
@@ -3240,6 +3244,7 @@ function paf_sveval(args)
3240
3244
  if (bed != null && bed[t[0]] == null) continue;
3241
3245
  if (t[4] == '<INV>' || t[4] == '<INVDUP>') continue; // no inversion
3242
3246
  if (/[\[\]]/.test(t[4])) continue; // no break points
3247
+ if (t[6] != "." && t[6] != "PASS") continue;
3243
3248
  var st = parseInt(t[1]) - 1, en = st + t[3].length;
3244
3249
  // parse svlen
3245
3250
  var b = _paf_get_alen(t), svlen = b[0];
@@ -24,6 +24,9 @@
24
24
  #define MM_SEED_SEG_SHIFT 48
25
25
  #define MM_SEED_SEG_MASK (0xffULL<<(MM_SEED_SEG_SHIFT))
26
26
 
27
+ #define MM_JUNC_ANNO 0x1
28
+ #define MM_JUNC_MISC 0x2
29
+
27
30
  #ifndef kroundup32
28
31
  #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
29
32
  #endif
@@ -33,6 +36,7 @@
33
36
 
34
37
  #define MALLOC(type, len) ((type*)malloc((len) * sizeof(type)))
35
38
  #define CALLOC(type, len) ((type*)calloc((len), sizeof(type)))
39
+ #define REALLOC(type, ptr, cnt) ((type*)realloc((ptr), (cnt) * sizeof(type)))
36
40
 
37
41
  #ifdef __cplusplus
38
42
  extern "C" {
@@ -52,6 +56,12 @@ typedef struct {
52
56
  mm128_t *a;
53
57
  } mm_seg_t;
54
58
 
59
+ typedef struct {
60
+ int32_t off, off2, cnt;
61
+ int16_t strand;
62
+ uint16_t flag;
63
+ } mm_idx_jjump1_t;
64
+
55
65
  double cputime(void);
56
66
  double realtime(void);
57
67
  long peakrss(void);
@@ -69,17 +79,23 @@ double mm_event_identity(const mm_reg1_t *r);
69
79
  int mm_write_sam_hdr(const mm_idx_t *mi, const char *rg, const char *ver, int argc, char *argv[]);
70
80
  void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag);
71
81
  void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag, int rep_len);
82
+ void mm_write_paf4(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag, int rep_len, int n_seg, int seg_idx);
72
83
  void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs);
73
84
  void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int seg_idx, int reg_idx, int n_seg, const int *n_regs, const mm_reg1_t *const* regs, void *km, int64_t opt_flag);
74
85
  void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int seg_idx, int reg_idx, int n_seg, const int *n_regss, const mm_reg1_t *const* regss, void *km, int64_t opt_flag, int rep_len);
86
+ void mm_write_junc(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r);
75
87
 
88
+ // indexing related in index.c
76
89
  void mm_idxopt_init(mm_idxopt_t *opt);
77
90
  const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
78
91
  int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f);
79
92
  int mm_idx_getseq2(const mm_idx_t *mi, int is_rev, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
80
- mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
81
93
  mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a, int is_qstrand);
94
+ int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc);
95
+ int mm_idx_jjump_read(mm_idx_t *mi, const char *fn, int flag, int min_sc);
96
+ const mm_idx_jjump1_t *mm_idx_jump_get(const mm_idx_t *db, int32_t cid, int32_t st, int32_t en, int32_t *n);
82
97
 
98
+ // chaining in lchain.c
83
99
  mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
84
100
  int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
85
101
  mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_skip, int cap_rmq_size, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
@@ -96,8 +112,12 @@ void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int
96
112
  int mm_filter_strand_retained(int n_regs, mm_reg1_t *r);
97
113
  void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs);
98
114
  void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r, float alt_diff_frac);
99
- void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr);
115
+ void mm_set_mapq2(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr, int is_splice);
100
116
  void mm_update_dp_max(int qlen, int n_regs, mm_reg1_t *regs, float frac, int a, int b);
117
+ void mm_jump_split(void *km, const mm_idx_t *mi, const mm_mapopt_t *opt, int32_t qlen, const uint8_t *qseq, mm_reg1_t *r, int32_t ts_strand);
118
+
119
+ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
120
+ void mm_enlarge_cigar(mm_reg1_t *r, uint32_t n_cigar);
101
121
 
102
122
  void mm_est_err(const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, const mm128_t *a, int32_t n, const uint64_t *mini_pos);
103
123
 
@@ -105,6 +125,8 @@ mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int
105
125
  void mm_seg_free(void *km, int n_segs, mm_seg_t *segs);
106
126
  void mm_pair(void *km, int max_gap_ref, int dp_bonus, int sub_diff, int match_sc, const int *qlens, int *n_regs, mm_reg1_t **regs);
107
127
 
128
+ void mm_jump_split(void *km, const mm_idx_t *mi, const mm_mapopt_t *opt, int32_t qlen, const uint8_t *qseq, mm_reg1_t *r, int32_t ts_strand);
129
+
108
130
  FILE *mm_split_init(const char *prefix, const mm_idx_t *mi);
109
131
  mm_idx_t *mm_split_merge_prep(const char *prefix, int n_splits, FILE **fp, uint32_t *n_seq_part);
110
132
  int mm_split_merge(int n_segs, const char **fn, const mm_mapopt_t *opt, int n_split_idx);