minimap2 0.2.27.0 → 0.2.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -0
  3. data/ext/cmappy/cmappy.c +3 -3
  4. data/ext/cmappy/cmappy.h +1 -1
  5. data/ext/minimap2/FAQ.md +1 -1
  6. data/ext/minimap2/Makefile +4 -3
  7. data/ext/minimap2/NEWS.md +68 -0
  8. data/ext/minimap2/README.md +30 -14
  9. data/ext/minimap2/align.c +136 -52
  10. data/ext/minimap2/cookbook.md +2 -2
  11. data/ext/minimap2/format.c +59 -5
  12. data/ext/minimap2/hit.c +14 -6
  13. data/ext/minimap2/index.c +304 -13
  14. data/ext/minimap2/jump.c +201 -0
  15. data/ext/minimap2/kalloc.h +8 -0
  16. data/ext/minimap2/ksw2.h +5 -2
  17. data/ext/minimap2/ksw2_dispatch.c +5 -5
  18. data/ext/minimap2/ksw2_exts2_sse.c +17 -6
  19. data/ext/minimap2/lchain.c +5 -5
  20. data/ext/minimap2/main.c +64 -12
  21. data/ext/minimap2/map.c +35 -8
  22. data/ext/minimap2/minimap.h +14 -3
  23. data/ext/minimap2/minimap2.1 +98 -46
  24. data/ext/minimap2/misc/README.md +2 -1
  25. data/ext/minimap2/misc/pafcluster.js +241 -0
  26. data/ext/minimap2/misc/paftools.js +17 -6
  27. data/ext/minimap2/mmpriv.h +25 -4
  28. data/ext/minimap2/options.c +36 -3
  29. data/ext/minimap2/python/cmappy.h +3 -3
  30. data/ext/minimap2/python/cmappy.pxd +5 -2
  31. data/ext/minimap2/python/mappy.pyx +20 -7
  32. data/ext/minimap2/python/minimap2.py +5 -3
  33. data/ext/minimap2/seed.c +2 -1
  34. data/ext/minimap2/setup.py +2 -2
  35. data/ext/minimap2.patch +2 -2
  36. data/lib/minimap2/aligner.rb +19 -12
  37. data/lib/minimap2/alignment.rb +1 -0
  38. data/lib/minimap2/ffi/constants.rb +10 -2
  39. data/lib/minimap2/ffi/functions.rb +145 -6
  40. data/lib/minimap2/ffi/mappy.rb +1 -1
  41. data/lib/minimap2/version.rb +1 -1
  42. data/lib/minimap2.rb +2 -2
  43. metadata +8 -7
  44. data/ext/minimap2/misc/mmphase.js +0 -335
@@ -5,7 +5,7 @@
5
5
  #include <stdio.h>
6
6
  #include <sys/types.h>
7
7
 
8
- #define MM_VERSION "2.27-r1193"
8
+ #define MM_VERSION "2.29-r1283"
9
9
 
10
10
  #define MM_F_NO_DIAG (0x001LL) // no exact diagonal hit
11
11
  #define MM_F_NO_DUAL (0x002LL) // skip pairs where query name is lexicographically larger than target name
@@ -45,6 +45,9 @@
45
45
  #define MM_F_SPLICE_OLD (0x800000000LL)
46
46
  #define MM_F_SECONDARY_SEQ (0x1000000000LL) //output SEQ field for seqondary alignments using hard clipping
47
47
  #define MM_F_OUT_DS (0x2000000000LL)
48
+ #define MM_F_WEAK_PAIRING (0x4000000000LL)
49
+ #define MM_F_SR_RNA (0x8000000000LL)
50
+ #define MM_F_OUT_JUNC (0x10000000000LL)
48
51
 
49
52
  #define MM_I_HPC 0x1
50
53
  #define MM_I_NO_SEQ 0x2
@@ -91,6 +94,8 @@ typedef struct {
91
94
  uint32_t *S; // 4-bit packed sequence
92
95
  struct mm_idx_bucket_s *B; // index (hidden)
93
96
  struct mm_idx_intv_s *I; // intervals (hidden)
97
+ struct mm_idx_spsc_s *spsc;// splice score (hidden)
98
+ struct mm_idx_jjump_s *J; // junctions to create jumps (hidden)
94
99
  void *km, *h;
95
100
  } mm_idx_t;
96
101
 
@@ -115,7 +120,7 @@ typedef struct {
115
120
  int32_t mlen, blen; // seeded exact match length; seeded alignment block length
116
121
  int32_t n_sub; // number of suboptimal mappings
117
122
  int32_t score0; // initial chaining score (before chain merging/spliting)
118
- uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, seg_id:8, split_inv:1, is_alt:1, strand_retained:1, dummy:5;
123
+ uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, seg_id:8, split_inv:1, is_alt:1, strand_retained:1, is_spliced:1, dummy:4;
119
124
  uint32_t hash;
120
125
  float div;
121
126
  mm_extra_t *p;
@@ -158,7 +163,7 @@ typedef struct {
158
163
  int transition; // transition mismatch score (A:G, C:T)
159
164
  int sc_ambi; // score when one or both bases are "N"
160
165
  int noncan; // cost of non-canonical splicing sites
161
- int junc_bonus;
166
+ int junc_bonus, junc_pen;
162
167
  int zdrop, zdrop_inv; // break alignment if alignment score drops too fast along the diagonal
163
168
  int end_bonus;
164
169
  int min_dp_max; // drop an alignment if the score of the max scoring segment is below this threshold
@@ -171,6 +176,8 @@ typedef struct {
171
176
 
172
177
  int pe_ori, pe_bonus;
173
178
 
179
+ int32_t jump_min_match;
180
+
174
181
  float mid_occ_frac; // only used by mm_mapopt_update(); see below
175
182
  float q_occ_frac;
176
183
  int32_t min_mid_occ, max_mid_occ;
@@ -411,6 +418,10 @@ int mm_idx_alt_read(mm_idx_t *mi, const char *fn);
411
418
  int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc);
412
419
  int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uint8_t *s);
413
420
 
421
+ int mm_max_spsc_bonus(const mm_mapopt_t *mo);
422
+ int32_t mm_idx_spsc_read(mm_idx_t *idx, const char *fn, int32_t max_sc);
423
+ int64_t mm_idx_spsc_get(const mm_idx_t *db, int32_t cid, int64_t st0, int64_t en0, int32_t rev, uint8_t *sc);
424
+
414
425
  // deprecated APIs for backward compatibility
415
426
  void mm_mapopt_init(mm_mapopt_t *opt);
416
427
  mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads);
@@ -1,4 +1,4 @@
1
- .TH minimap2 1 "12 March 2024" "minimap2-2.27 (r1193)" "Bioinformatics tools"
1
+ .TH minimap2 1 "18 April 2025" "minimap2-2.29 (r1283)" "Bioinformatics tools"
2
2
  .SH NAME
3
3
  .PP
4
4
  minimap2 - mapping and alignment between collections of DNA sequences
@@ -79,19 +79,6 @@ Minimizer k-mer length [15]
79
79
  .BI -w \ INT
80
80
  Minimizer window size [10]. A minimizer is the smallest k-mer
81
81
  in a window of w consecutive k-mers.
82
- .TP
83
- .BI -j \ INT
84
- Syncmer submer size [10]. Option
85
- .B -j
86
- and
87
- .B -w
88
- will override each: if
89
- .B -w
90
- is applied after
91
- .BR -j ,
92
- .B -j
93
- will have no effect, and vice versa.
94
-
95
82
  .TP
96
83
  .B -H
97
84
  Use homopolymer-compressed (HPC) minimizers. An HPC sequence is constructed by
@@ -268,6 +255,11 @@ or more of the shorter chain [0.5]
268
255
  Use the minigraph chaining algorithm [no]. The minigraph algorithm is better
269
256
  for aligning contigs through long INDELs.
270
257
  .TP
258
+ .BI --rmq-inner \ NUM
259
+ Apply full dynamic programming for anchors within distance
260
+ .I NUM
261
+ [1000].
262
+ .TP
271
263
  .B --hard-mask-level
272
264
  Honor option
273
265
  .B -M
@@ -305,11 +297,13 @@ maximum alignment gap is mostly controlled by
305
297
  .B --splice
306
298
  Enable the splice alignment mode.
307
299
  .TP
308
- .B --sr
309
- Enable short-read alignment heuristics. In the short-read mode, minimap2
310
- applies a second round of chaining with a higher minimizer occurrence threshold
311
- if no good chain is found. In addition, minimap2 attempts to patch gaps between
312
- seeds with ungapped alignment.
300
+ .BR --sr [= no | dna | rna ]
301
+ Enable short-read alignment heuristics [no]. If this option is used with no argument,
302
+ .RB ` dna '
303
+ is set. In the DNA short-read mode, minimap2 applies a second round of chaining
304
+ with a higher minimizer occurrence threshold if no good chain is found. In
305
+ addition, minimap2 attempts to patch gaps between seeds with ungapped
306
+ alignment.
313
307
  .TP
314
308
  .BI --split-prefix \ STR
315
309
  Prefix to create temporary files. Typically used for a multi-part index.
@@ -329,10 +323,6 @@ Only map to the reverse complement strand of the reference sequences.
329
323
  If yes, sort anchors with heap merge, instead of radix sort. Heap merge is
330
324
  faster for short reads, but slower for long reads. [no]
331
325
  .TP
332
- .B --no-pairing
333
- Treat two reads in a pair as independent reads. The mate related fields in SAM
334
- are still properly populated.
335
- .TP
336
326
  .B --no-hash-name
337
327
  Produce the same alignment for identical sequences regardless of their sequence names.
338
328
  .SS Alignment options
@@ -366,7 +356,16 @@ Splice model [1]. 0 for the original minimap2 splice model that always penalizes
366
356
  .B -C
367
357
  has no effect with the default
368
358
  .BR -J1 .
369
- .BR -J0 .
359
+ .TP
360
+ .BR -j \ FILE
361
+ Junctions used to extend alignment towards ends of reads [].
362
+ .I FILE
363
+ can be gene annotations in the BED12 format (aka 12-column BED), or intron
364
+ positions in 5-column BED with the strand column required. BED12 file can be
365
+ converted from GTF/GFF3 with `paftools.js gff2bed anno.gtf'. This option is
366
+ intended for short RNA-seq reads, while
367
+ .B --junc-bed
368
+ for long noisy RNA-seq reads.
370
369
  .TP
371
370
  .BI -C \ INT
372
371
  Cost for a non-canonical GT-AG splicing (effective with
@@ -409,7 +408,16 @@ no attempt to match GT-AG [n]
409
408
  Score bonus when alignment extends to the end of the query sequence [0].
410
409
  .TP
411
410
  .BI --score-N \ INT
412
- Score of a mismatch involving ambiguous bases [1].
411
+ Penalty of a mismatch involving ambiguous bases [1].
412
+ .TP
413
+ .BR --pairing = strong | weak | no
414
+ How to pair paired-end reads [strong].
415
+ .RB ` no '
416
+ for aligning the two ends in a pair independently with no `properly paired' set.
417
+ .RB ` weak '
418
+ for aligning the two ends independently and then pairing the hits.
419
+ .RB ` strong '
420
+ for jointly aligning and pairing the two ends.
413
421
  .TP
414
422
  .BR --splice-flank = yes | no
415
423
  Assume the next base to a
@@ -428,16 +436,40 @@ on SIRV data, please add
428
436
  .B --splice-flank=no
429
437
  to the command line.
430
438
  .TP
439
+ .BR --spsc \ FILE
440
+ Splice scores []. Each line consists of five fields: 1) contig, 2) offset, 3) `+' or `-', 4) `D' or `A', and 5) score,
441
+ where offset is the number of bases before a splice junction, `D' indicates the
442
+ line corresponds to a donor site and `A' for an acceptor site.
443
+ A positive score suggests the junction is preferred and a negative score
444
+ suggests the junction is not preferred.
445
+ .TP
446
+ .BR --junc-pen \ INT
447
+ Penalty for a position not in FILE specified by
448
+ .B --spsc
449
+ [5]. Effective with
450
+ .B --spsc
451
+ but not
452
+ .BR --junc-bed .
453
+ .TP
431
454
  .BR --junc-bed \ FILE
432
- Gene annotations in the BED12 format (aka 12-column BED), or intron positions
433
- in 5-column BED. With this option, minimap2 prefers splicing in annotations.
434
- BED12 file can be converted from GTF/GFF3 with `paftools.js gff2bed anno.gtf'
435
- [].
455
+ Junctions to prefer during base alignment [].
456
+ Same format as
457
+ .BR -j .
458
+ It is
459
+ .I NOT
460
+ recommended to apply this option to short RNA-seq reads. This would increase
461
+ run time with little improvement to junction accuracy.
436
462
  .TP
437
463
  .BR --junc-bonus \ INT
438
- Score bonus for a splice donor or acceptor found in annotation (effective with
439
- .BR --junc-bed )
440
- [9].
464
+ Score bonus for a splice donor or acceptor found in annotation [9]. Effective with
465
+ .B --junc-bed
466
+ but not
467
+ .BR --spsc .
468
+ .TP
469
+ .BR --jump-min-match \ INT
470
+ Minimum matching length to create a jump [3]. Equivalent to
471
+ .B STAR
472
+ .BR --alignSJDBoverhangMin .
441
473
  .TP
442
474
  .BI --end-seed-pen \ INT
443
475
  Drop a terminal anchor if
@@ -463,7 +495,7 @@ Set 0 to disable [100m].
463
495
  .BI --cap-kalloc \ NUM
464
496
  Free thread-local kalloc memory reservoir if after the alignment the size of the reservoir above
465
497
  .IR NUM .
466
- Set 0 to disable [0].
498
+ Set 0 to disable [500m].
467
499
  .SS Input/output options
468
500
  .TP 10
469
501
  .B -a
@@ -495,20 +527,13 @@ Copy input FASTA/Q comments to output.
495
527
  .B -c
496
528
  Generate CIGAR. In PAF, the CIGAR is written to the `cg' custom tag.
497
529
  .TP
498
- .BI --cs[= STR ]
530
+ .BR --cs [= short | long ]
499
531
  Output the
500
532
  .B cs
501
533
  tag.
502
- .I STR
503
- can be either
504
- .I short
505
- or
506
- .IR long .
507
- If no
508
- .I STR
509
- is given,
510
- .I short
511
- is assumed. [none]
534
+ If no argument is given,
535
+ .RB ` short '
536
+ is set. [none]
512
537
  .TP
513
538
  .B --MD
514
539
  Output the MD tag (see the SAM spec).
@@ -522,6 +547,26 @@ In SAM output, use soft clipping for supplementary alignments.
522
547
  .B --secondary-seq
523
548
  In SAM output, show query sequences for secondary alignments.
524
549
  .TP
550
+ .B --write-junc
551
+ Output splice junctions in 6-column BED: contig name, start, end,
552
+ read name, score and strand. Score is the sum of donor and acceptor scores,
553
+ where GT gets 3, GC gets 2 and AT gets 1 at donor sites,
554
+ while AG gets 3 and AC gets 1 at acceptor sites.
555
+ Alignments with mapping quality below 10 are ignored.
556
+ .TP
557
+ .BI --pass1 \ FILE
558
+ Junctions BED file outputted by
559
+ .B --write-junc
560
+ []. Rows with scores lower than 5 are ignored. When both
561
+ .B -j
562
+ and
563
+ .B --pass1
564
+ are present, junctions in
565
+ .B -j
566
+ are preferred over in
567
+ .BR --pass1
568
+ when there is ambiguity.
569
+ .TP
525
570
  .BI --seed \ INT
526
571
  Integer seed for randomizing equally best hits. Minimap2 hashes
527
572
  .I INT
@@ -661,10 +706,16 @@ Spliced alignment for accurate long RNA-seq reads such as PacBio iso-seq
661
706
  .B -C5 -O6,24
662
707
  .BR -B4 ).
663
708
  .TP
709
+ .B splice:sr
710
+ Spliced alignment for short RNA-seq reads
711
+ .RB ( -xsplice:hq
712
+ .B --frag=yes -m25 -s40 -2K100m --heap-sort=yes --pairing=weak --sr=rna --min-dp-len=20
713
+ .BR --secondary=no ).
714
+ .TP
664
715
  .B sr
665
716
  Short-read alignment without splicing
666
717
  .RB ( -k21
667
- .B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -b0 -r100 -p.5 -N20 -f1000,5000 -n2 -m25
718
+ .B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -r100 -p.5 -N20 -f1000,5000 -n2 -m25
668
719
  .B -s40 -g100 -2K50m --heap-sort=yes
669
720
  .BR --secondary=no ).
670
721
  .TP
@@ -737,7 +788,7 @@ s2 i Chaining score of the best secondary chain
737
788
  NM i Total number of mismatches and gaps in the alignment
738
789
  MD Z To generate the ref sequence in the alignment
739
790
  AS i DP alignment score
740
- SA Z List of other supplementary alignments
791
+ SA Z List of other supplementary alignments (with approximate CIGAR strings)
741
792
  ms i DP score of the max scoring segment in the alignment
742
793
  nn i Number of ambiguous bases in the alignment
743
794
  ts A Transcript strand (splice mode only)
@@ -746,6 +797,7 @@ cs Z Difference string
746
797
  dv f Approximate per-base sequence divergence
747
798
  de f Gap-compressed per-base sequence divergence
748
799
  rl i Length of query regions harboring repetitive seeds
800
+ zd i Alignment broken due to Z-drop; bit 1: left broken; bit 2: right broken
749
801
  .TE
750
802
 
751
803
  .PP
@@ -16,7 +16,8 @@ minimap2 -c test/MT-human.fa test/MT-orang.fa \
16
16
  | paftools.js liftover -l10000 - <(echo -e "MT_orang\t2000\t5000") # liftOver
17
17
  # no test data for the following examples
18
18
  paftools.js junceval -e anno.gtf splice.sam > out.txt # compare splice junctions to annotations
19
- paftools.js splice2bed anno.gtf > anno.bed # convert GTF/GFF3 to BED12
19
+ paftools.js splice2bed splice.sam > splice.bed # convert PAF/SAM to BED12
20
+ paftools.js gff2bed anno.gtf > anno.bed # convert GTF/GFF3 to BED12
20
21
  ```
21
22
 
22
23
  ## Table of Contents
@@ -0,0 +1,241 @@
1
+ #!/usr/bin/env k8
2
+
3
+ "use strict";
4
+
5
+ Array.prototype.delete_at = function(i) {
6
+ for (let j = i; j < this.length - 1; ++j)
7
+ this[j] = this[j + 1];
8
+ --this.length;
9
+ }
10
+
11
+ function* getopt(argv, ostr, longopts) {
12
+ if (argv.length == 0) return;
13
+ let pos = 0, cur = 0;
14
+ while (cur < argv.length) {
15
+ let lopt = "", opt = "?", arg = "";
16
+ while (cur < argv.length) { // skip non-option arguments
17
+ if (argv[cur][0] == "-" && argv[cur].length > 1) {
18
+ if (argv[cur] == "--") cur = argv.length;
19
+ break;
20
+ } else ++cur;
21
+ }
22
+ if (cur == argv.length) break;
23
+ let a = argv[cur];
24
+ if (a[0] == "-" && a[1] == "-") { // a long option
25
+ pos = -1;
26
+ let c = 0, k = -1, tmp = "", o;
27
+ const pos_eq = a.indexOf("=");
28
+ if (pos_eq > 0) {
29
+ o = a.substring(2, pos_eq);
30
+ arg = a.substring(pos_eq + 1);
31
+ } else o = a.substring(2);
32
+ for (let i = 0; i < longopts.length; ++i) {
33
+ let y = longopts[i];
34
+ if (y[y.length - 1] == "=") y = y.substring(0, y.length - 1);
35
+ if (o.length <= y.length && o == y.substring(0, o.length)) {
36
+ k = i, tmp = y;
37
+ ++c; // c is the number of matches
38
+ if (o == y) { // exact match
39
+ c = 1;
40
+ break;
41
+ }
42
+ }
43
+ }
44
+ if (c == 1) { // find a unique match
45
+ lopt = tmp;
46
+ if (pos_eq < 0 && longopts[k][longopts[k].length-1] == "=" && cur + 1 < argv.length) {
47
+ arg = argv[cur+1];
48
+ argv.delete_at(cur + 1);
49
+ }
50
+ }
51
+ } else { // a short option
52
+ if (pos == 0) pos = 1;
53
+ opt = a[pos++];
54
+ let k = ostr.indexOf(opt);
55
+ if (k < 0) {
56
+ opt = "?";
57
+ } else if (k + 1 < ostr.length && ostr[k+1] == ":") { // requiring an argument
58
+ if (pos >= a.length) {
59
+ arg = argv[cur+1];
60
+ argv.delete_at(cur + 1);
61
+ } else arg = a.substring(pos);
62
+ pos = -1;
63
+ }
64
+ }
65
+ if (pos < 0 || pos >= argv[cur].length) {
66
+ argv.delete_at(cur);
67
+ pos = 0;
68
+ }
69
+ if (lopt != "") yield { opt: `--${lopt}`, arg: arg };
70
+ else if (opt != "?") yield { opt: `-${opt}`, arg: arg };
71
+ else yield { opt: "?", arg: "" };
72
+ }
73
+ }
74
+
75
+ function* k8_readline(fn) {
76
+ let buf = new Bytes();
77
+ let file = new File(fn);
78
+ while (file.readline(buf) >= 0) {
79
+ yield buf.toString();
80
+ }
81
+ file.close();
82
+ buf.destroy();
83
+ }
84
+
85
+ function merge_hits(b) {
86
+ if (b.length == 1)
87
+ return { name1:b[0].name1, name2:b[0].name2, len1:b[0].len1, len2:b[0].len2, min_cov:b[0].min_cov, max_cov:b[0].max_cov, cov1:b[0].cov1, cov2:b[0].cov2, s1:b[0].s1, dv:b[0].dv };
88
+ b.sort(function(x, y) { return x.st1 - y.st1 });
89
+ let f = [], bt = [];
90
+ for (let i = 0; i < b.length; ++i)
91
+ f[i] = b[i].s1, bt[i] = -1;
92
+ for (let i = 0; i < b.length; ++i) {
93
+ for (let j = 0; j < i; ++j) {
94
+ if (b[j].st2 < b[i].st2) {
95
+ if (b[j].en1 >= b[i].en1) continue;
96
+ if (b[j].en2 >= b[i].en2) continue;
97
+ const ov1 = b[j].en1 <= b[i].st1? 0 : b[i].st1 - b[j].en1;
98
+ const li1 = b[i].en1 - b[i].st1;
99
+ const s11 = b[i].s1 / li1 * (li1 - ov1);
100
+ const ov2 = b[j].en2 <= b[i].st2? 0 : b[i].st2 - b[j].en2;
101
+ const li2 = b[i].en2 - b[i].st2;
102
+ const s12 = b[i].s1 / li2 * (li2 - ov2);
103
+ const s1 = s11 < s12? s11 : s12;
104
+ if (f[i] < f[j] + s1)
105
+ f[i] = f[j] + s1, bt[i] = j;
106
+ }
107
+ }
108
+ }
109
+ let max_i = -1, max_f = 0, d = [];
110
+ for (let i = 0; i < b.length; ++i)
111
+ if (max_f < f[i])
112
+ max_f = f[i], max_i = i;
113
+ for (let k = max_i; k >= 0; k = bt[k])
114
+ d.push(k);
115
+ d = d.reverse();
116
+ let dv = 0, tot = 0, cov1 = 0, cov2 = 0, st1 = 0, en1 = 0, st2 = 0, en2 = 0;
117
+ for (let k = 0; k < d.length; ++k) {
118
+ const i = d[k];
119
+ tot += b[i].blen;
120
+ dv += b[i].dv * b[i].blen;
121
+ if (b[i].st1 > en1) {
122
+ cov1 += en1 - st1;
123
+ st1 = b[i].st1, en1 = b[i].en1;
124
+ } else en1 = en1 > b[i].en1? en1 : b[i].en1;
125
+ if (b[i].st2 > en2) {
126
+ cov2 += en2 - st2;
127
+ st2 = b[i].st2, en2 = b[i].en2;
128
+ } else en2 = en2 > b[i].en2? en2 : b[i].en2;
129
+ }
130
+ dv /= tot;
131
+ cov1 = (cov1 + (en1 - st1)) / b[0].len1;
132
+ cov2 = (cov2 + (en2 - st2)) / b[0].len2;
133
+ const min_cov = cov1 < cov2? cov1 : cov2;
134
+ const max_cov = cov1 > cov2? cov1 : cov2;
135
+ //warn(d.length, b[0].name1, b[0].name2, min_cov, max_cov);
136
+ return { name1:b[0].name1, name2:b[0].name2, len1:b[0].len1, len2:b[0].len2, min_cov:min_cov, max_cov:max_cov, cov1:cov1, cov2:cov2, s1:max_f, dv:dv };
137
+ }
138
+
139
+ function main(args) {
140
+ let opt = { min_cov:.9, max_dv:.015, max_diff:20000 };
141
+ for (const o of getopt(args, "c:d:e:", [])) {
142
+ if (o.opt == '-c') opt.min_cov = parseFloat(o.arg);
143
+ else if (o.opt == '-d') opt.max_dv = parseFloat(o.arg);
144
+ else if (o.opt == '-e') opt.max_diff = parseFloat(o.arg);
145
+ }
146
+ if (args.length == 0) {
147
+ print("Usage: pafcluster.js [options] <ava.paf>");
148
+ print("Options:");
149
+ print(` -c FLOAT min coverage [${opt.min_cov}]`);
150
+ print(` -d FLOAT max divergence [${opt.max_dv}]`);
151
+ print(` -e FLOAT max difference [${opt.max_diff}]`);
152
+ return;
153
+ }
154
+
155
+ // read
156
+ let a = [], len = {}, name2len = {};
157
+ for (const line of k8_readline(args[0])) {
158
+ let m, t = line.split("\t");
159
+ if (t[4] != "+") continue;
160
+ for (let i = 1; i < 4; ++i) t[i] = parseInt(t[i]);
161
+ for (let i = 6; i < 11; ++i) t[i] = parseInt(t[i]);
162
+ const len1 = t[1], len2 = t[6];
163
+ let s1 = -1, dv = -1.0;
164
+ for (let i = 12; i < t.length; ++i) {
165
+ if ((m = /^(s1|dv):\S:(\S+)/.exec(t[i])) != null) {
166
+ if (m[1] == "s1") s1 = parseInt(m[2]);
167
+ else if (m[1] == "dv") dv = parseFloat(m[2]);
168
+ }
169
+ }
170
+ if (s1 < 0 || dv < 0) continue;
171
+ const cov1 = (parseInt(t[3]) - parseInt(t[2])) / len1;
172
+ const cov2 = (parseInt(t[8]) - parseInt(t[7])) / len2;
173
+ const min_cov = cov1 < cov2? cov1 : cov2;
174
+ const max_cov = cov1 > cov2? cov1 : cov2;
175
+ name2len[t[0]] = len1;
176
+ name2len[t[5]] = len2;
177
+ a.push({ name1:t[0], name2:t[5], len1:len1, len2:len2, min_cov:min_cov, max_cov:max_cov, s1:s1, dv:dv, cov1:cov1, cov2:cov2, st1:t[2], en1:t[3], st2:t[7], en2:t[8], blen:t[10] });
178
+ len[t[0]] = len1, len[t[5]] = len2;
179
+ }
180
+ warn(`Read ${a.length} hits`);
181
+
182
+ // merge duplicated hits
183
+ let h = {};
184
+ for (let i = 0; i < a.length; ++i) {
185
+ const key = `${a[i].name1}\t${a[i].name2}`;
186
+ if (h[key] == null) h[key] = [];
187
+ h[key].push(a[i]);
188
+ }
189
+ a = [];
190
+ for (const key in h)
191
+ a.push(merge_hits(h[key]));
192
+
193
+ // core loop
194
+ while (a.length > 1) {
195
+ // select the sequence with the highest sum of s1
196
+ let h = {};
197
+ for (let i = 0; i < a.length; ++i) {
198
+ if (h[a[i].name1] == null) h[a[i].name1] = 0;
199
+ h[a[i].name1] += a[i].s1;
200
+ }
201
+ let max_s1 = 0, max_name = "";
202
+ for (const name in h)
203
+ if (max_s1 < h[name])
204
+ max_s1 = h[name], max_name = name;
205
+ // find contigs in the same group
206
+ h = {};
207
+ h[max_name] = 1;
208
+ for (let i = 0; i < a.length; ++i) {
209
+ if (a[i].name1 != max_name && a[i].name2 != max_name)
210
+ continue;
211
+ const diff1 = a[i].len1 * (1.0 - a[i].cov1);
212
+ const diff2 = a[i].len2 * (1.0 - a[i].cov2);
213
+ if (a[i].min_cov >= opt.min_cov && a[i].dv <= opt.max_dv && diff1 <= opt.max_diff && diff2 <= opt.max_diff)
214
+ h[a[i].name1] = h[a[i].name2] = 1;
215
+ }
216
+ let n = 0;
217
+ for (const key in h) {
218
+ ++n;
219
+ delete name2len[key];
220
+ }
221
+ print(`SD\t${max_name}\t${n}`);
222
+ for (const key in h) print(`CL\t${key}\t${len[key]}`);
223
+ print("//");
224
+ // filter out redundant hits
225
+ let b = [];
226
+ for (let i = 0; i < a.length; ++i)
227
+ if (h[a[i].name1] == null && h[a[i].name2] == null)
228
+ b.push(a[i]);
229
+ warn(`Reduced the number of hits from ${a.length} to ${b.length}`);
230
+ a = b;
231
+ }
232
+
233
+ // output remaining singletons
234
+ for (const key in name2len) {
235
+ print(`SD\t${key}\t1`);
236
+ print(`CL\t${key}\t${name2len[key]}`);
237
+ print(`//`);
238
+ }
239
+ }
240
+
241
+ main(arguments);
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env k8
2
2
 
3
- var paftools_version = '2.27-r1193';
3
+ var paftools_version = '2.29-r1283';
4
4
 
5
5
  /*****************************
6
6
  ***** Library functions *****
@@ -1740,15 +1740,17 @@ function paf_gff2bed(args)
1740
1740
 
1741
1741
  function paf_sam2paf(args)
1742
1742
  {
1743
- var c, pri_only = false, long_cs = false;
1744
- while ((c = getopt(args, "pL")) != null) {
1743
+ var c, pri_only = false, long_cs = false, pri_pri_only = false;
1744
+ while ((c = getopt(args, "pPL")) != null) {
1745
1745
  if (c == 'p') pri_only = true;
1746
+ else if (c == 'P') pri_pri_only = pri_only = true;
1746
1747
  else if (c == 'L') long_cs = true;
1747
1748
  }
1748
1749
  if (args.length == getopt.ind) {
1749
1750
  print("Usage: paftools.js sam2paf [options] <in.sam>");
1750
1751
  print("Options:");
1751
1752
  print(" -p convert primary or supplementary alignments only");
1753
+ print(" -P convert primary alignments only");
1752
1754
  print(" -L output the cs tag in the long form");
1753
1755
  exit(1);
1754
1756
  }
@@ -1775,6 +1777,7 @@ function paf_sam2paf(args)
1775
1777
  throw Error("at line " + lineno + ": inconsistent SEQ and QUAL lengths - " + t[9].length + " != " + t[10].length);
1776
1778
  if (t[2] == '*' || (flag&4) || t[5] == '*') continue;
1777
1779
  if (pri_only && (flag&0x100)) continue;
1780
+ if (pri_pri_only && (flag&0x900)) continue;
1778
1781
  var tlen = ctg_len[t[2]];
1779
1782
  if (tlen == null) throw Error("at line " + lineno + ": can't find the length of contig " + t[2]);
1780
1783
  // find tags
@@ -1887,7 +1890,10 @@ function paf_sam2paf(args)
1887
1890
  // optional tags
1888
1891
  var type = flag&0x100? 'S' : 'P';
1889
1892
  var tags = ["tp:A:" + type];
1890
- if (NM != null) tags.push("mm:i:"+mm);
1893
+ if (NM != null) {
1894
+ tags.push("NM:i:"+NM);
1895
+ tags.push("mm:i:"+mm);
1896
+ }
1891
1897
  tags.push("gn:i:"+(I[1]+D[1]), "go:i:"+(I[0]+D[0]), "cg:Z:" + t[5].replace(/\d+[SH]/g, ''));
1892
1898
  if (cs_str != null) tags.push("cs:Z:" + cs_str);
1893
1899
  else if (cs.length > 0) tags.push("cs:Z:" + cs.join(""));
@@ -2181,7 +2187,7 @@ function paf_mapeval(args)
2181
2187
  }
2182
2188
 
2183
2189
  var lineno = 0, last = null, a = [], n_unmapped = null;
2184
- var re_cigar = /(\d+)([MIDSHN])/g;
2190
+ var re_cigar = /(\d+)([MIDSHN=X])/g;
2185
2191
  while (file.readline(buf) >= 0) {
2186
2192
  var m, line = buf.toString();
2187
2193
  ++lineno;
@@ -2219,7 +2225,7 @@ function paf_mapeval(args)
2219
2225
  var n_gap = 0, mlen = 0;
2220
2226
  while ((m = re_cigar.exec(t[5])) != null) {
2221
2227
  var len = parseInt(m[1]);
2222
- if (m[2] == 'M') pos_end += len, mlen += len;
2228
+ if (m[2] == 'M' || m[2] == 'X' || m[2] == '=') pos_end += len, mlen += len;
2223
2229
  else if (m[2] == 'I') n_gap += len;
2224
2230
  else if (m[2] == 'D') n_gap += len, pos_end += len;
2225
2231
  }
@@ -2488,6 +2494,10 @@ function paf_junceval(args)
2488
2494
  } else { // SAM
2489
2495
  ctg_name = t[2], pos = parseInt(t[3]) - 1, cigar = t[5];
2490
2496
  var flag = parseInt(t[1]);
2497
+ if (flag & 1) {
2498
+ if (flag & 0x40) qname += '/1';
2499
+ else if (flag & 0x80) qname += '/2';
2500
+ }
2491
2501
  if (flag&0x100) continue; // secondary
2492
2502
  }
2493
2503
 
@@ -3234,6 +3244,7 @@ function paf_sveval(args)
3234
3244
  if (bed != null && bed[t[0]] == null) continue;
3235
3245
  if (t[4] == '<INV>' || t[4] == '<INVDUP>') continue; // no inversion
3236
3246
  if (/[\[\]]/.test(t[4])) continue; // no break points
3247
+ if (t[6] != "." && t[6] != "PASS") continue;
3237
3248
  var st = parseInt(t[1]) - 1, en = st + t[3].length;
3238
3249
  // parse svlen
3239
3250
  var b = _paf_get_alen(t), svlen = b[0];