miga-base 0.7.26.2 → 1.0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
  3. data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
  4. data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
  5. data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
  6. data/lib/miga/cli/action/classify_wf.rb +2 -2
  7. data/lib/miga/cli/action/derep_wf.rb +1 -1
  8. data/lib/miga/cli/action/doctor.rb +57 -14
  9. data/lib/miga/cli/action/doctor/base.rb +47 -23
  10. data/lib/miga/cli/action/env.rb +26 -0
  11. data/lib/miga/cli/action/init.rb +11 -7
  12. data/lib/miga/cli/action/init/files_helper.rb +1 -0
  13. data/lib/miga/cli/action/ncbi_get.rb +3 -3
  14. data/lib/miga/cli/action/tax_dist.rb +2 -2
  15. data/lib/miga/cli/action/wf.rb +5 -4
  16. data/lib/miga/cli/base.rb +1 -0
  17. data/lib/miga/common.rb +1 -0
  18. data/lib/miga/daemon.rb +11 -4
  19. data/lib/miga/dataset/result.rb +10 -6
  20. data/lib/miga/json.rb +5 -4
  21. data/lib/miga/metadata.rb +5 -1
  22. data/lib/miga/parallel.rb +36 -0
  23. data/lib/miga/project.rb +8 -8
  24. data/lib/miga/project/base.rb +4 -4
  25. data/lib/miga/project/result.rb +2 -2
  26. data/lib/miga/sqlite.rb +10 -2
  27. data/lib/miga/version.rb +23 -9
  28. data/scripts/aai_distances.bash +16 -18
  29. data/scripts/ani_distances.bash +16 -17
  30. data/scripts/assembly.bash +31 -16
  31. data/scripts/haai_distances.bash +3 -27
  32. data/scripts/miga.bash +12 -8
  33. data/scripts/p.bash +1 -1
  34. data/scripts/read_quality.bash +9 -18
  35. data/scripts/trimmed_fasta.bash +14 -30
  36. data/scripts/trimmed_reads.bash +36 -36
  37. data/test/parallel_test.rb +31 -0
  38. data/test/project_test.rb +2 -1
  39. data/test/remote_dataset_test.rb +1 -1
  40. data/utils/distance/commands.rb +1 -0
  41. data/utils/distance/database.rb +0 -1
  42. data/utils/distance/runner.rb +2 -4
  43. data/utils/enveomics/Manifest/Tasks/fasta.json +39 -3
  44. data/utils/enveomics/Manifest/Tasks/fastq.json +50 -2
  45. data/utils/enveomics/Manifest/Tasks/mapping.json +70 -0
  46. data/utils/enveomics/Manifest/Tasks/other.json +77 -0
  47. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +138 -1
  48. data/utils/enveomics/Manifest/categories.json +13 -4
  49. data/utils/enveomics/Scripts/Aln.cat.rb +206 -148
  50. data/utils/enveomics/Scripts/FastA.N50.pl +33 -29
  51. data/utils/enveomics/Scripts/FastA.fragment.rb +69 -61
  52. data/utils/enveomics/Scripts/FastA.sample.rb +61 -46
  53. data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
  54. data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
  55. data/utils/enveomics/Scripts/FastQ.tag.rb +59 -52
  56. data/utils/enveomics/Scripts/SRA.download.bash +6 -8
  57. data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
  58. data/utils/enveomics/Scripts/aai.rb +3 -2
  59. data/utils/enveomics/Scripts/anir.rb +137 -0
  60. data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
  61. data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
  62. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +17 -17
  63. data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
  64. data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
  65. data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
  66. data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
  67. data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
  68. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
  69. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
  70. data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
  71. data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
  72. data/utils/enveomics/Scripts/rbm.rb +87 -133
  73. data/utils/enveomics/Scripts/sam.filter.rb +148 -0
  74. data/utils/enveomics/enveomics.R/DESCRIPTION +2 -2
  75. data/utils/enveomics/enveomics.R/NAMESPACE +1 -1
  76. data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
  77. data/utils/enveomics/enveomics.R/R/utils.R +30 -0
  78. data/utils/enveomics/enveomics.R/README.md +1 -0
  79. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -1
  80. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -1
  81. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -1
  82. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +10 -2
  83. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +16 -4
  84. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +13 -3
  85. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +8 -2
  86. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +8 -2
  87. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +9 -2
  88. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +13 -5
  89. data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
  90. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +9 -2
  91. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +23 -6
  92. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -4
  93. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +8 -2
  94. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +7 -2
  95. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +14 -3
  96. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +10 -2
  97. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +8 -2
  98. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +17 -9
  99. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +6 -2
  100. data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
  101. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +14 -5
  102. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +19 -4
  103. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +11 -3
  104. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +11 -4
  105. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +26 -12
  106. data/utils/multitrim/Multitrim How-To.pdf +0 -0
  107. data/utils/multitrim/README.md +67 -0
  108. data/utils/multitrim/multitrim.py +1555 -0
  109. data/utils/multitrim/multitrim.yml +13 -0
  110. data/utils/requirements.txt +4 -3
  111. data/utils/subclade/pipeline.rb +2 -2
  112. metadata +33 -4
  113. data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +0 -30
@@ -824,6 +824,83 @@
824
824
  "description": "Features to map in GFF."
825
825
  }
826
826
  ]
827
+ },
828
+ {
829
+ "task": "Table.prefScore.R",
830
+ "description": ["Estimate preference score of species based on occupancy",
831
+ "in biased sample sets."],
832
+ "help_arg": "--help",
833
+ "requires": [ { "r_package": "optparse" } ],
834
+ "options": [
835
+ {
836
+ "name": "Occupancy matrix",
837
+ "opt": "--x",
838
+ "arg": "in_file",
839
+ "description": ["A tab-delimited table of presence/absence (1/0)",
840
+ "with species as rows and samples as columns."],
841
+ "mandatory": true
842
+ },
843
+ {
844
+ "name": "Sample set",
845
+ "opt": "--set",
846
+ "arg": "in_file",
847
+ "description": ["A list of sample names that constitute the test",
848
+ "set, one per line."],
849
+ "mandatory": true
850
+ },
851
+ {
852
+ "opt": "--ignore",
853
+ "arg": "in_file",
854
+ "description": ["A list of species to exclude from the analysis,",
855
+ "one per line."]
856
+ },
857
+ {
858
+ "name": "Significance threshold",
859
+ "opt": "--signif-thr",
860
+ "arg": "float",
861
+ "description": "Absolute value of the significance threshold."
862
+ },
863
+ {
864
+ "opt": "--col-above",
865
+ "arg": "string",
866
+ "description": "Color for points significantly above zero.",
867
+ "default": "#941100"
868
+ },
869
+ {
870
+ "opt": "--col-equal",
871
+ "arg": "string",
872
+ "description": ["Color for points not significantly different from",
873
+ "zero."],
874
+ "default": "#BDBDBD"
875
+ },
876
+ {
877
+ "opt": "--col-below",
878
+ "arg": "string",
879
+ "description": "Color for points significantly below zero.",
880
+ "default": "#2F5496"
881
+ },
882
+ {
883
+ "name": "Output preference scores",
884
+ "arg": "out_file",
885
+ "description": "Output raw-text file with preference scores.",
886
+ "mandatory": true
887
+ },
888
+ {
889
+ "name": "Graphical utput",
890
+ "arg": "out_file",
891
+ "description": "Output PDF file with preference scores plot."
892
+ },
893
+ {
894
+ "name": "Width",
895
+ "arg": "float",
896
+ "description": "Width of the plot in inches (7 by default)."
897
+ },
898
+ {
899
+ "name": "Height",
900
+ "arg": "float",
901
+ "description": "Height of the plot in inches (7 by default)."
902
+ }
903
+ ]
827
904
  }
828
905
  ]
829
906
  }
@@ -362,6 +362,139 @@
362
362
  }
363
363
  ]
364
364
  },
365
+ {
366
+ "task": "anir.rb",
367
+ "description": ["Estimates ANIr: the Average Nucleotide Identity of",
368
+ "reads against a genome."],
369
+ "help_arg": "--help",
370
+ "see_also": ["ani.rb", "sam.filter.rb"],
371
+ "options": [
372
+ {
373
+ "opt": "--reads",
374
+ "arg": "in_file",
375
+ "description": "Metagenomic reads."
376
+ },
377
+ {
378
+ "opt": "--genome",
379
+ "arg": "in_file",
380
+ "description": "Genome assembly."
381
+ },
382
+ {
383
+ "opt": "--mapping",
384
+ "arg": "in_file",
385
+ "description": "Mapping file."
386
+ },
387
+ {
388
+ "opt": "--list",
389
+ "arg": "in_file",
390
+ "description": "Output file with identities."
391
+ },
392
+ {
393
+ "opt": "--hist",
394
+ "arg": "in_file",
395
+ "description": "Output file with histogram."
396
+ },
397
+ {
398
+ "opt": "--tab",
399
+ "arg": "out_file",
400
+ "description": "Output file with results in tabular format."
401
+ },
402
+ {
403
+ "name": "Reads format",
404
+ "opt": "--r-format",
405
+ "arg": "select",
406
+ "description": ["Metagenomic reads format: fastq or fasta.",
407
+ "Both options support compression with .gz file extension."],
408
+ "values": ["fastq", "fasta"],
409
+ "default": "fastq"
410
+ },
411
+ {
412
+ "name": "Reads type",
413
+ "opt": "--r-type",
414
+ "arg": "select",
415
+ "description": ["Type of metagenomic reads: Single reads (single),",
416
+ "coupled reads in separate files (-m must be comma-delimited;",
417
+ "coupled), or coupled reads in a single interposed file",
418
+ "(interleaved)."],
419
+ "values": ["single", "coupled", "interleaved"],
420
+ "default": "single"
421
+ },
422
+ {
423
+ "name": "Genome format",
424
+ "opt": "--g-format",
425
+ "arg": "select",
426
+ "description": ["Genome assembly format: fasta or list.",
427
+ "Both options support compression with .gz file extension.",
428
+ "If passed in mapping-read mode, filters only matches to these",
429
+ "contigs."],
430
+ "values": ["fasta", "list"],
431
+ "default": "fasta"
432
+ },
433
+ {
434
+ "name": "Mapping format",
435
+ "opt": "--m-format",
436
+ "arg": "select",
437
+ "description": ["Mapping file format: sam, bam, tab, or list.",
438
+ "All except bam support compression with .gz file extension."],
439
+ "values": ["sam", "bam", "tab", "list"],
440
+ "default": "sam"
441
+ },
442
+ {
443
+ "opt": "--identity",
444
+ "arg": "float",
445
+ "description": "Set a fixed threshold of percent identity.",
446
+ "default": 95.0
447
+ },
448
+ {
449
+ "opt": "--algorithm",
450
+ "arg": "select",
451
+ "description": ["Set an algorithm to automatically detect identity",
452
+ "threshold: Valley detection by E-M of Gaussian Mixture Model",
453
+ "(gmm), fixed threshold (see Identity; fix),",
454
+ "Pick gmm or fix depending on bimodality (see Bimodality; auto)."],
455
+ "values": ["gmm", "fix", "auto"],
456
+ "default": "auto"
457
+ },
458
+ {
459
+ "opt": "--bimodality",
460
+ "arg": "float",
461
+ "description": ["Threshold of bimodality below which the algorithm",
462
+ "is set to fix. The coefficient used is the de Michele & Accantino",
463
+ "(2014) B index."],
464
+ "default": 0.5
465
+ },
466
+ {
467
+ "opt": "--coefficient",
468
+ "arg": "select",
469
+ "description": ["Coefficient of bimodality for Algorithm auto: ",
470
+ "Sarle's bimodality coefficient b (sarle), or",
471
+ "de Michele and Accatino (2014 PLoS ONE) B index",
472
+ "(use with Bimodality 0.1, dma)."],
473
+ "values": ["sarle", "dma"],
474
+ "default": "sarle"
475
+ },
476
+ {
477
+ "opt": "--bin-size",
478
+ "arg": "float",
479
+ "description": "Width of histogram bins (in percent identity).",
480
+ "default": 1.0
481
+ },
482
+ {
483
+ "opt": "--threads",
484
+ "arg": "integer",
485
+ "description": "Threads to use."
486
+ },
487
+ {
488
+ "opt": "--log",
489
+ "arg": "out_file",
490
+ "description": "Log file to save output."
491
+ },
492
+ {
493
+ "opt": "--quiet",
494
+ "description": "Run quietly."
495
+ }
496
+ ]
497
+ },
365
498
  {
366
499
  "task": "HMM.haai.rb",
367
500
  "description": ["Estimates Average Amino Acid Identity (AAI) from the",
@@ -407,10 +540,14 @@
407
540
  "sequences."],
408
541
  "help_arg": "--help",
409
542
  "cite":[
543
+ ["Camacho et al, 2009, BMC Bioinf (BLAST+)",
544
+ "https://doi.org/10.1186/1471-2105-10-421"],
410
545
  ["Altschul et al, 2000, JMB (BLAST)",
411
546
  "http://dx.doi.org/10.1016/S0022-2836(05)80360-2"],
412
547
  ["Buchfink B, Xie C, Huson D, 2015, Nat Meth (Diamond)",
413
- "https://dx.doi.org/10.1038/nmeth.3176"]
548
+ "https://dx.doi.org/10.1038/nmeth.3176"],
549
+ ["Kent, 2002, Genome Res (BLAT)",
550
+ "https://doi.org/10.1101/gr.229202"]
414
551
  ],
415
552
  "options": [
416
553
  {
@@ -24,11 +24,13 @@
24
24
  "BlastTab.pairedHits.rb",
25
25
  "BlastTab.subsample.pl",
26
26
  "BlastTab.taxid2taxrank.pl",
27
- "BlastTab.topHits_sorted.rb"
27
+ "BlastTab.topHits_sorted.rb",
28
+ "sam.filter.rb"
28
29
  ],
29
30
  "Execution": [
30
31
  "aai.rb",
31
32
  "ani.rb",
33
+ "anir.rb",
32
34
  "HMM.haai.rb",
33
35
  "rbm.rb"
34
36
  ]
@@ -58,9 +60,11 @@
58
60
  "FastA.split.rb",
59
61
  "FastA.subsample.pl",
60
62
  "FastA.tag.rb",
63
+ "FastA.toFastQ.rb",
61
64
  "FastA.wrap.rb",
62
65
  "FastQ.filter.pl",
63
66
  "FastQ.interpose.pl",
67
+ "FastQ.maskQual.rb",
64
68
  "FastQ.offset.pl",
65
69
  "FastQ.split.pl",
66
70
  "FastQ.tag.rb",
@@ -71,11 +75,13 @@
71
75
  "Community": [
72
76
  "AlphaDiversity.pl",
73
77
  "Chao1.pl",
74
- "Table.barplot.R"
78
+ "Table.barplot.R",
79
+ "Table.prefScore.R"
75
80
  ],
76
81
  "Population": [
77
82
  "VCF.SNPs.rb",
78
- "VCF.KaKs.rb"
83
+ "VCF.KaKs.rb",
84
+ "Table.prefScore.R"
79
85
  ]
80
86
  },
81
87
  "Annotation": {
@@ -143,13 +149,16 @@
143
149
  "clust.rand.rb"
144
150
  ],
145
151
  "Read recruitments": [
152
+ "anir.rb",
146
153
  "BedGraph.tad.rb",
147
154
  "BedGraph.window.rb",
148
155
  "BlastTab.catsbj.pl",
149
156
  "BlastTab.pairedHits.rb",
150
157
  "BlastTab.recplot2.R",
158
+ "FastQ.test-error.rb",
151
159
  "GFF.catsbj.pl",
152
- "RecPlot2.compareIdentities.R"
160
+ "RecPlot2.compareIdentities.R",
161
+ "sam.filter.rb"
153
162
  ]
154
163
  }
155
164
  }
@@ -1,163 +1,221 @@
1
1
  #!/usr/bin/env ruby
2
- #
2
+
3
3
  # @author Luis M. Rodriguez-R
4
- # @update Nov-30-2015
5
4
  # @license artistic license 2.0
6
- #
7
5
 
8
- $:.push File.expand_path(File.dirname(__FILE__) + "/lib")
9
- require "enveomics_rb/enveomics"
6
+ $VERSION = 1.0
7
+ $:.push File.expand_path('../lib', __FILE__)
8
+ require 'enveomics_rb/enveomics'
9
+
10
+ o = {
11
+ q: false, missing: '-', model: 'AUTO', removeinvar: false, undefined: '-.Xx?'
12
+ }
10
13
 
11
- o = {:q=>false, :missing=>"-", :model=>"AUTO", :removeinvar=>false,
12
- :undefined=>"-.Xx?"}
13
14
  OptionParser.new do |opt|
14
- opt.banner = "
15
- Concatenates several multiple alignments in FastA format into a single
16
- multiple alignment. The IDs of the sequences (or the ID prefixes, if using
17
- --ignore-after) must coincide across files.
18
-
19
- Usage: #{$0} [options] aln1.fa aln2.fa ... > aln.fa".gsub(/^ +/,"")
20
- opt.separator ""
21
- opt.on("-c", "--coords FILE",
22
- "Output file of coordinates in RAxML-compliant format."
23
- ){ |v| o[:coords]=v }
24
- opt.on("-i", "--ignore-after STRING",
25
- "Remove everything in the IDs after the specified string."
26
- ){ |v| o[:ignoreafter]=v }
27
- opt.on("-I", "--remove-invariable", "Remove invariable sites.",
28
- "Note: Invariable sites are defined as columns with only one state and",
29
- "undefined characters. Additional ambiguous characters may exist and",
30
- "should be declared using --undefined."){ |v| o[:removeinvar]=v }
31
- opt.on("-u", "--missing-char CHAR",
32
- "Character denoting missing data. By default: '#{o[:missing]}'.") do |v|
33
- abort "Missing positions can only be denoted by single characters, " +
34
- "offending value: '#{v}'." if v.length != 1
35
- o[:missing]=v
36
- end
37
- opt.on("-m", "--model STRING",
38
- "Name of the model to use if --coords is used. See RAxML's docs; ",
39
- "supported values in v8+ include:",
40
- "o For DNA alignments:",
41
- " 'DNA[F|X]', or 'DNA[F|X]/3' (to estimate rates per codon position,",
42
- " particular notation for this script).",
43
- "o General protein alignments:",
44
- " 'AUTO' (default in this script), 'DAYHOFF' (1978), 'DCMUT' (MBE 2005;",
45
- " 22(2):193-199), 'JTT' (Nat 1992;358:86-89), 'VT' (JCompBiol 2000;",
46
- " 7(6):761-776), 'BLOSUM62' (PNAS 1992;89:10915), and 'LG' (MBE 2008;",
47
- " 25(7):1307-1320).",
48
- "o Specialized protein alignments:",
49
- " 'MTREV' (mitochondrial, JME 1996;42(4):459-468), 'WAG' (globular, MBE",
50
- " 2001;18(5):691-699), 'RTREV' (retrovirus, JME 2002;55(1):65-73), ",
51
- " 'CPREV' (chloroplast, JME 2000;50(4):348-358), and 'MTMAM' (nuclear",
52
- " mammal proteins, JME 1998;46(4):409-418)."){|v| o[:model]=v}
53
- opt.on("--undefined STRING",
54
- "All characters to be regarded as 'undefined'. It should include all",
55
- "ambiguous and missing data chars. Ignored unless --remove-invariable.",
56
- "By default: '#{o[:undefined]}'."){|v| o[:undefined]=v}
57
- opt.on("-q", "--quiet", "Run quietly (no STDERR output)."){ o[:q] = TRUE }
58
- opt.on("-h", "--help", "Display this screen.") do
59
- puts opt
60
- exit
61
- end
62
- opt.separator ""
15
+ cmd = File.basename($0)
16
+ opt.banner = <<~BANNER
17
+
18
+ [Enveomics Collection: #{cmd} v#{$VERSION}]
19
+
20
+ Concatenates several multiple alignments in FastA format into a single
21
+ multiple alignment. The IDs of the sequences (or the ID prefixes, if using
22
+ --ignore-after) must coincide across files.
23
+
24
+ Usage: #{cmd} [options] aln1.fa aln2.fa ... > aln.fa
25
+
26
+ BANNER
27
+ opt.on(
28
+ '-c', '--coords FILE',
29
+ 'Output file of coordinates in RAxML-compliant format'
30
+ ) { |v| o[:coords] = v }
31
+ opt.on(
32
+ '-i', '--ignore-after STRING',
33
+ 'Remove everything in the IDs after the specified string'
34
+ ) { |v| o[:ignoreafter] = v }
35
+ opt.on(
36
+ '-I', '--remove-invariable', 'Remove invariable sites',
37
+ 'Note: Invariable sites are defined as columns with only one state and',
38
+ 'undefined characters. Additional ambiguous characters may exist and',
39
+ 'should be declared using --undefined'
40
+ ) { |v| o[:removeinvar] = v }
41
+ opt.on(
42
+ '-u', '--missing-char CHAR',
43
+ "Character denoting missing data. By default: '#{o[:missing]}'"
44
+ ) do |v|
45
+ if v.length != 1
46
+ abort "-missing-char can only be denoted by single characters: #{v}"
47
+ end
48
+ o[:missing] = v
49
+ end
50
+ opt.on(
51
+ '-m', '--model STRING',
52
+ 'Name of the model to use if --coords is used. See RAxML docs;',
53
+ 'supported values in v8+ include:',
54
+ '~ For DNA alignments:',
55
+ ' "DNA[F|X]", or "DNA[F|X]/3" (to estimate rates per codon position,',
56
+ ' particular notation for this script)',
57
+ '~ General protein alignments:',
58
+ ' "AUTO" (default in this script), "DAYHOFF" (1978), "DCMUT" (MBE 2005;',
59
+ ' 22(2):193-199), "JTT" (Nat 1992;358:86-89), "VT" (JCompBiol 2000;',
60
+ ' 7(6):761-776), "BLOSUM62" (PNAS 1992;89:10915), and "LG" (MBE 2008;',
61
+ ' 25(7):1307-1320)',
62
+ '~ Specialized protein alignments:',
63
+ ' "MTREV" (mitochondrial, JME 1996;42(4):459-468), "WAG" (globular, MBE',
64
+ ' 2001;18(5):691-699), "RTREV" (retrovirus, JME 2002;55(1):65-73),',
65
+ ' "CPREV" (chloroplast, JME 2000;50(4):348-358), and "MTMAM" (nuclear',
66
+ ' mammal proteins, JME 1998;46(4):409-418)'
67
+ ) { |v| o[:model] = v }
68
+ opt.on(
69
+ '--undefined STRING',
70
+ 'All characters to be regarded as "undefined". It should include all',
71
+ 'ambiguous and missing data chars. Ignored unless --remove-invariable',
72
+ "By default: '#{o[:undefined]}'"
73
+ ) { |v| o[:undefined] = v }
74
+ opt.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
75
+ opt.on('-V', '--version', 'Returns version') { puts $VERSION ; exit }
76
+ opt.on('-h', '--help', 'Display this screen') { puts opt ; exit }
77
+ opt.separator ''
63
78
  end.parse!
64
- alns = ARGV
65
- abort "Alignment files are mandatory" if alns.nil? or alns.empty?
79
+ files = ARGV
80
+ abort 'Alignment files are mandatory' if files.nil? || files.empty?
81
+ $QUIET = o[:q]
66
82
 
67
- ##### MAIN:
68
- begin
69
- $stderr.puts "Reading." unless o[:q]
70
- a = {}
71
- n = alns.size-1
72
- lengths = []
73
- (0 .. n).each do |i|
74
- key = nil
75
- File.open(alns[i],"r").each do |ln|
76
- ln.chomp!
77
- if ln =~ /^>(\S+)/
78
- key = $1
79
- key.sub!(/#{o[:ignoreafter]}.*/,"") unless o[:ignoreafter].nil?
80
- a[key] ||= []
81
- a[key][i] = ""
82
- else
83
- abort "#{alns[i]}: Leading line is not a def-line, is this a "+
84
- "valid FastA file?" if key.nil?
85
- ln.gsub!(/\s/,"")
86
- a[key][i] += ln
87
- end
83
+ # Read individual gene alignments and return them as a single hash with genome
84
+ # IDs as keys and arrays of single-line strings as values
85
+ #
86
+ # IDs are trimmed after the first occurrence of +ignoreafter+, if defined
87
+ def read_alignments(files, ignoreafter = nil)
88
+ aln = {}
89
+ files.each_with_index do |file, i|
90
+ key = nil
91
+ File.open(file, 'r').each do |ln|
92
+ ln.chomp!
93
+ if ln =~ /^>(\S+)/
94
+ key = $1
95
+ key.sub!(/#{ignoreafter}.*/, '') if ignoreafter
96
+ aln[key] ||= []
97
+ aln[key][i] = ''
98
+ else
99
+ if key.nil?
100
+ abort "Invalid FastA file: #{file}: Leading line not a def-line"
101
+ end
102
+ ln.gsub!(/\s/, '')
103
+ aln[key][i] += ln
88
104
  end
89
- abort "#{alns[i]}: Empty alignment?" if key.nil?
90
- lengths[i] = a[key][i].length
91
- end
92
- if o[:removeinvar]
93
- $stderr.puts "Removing invariable sites." unless o[:q]
94
- invs = 0
95
- (0 .. n).each do |i|
96
- olen = lengths[i]
97
- (0 .. (lengths[i]-1)).each do |pos|
98
- chr = nil
99
- inv = true
100
- a.keys.each do |key|
101
- next if a[key][i].nil?
102
- chr = a[key][i][pos] if
103
- chr.nil? or o[:undefined].chars.include? chr
104
- if chr != a[key][i][pos] and
105
- not o[:undefined].chars.include? a[key][i][pos]
106
- inv = false
107
- break
108
- end
109
- end
110
- if inv
111
- a.keys.each{|key| a[key][i][pos]="!" unless a[key][i].nil?}
112
- lengths[i] -= 1
113
- invs += 1
114
- end
115
- end
116
- a.keys.each{|key| a[key][i].gsub!("!", "") unless a[key][i].nil?}
105
+ end
106
+ abort "Empty alignment file: #{file}" if key.nil?
107
+ end
108
+ aln
109
+ end
110
+
111
+ # Remove invariable sites from the alignment hash +aln+, using +undefined+ as
112
+ # a string including all characters representing undefined positions (e.g., X)
113
+ #
114
+ # Returns number of columns removed
115
+ def remove_invariable(aln, undefined)
116
+ invs = 0
117
+ lengths = aln.values.first.map(&:length)
118
+ undef_chars = undefined.chars
119
+
120
+ lengths.each_with_index do |len, i|
121
+ (0 .. len - 1).each do |pos|
122
+ chr = nil
123
+ inv = true
124
+ aln.each_key do |key|
125
+ next if aln[key][i].nil?
126
+ chr = aln[key][i][pos] if chr.nil? || undefined.chars.include?(chr)
127
+ if chr != aln[key][i][pos] && !undef_chars.include?(aln[key][i][pos])
128
+ inv = false
129
+ break
130
+ end
117
131
  end
118
- $stderr.puts " Removed #{invs} sites." unless o[:q]
119
- end
120
- $stderr.puts "Concatenating." unless o[:q]
121
- a.keys.each do |key|
122
- (0 .. n).each do |i|
123
- a[key][i] = (o[:missing] * lengths[i]) if a[key][i].nil?
132
+ if inv
133
+ aln.each_key { |key| aln[key][i][pos] = '!' unless aln[key][i].nil? }
134
+ lengths[i] -= 1
135
+ invs += 1
124
136
  end
125
- abort "Inconsistent lengths in '#{key}'
126
- exp:#{lengths.join(" ")}
127
- obs:#{a[key].map{|i| i.length}.join(" ")}." unless
128
- lengths == a[key].map{|i| i.length}
129
- puts ">#{key}", a[key].join("").gsub(/(.{1,60})/, "\\1\n")
130
- a.delete(key)
131
- end
132
- $stderr.puts " #{lengths.inject(:+)} columns." unless o[:q]
133
- unless o[:coords].nil?
134
- $stderr.puts "Generating coordinates." unless o[:q]
135
- coords = File.open(o[:coords],"w")
136
- s = 0
137
- names = (alns.map do |a|
138
- File.basename(a).gsub(/\..*/,"").gsub(/[^A-Za-z0-9_]/,"_")
139
- end)
140
- (0 .. n).each do |i|
141
- l = lengths[i]
142
- next unless l > 0
143
- names[i] += "_#{i}" while names.count(names[i])>1
144
- if o[:model] =~ /(DNA.?)\/3/
145
- coords.puts "#{$1}, #{names[i]}codon1 = #{s+1}-#{s+l}\\3"
146
- coords.puts "#{$1}, #{names[i]}codon2 = #{s+2}-#{s+l}\\3"
147
- coords.puts "#{$1}, #{names[i]}codon3 = #{s+3}-#{s+l}\\3"
148
- else
149
- coords.puts "#{o[:model]}, #{names[i]} = #{s+1}-#{s+l}"
150
- end
151
- s += l
137
+ end
138
+ aln.each_key { |key| aln[key][i].gsub!('!', '') unless aln[key][i].nil? }
139
+ end
140
+ invs
141
+ end
142
+
143
+ # Concatenate the alignments hash +aln+ using the character +missing+ to
144
+ # indicate missing alignments, and send each entry in the concatenated alignment
145
+ # to +blk+ as two variables: key (name) and value (alignment string)
146
+ #
147
+ # Returns an array with the lengths of each individual alignment
148
+ def concatenate(aln, missing, &blk)
149
+ say 'Concatenating'
150
+ lengths = aln.values.first.map(&:length)
151
+ aln.each_key do |key|
152
+ # Pad missing entries
153
+ lengths.each_with_index { |len, i| aln[key][i] ||= missing * len }
154
+
155
+ # Check length
156
+ obs_len = aln[key].map(&:length)
157
+ unless lengths == obs_len
158
+ abort "Inconsistent lengths in '#{key}'\nexp: #{lengths}\nobs: #{obs_len}"
159
+ end
160
+
161
+ # Pass entry to the block and remove from alignment hash
162
+ blk[key, aln[key].join('')]
163
+ aln.delete(key)
164
+ end
165
+ lengths
166
+ end
167
+
168
+ # Save the coordinates in +file+ based on +files+ paths (for the names), and
169
+ # using +lengths+ individual alignment lengths
170
+ #
171
+ # The saved format is RAxML coords, including the +model+ for each alignment
172
+ def save_coords(file, names, lengths, model)
173
+ File.open(file, 'w') do |fh|
174
+ s = 0
175
+ names.each_with_index do |name, i|
176
+ l = lengths[i]
177
+ next unless l > 0
178
+ name += "_#{i}" while names.count(name) > 1
179
+ if model =~ /(DNA.?)\/3/
180
+ fh.puts "#{$1}, #{name}codon1 = #{s + 1}-#{s + l}\\3"
181
+ fh.puts "#{$1}, #{name}codon2 = #{s + 2}-#{s + l}\\3"
182
+ fh.puts "#{$1}, #{name}codon3 = #{s + 3}-#{s + l}\\3"
183
+ else
184
+ fh.puts "#{model}, #{name} = #{s + 1}-#{s + l}"
152
185
  end
153
- coords.close
154
- end
155
- # Save the output matrix
156
- $stderr.puts "Done.\n" unless o[:q]
157
- rescue => err
158
- $stderr.puts "Exception: #{err}\n\n"
159
- err.backtrace.each { |l| $stderr.puts l + "\n" }
160
- err
186
+ s += l
187
+ end
188
+ end
161
189
  end
162
190
 
191
+ # ------ MAIN ------
192
+ begin
193
+ say 'Reading'
194
+ alignments = read_alignments(files, o[:ignoreafter])
195
+
196
+ if o[:removeinvar]
197
+ say 'Removing invariable sites'
198
+ inv = remove_invariable(alignments, o[:undefined])
199
+ say " Removed #{inv} sites"
200
+ end
201
+
202
+ lengths = concatenate(alignments, o[:missing]) do |name, seq|
203
+ puts ">#{name}", seq.gsub(/(.{1,60})/, "\\1\n")
204
+ end
205
+ say " #{lengths.inject(:+)} columns"
206
+
207
+ unless o[:coords].nil?
208
+ say 'Generating coordinates'
209
+ names = files.map do |i|
210
+ File.basename(i).gsub(/\..*/, '').gsub(/[^A-Za-z0-9_]/, '_')
211
+ end
212
+ save_coords(o[:coords], names, lengths, o[:model])
213
+ end
214
+
215
+ $stderr.puts 'Done' unless o[:q]
216
+ rescue => err
217
+ $stderr.puts "Exception: #{err}\n\n"
218
+ err.backtrace.each { |l| $stderr.puts l + "\n" }
219
+ err
220
+ end
163
221