miga-base 0.7.26.3 → 1.0.0.sr1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (105) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
  3. data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
  4. data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
  5. data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
  6. data/lib/miga/cli/action/doctor.rb +50 -19
  7. data/lib/miga/cli/action/doctor/base.rb +20 -18
  8. data/lib/miga/cli/action/init.rb +11 -7
  9. data/lib/miga/cli/action/init/files_helper.rb +1 -0
  10. data/lib/miga/cli/action/ncbi_get.rb +3 -3
  11. data/lib/miga/cli/action/tax_dist.rb +2 -2
  12. data/lib/miga/cli/action/wf.rb +5 -4
  13. data/lib/miga/daemon.rb +11 -4
  14. data/lib/miga/dataset/result.rb +10 -6
  15. data/lib/miga/json.rb +1 -2
  16. data/lib/miga/metadata.rb +5 -1
  17. data/lib/miga/parallel.rb +11 -6
  18. data/lib/miga/project.rb +8 -8
  19. data/lib/miga/project/base.rb +4 -4
  20. data/lib/miga/project/result.rb +2 -2
  21. data/lib/miga/sqlite.rb +7 -0
  22. data/lib/miga/version.rb +23 -9
  23. data/scripts/aai_distances.bash +16 -18
  24. data/scripts/ani_distances.bash +16 -17
  25. data/scripts/assembly.bash +31 -16
  26. data/scripts/haai_distances.bash +3 -27
  27. data/scripts/miga.bash +6 -4
  28. data/scripts/p.bash +1 -1
  29. data/scripts/read_quality.bash +9 -18
  30. data/scripts/trimmed_fasta.bash +14 -30
  31. data/scripts/trimmed_reads.bash +36 -36
  32. data/test/parallel_test.rb +31 -0
  33. data/test/project_test.rb +2 -1
  34. data/utils/distance/commands.rb +1 -0
  35. data/utils/distance/runner.rb +2 -4
  36. data/utils/enveomics/Manifest/Tasks/fasta.json +39 -3
  37. data/utils/enveomics/Manifest/Tasks/fastq.json +50 -2
  38. data/utils/enveomics/Manifest/Tasks/mapping.json +70 -0
  39. data/utils/enveomics/Manifest/Tasks/other.json +77 -0
  40. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +138 -1
  41. data/utils/enveomics/Manifest/categories.json +13 -4
  42. data/utils/enveomics/Scripts/Aln.cat.rb +206 -148
  43. data/utils/enveomics/Scripts/FastA.N50.pl +33 -29
  44. data/utils/enveomics/Scripts/FastA.fragment.rb +69 -61
  45. data/utils/enveomics/Scripts/FastA.sample.rb +61 -46
  46. data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
  47. data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
  48. data/utils/enveomics/Scripts/FastQ.tag.rb +59 -52
  49. data/utils/enveomics/Scripts/SRA.download.bash +6 -8
  50. data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
  51. data/utils/enveomics/Scripts/aai.rb +3 -2
  52. data/utils/enveomics/Scripts/anir.rb +137 -0
  53. data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
  54. data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
  55. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +17 -17
  56. data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
  57. data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
  58. data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
  59. data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
  60. data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
  61. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
  62. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
  63. data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
  64. data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
  65. data/utils/enveomics/Scripts/rbm.rb +87 -133
  66. data/utils/enveomics/Scripts/sam.filter.rb +148 -0
  67. data/utils/enveomics/enveomics.R/DESCRIPTION +2 -2
  68. data/utils/enveomics/enveomics.R/NAMESPACE +1 -1
  69. data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
  70. data/utils/enveomics/enveomics.R/R/utils.R +30 -0
  71. data/utils/enveomics/enveomics.R/README.md +1 -0
  72. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -1
  73. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -1
  74. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -1
  75. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +10 -2
  76. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +16 -4
  77. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +13 -3
  78. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +8 -2
  79. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +8 -2
  80. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +9 -2
  81. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +13 -5
  82. data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
  83. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +9 -2
  84. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +23 -6
  85. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -4
  86. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +8 -2
  87. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +7 -2
  88. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +14 -3
  89. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +10 -2
  90. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +8 -2
  91. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +17 -9
  92. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +6 -2
  93. data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
  94. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +14 -5
  95. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +19 -4
  96. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +11 -3
  97. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +11 -4
  98. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +26 -12
  99. data/utils/multitrim/Multitrim How-To.pdf +0 -0
  100. data/utils/multitrim/README.md +67 -0
  101. data/utils/multitrim/multitrim.py +1555 -0
  102. data/utils/multitrim/multitrim.yml +13 -0
  103. data/utils/requirements.txt +4 -3
  104. metadata +33 -6
  105. data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +0 -30
@@ -24,11 +24,13 @@
24
24
  "BlastTab.pairedHits.rb",
25
25
  "BlastTab.subsample.pl",
26
26
  "BlastTab.taxid2taxrank.pl",
27
- "BlastTab.topHits_sorted.rb"
27
+ "BlastTab.topHits_sorted.rb",
28
+ "sam.filter.rb"
28
29
  ],
29
30
  "Execution": [
30
31
  "aai.rb",
31
32
  "ani.rb",
33
+ "anir.rb",
32
34
  "HMM.haai.rb",
33
35
  "rbm.rb"
34
36
  ]
@@ -58,9 +60,11 @@
58
60
  "FastA.split.rb",
59
61
  "FastA.subsample.pl",
60
62
  "FastA.tag.rb",
63
+ "FastA.toFastQ.rb",
61
64
  "FastA.wrap.rb",
62
65
  "FastQ.filter.pl",
63
66
  "FastQ.interpose.pl",
67
+ "FastQ.maskQual.rb",
64
68
  "FastQ.offset.pl",
65
69
  "FastQ.split.pl",
66
70
  "FastQ.tag.rb",
@@ -71,11 +75,13 @@
71
75
  "Community": [
72
76
  "AlphaDiversity.pl",
73
77
  "Chao1.pl",
74
- "Table.barplot.R"
78
+ "Table.barplot.R",
79
+ "Table.prefScore.R"
75
80
  ],
76
81
  "Population": [
77
82
  "VCF.SNPs.rb",
78
- "VCF.KaKs.rb"
83
+ "VCF.KaKs.rb",
84
+ "Table.prefScore.R"
79
85
  ]
80
86
  },
81
87
  "Annotation": {
@@ -143,13 +149,16 @@
143
149
  "clust.rand.rb"
144
150
  ],
145
151
  "Read recruitments": [
152
+ "anir.rb",
146
153
  "BedGraph.tad.rb",
147
154
  "BedGraph.window.rb",
148
155
  "BlastTab.catsbj.pl",
149
156
  "BlastTab.pairedHits.rb",
150
157
  "BlastTab.recplot2.R",
158
+ "FastQ.test-error.rb",
151
159
  "GFF.catsbj.pl",
152
- "RecPlot2.compareIdentities.R"
160
+ "RecPlot2.compareIdentities.R",
161
+ "sam.filter.rb"
153
162
  ]
154
163
  }
155
164
  }
@@ -1,163 +1,221 @@
1
1
  #!/usr/bin/env ruby
2
- #
2
+
3
3
  # @author Luis M. Rodriguez-R
4
- # @update Nov-30-2015
5
4
  # @license artistic license 2.0
6
- #
7
5
 
8
- $:.push File.expand_path(File.dirname(__FILE__) + "/lib")
9
- require "enveomics_rb/enveomics"
6
+ $VERSION = 1.0
7
+ $:.push File.expand_path('../lib', __FILE__)
8
+ require 'enveomics_rb/enveomics'
9
+
10
+ o = {
11
+ q: false, missing: '-', model: 'AUTO', removeinvar: false, undefined: '-.Xx?'
12
+ }
10
13
 
11
- o = {:q=>false, :missing=>"-", :model=>"AUTO", :removeinvar=>false,
12
- :undefined=>"-.Xx?"}
13
14
  OptionParser.new do |opt|
14
- opt.banner = "
15
- Concatenates several multiple alignments in FastA format into a single
16
- multiple alignment. The IDs of the sequences (or the ID prefixes, if using
17
- --ignore-after) must coincide across files.
18
-
19
- Usage: #{$0} [options] aln1.fa aln2.fa ... > aln.fa".gsub(/^ +/,"")
20
- opt.separator ""
21
- opt.on("-c", "--coords FILE",
22
- "Output file of coordinates in RAxML-compliant format."
23
- ){ |v| o[:coords]=v }
24
- opt.on("-i", "--ignore-after STRING",
25
- "Remove everything in the IDs after the specified string."
26
- ){ |v| o[:ignoreafter]=v }
27
- opt.on("-I", "--remove-invariable", "Remove invariable sites.",
28
- "Note: Invariable sites are defined as columns with only one state and",
29
- "undefined characters. Additional ambiguous characters may exist and",
30
- "should be declared using --undefined."){ |v| o[:removeinvar]=v }
31
- opt.on("-u", "--missing-char CHAR",
32
- "Character denoting missing data. By default: '#{o[:missing]}'.") do |v|
33
- abort "Missing positions can only be denoted by single characters, " +
34
- "offending value: '#{v}'." if v.length != 1
35
- o[:missing]=v
36
- end
37
- opt.on("-m", "--model STRING",
38
- "Name of the model to use if --coords is used. See RAxML's docs; ",
39
- "supported values in v8+ include:",
40
- "o For DNA alignments:",
41
- " 'DNA[F|X]', or 'DNA[F|X]/3' (to estimate rates per codon position,",
42
- " particular notation for this script).",
43
- "o General protein alignments:",
44
- " 'AUTO' (default in this script), 'DAYHOFF' (1978), 'DCMUT' (MBE 2005;",
45
- " 22(2):193-199), 'JTT' (Nat 1992;358:86-89), 'VT' (JCompBiol 2000;",
46
- " 7(6):761-776), 'BLOSUM62' (PNAS 1992;89:10915), and 'LG' (MBE 2008;",
47
- " 25(7):1307-1320).",
48
- "o Specialized protein alignments:",
49
- " 'MTREV' (mitochondrial, JME 1996;42(4):459-468), 'WAG' (globular, MBE",
50
- " 2001;18(5):691-699), 'RTREV' (retrovirus, JME 2002;55(1):65-73), ",
51
- " 'CPREV' (chloroplast, JME 2000;50(4):348-358), and 'MTMAM' (nuclear",
52
- " mammal proteins, JME 1998;46(4):409-418)."){|v| o[:model]=v}
53
- opt.on("--undefined STRING",
54
- "All characters to be regarded as 'undefined'. It should include all",
55
- "ambiguous and missing data chars. Ignored unless --remove-invariable.",
56
- "By default: '#{o[:undefined]}'."){|v| o[:undefined]=v}
57
- opt.on("-q", "--quiet", "Run quietly (no STDERR output)."){ o[:q] = TRUE }
58
- opt.on("-h", "--help", "Display this screen.") do
59
- puts opt
60
- exit
61
- end
62
- opt.separator ""
15
+ cmd = File.basename($0)
16
+ opt.banner = <<~BANNER
17
+
18
+ [Enveomics Collection: #{cmd} v#{$VERSION}]
19
+
20
+ Concatenates several multiple alignments in FastA format into a single
21
+ multiple alignment. The IDs of the sequences (or the ID prefixes, if using
22
+ --ignore-after) must coincide across files.
23
+
24
+ Usage: #{cmd} [options] aln1.fa aln2.fa ... > aln.fa
25
+
26
+ BANNER
27
+ opt.on(
28
+ '-c', '--coords FILE',
29
+ 'Output file of coordinates in RAxML-compliant format'
30
+ ) { |v| o[:coords] = v }
31
+ opt.on(
32
+ '-i', '--ignore-after STRING',
33
+ 'Remove everything in the IDs after the specified string'
34
+ ) { |v| o[:ignoreafter] = v }
35
+ opt.on(
36
+ '-I', '--remove-invariable', 'Remove invariable sites',
37
+ 'Note: Invariable sites are defined as columns with only one state and',
38
+ 'undefined characters. Additional ambiguous characters may exist and',
39
+ 'should be declared using --undefined'
40
+ ) { |v| o[:removeinvar] = v }
41
+ opt.on(
42
+ '-u', '--missing-char CHAR',
43
+ "Character denoting missing data. By default: '#{o[:missing]}'"
44
+ ) do |v|
45
+ if v.length != 1
46
+ abort "-missing-char can only be denoted by single characters: #{v}"
47
+ end
48
+ o[:missing] = v
49
+ end
50
+ opt.on(
51
+ '-m', '--model STRING',
52
+ 'Name of the model to use if --coords is used. See RAxML docs;',
53
+ 'supported values in v8+ include:',
54
+ '~ For DNA alignments:',
55
+ ' "DNA[F|X]", or "DNA[F|X]/3" (to estimate rates per codon position,',
56
+ ' particular notation for this script)',
57
+ '~ General protein alignments:',
58
+ ' "AUTO" (default in this script), "DAYHOFF" (1978), "DCMUT" (MBE 2005;',
59
+ ' 22(2):193-199), "JTT" (Nat 1992;358:86-89), "VT" (JCompBiol 2000;',
60
+ ' 7(6):761-776), "BLOSUM62" (PNAS 1992;89:10915), and "LG" (MBE 2008;',
61
+ ' 25(7):1307-1320)',
62
+ '~ Specialized protein alignments:',
63
+ ' "MTREV" (mitochondrial, JME 1996;42(4):459-468), "WAG" (globular, MBE',
64
+ ' 2001;18(5):691-699), "RTREV" (retrovirus, JME 2002;55(1):65-73),',
65
+ ' "CPREV" (chloroplast, JME 2000;50(4):348-358), and "MTMAM" (nuclear',
66
+ ' mammal proteins, JME 1998;46(4):409-418)'
67
+ ) { |v| o[:model] = v }
68
+ opt.on(
69
+ '--undefined STRING',
70
+ 'All characters to be regarded as "undefined". It should include all',
71
+ 'ambiguous and missing data chars. Ignored unless --remove-invariable',
72
+ "By default: '#{o[:undefined]}'"
73
+ ) { |v| o[:undefined] = v }
74
+ opt.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
75
+ opt.on('-V', '--version', 'Returns version') { puts $VERSION ; exit }
76
+ opt.on('-h', '--help', 'Display this screen') { puts opt ; exit }
77
+ opt.separator ''
63
78
  end.parse!
64
- alns = ARGV
65
- abort "Alignment files are mandatory" if alns.nil? or alns.empty?
79
+ files = ARGV
80
+ abort 'Alignment files are mandatory' if files.nil? || files.empty?
81
+ $QUIET = o[:q]
66
82
 
67
- ##### MAIN:
68
- begin
69
- $stderr.puts "Reading." unless o[:q]
70
- a = {}
71
- n = alns.size-1
72
- lengths = []
73
- (0 .. n).each do |i|
74
- key = nil
75
- File.open(alns[i],"r").each do |ln|
76
- ln.chomp!
77
- if ln =~ /^>(\S+)/
78
- key = $1
79
- key.sub!(/#{o[:ignoreafter]}.*/,"") unless o[:ignoreafter].nil?
80
- a[key] ||= []
81
- a[key][i] = ""
82
- else
83
- abort "#{alns[i]}: Leading line is not a def-line, is this a "+
84
- "valid FastA file?" if key.nil?
85
- ln.gsub!(/\s/,"")
86
- a[key][i] += ln
87
- end
83
+ # Read individual gene alignments and return them as a single hash with genome
84
+ # IDs as keys and arrays of single-line strings as values
85
+ #
86
+ # IDs are trimmed after the first occurrence of +ignoreafter+, if defined
87
+ def read_alignments(files, ignoreafter = nil)
88
+ aln = {}
89
+ files.each_with_index do |file, i|
90
+ key = nil
91
+ File.open(file, 'r').each do |ln|
92
+ ln.chomp!
93
+ if ln =~ /^>(\S+)/
94
+ key = $1
95
+ key.sub!(/#{ignoreafter}.*/, '') if ignoreafter
96
+ aln[key] ||= []
97
+ aln[key][i] = ''
98
+ else
99
+ if key.nil?
100
+ abort "Invalid FastA file: #{file}: Leading line not a def-line"
101
+ end
102
+ ln.gsub!(/\s/, '')
103
+ aln[key][i] += ln
88
104
  end
89
- abort "#{alns[i]}: Empty alignment?" if key.nil?
90
- lengths[i] = a[key][i].length
91
- end
92
- if o[:removeinvar]
93
- $stderr.puts "Removing invariable sites." unless o[:q]
94
- invs = 0
95
- (0 .. n).each do |i|
96
- olen = lengths[i]
97
- (0 .. (lengths[i]-1)).each do |pos|
98
- chr = nil
99
- inv = true
100
- a.keys.each do |key|
101
- next if a[key][i].nil?
102
- chr = a[key][i][pos] if
103
- chr.nil? or o[:undefined].chars.include? chr
104
- if chr != a[key][i][pos] and
105
- not o[:undefined].chars.include? a[key][i][pos]
106
- inv = false
107
- break
108
- end
109
- end
110
- if inv
111
- a.keys.each{|key| a[key][i][pos]="!" unless a[key][i].nil?}
112
- lengths[i] -= 1
113
- invs += 1
114
- end
115
- end
116
- a.keys.each{|key| a[key][i].gsub!("!", "") unless a[key][i].nil?}
105
+ end
106
+ abort "Empty alignment file: #{file}" if key.nil?
107
+ end
108
+ aln
109
+ end
110
+
111
+ # Remove invariable sites from the alignment hash +aln+, using +undefined+ as
112
+ # a string including all characters representing undefined positions (e.g., X)
113
+ #
114
+ # Returns number of columns removed
115
+ def remove_invariable(aln, undefined)
116
+ invs = 0
117
+ lengths = aln.values.first.map(&:length)
118
+ undef_chars = undefined.chars
119
+
120
+ lengths.each_with_index do |len, i|
121
+ (0 .. len - 1).each do |pos|
122
+ chr = nil
123
+ inv = true
124
+ aln.each_key do |key|
125
+ next if aln[key][i].nil?
126
+ chr = aln[key][i][pos] if chr.nil? || undefined.chars.include?(chr)
127
+ if chr != aln[key][i][pos] && !undef_chars.include?(aln[key][i][pos])
128
+ inv = false
129
+ break
130
+ end
117
131
  end
118
- $stderr.puts " Removed #{invs} sites." unless o[:q]
119
- end
120
- $stderr.puts "Concatenating." unless o[:q]
121
- a.keys.each do |key|
122
- (0 .. n).each do |i|
123
- a[key][i] = (o[:missing] * lengths[i]) if a[key][i].nil?
132
+ if inv
133
+ aln.each_key { |key| aln[key][i][pos] = '!' unless aln[key][i].nil? }
134
+ lengths[i] -= 1
135
+ invs += 1
124
136
  end
125
- abort "Inconsistent lengths in '#{key}'
126
- exp:#{lengths.join(" ")}
127
- obs:#{a[key].map{|i| i.length}.join(" ")}." unless
128
- lengths == a[key].map{|i| i.length}
129
- puts ">#{key}", a[key].join("").gsub(/(.{1,60})/, "\\1\n")
130
- a.delete(key)
131
- end
132
- $stderr.puts " #{lengths.inject(:+)} columns." unless o[:q]
133
- unless o[:coords].nil?
134
- $stderr.puts "Generating coordinates." unless o[:q]
135
- coords = File.open(o[:coords],"w")
136
- s = 0
137
- names = (alns.map do |a|
138
- File.basename(a).gsub(/\..*/,"").gsub(/[^A-Za-z0-9_]/,"_")
139
- end)
140
- (0 .. n).each do |i|
141
- l = lengths[i]
142
- next unless l > 0
143
- names[i] += "_#{i}" while names.count(names[i])>1
144
- if o[:model] =~ /(DNA.?)\/3/
145
- coords.puts "#{$1}, #{names[i]}codon1 = #{s+1}-#{s+l}\\3"
146
- coords.puts "#{$1}, #{names[i]}codon2 = #{s+2}-#{s+l}\\3"
147
- coords.puts "#{$1}, #{names[i]}codon3 = #{s+3}-#{s+l}\\3"
148
- else
149
- coords.puts "#{o[:model]}, #{names[i]} = #{s+1}-#{s+l}"
150
- end
151
- s += l
137
+ end
138
+ aln.each_key { |key| aln[key][i].gsub!('!', '') unless aln[key][i].nil? }
139
+ end
140
+ invs
141
+ end
142
+
143
+ # Concatenate the alignments hash +aln+ using the character +missing+ to
144
+ # indicate missing alignments, and send each entry in the concatenated alignment
145
+ # to +blk+ as two variables: key (name) and value (alignment string)
146
+ #
147
+ # Returns an array with the lengths of each individual alignment
148
+ def concatenate(aln, missing, &blk)
149
+ say 'Concatenating'
150
+ lengths = aln.values.first.map(&:length)
151
+ aln.each_key do |key|
152
+ # Pad missing entries
153
+ lengths.each_with_index { |len, i| aln[key][i] ||= missing * len }
154
+
155
+ # Check length
156
+ obs_len = aln[key].map(&:length)
157
+ unless lengths == obs_len
158
+ abort "Inconsistent lengths in '#{key}'\nexp: #{lengths}\nobs: #{obs_len}"
159
+ end
160
+
161
+ # Pass entry to the block and remove from alignment hash
162
+ blk[key, aln[key].join('')]
163
+ aln.delete(key)
164
+ end
165
+ lengths
166
+ end
167
+
168
+ # Save the coordinates in +file+ based on +files+ paths (for the names), and
169
+ # using +lengths+ individual alignment lengths
170
+ #
171
+ # The saved format is RAxML coords, including the +model+ for each alignment
172
+ def save_coords(file, names, lengths, model)
173
+ File.open(file, 'w') do |fh|
174
+ s = 0
175
+ names.each_with_index do |name, i|
176
+ l = lengths[i]
177
+ next unless l > 0
178
+ name += "_#{i}" while names.count(name) > 1
179
+ if model =~ /(DNA.?)\/3/
180
+ fh.puts "#{$1}, #{name}codon1 = #{s + 1}-#{s + l}\\3"
181
+ fh.puts "#{$1}, #{name}codon2 = #{s + 2}-#{s + l}\\3"
182
+ fh.puts "#{$1}, #{name}codon3 = #{s + 3}-#{s + l}\\3"
183
+ else
184
+ fh.puts "#{model}, #{name} = #{s + 1}-#{s + l}"
152
185
  end
153
- coords.close
154
- end
155
- # Save the output matrix
156
- $stderr.puts "Done.\n" unless o[:q]
157
- rescue => err
158
- $stderr.puts "Exception: #{err}\n\n"
159
- err.backtrace.each { |l| $stderr.puts l + "\n" }
160
- err
186
+ s += l
187
+ end
188
+ end
161
189
  end
162
190
 
191
+ # ------ MAIN ------
192
+ begin
193
+ say 'Reading'
194
+ alignments = read_alignments(files, o[:ignoreafter])
195
+
196
+ if o[:removeinvar]
197
+ say 'Removing invariable sites'
198
+ inv = remove_invariable(alignments, o[:undefined])
199
+ say " Removed #{inv} sites"
200
+ end
201
+
202
+ lengths = concatenate(alignments, o[:missing]) do |name, seq|
203
+ puts ">#{name}", seq.gsub(/(.{1,60})/, "\\1\n")
204
+ end
205
+ say " #{lengths.inject(:+)} columns"
206
+
207
+ unless o[:coords].nil?
208
+ say 'Generating coordinates'
209
+ names = files.map do |i|
210
+ File.basename(i).gsub(/\..*/, '').gsub(/[^A-Za-z0-9_]/, '_')
211
+ end
212
+ save_coords(o[:coords], names, lengths, o[:model])
213
+ end
214
+
215
+ $stderr.puts 'Done' unless o[:q]
216
+ rescue => err
217
+ $stderr.puts "Exception: #{err}\n\n"
218
+ err.backtrace.each { |l| $stderr.puts l + "\n" }
219
+ err
220
+ end
163
221
 
@@ -1,9 +1,8 @@
1
1
  #!/usr/bin/env perl
2
- #
2
+
3
3
  # @author: Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
4
- # @update: Oct 07 2015
5
- # @license: artistic license 2.0
6
- #
4
+ # @license: Artistic-2.0
5
+
7
6
  use strict;
8
7
  use warnings;
9
8
  use List::Util qw/sum min max/;
@@ -11,46 +10,51 @@ use List::Util qw/sum min max/;
11
10
  my ($seqs, $minlen, $n__) = @ARGV;
12
11
  $seqs or die "
13
12
  Description:
14
- Calculates the N50 value of a set of sequences. Alternatively, it
15
- can calculate other N** values. It also calculates the total number
16
- of sequences and the total added length.
17
-
13
+ Calculates the N50 value of a set of sequences. Alternatively, it
14
+ can calculate other N** values. It also calculates the total number
15
+ of sequences, the total added length, and the longest sequence length.
16
+
18
17
  Usage:
19
- $0 seqs.fa[ minlen[ **]]
18
+ $0 seqs.fa [minlen [**]]
19
+
20
+ seqs.fa A FastA file containing the sequences
21
+ minlen (optional) The minimum length to take into consideration
22
+ By default: 0
23
+ ** (optional) Value N** to calculate. By default: 50 (N50)
20
24
 
21
- seqs.fa A FastA file containing the sequences.
22
- minlen (optional) The minimum length to take into consideration.
23
- By default: 0.
24
- ** Value N** to calculate. By default: 50 (N50).
25
25
  ";
26
+
26
27
  $minlen ||= 0;
27
28
  $n__ ||= 50;
28
29
 
29
30
  my @len = ();
30
31
  open SEQ, "<", $seqs or die "Cannot open file: $seqs: $!\n";
31
32
  while(<SEQ>){
32
- if(/^>/){
33
- push @len, 0;
34
- }else{
35
- next if /^;/;
36
- chomp;
37
- s/\W//g;
38
- $len[-1]+=length $_;
39
- }
33
+ if(/^>/){
34
+ push @len, 0;
35
+ }else{
36
+ next if /^;/;
37
+ chomp;
38
+ s/\W//g;
39
+ $len[-1] += length $_;
40
+ }
40
41
  }
41
42
  close SEQ;
42
- @len = sort { $a <=> $b } map { $_>=$minlen?$_:() } @len;
43
+
44
+ @len = sort { $a <=> $b } map { $_ >= $minlen ? $_ : () } @len;
43
45
  my $tot = (sum(@len) || 0);
44
46
 
45
- my $thr = $n__*$tot/100;
47
+ my $thr = $n__ * $tot / 100;
46
48
  my $pos = 0;
47
49
  for(@len){
48
- $pos+= $_;
49
- if($pos>=$thr){
50
- print "N$n__: $_\n";
51
- last;
52
- }
50
+ $pos += $_;
51
+ if($pos >= $thr){
52
+ print "N$n__: $_\n";
53
+ last;
54
+ }
53
55
  }
54
- print "Sequences: ".scalar(@len)."\n";
56
+
57
+ print "Sequences: " . scalar(@len) . "\n";
55
58
  print "Total length: $tot\n";
59
+ print "Longest sequence: " . pop(@len) . "\n";
56
60