miga-base 0.7.26.3 → 1.0.0.sr1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
- data/lib/miga/cli/action/doctor.rb +50 -19
- data/lib/miga/cli/action/doctor/base.rb +20 -18
- data/lib/miga/cli/action/init.rb +11 -7
- data/lib/miga/cli/action/init/files_helper.rb +1 -0
- data/lib/miga/cli/action/ncbi_get.rb +3 -3
- data/lib/miga/cli/action/tax_dist.rb +2 -2
- data/lib/miga/cli/action/wf.rb +5 -4
- data/lib/miga/daemon.rb +11 -4
- data/lib/miga/dataset/result.rb +10 -6
- data/lib/miga/json.rb +1 -2
- data/lib/miga/metadata.rb +5 -1
- data/lib/miga/parallel.rb +11 -6
- data/lib/miga/project.rb +8 -8
- data/lib/miga/project/base.rb +4 -4
- data/lib/miga/project/result.rb +2 -2
- data/lib/miga/sqlite.rb +7 -0
- data/lib/miga/version.rb +23 -9
- data/scripts/aai_distances.bash +16 -18
- data/scripts/ani_distances.bash +16 -17
- data/scripts/assembly.bash +31 -16
- data/scripts/haai_distances.bash +3 -27
- data/scripts/miga.bash +6 -4
- data/scripts/p.bash +1 -1
- data/scripts/read_quality.bash +9 -18
- data/scripts/trimmed_fasta.bash +14 -30
- data/scripts/trimmed_reads.bash +36 -36
- data/test/parallel_test.rb +31 -0
- data/test/project_test.rb +2 -1
- data/utils/distance/commands.rb +1 -0
- data/utils/distance/runner.rb +2 -4
- data/utils/enveomics/Manifest/Tasks/fasta.json +39 -3
- data/utils/enveomics/Manifest/Tasks/fastq.json +50 -2
- data/utils/enveomics/Manifest/Tasks/mapping.json +70 -0
- data/utils/enveomics/Manifest/Tasks/other.json +77 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +138 -1
- data/utils/enveomics/Manifest/categories.json +13 -4
- data/utils/enveomics/Scripts/Aln.cat.rb +206 -148
- data/utils/enveomics/Scripts/FastA.N50.pl +33 -29
- data/utils/enveomics/Scripts/FastA.fragment.rb +69 -61
- data/utils/enveomics/Scripts/FastA.sample.rb +61 -46
- data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
- data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
- data/utils/enveomics/Scripts/FastQ.tag.rb +59 -52
- data/utils/enveomics/Scripts/SRA.download.bash +6 -8
- data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
- data/utils/enveomics/Scripts/aai.rb +3 -2
- data/utils/enveomics/Scripts/anir.rb +137 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +17 -17
- data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
- data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
- data/utils/enveomics/Scripts/rbm.rb +87 -133
- data/utils/enveomics/Scripts/sam.filter.rb +148 -0
- data/utils/enveomics/enveomics.R/DESCRIPTION +2 -2
- data/utils/enveomics/enveomics.R/NAMESPACE +1 -1
- data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
- data/utils/enveomics/enveomics.R/R/utils.R +30 -0
- data/utils/enveomics/enveomics.R/README.md +1 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +10 -2
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +16 -4
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +13 -3
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +9 -2
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +13 -5
- data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +9 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +23 -6
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -4
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +7 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +14 -3
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +10 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +17 -9
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +6 -2
- data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +14 -5
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +19 -4
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +11 -3
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +11 -4
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +26 -12
- data/utils/multitrim/Multitrim How-To.pdf +0 -0
- data/utils/multitrim/README.md +67 -0
- data/utils/multitrim/multitrim.py +1555 -0
- data/utils/multitrim/multitrim.yml +13 -0
- data/utils/requirements.txt +4 -3
- metadata +33 -6
- data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +0 -30
@@ -24,11 +24,13 @@
|
|
24
24
|
"BlastTab.pairedHits.rb",
|
25
25
|
"BlastTab.subsample.pl",
|
26
26
|
"BlastTab.taxid2taxrank.pl",
|
27
|
-
"BlastTab.topHits_sorted.rb"
|
27
|
+
"BlastTab.topHits_sorted.rb",
|
28
|
+
"sam.filter.rb"
|
28
29
|
],
|
29
30
|
"Execution": [
|
30
31
|
"aai.rb",
|
31
32
|
"ani.rb",
|
33
|
+
"anir.rb",
|
32
34
|
"HMM.haai.rb",
|
33
35
|
"rbm.rb"
|
34
36
|
]
|
@@ -58,9 +60,11 @@
|
|
58
60
|
"FastA.split.rb",
|
59
61
|
"FastA.subsample.pl",
|
60
62
|
"FastA.tag.rb",
|
63
|
+
"FastA.toFastQ.rb",
|
61
64
|
"FastA.wrap.rb",
|
62
65
|
"FastQ.filter.pl",
|
63
66
|
"FastQ.interpose.pl",
|
67
|
+
"FastQ.maskQual.rb",
|
64
68
|
"FastQ.offset.pl",
|
65
69
|
"FastQ.split.pl",
|
66
70
|
"FastQ.tag.rb",
|
@@ -71,11 +75,13 @@
|
|
71
75
|
"Community": [
|
72
76
|
"AlphaDiversity.pl",
|
73
77
|
"Chao1.pl",
|
74
|
-
"Table.barplot.R"
|
78
|
+
"Table.barplot.R",
|
79
|
+
"Table.prefScore.R"
|
75
80
|
],
|
76
81
|
"Population": [
|
77
82
|
"VCF.SNPs.rb",
|
78
|
-
"VCF.KaKs.rb"
|
83
|
+
"VCF.KaKs.rb",
|
84
|
+
"Table.prefScore.R"
|
79
85
|
]
|
80
86
|
},
|
81
87
|
"Annotation": {
|
@@ -143,13 +149,16 @@
|
|
143
149
|
"clust.rand.rb"
|
144
150
|
],
|
145
151
|
"Read recruitments": [
|
152
|
+
"anir.rb",
|
146
153
|
"BedGraph.tad.rb",
|
147
154
|
"BedGraph.window.rb",
|
148
155
|
"BlastTab.catsbj.pl",
|
149
156
|
"BlastTab.pairedHits.rb",
|
150
157
|
"BlastTab.recplot2.R",
|
158
|
+
"FastQ.test-error.rb",
|
151
159
|
"GFF.catsbj.pl",
|
152
|
-
"RecPlot2.compareIdentities.R"
|
160
|
+
"RecPlot2.compareIdentities.R",
|
161
|
+
"sam.filter.rb"
|
153
162
|
]
|
154
163
|
}
|
155
164
|
}
|
@@ -1,163 +1,221 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
|
2
|
+
|
3
3
|
# @author Luis M. Rodriguez-R
|
4
|
-
# @update Nov-30-2015
|
5
4
|
# @license artistic license 2.0
|
6
|
-
#
|
7
5
|
|
8
|
-
|
9
|
-
|
6
|
+
$VERSION = 1.0
|
7
|
+
$:.push File.expand_path('../lib', __FILE__)
|
8
|
+
require 'enveomics_rb/enveomics'
|
9
|
+
|
10
|
+
o = {
|
11
|
+
q: false, missing: '-', model: 'AUTO', removeinvar: false, undefined: '-.Xx?'
|
12
|
+
}
|
10
13
|
|
11
|
-
o = {:q=>false, :missing=>"-", :model=>"AUTO", :removeinvar=>false,
|
12
|
-
:undefined=>"-.Xx?"}
|
13
14
|
OptionParser.new do |opt|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
"
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
15
|
+
cmd = File.basename($0)
|
16
|
+
opt.banner = <<~BANNER
|
17
|
+
|
18
|
+
[Enveomics Collection: #{cmd} v#{$VERSION}]
|
19
|
+
|
20
|
+
Concatenates several multiple alignments in FastA format into a single
|
21
|
+
multiple alignment. The IDs of the sequences (or the ID prefixes, if using
|
22
|
+
--ignore-after) must coincide across files.
|
23
|
+
|
24
|
+
Usage: #{cmd} [options] aln1.fa aln2.fa ... > aln.fa
|
25
|
+
|
26
|
+
BANNER
|
27
|
+
opt.on(
|
28
|
+
'-c', '--coords FILE',
|
29
|
+
'Output file of coordinates in RAxML-compliant format'
|
30
|
+
) { |v| o[:coords] = v }
|
31
|
+
opt.on(
|
32
|
+
'-i', '--ignore-after STRING',
|
33
|
+
'Remove everything in the IDs after the specified string'
|
34
|
+
) { |v| o[:ignoreafter] = v }
|
35
|
+
opt.on(
|
36
|
+
'-I', '--remove-invariable', 'Remove invariable sites',
|
37
|
+
'Note: Invariable sites are defined as columns with only one state and',
|
38
|
+
'undefined characters. Additional ambiguous characters may exist and',
|
39
|
+
'should be declared using --undefined'
|
40
|
+
) { |v| o[:removeinvar] = v }
|
41
|
+
opt.on(
|
42
|
+
'-u', '--missing-char CHAR',
|
43
|
+
"Character denoting missing data. By default: '#{o[:missing]}'"
|
44
|
+
) do |v|
|
45
|
+
if v.length != 1
|
46
|
+
abort "-missing-char can only be denoted by single characters: #{v}"
|
47
|
+
end
|
48
|
+
o[:missing] = v
|
49
|
+
end
|
50
|
+
opt.on(
|
51
|
+
'-m', '--model STRING',
|
52
|
+
'Name of the model to use if --coords is used. See RAxML docs;',
|
53
|
+
'supported values in v8+ include:',
|
54
|
+
'~ For DNA alignments:',
|
55
|
+
' "DNA[F|X]", or "DNA[F|X]/3" (to estimate rates per codon position,',
|
56
|
+
' particular notation for this script)',
|
57
|
+
'~ General protein alignments:',
|
58
|
+
' "AUTO" (default in this script), "DAYHOFF" (1978), "DCMUT" (MBE 2005;',
|
59
|
+
' 22(2):193-199), "JTT" (Nat 1992;358:86-89), "VT" (JCompBiol 2000;',
|
60
|
+
' 7(6):761-776), "BLOSUM62" (PNAS 1992;89:10915), and "LG" (MBE 2008;',
|
61
|
+
' 25(7):1307-1320)',
|
62
|
+
'~ Specialized protein alignments:',
|
63
|
+
' "MTREV" (mitochondrial, JME 1996;42(4):459-468), "WAG" (globular, MBE',
|
64
|
+
' 2001;18(5):691-699), "RTREV" (retrovirus, JME 2002;55(1):65-73),',
|
65
|
+
' "CPREV" (chloroplast, JME 2000;50(4):348-358), and "MTMAM" (nuclear',
|
66
|
+
' mammal proteins, JME 1998;46(4):409-418)'
|
67
|
+
) { |v| o[:model] = v }
|
68
|
+
opt.on(
|
69
|
+
'--undefined STRING',
|
70
|
+
'All characters to be regarded as "undefined". It should include all',
|
71
|
+
'ambiguous and missing data chars. Ignored unless --remove-invariable',
|
72
|
+
"By default: '#{o[:undefined]}'"
|
73
|
+
) { |v| o[:undefined] = v }
|
74
|
+
opt.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
|
75
|
+
opt.on('-V', '--version', 'Returns version') { puts $VERSION ; exit }
|
76
|
+
opt.on('-h', '--help', 'Display this screen') { puts opt ; exit }
|
77
|
+
opt.separator ''
|
63
78
|
end.parse!
|
64
|
-
|
65
|
-
abort
|
79
|
+
files = ARGV
|
80
|
+
abort 'Alignment files are mandatory' if files.nil? || files.empty?
|
81
|
+
$QUIET = o[:q]
|
66
82
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
83
|
+
# Read individual gene alignments and return them as a single hash with genome
|
84
|
+
# IDs as keys and arrays of single-line strings as values
|
85
|
+
#
|
86
|
+
# IDs are trimmed after the first occurrence of +ignoreafter+, if defined
|
87
|
+
def read_alignments(files, ignoreafter = nil)
|
88
|
+
aln = {}
|
89
|
+
files.each_with_index do |file, i|
|
90
|
+
key = nil
|
91
|
+
File.open(file, 'r').each do |ln|
|
92
|
+
ln.chomp!
|
93
|
+
if ln =~ /^>(\S+)/
|
94
|
+
key = $1
|
95
|
+
key.sub!(/#{ignoreafter}.*/, '') if ignoreafter
|
96
|
+
aln[key] ||= []
|
97
|
+
aln[key][i] = ''
|
98
|
+
else
|
99
|
+
if key.nil?
|
100
|
+
abort "Invalid FastA file: #{file}: Leading line not a def-line"
|
101
|
+
end
|
102
|
+
ln.gsub!(/\s/, '')
|
103
|
+
aln[key][i] += ln
|
88
104
|
end
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
end
|
116
|
-
a.keys.each{|key| a[key][i].gsub!("!", "") unless a[key][i].nil?}
|
105
|
+
end
|
106
|
+
abort "Empty alignment file: #{file}" if key.nil?
|
107
|
+
end
|
108
|
+
aln
|
109
|
+
end
|
110
|
+
|
111
|
+
# Remove invariable sites from the alignment hash +aln+, using +undefined+ as
|
112
|
+
# a string including all characters representing undefined positions (e.g., X)
|
113
|
+
#
|
114
|
+
# Returns number of columns removed
|
115
|
+
def remove_invariable(aln, undefined)
|
116
|
+
invs = 0
|
117
|
+
lengths = aln.values.first.map(&:length)
|
118
|
+
undef_chars = undefined.chars
|
119
|
+
|
120
|
+
lengths.each_with_index do |len, i|
|
121
|
+
(0 .. len - 1).each do |pos|
|
122
|
+
chr = nil
|
123
|
+
inv = true
|
124
|
+
aln.each_key do |key|
|
125
|
+
next if aln[key][i].nil?
|
126
|
+
chr = aln[key][i][pos] if chr.nil? || undefined.chars.include?(chr)
|
127
|
+
if chr != aln[key][i][pos] && !undef_chars.include?(aln[key][i][pos])
|
128
|
+
inv = false
|
129
|
+
break
|
130
|
+
end
|
117
131
|
end
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
(0 .. n).each do |i|
|
123
|
-
a[key][i] = (o[:missing] * lengths[i]) if a[key][i].nil?
|
132
|
+
if inv
|
133
|
+
aln.each_key { |key| aln[key][i][pos] = '!' unless aln[key][i].nil? }
|
134
|
+
lengths[i] -= 1
|
135
|
+
invs += 1
|
124
136
|
end
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
137
|
+
end
|
138
|
+
aln.each_key { |key| aln[key][i].gsub!('!', '') unless aln[key][i].nil? }
|
139
|
+
end
|
140
|
+
invs
|
141
|
+
end
|
142
|
+
|
143
|
+
# Concatenate the alignments hash +aln+ using the character +missing+ to
|
144
|
+
# indicate missing alignments, and send each entry in the concatenated alignment
|
145
|
+
# to +blk+ as two variables: key (name) and value (alignment string)
|
146
|
+
#
|
147
|
+
# Returns an array with the lengths of each individual alignment
|
148
|
+
def concatenate(aln, missing, &blk)
|
149
|
+
say 'Concatenating'
|
150
|
+
lengths = aln.values.first.map(&:length)
|
151
|
+
aln.each_key do |key|
|
152
|
+
# Pad missing entries
|
153
|
+
lengths.each_with_index { |len, i| aln[key][i] ||= missing * len }
|
154
|
+
|
155
|
+
# Check length
|
156
|
+
obs_len = aln[key].map(&:length)
|
157
|
+
unless lengths == obs_len
|
158
|
+
abort "Inconsistent lengths in '#{key}'\nexp: #{lengths}\nobs: #{obs_len}"
|
159
|
+
end
|
160
|
+
|
161
|
+
# Pass entry to the block and remove from alignment hash
|
162
|
+
blk[key, aln[key].join('')]
|
163
|
+
aln.delete(key)
|
164
|
+
end
|
165
|
+
lengths
|
166
|
+
end
|
167
|
+
|
168
|
+
# Save the coordinates in +file+ based on +files+ paths (for the names), and
|
169
|
+
# using +lengths+ individual alignment lengths
|
170
|
+
#
|
171
|
+
# The saved format is RAxML coords, including the +model+ for each alignment
|
172
|
+
def save_coords(file, names, lengths, model)
|
173
|
+
File.open(file, 'w') do |fh|
|
174
|
+
s = 0
|
175
|
+
names.each_with_index do |name, i|
|
176
|
+
l = lengths[i]
|
177
|
+
next unless l > 0
|
178
|
+
name += "_#{i}" while names.count(name) > 1
|
179
|
+
if model =~ /(DNA.?)\/3/
|
180
|
+
fh.puts "#{$1}, #{name}codon1 = #{s + 1}-#{s + l}\\3"
|
181
|
+
fh.puts "#{$1}, #{name}codon2 = #{s + 2}-#{s + l}\\3"
|
182
|
+
fh.puts "#{$1}, #{name}codon3 = #{s + 3}-#{s + l}\\3"
|
183
|
+
else
|
184
|
+
fh.puts "#{model}, #{name} = #{s + 1}-#{s + l}"
|
152
185
|
end
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
$stderr.puts "Done.\n" unless o[:q]
|
157
|
-
rescue => err
|
158
|
-
$stderr.puts "Exception: #{err}\n\n"
|
159
|
-
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
160
|
-
err
|
186
|
+
s += l
|
187
|
+
end
|
188
|
+
end
|
161
189
|
end
|
162
190
|
|
191
|
+
# ------ MAIN ------
|
192
|
+
begin
|
193
|
+
say 'Reading'
|
194
|
+
alignments = read_alignments(files, o[:ignoreafter])
|
195
|
+
|
196
|
+
if o[:removeinvar]
|
197
|
+
say 'Removing invariable sites'
|
198
|
+
inv = remove_invariable(alignments, o[:undefined])
|
199
|
+
say " Removed #{inv} sites"
|
200
|
+
end
|
201
|
+
|
202
|
+
lengths = concatenate(alignments, o[:missing]) do |name, seq|
|
203
|
+
puts ">#{name}", seq.gsub(/(.{1,60})/, "\\1\n")
|
204
|
+
end
|
205
|
+
say " #{lengths.inject(:+)} columns"
|
206
|
+
|
207
|
+
unless o[:coords].nil?
|
208
|
+
say 'Generating coordinates'
|
209
|
+
names = files.map do |i|
|
210
|
+
File.basename(i).gsub(/\..*/, '').gsub(/[^A-Za-z0-9_]/, '_')
|
211
|
+
end
|
212
|
+
save_coords(o[:coords], names, lengths, o[:model])
|
213
|
+
end
|
214
|
+
|
215
|
+
$stderr.puts 'Done' unless o[:q]
|
216
|
+
rescue => err
|
217
|
+
$stderr.puts "Exception: #{err}\n\n"
|
218
|
+
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
219
|
+
err
|
220
|
+
end
|
163
221
|
|
@@ -1,9 +1,8 @@
|
|
1
1
|
#!/usr/bin/env perl
|
2
|
-
|
2
|
+
|
3
3
|
# @author: Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
4
|
-
# @
|
5
|
-
|
6
|
-
#
|
4
|
+
# @license: Artistic-2.0
|
5
|
+
|
7
6
|
use strict;
|
8
7
|
use warnings;
|
9
8
|
use List::Util qw/sum min max/;
|
@@ -11,46 +10,51 @@ use List::Util qw/sum min max/;
|
|
11
10
|
my ($seqs, $minlen, $n__) = @ARGV;
|
12
11
|
$seqs or die "
|
13
12
|
Description:
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
13
|
+
Calculates the N50 value of a set of sequences. Alternatively, it
|
14
|
+
can calculate other N** values. It also calculates the total number
|
15
|
+
of sequences, the total added length, and the longest sequence length.
|
16
|
+
|
18
17
|
Usage:
|
19
|
-
|
18
|
+
$0 seqs.fa [minlen [**]]
|
19
|
+
|
20
|
+
seqs.fa A FastA file containing the sequences
|
21
|
+
minlen (optional) The minimum length to take into consideration
|
22
|
+
By default: 0
|
23
|
+
** (optional) Value N** to calculate. By default: 50 (N50)
|
20
24
|
|
21
|
-
seqs.fa A FastA file containing the sequences.
|
22
|
-
minlen (optional) The minimum length to take into consideration.
|
23
|
-
By default: 0.
|
24
|
-
** Value N** to calculate. By default: 50 (N50).
|
25
25
|
";
|
26
|
+
|
26
27
|
$minlen ||= 0;
|
27
28
|
$n__ ||= 50;
|
28
29
|
|
29
30
|
my @len = ();
|
30
31
|
open SEQ, "<", $seqs or die "Cannot open file: $seqs: $!\n";
|
31
32
|
while(<SEQ>){
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
33
|
+
if(/^>/){
|
34
|
+
push @len, 0;
|
35
|
+
}else{
|
36
|
+
next if /^;/;
|
37
|
+
chomp;
|
38
|
+
s/\W//g;
|
39
|
+
$len[-1] += length $_;
|
40
|
+
}
|
40
41
|
}
|
41
42
|
close SEQ;
|
42
|
-
|
43
|
+
|
44
|
+
@len = sort { $a <=> $b } map { $_ >= $minlen ? $_ : () } @len;
|
43
45
|
my $tot = (sum(@len) || 0);
|
44
46
|
|
45
|
-
my $thr = $n__
|
47
|
+
my $thr = $n__ * $tot / 100;
|
46
48
|
my $pos = 0;
|
47
49
|
for(@len){
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
50
|
+
$pos += $_;
|
51
|
+
if($pos >= $thr){
|
52
|
+
print "N$n__: $_\n";
|
53
|
+
last;
|
54
|
+
}
|
53
55
|
}
|
54
|
-
|
56
|
+
|
57
|
+
print "Sequences: " . scalar(@len) . "\n";
|
55
58
|
print "Total length: $tot\n";
|
59
|
+
print "Longest sequence: " . pop(@len) . "\n";
|
56
60
|
|