miga-base 0.7.26.3 → 1.0.0.sr1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
- data/lib/miga/cli/action/doctor.rb +50 -19
- data/lib/miga/cli/action/doctor/base.rb +20 -18
- data/lib/miga/cli/action/init.rb +11 -7
- data/lib/miga/cli/action/init/files_helper.rb +1 -0
- data/lib/miga/cli/action/ncbi_get.rb +3 -3
- data/lib/miga/cli/action/tax_dist.rb +2 -2
- data/lib/miga/cli/action/wf.rb +5 -4
- data/lib/miga/daemon.rb +11 -4
- data/lib/miga/dataset/result.rb +10 -6
- data/lib/miga/json.rb +1 -2
- data/lib/miga/metadata.rb +5 -1
- data/lib/miga/parallel.rb +11 -6
- data/lib/miga/project.rb +8 -8
- data/lib/miga/project/base.rb +4 -4
- data/lib/miga/project/result.rb +2 -2
- data/lib/miga/sqlite.rb +7 -0
- data/lib/miga/version.rb +23 -9
- data/scripts/aai_distances.bash +16 -18
- data/scripts/ani_distances.bash +16 -17
- data/scripts/assembly.bash +31 -16
- data/scripts/haai_distances.bash +3 -27
- data/scripts/miga.bash +6 -4
- data/scripts/p.bash +1 -1
- data/scripts/read_quality.bash +9 -18
- data/scripts/trimmed_fasta.bash +14 -30
- data/scripts/trimmed_reads.bash +36 -36
- data/test/parallel_test.rb +31 -0
- data/test/project_test.rb +2 -1
- data/utils/distance/commands.rb +1 -0
- data/utils/distance/runner.rb +2 -4
- data/utils/enveomics/Manifest/Tasks/fasta.json +39 -3
- data/utils/enveomics/Manifest/Tasks/fastq.json +50 -2
- data/utils/enveomics/Manifest/Tasks/mapping.json +70 -0
- data/utils/enveomics/Manifest/Tasks/other.json +77 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +138 -1
- data/utils/enveomics/Manifest/categories.json +13 -4
- data/utils/enveomics/Scripts/Aln.cat.rb +206 -148
- data/utils/enveomics/Scripts/FastA.N50.pl +33 -29
- data/utils/enveomics/Scripts/FastA.fragment.rb +69 -61
- data/utils/enveomics/Scripts/FastA.sample.rb +61 -46
- data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
- data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
- data/utils/enveomics/Scripts/FastQ.tag.rb +59 -52
- data/utils/enveomics/Scripts/SRA.download.bash +6 -8
- data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
- data/utils/enveomics/Scripts/aai.rb +3 -2
- data/utils/enveomics/Scripts/anir.rb +137 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +17 -17
- data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
- data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
- data/utils/enveomics/Scripts/rbm.rb +87 -133
- data/utils/enveomics/Scripts/sam.filter.rb +148 -0
- data/utils/enveomics/enveomics.R/DESCRIPTION +2 -2
- data/utils/enveomics/enveomics.R/NAMESPACE +1 -1
- data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
- data/utils/enveomics/enveomics.R/R/utils.R +30 -0
- data/utils/enveomics/enveomics.R/README.md +1 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +10 -2
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +16 -4
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +13 -3
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +9 -2
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +13 -5
- data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +9 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +23 -6
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -4
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +7 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +14 -3
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +10 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +17 -9
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +6 -2
- data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +14 -5
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +19 -4
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +11 -3
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +11 -4
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +26 -12
- data/utils/multitrim/Multitrim How-To.pdf +0 -0
- data/utils/multitrim/README.md +67 -0
- data/utils/multitrim/multitrim.py +1555 -0
- data/utils/multitrim/multitrim.yml +13 -0
- data/utils/requirements.txt +4 -3
- metadata +33 -6
- data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +0 -30
|
@@ -24,11 +24,13 @@
|
|
|
24
24
|
"BlastTab.pairedHits.rb",
|
|
25
25
|
"BlastTab.subsample.pl",
|
|
26
26
|
"BlastTab.taxid2taxrank.pl",
|
|
27
|
-
"BlastTab.topHits_sorted.rb"
|
|
27
|
+
"BlastTab.topHits_sorted.rb",
|
|
28
|
+
"sam.filter.rb"
|
|
28
29
|
],
|
|
29
30
|
"Execution": [
|
|
30
31
|
"aai.rb",
|
|
31
32
|
"ani.rb",
|
|
33
|
+
"anir.rb",
|
|
32
34
|
"HMM.haai.rb",
|
|
33
35
|
"rbm.rb"
|
|
34
36
|
]
|
|
@@ -58,9 +60,11 @@
|
|
|
58
60
|
"FastA.split.rb",
|
|
59
61
|
"FastA.subsample.pl",
|
|
60
62
|
"FastA.tag.rb",
|
|
63
|
+
"FastA.toFastQ.rb",
|
|
61
64
|
"FastA.wrap.rb",
|
|
62
65
|
"FastQ.filter.pl",
|
|
63
66
|
"FastQ.interpose.pl",
|
|
67
|
+
"FastQ.maskQual.rb",
|
|
64
68
|
"FastQ.offset.pl",
|
|
65
69
|
"FastQ.split.pl",
|
|
66
70
|
"FastQ.tag.rb",
|
|
@@ -71,11 +75,13 @@
|
|
|
71
75
|
"Community": [
|
|
72
76
|
"AlphaDiversity.pl",
|
|
73
77
|
"Chao1.pl",
|
|
74
|
-
"Table.barplot.R"
|
|
78
|
+
"Table.barplot.R",
|
|
79
|
+
"Table.prefScore.R"
|
|
75
80
|
],
|
|
76
81
|
"Population": [
|
|
77
82
|
"VCF.SNPs.rb",
|
|
78
|
-
"VCF.KaKs.rb"
|
|
83
|
+
"VCF.KaKs.rb",
|
|
84
|
+
"Table.prefScore.R"
|
|
79
85
|
]
|
|
80
86
|
},
|
|
81
87
|
"Annotation": {
|
|
@@ -143,13 +149,16 @@
|
|
|
143
149
|
"clust.rand.rb"
|
|
144
150
|
],
|
|
145
151
|
"Read recruitments": [
|
|
152
|
+
"anir.rb",
|
|
146
153
|
"BedGraph.tad.rb",
|
|
147
154
|
"BedGraph.window.rb",
|
|
148
155
|
"BlastTab.catsbj.pl",
|
|
149
156
|
"BlastTab.pairedHits.rb",
|
|
150
157
|
"BlastTab.recplot2.R",
|
|
158
|
+
"FastQ.test-error.rb",
|
|
151
159
|
"GFF.catsbj.pl",
|
|
152
|
-
"RecPlot2.compareIdentities.R"
|
|
160
|
+
"RecPlot2.compareIdentities.R",
|
|
161
|
+
"sam.filter.rb"
|
|
153
162
|
]
|
|
154
163
|
}
|
|
155
164
|
}
|
|
@@ -1,163 +1,221 @@
|
|
|
1
1
|
#!/usr/bin/env ruby
|
|
2
|
-
|
|
2
|
+
|
|
3
3
|
# @author Luis M. Rodriguez-R
|
|
4
|
-
# @update Nov-30-2015
|
|
5
4
|
# @license artistic license 2.0
|
|
6
|
-
#
|
|
7
5
|
|
|
8
|
-
|
|
9
|
-
|
|
6
|
+
$VERSION = 1.0
|
|
7
|
+
$:.push File.expand_path('../lib', __FILE__)
|
|
8
|
+
require 'enveomics_rb/enveomics'
|
|
9
|
+
|
|
10
|
+
o = {
|
|
11
|
+
q: false, missing: '-', model: 'AUTO', removeinvar: false, undefined: '-.Xx?'
|
|
12
|
+
}
|
|
10
13
|
|
|
11
|
-
o = {:q=>false, :missing=>"-", :model=>"AUTO", :removeinvar=>false,
|
|
12
|
-
:undefined=>"-.Xx?"}
|
|
13
14
|
OptionParser.new do |opt|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
"
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
15
|
+
cmd = File.basename($0)
|
|
16
|
+
opt.banner = <<~BANNER
|
|
17
|
+
|
|
18
|
+
[Enveomics Collection: #{cmd} v#{$VERSION}]
|
|
19
|
+
|
|
20
|
+
Concatenates several multiple alignments in FastA format into a single
|
|
21
|
+
multiple alignment. The IDs of the sequences (or the ID prefixes, if using
|
|
22
|
+
--ignore-after) must coincide across files.
|
|
23
|
+
|
|
24
|
+
Usage: #{cmd} [options] aln1.fa aln2.fa ... > aln.fa
|
|
25
|
+
|
|
26
|
+
BANNER
|
|
27
|
+
opt.on(
|
|
28
|
+
'-c', '--coords FILE',
|
|
29
|
+
'Output file of coordinates in RAxML-compliant format'
|
|
30
|
+
) { |v| o[:coords] = v }
|
|
31
|
+
opt.on(
|
|
32
|
+
'-i', '--ignore-after STRING',
|
|
33
|
+
'Remove everything in the IDs after the specified string'
|
|
34
|
+
) { |v| o[:ignoreafter] = v }
|
|
35
|
+
opt.on(
|
|
36
|
+
'-I', '--remove-invariable', 'Remove invariable sites',
|
|
37
|
+
'Note: Invariable sites are defined as columns with only one state and',
|
|
38
|
+
'undefined characters. Additional ambiguous characters may exist and',
|
|
39
|
+
'should be declared using --undefined'
|
|
40
|
+
) { |v| o[:removeinvar] = v }
|
|
41
|
+
opt.on(
|
|
42
|
+
'-u', '--missing-char CHAR',
|
|
43
|
+
"Character denoting missing data. By default: '#{o[:missing]}'"
|
|
44
|
+
) do |v|
|
|
45
|
+
if v.length != 1
|
|
46
|
+
abort "-missing-char can only be denoted by single characters: #{v}"
|
|
47
|
+
end
|
|
48
|
+
o[:missing] = v
|
|
49
|
+
end
|
|
50
|
+
opt.on(
|
|
51
|
+
'-m', '--model STRING',
|
|
52
|
+
'Name of the model to use if --coords is used. See RAxML docs;',
|
|
53
|
+
'supported values in v8+ include:',
|
|
54
|
+
'~ For DNA alignments:',
|
|
55
|
+
' "DNA[F|X]", or "DNA[F|X]/3" (to estimate rates per codon position,',
|
|
56
|
+
' particular notation for this script)',
|
|
57
|
+
'~ General protein alignments:',
|
|
58
|
+
' "AUTO" (default in this script), "DAYHOFF" (1978), "DCMUT" (MBE 2005;',
|
|
59
|
+
' 22(2):193-199), "JTT" (Nat 1992;358:86-89), "VT" (JCompBiol 2000;',
|
|
60
|
+
' 7(6):761-776), "BLOSUM62" (PNAS 1992;89:10915), and "LG" (MBE 2008;',
|
|
61
|
+
' 25(7):1307-1320)',
|
|
62
|
+
'~ Specialized protein alignments:',
|
|
63
|
+
' "MTREV" (mitochondrial, JME 1996;42(4):459-468), "WAG" (globular, MBE',
|
|
64
|
+
' 2001;18(5):691-699), "RTREV" (retrovirus, JME 2002;55(1):65-73),',
|
|
65
|
+
' "CPREV" (chloroplast, JME 2000;50(4):348-358), and "MTMAM" (nuclear',
|
|
66
|
+
' mammal proteins, JME 1998;46(4):409-418)'
|
|
67
|
+
) { |v| o[:model] = v }
|
|
68
|
+
opt.on(
|
|
69
|
+
'--undefined STRING',
|
|
70
|
+
'All characters to be regarded as "undefined". It should include all',
|
|
71
|
+
'ambiguous and missing data chars. Ignored unless --remove-invariable',
|
|
72
|
+
"By default: '#{o[:undefined]}'"
|
|
73
|
+
) { |v| o[:undefined] = v }
|
|
74
|
+
opt.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
|
|
75
|
+
opt.on('-V', '--version', 'Returns version') { puts $VERSION ; exit }
|
|
76
|
+
opt.on('-h', '--help', 'Display this screen') { puts opt ; exit }
|
|
77
|
+
opt.separator ''
|
|
63
78
|
end.parse!
|
|
64
|
-
|
|
65
|
-
abort
|
|
79
|
+
files = ARGV
|
|
80
|
+
abort 'Alignment files are mandatory' if files.nil? || files.empty?
|
|
81
|
+
$QUIET = o[:q]
|
|
66
82
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
83
|
+
# Read individual gene alignments and return them as a single hash with genome
|
|
84
|
+
# IDs as keys and arrays of single-line strings as values
|
|
85
|
+
#
|
|
86
|
+
# IDs are trimmed after the first occurrence of +ignoreafter+, if defined
|
|
87
|
+
def read_alignments(files, ignoreafter = nil)
|
|
88
|
+
aln = {}
|
|
89
|
+
files.each_with_index do |file, i|
|
|
90
|
+
key = nil
|
|
91
|
+
File.open(file, 'r').each do |ln|
|
|
92
|
+
ln.chomp!
|
|
93
|
+
if ln =~ /^>(\S+)/
|
|
94
|
+
key = $1
|
|
95
|
+
key.sub!(/#{ignoreafter}.*/, '') if ignoreafter
|
|
96
|
+
aln[key] ||= []
|
|
97
|
+
aln[key][i] = ''
|
|
98
|
+
else
|
|
99
|
+
if key.nil?
|
|
100
|
+
abort "Invalid FastA file: #{file}: Leading line not a def-line"
|
|
101
|
+
end
|
|
102
|
+
ln.gsub!(/\s/, '')
|
|
103
|
+
aln[key][i] += ln
|
|
88
104
|
end
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
end
|
|
116
|
-
a.keys.each{|key| a[key][i].gsub!("!", "") unless a[key][i].nil?}
|
|
105
|
+
end
|
|
106
|
+
abort "Empty alignment file: #{file}" if key.nil?
|
|
107
|
+
end
|
|
108
|
+
aln
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Remove invariable sites from the alignment hash +aln+, using +undefined+ as
|
|
112
|
+
# a string including all characters representing undefined positions (e.g., X)
|
|
113
|
+
#
|
|
114
|
+
# Returns number of columns removed
|
|
115
|
+
def remove_invariable(aln, undefined)
|
|
116
|
+
invs = 0
|
|
117
|
+
lengths = aln.values.first.map(&:length)
|
|
118
|
+
undef_chars = undefined.chars
|
|
119
|
+
|
|
120
|
+
lengths.each_with_index do |len, i|
|
|
121
|
+
(0 .. len - 1).each do |pos|
|
|
122
|
+
chr = nil
|
|
123
|
+
inv = true
|
|
124
|
+
aln.each_key do |key|
|
|
125
|
+
next if aln[key][i].nil?
|
|
126
|
+
chr = aln[key][i][pos] if chr.nil? || undefined.chars.include?(chr)
|
|
127
|
+
if chr != aln[key][i][pos] && !undef_chars.include?(aln[key][i][pos])
|
|
128
|
+
inv = false
|
|
129
|
+
break
|
|
130
|
+
end
|
|
117
131
|
end
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
(0 .. n).each do |i|
|
|
123
|
-
a[key][i] = (o[:missing] * lengths[i]) if a[key][i].nil?
|
|
132
|
+
if inv
|
|
133
|
+
aln.each_key { |key| aln[key][i][pos] = '!' unless aln[key][i].nil? }
|
|
134
|
+
lengths[i] -= 1
|
|
135
|
+
invs += 1
|
|
124
136
|
end
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
137
|
+
end
|
|
138
|
+
aln.each_key { |key| aln[key][i].gsub!('!', '') unless aln[key][i].nil? }
|
|
139
|
+
end
|
|
140
|
+
invs
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Concatenate the alignments hash +aln+ using the character +missing+ to
|
|
144
|
+
# indicate missing alignments, and send each entry in the concatenated alignment
|
|
145
|
+
# to +blk+ as two variables: key (name) and value (alignment string)
|
|
146
|
+
#
|
|
147
|
+
# Returns an array with the lengths of each individual alignment
|
|
148
|
+
def concatenate(aln, missing, &blk)
|
|
149
|
+
say 'Concatenating'
|
|
150
|
+
lengths = aln.values.first.map(&:length)
|
|
151
|
+
aln.each_key do |key|
|
|
152
|
+
# Pad missing entries
|
|
153
|
+
lengths.each_with_index { |len, i| aln[key][i] ||= missing * len }
|
|
154
|
+
|
|
155
|
+
# Check length
|
|
156
|
+
obs_len = aln[key].map(&:length)
|
|
157
|
+
unless lengths == obs_len
|
|
158
|
+
abort "Inconsistent lengths in '#{key}'\nexp: #{lengths}\nobs: #{obs_len}"
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Pass entry to the block and remove from alignment hash
|
|
162
|
+
blk[key, aln[key].join('')]
|
|
163
|
+
aln.delete(key)
|
|
164
|
+
end
|
|
165
|
+
lengths
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Save the coordinates in +file+ based on +files+ paths (for the names), and
|
|
169
|
+
# using +lengths+ individual alignment lengths
|
|
170
|
+
#
|
|
171
|
+
# The saved format is RAxML coords, including the +model+ for each alignment
|
|
172
|
+
def save_coords(file, names, lengths, model)
|
|
173
|
+
File.open(file, 'w') do |fh|
|
|
174
|
+
s = 0
|
|
175
|
+
names.each_with_index do |name, i|
|
|
176
|
+
l = lengths[i]
|
|
177
|
+
next unless l > 0
|
|
178
|
+
name += "_#{i}" while names.count(name) > 1
|
|
179
|
+
if model =~ /(DNA.?)\/3/
|
|
180
|
+
fh.puts "#{$1}, #{name}codon1 = #{s + 1}-#{s + l}\\3"
|
|
181
|
+
fh.puts "#{$1}, #{name}codon2 = #{s + 2}-#{s + l}\\3"
|
|
182
|
+
fh.puts "#{$1}, #{name}codon3 = #{s + 3}-#{s + l}\\3"
|
|
183
|
+
else
|
|
184
|
+
fh.puts "#{model}, #{name} = #{s + 1}-#{s + l}"
|
|
152
185
|
end
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
$stderr.puts "Done.\n" unless o[:q]
|
|
157
|
-
rescue => err
|
|
158
|
-
$stderr.puts "Exception: #{err}\n\n"
|
|
159
|
-
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
|
160
|
-
err
|
|
186
|
+
s += l
|
|
187
|
+
end
|
|
188
|
+
end
|
|
161
189
|
end
|
|
162
190
|
|
|
191
|
+
# ------ MAIN ------
|
|
192
|
+
begin
|
|
193
|
+
say 'Reading'
|
|
194
|
+
alignments = read_alignments(files, o[:ignoreafter])
|
|
195
|
+
|
|
196
|
+
if o[:removeinvar]
|
|
197
|
+
say 'Removing invariable sites'
|
|
198
|
+
inv = remove_invariable(alignments, o[:undefined])
|
|
199
|
+
say " Removed #{inv} sites"
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
lengths = concatenate(alignments, o[:missing]) do |name, seq|
|
|
203
|
+
puts ">#{name}", seq.gsub(/(.{1,60})/, "\\1\n")
|
|
204
|
+
end
|
|
205
|
+
say " #{lengths.inject(:+)} columns"
|
|
206
|
+
|
|
207
|
+
unless o[:coords].nil?
|
|
208
|
+
say 'Generating coordinates'
|
|
209
|
+
names = files.map do |i|
|
|
210
|
+
File.basename(i).gsub(/\..*/, '').gsub(/[^A-Za-z0-9_]/, '_')
|
|
211
|
+
end
|
|
212
|
+
save_coords(o[:coords], names, lengths, o[:model])
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
$stderr.puts 'Done' unless o[:q]
|
|
216
|
+
rescue => err
|
|
217
|
+
$stderr.puts "Exception: #{err}\n\n"
|
|
218
|
+
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
|
219
|
+
err
|
|
220
|
+
end
|
|
163
221
|
|
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
#!/usr/bin/env perl
|
|
2
|
-
|
|
2
|
+
|
|
3
3
|
# @author: Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
|
4
|
-
# @
|
|
5
|
-
|
|
6
|
-
#
|
|
4
|
+
# @license: Artistic-2.0
|
|
5
|
+
|
|
7
6
|
use strict;
|
|
8
7
|
use warnings;
|
|
9
8
|
use List::Util qw/sum min max/;
|
|
@@ -11,46 +10,51 @@ use List::Util qw/sum min max/;
|
|
|
11
10
|
my ($seqs, $minlen, $n__) = @ARGV;
|
|
12
11
|
$seqs or die "
|
|
13
12
|
Description:
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
13
|
+
Calculates the N50 value of a set of sequences. Alternatively, it
|
|
14
|
+
can calculate other N** values. It also calculates the total number
|
|
15
|
+
of sequences, the total added length, and the longest sequence length.
|
|
16
|
+
|
|
18
17
|
Usage:
|
|
19
|
-
|
|
18
|
+
$0 seqs.fa [minlen [**]]
|
|
19
|
+
|
|
20
|
+
seqs.fa A FastA file containing the sequences
|
|
21
|
+
minlen (optional) The minimum length to take into consideration
|
|
22
|
+
By default: 0
|
|
23
|
+
** (optional) Value N** to calculate. By default: 50 (N50)
|
|
20
24
|
|
|
21
|
-
seqs.fa A FastA file containing the sequences.
|
|
22
|
-
minlen (optional) The minimum length to take into consideration.
|
|
23
|
-
By default: 0.
|
|
24
|
-
** Value N** to calculate. By default: 50 (N50).
|
|
25
25
|
";
|
|
26
|
+
|
|
26
27
|
$minlen ||= 0;
|
|
27
28
|
$n__ ||= 50;
|
|
28
29
|
|
|
29
30
|
my @len = ();
|
|
30
31
|
open SEQ, "<", $seqs or die "Cannot open file: $seqs: $!\n";
|
|
31
32
|
while(<SEQ>){
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
33
|
+
if(/^>/){
|
|
34
|
+
push @len, 0;
|
|
35
|
+
}else{
|
|
36
|
+
next if /^;/;
|
|
37
|
+
chomp;
|
|
38
|
+
s/\W//g;
|
|
39
|
+
$len[-1] += length $_;
|
|
40
|
+
}
|
|
40
41
|
}
|
|
41
42
|
close SEQ;
|
|
42
|
-
|
|
43
|
+
|
|
44
|
+
@len = sort { $a <=> $b } map { $_ >= $minlen ? $_ : () } @len;
|
|
43
45
|
my $tot = (sum(@len) || 0);
|
|
44
46
|
|
|
45
|
-
my $thr = $n__
|
|
47
|
+
my $thr = $n__ * $tot / 100;
|
|
46
48
|
my $pos = 0;
|
|
47
49
|
for(@len){
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
50
|
+
$pos += $_;
|
|
51
|
+
if($pos >= $thr){
|
|
52
|
+
print "N$n__: $_\n";
|
|
53
|
+
last;
|
|
54
|
+
}
|
|
53
55
|
}
|
|
54
|
-
|
|
56
|
+
|
|
57
|
+
print "Sequences: " . scalar(@len) . "\n";
|
|
55
58
|
print "Total length: $tot\n";
|
|
59
|
+
print "Longest sequence: " . pop(@len) . "\n";
|
|
56
60
|
|