miga-base 0.3.1.7 → 0.3.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/actions/ncbi_get.rb +8 -0
- data/lib/miga/common.rb +9 -215
- data/lib/miga/common/base.rb +49 -0
- data/lib/miga/common/format.rb +135 -0
- data/lib/miga/common/path.rb +49 -0
- data/lib/miga/daemon.rb +3 -60
- data/lib/miga/daemon/base.rb +69 -0
- data/lib/miga/dataset.rb +3 -3
- data/lib/miga/dataset/result.rb +5 -5
- data/lib/miga/result.rb +5 -0
- data/lib/miga/version.rb +7 -5
- data/scripts/distances.bash +2 -19
- data/scripts/taxonomy.bash +2 -21
- data/test/common_test.rb +9 -0
- data/utils/distance/base.rb +6 -0
- data/utils/distance/commands.rb +82 -0
- data/utils/distance/database.rb +86 -0
- data/utils/distance/pipeline.rb +98 -0
- data/utils/distance/runner.rb +104 -0
- data/utils/distance/temporal.rb +37 -0
- data/utils/distances.rb +9 -0
- data/utils/enveomics/Docs/recplot2.md +233 -0
- data/utils/enveomics/Makefile +1 -1
- data/utils/enveomics/Manifest/Tasks/blasttab.json +66 -0
- data/utils/enveomics/Manifest/Tasks/fasta.json +10 -3
- data/utils/enveomics/Manifest/Tasks/fastq.json +4 -4
- data/utils/enveomics/Manifest/Tasks/mapping.json +38 -1
- data/utils/enveomics/Manifest/categories.json +11 -1
- data/utils/enveomics/Manifest/examples.json +2 -2
- data/utils/enveomics/README.md +2 -0
- data/utils/enveomics/Scripts/Aln.cat.rb +1 -0
- data/utils/enveomics/Scripts/BedGraph.tad.rb +52 -30
- data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
- data/utils/enveomics/Scripts/BlastTab.recplot2.R +7 -2
- data/utils/enveomics/Scripts/FastA.interpose.pl +26 -20
- data/utils/enveomics/Scripts/FastQ.interpose.pl +20 -20
- data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
- data/utils/enveomics/Scripts/SRA.download.bash +28 -21
- data/utils/enveomics/Scripts/Table.barplot.R +1 -0
- data/utils/enveomics/Scripts/aai.rb +4 -2
- data/utils/enveomics/build_enveomics_r.bash +5 -5
- data/utils/enveomics/enveomics.R/DESCRIPTION +1 -1
- data/utils/enveomics/enveomics.R/NAMESPACE +6 -2
- data/utils/enveomics/enveomics.R/R/recplot2.R +471 -71
- data/utils/enveomics/enveomics.R/README.md +26 -17
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +1 -1
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +6 -3
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +32 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +12 -7
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +8 -37
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +20 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +20 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +29 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +42 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +18 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +33 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +28 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +56 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +3 -1
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +22 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +20 -14
- data/utils/requirements.txt +1 -1
- metadata +28 -4
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeak.Rd +0 -40
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeaks.Rd +0 -18
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c395d2565cacafe425c91a277c03cdeaa3ac9ece
|
4
|
+
data.tar.gz: 1978e8a2df4d0646bc884ce9921bf8de2bc13a25
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fff79ff971c5fc9e0e0684585b9b35abe5377cff0742c772fc1a7c58005b224ea18b36e7fe489b78c6103f6c4b9ffc90be75e7a385d5fce59a917e2d68c3765a
|
7
|
+
data.tar.gz: cda85047eabd8ba1c76bb5fe690247a3e71583e034f3c5dbef9178776d4c5da02bd3075cfc0e3154e1e97aa69da8ddea85a00ef99efa065e867950c10a860a71
|
data/actions/ncbi_get.rb
CHANGED
@@ -34,6 +34,8 @@ OptionParser.new do |opt|
|
|
34
34
|
opt.on('--no-version-name',
|
35
35
|
'Do not add sequence version to the dataset name.',
|
36
36
|
'Only affects --complete and --chromosome.'){ |v| o[:add_version]=v }
|
37
|
+
opt.on('--blacklist PATH',
|
38
|
+
'A file with dataset names to blacklist.'){ |v| o[:blacklist] = v }
|
37
39
|
opt.on('--dry', 'Do not download or save the datasets.'){ |v| o[:dry] = v }
|
38
40
|
opt.on('-q', '--query',
|
39
41
|
'Register the datasets as queries, not reference datasets.'
|
@@ -135,6 +137,12 @@ if o[:scaffold] or o[:contig]
|
|
135
137
|
end
|
136
138
|
end
|
137
139
|
|
140
|
+
# Discard blacklisted
|
141
|
+
unless o[:blacklist].nil?
|
142
|
+
$stderr.puts "Discarding datasets in #{o[:blacklist]}." unless o[:q]
|
143
|
+
File.readlines(o[:blacklist]).map(&:chomp).each{ |i| ds.delete i }
|
144
|
+
end
|
145
|
+
|
138
146
|
# Download entries
|
139
147
|
$stderr.puts "Downloading #{ds.size} #{ds.size>1 ? "entries" : "entry"}." unless o[:q]
|
140
148
|
ds.each do |name,body|
|
data/lib/miga/common.rb
CHANGED
@@ -1,62 +1,24 @@
|
|
1
1
|
# @package MiGA
|
2
2
|
# @license Artistic-2.0
|
3
3
|
|
4
|
-
require 'miga/version'
|
5
4
|
require 'json'
|
6
|
-
require '
|
7
|
-
require '
|
5
|
+
require 'miga/version'
|
6
|
+
require 'miga/common/base'
|
7
|
+
require 'miga/common/path'
|
8
|
+
require 'miga/common/format'
|
8
9
|
|
9
10
|
##
|
10
11
|
# Generic class used to handle system-wide information and methods, and parent
|
11
12
|
# of all other MiGA::* classes.
|
12
13
|
class MiGA::MiGA
|
14
|
+
|
15
|
+
include MiGA::Common
|
16
|
+
|
17
|
+
extend MiGA::Common::Path
|
18
|
+
extend MiGA::Common::Format
|
13
19
|
|
14
20
|
ENV['MIGA_HOME'] ||= ENV['HOME']
|
15
21
|
|
16
|
-
##
|
17
|
-
# Root path to MiGA (as estimated from the location of the current file).
|
18
|
-
def self.root_path
|
19
|
-
File.expand_path('../../..', __FILE__)
|
20
|
-
end
|
21
|
-
|
22
|
-
##
|
23
|
-
# Should debugging information be reported?
|
24
|
-
@@DEBUG = false
|
25
|
-
|
26
|
-
##
|
27
|
-
# Should the trace of debugging information be reported?
|
28
|
-
@@DEBUG_TRACE = false
|
29
|
-
|
30
|
-
##
|
31
|
-
# Turn on debugging.
|
32
|
-
def self.DEBUG_ON() @@DEBUG=true end
|
33
|
-
|
34
|
-
##
|
35
|
-
# Turn off debugging.
|
36
|
-
def self.DEBUG_OFF() @@DEBUG=false end
|
37
|
-
|
38
|
-
##
|
39
|
-
# Turn on debug tracing (and debugging).
|
40
|
-
def self.DEBUG_TRACE_ON
|
41
|
-
@@DEBUG_TRACE=true
|
42
|
-
self.DEBUG_ON
|
43
|
-
end
|
44
|
-
|
45
|
-
##
|
46
|
-
# Turn off debug tracing (but not debugging).
|
47
|
-
def self.DEBUG_TRACE_OFF
|
48
|
-
@@DEBUG_TRACE=false
|
49
|
-
end
|
50
|
-
|
51
|
-
##
|
52
|
-
# Send debug message.
|
53
|
-
def self.DEBUG(*args)
|
54
|
-
$stderr.puts(*args) if @@DEBUG
|
55
|
-
if @@DEBUG_TRACE
|
56
|
-
$stderr.puts caller.map{ |v| v.gsub(/^/,' ') }.join("\n")
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
22
|
##
|
61
23
|
# Has MiGA been initialized?
|
62
24
|
def self.initialized?
|
@@ -64,121 +26,6 @@ class MiGA::MiGA
|
|
64
26
|
File.exist?(File.expand_path('.miga_daemon.json', ENV['MIGA_HOME']))
|
65
27
|
end
|
66
28
|
|
67
|
-
##
|
68
|
-
# Tabulates an +values+, and Array of Arrays, all with the same number of
|
69
|
-
# entries as +header+. Returns an Array of String, one per line.
|
70
|
-
def self.tabulate(header, values)
|
71
|
-
fields = [header.map(&:to_s)]
|
72
|
-
fields << fields.first.map{ |h| h.gsub(/\S/, '-') }
|
73
|
-
fields += values.map{ |row| row.map{ |cell| cell.nil? ? '?' : cell.to_s } }
|
74
|
-
clen = fields.map{ |row| row.map(&:length) }.transpose.map(&:max)
|
75
|
-
fields.map do |row|
|
76
|
-
(0 .. clen.size-1).map do |col_n|
|
77
|
-
col_n==0 ? row[col_n].rjust(clen[col_n]) : row[col_n].ljust(clen[col_n])
|
78
|
-
end.join(' ')
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
##
|
83
|
-
# Cleans a FastA file in place.
|
84
|
-
def self.clean_fasta_file(file)
|
85
|
-
tmp_fh = nil
|
86
|
-
begin
|
87
|
-
if file =~ /\.gz/
|
88
|
-
tmp_path = Tempfile.new('MiGA.gz').tap(&:close).path
|
89
|
-
tmp_fh = Zlib::GzipWriter.open(tmp_path)
|
90
|
-
fh = Zlib::GzipReader.open(file)
|
91
|
-
else
|
92
|
-
tmp_fh = Tempfile.new('MiGA')
|
93
|
-
tmp_path = tmp_fh.path
|
94
|
-
fh = File.open(file, 'r')
|
95
|
-
end
|
96
|
-
buffer = ''
|
97
|
-
fh.each_line do |ln|
|
98
|
-
ln.chomp!
|
99
|
-
if ln =~ /^>\s*(\S+)(.*)/
|
100
|
-
(id, df) = [$1, $2]
|
101
|
-
tmp_fh.print buffer.wrap_width(80)
|
102
|
-
buffer = ''
|
103
|
-
tmp_fh.puts ">#{id.gsub(/[^A-Za-z0-9_\|\.]/, "_")}#{df}"
|
104
|
-
else
|
105
|
-
buffer << ln.gsub(/[^A-Za-z\.\-]/, '')
|
106
|
-
end
|
107
|
-
end
|
108
|
-
tmp_fh.print buffer.wrap_width(80)
|
109
|
-
tmp_fh.close
|
110
|
-
fh.close
|
111
|
-
FileUtils.cp(tmp_path, file)
|
112
|
-
ensure
|
113
|
-
begin
|
114
|
-
tmp_fh.close unless tmp_fh.nil?
|
115
|
-
File.unlink(tmp_path) unless tmp_path.nil?
|
116
|
-
rescue
|
117
|
-
end
|
118
|
-
end
|
119
|
-
end
|
120
|
-
|
121
|
-
##
|
122
|
-
# Calculates the average and standard deviation of the sequence lengths in
|
123
|
-
# a FastA or FastQ file (supports gzipped files). The +format+ must be a
|
124
|
-
# Symbol, one of +:fasta+ or +:fastq+. Additional estimations can be
|
125
|
-
# controlled via the +opts+ Hash. Supported options include:
|
126
|
-
# - +:n50+: If true, it also returns the N50 and the median (in bp).
|
127
|
-
# - +gc+: If true, it also returns the G+C content (in %).
|
128
|
-
def self.seqs_length(file, format, opts={})
|
129
|
-
fh = (file =~ /\.gz/) ? Zlib::GzipReader.open(file) : File.open(file, 'r')
|
130
|
-
l = []
|
131
|
-
gc = 0
|
132
|
-
i = 0 # <- Zlib::GzipReader doesn't set $.
|
133
|
-
fh.each_line do |ln|
|
134
|
-
i += 1
|
135
|
-
if (format==:fasta and ln =~ /^>/) or (format==:fastq and (i % 4)==1)
|
136
|
-
l << 0
|
137
|
-
elsif format==:fasta or (i % 4)==2
|
138
|
-
l[l.size-1] += ln.chomp.size
|
139
|
-
gc += ln.scan(/[GCgc]/).count if opts[:gc]
|
140
|
-
end
|
141
|
-
end
|
142
|
-
fh.close
|
143
|
-
|
144
|
-
o = { n: l.size, tot: l.inject(:+) }
|
145
|
-
o[:avg] = o[:tot].to_f/l.size
|
146
|
-
o[:var] = l.map{ |a| a ** 2 }.inject(:+).to_f/l.size - o[:avg]**2
|
147
|
-
o[:sd] = Math.sqrt o[:var]
|
148
|
-
o[:gc] = 100.0*gc/o[:tot] if opts[:gc]
|
149
|
-
if opts[:n50]
|
150
|
-
l.sort!
|
151
|
-
thr = o[:tot]/2
|
152
|
-
pos = 0
|
153
|
-
l.each do |a|
|
154
|
-
pos += a
|
155
|
-
o[:n50] = a
|
156
|
-
break if pos >= thr
|
157
|
-
end
|
158
|
-
o[:med] = o[:n].even? ?
|
159
|
-
0.5*l[o[:n]/2-1,2].inject(:+) : l[(o[:n]-1)/2]
|
160
|
-
end
|
161
|
-
o
|
162
|
-
end
|
163
|
-
|
164
|
-
##
|
165
|
-
# Path to a script to be executed for +task+. Supported +opts+ are:
|
166
|
-
# - +:miga+ Path to the MiGA home to use. If not passed, the home of the
|
167
|
-
# library is used).
|
168
|
-
# - +:project+ MiGA::Project object to check within plugins. If not passed,
|
169
|
-
# only core scripts are supported.
|
170
|
-
def self.script_path(task, opts={})
|
171
|
-
opts[:miga] ||= root_path
|
172
|
-
unless opts[:project].nil?
|
173
|
-
opts[:project].plugins.each do |pl|
|
174
|
-
if File.exist? File.expand_path("scripts/#{task}.bash", pl)
|
175
|
-
opts[:miga] = pl
|
176
|
-
end
|
177
|
-
end
|
178
|
-
end
|
179
|
-
File.expand_path("scripts/#{task}.bash", opts[:miga])
|
180
|
-
end
|
181
|
-
|
182
29
|
##
|
183
30
|
# Check if the result files exist with +base+ name (String) followed by the
|
184
31
|
# +ext+ values (Array of String).
|
@@ -188,59 +35,6 @@ class MiGA::MiGA
|
|
188
35
|
File.exist?(base + f) or File.exist?("#{base}#{f}.gz")
|
189
36
|
end
|
190
37
|
end
|
191
|
-
end
|
192
|
-
|
193
|
-
##
|
194
|
-
# MiGA extensions to the File class.
|
195
|
-
class File
|
196
38
|
|
197
|
-
##
|
198
|
-
# Method to transfer a file from +old_name+ to +new_name+, using a +method+
|
199
|
-
# that can be one of :symlink for File#symlink, :hardlink for File#link, or
|
200
|
-
# :copy for FileUtils#cp_r.
|
201
|
-
def self.generic_transfer(old_name, new_name, method)
|
202
|
-
return nil if exist? new_name
|
203
|
-
case method
|
204
|
-
when :symlink
|
205
|
-
File.symlink(old_name, new_name)
|
206
|
-
when :hardlink
|
207
|
-
File.link(old_name, new_name)
|
208
|
-
when :copy
|
209
|
-
FileUtils.cp_r(old_name, new_name)
|
210
|
-
else
|
211
|
-
raise "Unknown transfer method: #{method}."
|
212
|
-
end
|
213
|
-
end
|
214
|
-
end
|
215
|
-
|
216
|
-
##
|
217
|
-
# MiGA extensions to the String class.
|
218
|
-
class String
|
219
|
-
|
220
|
-
##
|
221
|
-
# Replace any character not allowed in a MiGA name for underscore (_). This
|
222
|
-
# results in a MiGA-compliant name EXCEPT for empty strings, that results in
|
223
|
-
# empty strings.
|
224
|
-
def miga_name
|
225
|
-
gsub(/[^A-Za-z0-9_]/, '_')
|
226
|
-
end
|
227
|
-
|
228
|
-
##
|
229
|
-
# Is the string a MiGA-compliant name?
|
230
|
-
def miga_name?
|
231
|
-
!(self !~ /^[A-Za-z0-9_]+$/)
|
232
|
-
end
|
233
|
-
|
234
|
-
##
|
235
|
-
# Replace underscores by spaces or dots (depending on context).
|
236
|
-
def unmiga_name
|
237
|
-
gsub(/_(str|sp|subsp|pv)__/,"_\\1._").tr('_', ' ')
|
238
|
-
end
|
239
|
-
|
240
|
-
##
|
241
|
-
# Wraps the string with fixed Integer +width+.
|
242
|
-
def wrap_width(width)
|
243
|
-
gsub(/([^\n\r]{1,#{width}})/,"\\1\n")
|
244
|
-
end
|
245
39
|
end
|
246
40
|
|
@@ -0,0 +1,49 @@
|
|
1
|
+
|
2
|
+
class MiGA::MiGA
|
3
|
+
|
4
|
+
# Class-level
|
5
|
+
class << self
|
6
|
+
##
|
7
|
+
# Turn on debugging.
|
8
|
+
def DEBUG_ON ; @@DEBUG=true end
|
9
|
+
|
10
|
+
##
|
11
|
+
# Turn off debugging.
|
12
|
+
def DEBUG_OFF ; @@DEBUG=false end
|
13
|
+
|
14
|
+
##
|
15
|
+
# Turn on debug tracing (and debugging).
|
16
|
+
def DEBUG_TRACE_ON
|
17
|
+
@@DEBUG_TRACE=true
|
18
|
+
DEBUG_ON()
|
19
|
+
end
|
20
|
+
|
21
|
+
##
|
22
|
+
# Turn off debug tracing (but not debugging).
|
23
|
+
def DEBUG_TRACE_OFF
|
24
|
+
@@DEBUG_TRACE=false
|
25
|
+
end
|
26
|
+
|
27
|
+
##
|
28
|
+
# Send debug message.
|
29
|
+
def DEBUG(*args)
|
30
|
+
$stderr.puts(*args) if @@DEBUG
|
31
|
+
$stderr.puts(
|
32
|
+
caller.map{ |v| v.gsub(/^/,' ') }.join("\n") ) if @@DEBUG_TRACE
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
module MiGA::Common
|
39
|
+
|
40
|
+
##
|
41
|
+
# Should debugging information be reported?
|
42
|
+
@@DEBUG = false
|
43
|
+
|
44
|
+
##
|
45
|
+
# Should the trace of debugging information be reported?
|
46
|
+
@@DEBUG_TRACE = false
|
47
|
+
|
48
|
+
end
|
49
|
+
|
@@ -0,0 +1,135 @@
|
|
1
|
+
|
2
|
+
require 'tempfile'
|
3
|
+
require 'zlib'
|
4
|
+
|
5
|
+
module MiGA::Common::Format
|
6
|
+
|
7
|
+
##
|
8
|
+
# Tabulates an +values+, and Array of Arrays, all with the same number of
|
9
|
+
# entries as +header+. Returns an Array of String, one per line.
|
10
|
+
def tabulate(header, values)
|
11
|
+
fields = [header.map(&:to_s)]
|
12
|
+
fields << fields.first.map{ |h| h.gsub(/\S/, '-') }
|
13
|
+
fields += values.map{ |row| row.map{ |cell| cell.nil? ? '?' : cell.to_s } }
|
14
|
+
clen = fields.map{ |row| row.map(&:length) }.transpose.map(&:max)
|
15
|
+
fields.map do |row|
|
16
|
+
(0 .. clen.size-1).map do |col_n|
|
17
|
+
col_n==0 ? row[col_n].rjust(clen[col_n]) : row[col_n].ljust(clen[col_n])
|
18
|
+
end.join(' ')
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# Cleans a FastA file in place.
|
24
|
+
def clean_fasta_file(file)
|
25
|
+
tmp_fh = nil
|
26
|
+
begin
|
27
|
+
if file =~ /\.gz/
|
28
|
+
tmp_path = Tempfile.new('MiGA.gz').tap(&:close).path
|
29
|
+
tmp_fh = Zlib::GzipWriter.open(tmp_path)
|
30
|
+
fh = Zlib::GzipReader.open(file)
|
31
|
+
else
|
32
|
+
tmp_fh = Tempfile.new('MiGA')
|
33
|
+
tmp_path = tmp_fh.path
|
34
|
+
fh = File.open(file, 'r')
|
35
|
+
end
|
36
|
+
buffer = ''
|
37
|
+
fh.each_line do |ln|
|
38
|
+
ln.chomp!
|
39
|
+
if ln =~ /^>\s*(\S+)(.*)/
|
40
|
+
(id, df) = [$1, $2]
|
41
|
+
tmp_fh.print buffer.wrap_width(80)
|
42
|
+
buffer = ''
|
43
|
+
tmp_fh.puts ">#{id.gsub(/[^A-Za-z0-9_\|\.]/, "_")}#{df}"
|
44
|
+
else
|
45
|
+
buffer << ln.gsub(/[^A-Za-z\.\-]/, '')
|
46
|
+
end
|
47
|
+
end
|
48
|
+
tmp_fh.print buffer.wrap_width(80)
|
49
|
+
tmp_fh.close
|
50
|
+
fh.close
|
51
|
+
FileUtils.cp(tmp_path, file)
|
52
|
+
ensure
|
53
|
+
begin
|
54
|
+
tmp_fh.close unless tmp_fh.nil?
|
55
|
+
File.unlink(tmp_path) unless tmp_path.nil?
|
56
|
+
rescue
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
##
|
62
|
+
# Calculates the average and standard deviation of the sequence lengths in
|
63
|
+
# a FastA or FastQ file (supports gzipped files). The +format+ must be a
|
64
|
+
# Symbol, one of +:fasta+ or +:fastq+. Additional estimations can be
|
65
|
+
# controlled via the +opts+ Hash. Supported options include:
|
66
|
+
# - +:n50+: If true, it also returns the N50 and the median (in bp).
|
67
|
+
# - +gc+: If true, it also returns the G+C content (in %).
|
68
|
+
def seqs_length(file, format, opts={})
|
69
|
+
fh = (file =~ /\.gz/) ? Zlib::GzipReader.open(file) : File.open(file, 'r')
|
70
|
+
l = []
|
71
|
+
gc = 0
|
72
|
+
i = 0 # <- Zlib::GzipReader doesn't set $.
|
73
|
+
fh.each_line do |ln|
|
74
|
+
i += 1
|
75
|
+
if (format==:fasta and ln =~ /^>/) or (format==:fastq and (i % 4)==1)
|
76
|
+
l << 0
|
77
|
+
elsif format==:fasta or (i % 4)==2
|
78
|
+
l[l.size-1] += ln.chomp.size
|
79
|
+
gc += ln.scan(/[GCgc]/).count if opts[:gc]
|
80
|
+
end
|
81
|
+
end
|
82
|
+
fh.close
|
83
|
+
|
84
|
+
o = { n: l.size, tot: l.inject(:+) }
|
85
|
+
o[:avg] = o[:tot].to_f/l.size
|
86
|
+
o[:var] = l.map{ |a| a ** 2 }.inject(:+).to_f/l.size - o[:avg]**2
|
87
|
+
o[:sd] = Math.sqrt o[:var]
|
88
|
+
o[:gc] = 100.0*gc/o[:tot] if opts[:gc]
|
89
|
+
if opts[:n50]
|
90
|
+
l.sort!
|
91
|
+
thr = o[:tot]/2
|
92
|
+
pos = 0
|
93
|
+
l.each do |a|
|
94
|
+
pos += a
|
95
|
+
o[:n50] = a
|
96
|
+
break if pos >= thr
|
97
|
+
end
|
98
|
+
o[:med] = o[:n].even? ?
|
99
|
+
0.5*l[o[:n]/2-1,2].inject(:+) : l[(o[:n]-1)/2]
|
100
|
+
end
|
101
|
+
o
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
##
|
106
|
+
# MiGA extensions to the String class.
|
107
|
+
class String
|
108
|
+
|
109
|
+
##
|
110
|
+
# Replace any character not allowed in a MiGA name for underscore (_). This
|
111
|
+
# results in a MiGA-compliant name EXCEPT for empty strings, that results in
|
112
|
+
# empty strings.
|
113
|
+
def miga_name
|
114
|
+
gsub(/[^A-Za-z0-9_]/, '_')
|
115
|
+
end
|
116
|
+
|
117
|
+
##
|
118
|
+
# Is the string a MiGA-compliant name?
|
119
|
+
def miga_name?
|
120
|
+
!(self !~ /^[A-Za-z0-9_]+$/)
|
121
|
+
end
|
122
|
+
|
123
|
+
##
|
124
|
+
# Replace underscores by spaces or dots (depending on context).
|
125
|
+
def unmiga_name
|
126
|
+
gsub(/_(str|sp|subsp|pv)__/,"_\\1._").tr('_', ' ')
|
127
|
+
end
|
128
|
+
|
129
|
+
##
|
130
|
+
# Wraps the string with fixed Integer +width+.
|
131
|
+
def wrap_width(width)
|
132
|
+
gsub(/([^\n\r]{1,#{width}})/,"\\1\n")
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|