miga-base 0.3.1.7 → 0.3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/actions/ncbi_get.rb +8 -0
- data/lib/miga/common.rb +9 -215
- data/lib/miga/common/base.rb +49 -0
- data/lib/miga/common/format.rb +135 -0
- data/lib/miga/common/path.rb +49 -0
- data/lib/miga/daemon.rb +3 -60
- data/lib/miga/daemon/base.rb +69 -0
- data/lib/miga/dataset.rb +3 -3
- data/lib/miga/dataset/result.rb +5 -5
- data/lib/miga/result.rb +5 -0
- data/lib/miga/version.rb +7 -5
- data/scripts/distances.bash +2 -19
- data/scripts/taxonomy.bash +2 -21
- data/test/common_test.rb +9 -0
- data/utils/distance/base.rb +6 -0
- data/utils/distance/commands.rb +82 -0
- data/utils/distance/database.rb +86 -0
- data/utils/distance/pipeline.rb +98 -0
- data/utils/distance/runner.rb +104 -0
- data/utils/distance/temporal.rb +37 -0
- data/utils/distances.rb +9 -0
- data/utils/enveomics/Docs/recplot2.md +233 -0
- data/utils/enveomics/Makefile +1 -1
- data/utils/enveomics/Manifest/Tasks/blasttab.json +66 -0
- data/utils/enveomics/Manifest/Tasks/fasta.json +10 -3
- data/utils/enveomics/Manifest/Tasks/fastq.json +4 -4
- data/utils/enveomics/Manifest/Tasks/mapping.json +38 -1
- data/utils/enveomics/Manifest/categories.json +11 -1
- data/utils/enveomics/Manifest/examples.json +2 -2
- data/utils/enveomics/README.md +2 -0
- data/utils/enveomics/Scripts/Aln.cat.rb +1 -0
- data/utils/enveomics/Scripts/BedGraph.tad.rb +52 -30
- data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
- data/utils/enveomics/Scripts/BlastTab.recplot2.R +7 -2
- data/utils/enveomics/Scripts/FastA.interpose.pl +26 -20
- data/utils/enveomics/Scripts/FastQ.interpose.pl +20 -20
- data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
- data/utils/enveomics/Scripts/SRA.download.bash +28 -21
- data/utils/enveomics/Scripts/Table.barplot.R +1 -0
- data/utils/enveomics/Scripts/aai.rb +4 -2
- data/utils/enveomics/build_enveomics_r.bash +5 -5
- data/utils/enveomics/enveomics.R/DESCRIPTION +1 -1
- data/utils/enveomics/enveomics.R/NAMESPACE +6 -2
- data/utils/enveomics/enveomics.R/R/recplot2.R +471 -71
- data/utils/enveomics/enveomics.R/README.md +26 -17
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +1 -1
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +6 -3
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +32 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +12 -7
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +8 -37
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +20 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +20 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +29 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +42 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +18 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +33 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +28 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +56 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +3 -1
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +22 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +20 -14
- data/utils/requirements.txt +1 -1
- metadata +28 -4
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeak.Rd +0 -40
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeaks.Rd +0 -18
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c395d2565cacafe425c91a277c03cdeaa3ac9ece
|
4
|
+
data.tar.gz: 1978e8a2df4d0646bc884ce9921bf8de2bc13a25
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fff79ff971c5fc9e0e0684585b9b35abe5377cff0742c772fc1a7c58005b224ea18b36e7fe489b78c6103f6c4b9ffc90be75e7a385d5fce59a917e2d68c3765a
|
7
|
+
data.tar.gz: cda85047eabd8ba1c76bb5fe690247a3e71583e034f3c5dbef9178776d4c5da02bd3075cfc0e3154e1e97aa69da8ddea85a00ef99efa065e867950c10a860a71
|
data/actions/ncbi_get.rb
CHANGED
@@ -34,6 +34,8 @@ OptionParser.new do |opt|
|
|
34
34
|
opt.on('--no-version-name',
|
35
35
|
'Do not add sequence version to the dataset name.',
|
36
36
|
'Only affects --complete and --chromosome.'){ |v| o[:add_version]=v }
|
37
|
+
opt.on('--blacklist PATH',
|
38
|
+
'A file with dataset names to blacklist.'){ |v| o[:blacklist] = v }
|
37
39
|
opt.on('--dry', 'Do not download or save the datasets.'){ |v| o[:dry] = v }
|
38
40
|
opt.on('-q', '--query',
|
39
41
|
'Register the datasets as queries, not reference datasets.'
|
@@ -135,6 +137,12 @@ if o[:scaffold] or o[:contig]
|
|
135
137
|
end
|
136
138
|
end
|
137
139
|
|
140
|
+
# Discard blacklisted
|
141
|
+
unless o[:blacklist].nil?
|
142
|
+
$stderr.puts "Discarding datasets in #{o[:blacklist]}." unless o[:q]
|
143
|
+
File.readlines(o[:blacklist]).map(&:chomp).each{ |i| ds.delete i }
|
144
|
+
end
|
145
|
+
|
138
146
|
# Download entries
|
139
147
|
$stderr.puts "Downloading #{ds.size} #{ds.size>1 ? "entries" : "entry"}." unless o[:q]
|
140
148
|
ds.each do |name,body|
|
data/lib/miga/common.rb
CHANGED
@@ -1,62 +1,24 @@
|
|
1
1
|
# @package MiGA
|
2
2
|
# @license Artistic-2.0
|
3
3
|
|
4
|
-
require 'miga/version'
|
5
4
|
require 'json'
|
6
|
-
require '
|
7
|
-
require '
|
5
|
+
require 'miga/version'
|
6
|
+
require 'miga/common/base'
|
7
|
+
require 'miga/common/path'
|
8
|
+
require 'miga/common/format'
|
8
9
|
|
9
10
|
##
|
10
11
|
# Generic class used to handle system-wide information and methods, and parent
|
11
12
|
# of all other MiGA::* classes.
|
12
13
|
class MiGA::MiGA
|
14
|
+
|
15
|
+
include MiGA::Common
|
16
|
+
|
17
|
+
extend MiGA::Common::Path
|
18
|
+
extend MiGA::Common::Format
|
13
19
|
|
14
20
|
ENV['MIGA_HOME'] ||= ENV['HOME']
|
15
21
|
|
16
|
-
##
|
17
|
-
# Root path to MiGA (as estimated from the location of the current file).
|
18
|
-
def self.root_path
|
19
|
-
File.expand_path('../../..', __FILE__)
|
20
|
-
end
|
21
|
-
|
22
|
-
##
|
23
|
-
# Should debugging information be reported?
|
24
|
-
@@DEBUG = false
|
25
|
-
|
26
|
-
##
|
27
|
-
# Should the trace of debugging information be reported?
|
28
|
-
@@DEBUG_TRACE = false
|
29
|
-
|
30
|
-
##
|
31
|
-
# Turn on debugging.
|
32
|
-
def self.DEBUG_ON() @@DEBUG=true end
|
33
|
-
|
34
|
-
##
|
35
|
-
# Turn off debugging.
|
36
|
-
def self.DEBUG_OFF() @@DEBUG=false end
|
37
|
-
|
38
|
-
##
|
39
|
-
# Turn on debug tracing (and debugging).
|
40
|
-
def self.DEBUG_TRACE_ON
|
41
|
-
@@DEBUG_TRACE=true
|
42
|
-
self.DEBUG_ON
|
43
|
-
end
|
44
|
-
|
45
|
-
##
|
46
|
-
# Turn off debug tracing (but not debugging).
|
47
|
-
def self.DEBUG_TRACE_OFF
|
48
|
-
@@DEBUG_TRACE=false
|
49
|
-
end
|
50
|
-
|
51
|
-
##
|
52
|
-
# Send debug message.
|
53
|
-
def self.DEBUG(*args)
|
54
|
-
$stderr.puts(*args) if @@DEBUG
|
55
|
-
if @@DEBUG_TRACE
|
56
|
-
$stderr.puts caller.map{ |v| v.gsub(/^/,' ') }.join("\n")
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
22
|
##
|
61
23
|
# Has MiGA been initialized?
|
62
24
|
def self.initialized?
|
@@ -64,121 +26,6 @@ class MiGA::MiGA
|
|
64
26
|
File.exist?(File.expand_path('.miga_daemon.json', ENV['MIGA_HOME']))
|
65
27
|
end
|
66
28
|
|
67
|
-
##
|
68
|
-
# Tabulates an +values+, and Array of Arrays, all with the same number of
|
69
|
-
# entries as +header+. Returns an Array of String, one per line.
|
70
|
-
def self.tabulate(header, values)
|
71
|
-
fields = [header.map(&:to_s)]
|
72
|
-
fields << fields.first.map{ |h| h.gsub(/\S/, '-') }
|
73
|
-
fields += values.map{ |row| row.map{ |cell| cell.nil? ? '?' : cell.to_s } }
|
74
|
-
clen = fields.map{ |row| row.map(&:length) }.transpose.map(&:max)
|
75
|
-
fields.map do |row|
|
76
|
-
(0 .. clen.size-1).map do |col_n|
|
77
|
-
col_n==0 ? row[col_n].rjust(clen[col_n]) : row[col_n].ljust(clen[col_n])
|
78
|
-
end.join(' ')
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
##
|
83
|
-
# Cleans a FastA file in place.
|
84
|
-
def self.clean_fasta_file(file)
|
85
|
-
tmp_fh = nil
|
86
|
-
begin
|
87
|
-
if file =~ /\.gz/
|
88
|
-
tmp_path = Tempfile.new('MiGA.gz').tap(&:close).path
|
89
|
-
tmp_fh = Zlib::GzipWriter.open(tmp_path)
|
90
|
-
fh = Zlib::GzipReader.open(file)
|
91
|
-
else
|
92
|
-
tmp_fh = Tempfile.new('MiGA')
|
93
|
-
tmp_path = tmp_fh.path
|
94
|
-
fh = File.open(file, 'r')
|
95
|
-
end
|
96
|
-
buffer = ''
|
97
|
-
fh.each_line do |ln|
|
98
|
-
ln.chomp!
|
99
|
-
if ln =~ /^>\s*(\S+)(.*)/
|
100
|
-
(id, df) = [$1, $2]
|
101
|
-
tmp_fh.print buffer.wrap_width(80)
|
102
|
-
buffer = ''
|
103
|
-
tmp_fh.puts ">#{id.gsub(/[^A-Za-z0-9_\|\.]/, "_")}#{df}"
|
104
|
-
else
|
105
|
-
buffer << ln.gsub(/[^A-Za-z\.\-]/, '')
|
106
|
-
end
|
107
|
-
end
|
108
|
-
tmp_fh.print buffer.wrap_width(80)
|
109
|
-
tmp_fh.close
|
110
|
-
fh.close
|
111
|
-
FileUtils.cp(tmp_path, file)
|
112
|
-
ensure
|
113
|
-
begin
|
114
|
-
tmp_fh.close unless tmp_fh.nil?
|
115
|
-
File.unlink(tmp_path) unless tmp_path.nil?
|
116
|
-
rescue
|
117
|
-
end
|
118
|
-
end
|
119
|
-
end
|
120
|
-
|
121
|
-
##
|
122
|
-
# Calculates the average and standard deviation of the sequence lengths in
|
123
|
-
# a FastA or FastQ file (supports gzipped files). The +format+ must be a
|
124
|
-
# Symbol, one of +:fasta+ or +:fastq+. Additional estimations can be
|
125
|
-
# controlled via the +opts+ Hash. Supported options include:
|
126
|
-
# - +:n50+: If true, it also returns the N50 and the median (in bp).
|
127
|
-
# - +gc+: If true, it also returns the G+C content (in %).
|
128
|
-
def self.seqs_length(file, format, opts={})
|
129
|
-
fh = (file =~ /\.gz/) ? Zlib::GzipReader.open(file) : File.open(file, 'r')
|
130
|
-
l = []
|
131
|
-
gc = 0
|
132
|
-
i = 0 # <- Zlib::GzipReader doesn't set $.
|
133
|
-
fh.each_line do |ln|
|
134
|
-
i += 1
|
135
|
-
if (format==:fasta and ln =~ /^>/) or (format==:fastq and (i % 4)==1)
|
136
|
-
l << 0
|
137
|
-
elsif format==:fasta or (i % 4)==2
|
138
|
-
l[l.size-1] += ln.chomp.size
|
139
|
-
gc += ln.scan(/[GCgc]/).count if opts[:gc]
|
140
|
-
end
|
141
|
-
end
|
142
|
-
fh.close
|
143
|
-
|
144
|
-
o = { n: l.size, tot: l.inject(:+) }
|
145
|
-
o[:avg] = o[:tot].to_f/l.size
|
146
|
-
o[:var] = l.map{ |a| a ** 2 }.inject(:+).to_f/l.size - o[:avg]**2
|
147
|
-
o[:sd] = Math.sqrt o[:var]
|
148
|
-
o[:gc] = 100.0*gc/o[:tot] if opts[:gc]
|
149
|
-
if opts[:n50]
|
150
|
-
l.sort!
|
151
|
-
thr = o[:tot]/2
|
152
|
-
pos = 0
|
153
|
-
l.each do |a|
|
154
|
-
pos += a
|
155
|
-
o[:n50] = a
|
156
|
-
break if pos >= thr
|
157
|
-
end
|
158
|
-
o[:med] = o[:n].even? ?
|
159
|
-
0.5*l[o[:n]/2-1,2].inject(:+) : l[(o[:n]-1)/2]
|
160
|
-
end
|
161
|
-
o
|
162
|
-
end
|
163
|
-
|
164
|
-
##
|
165
|
-
# Path to a script to be executed for +task+. Supported +opts+ are:
|
166
|
-
# - +:miga+ Path to the MiGA home to use. If not passed, the home of the
|
167
|
-
# library is used).
|
168
|
-
# - +:project+ MiGA::Project object to check within plugins. If not passed,
|
169
|
-
# only core scripts are supported.
|
170
|
-
def self.script_path(task, opts={})
|
171
|
-
opts[:miga] ||= root_path
|
172
|
-
unless opts[:project].nil?
|
173
|
-
opts[:project].plugins.each do |pl|
|
174
|
-
if File.exist? File.expand_path("scripts/#{task}.bash", pl)
|
175
|
-
opts[:miga] = pl
|
176
|
-
end
|
177
|
-
end
|
178
|
-
end
|
179
|
-
File.expand_path("scripts/#{task}.bash", opts[:miga])
|
180
|
-
end
|
181
|
-
|
182
29
|
##
|
183
30
|
# Check if the result files exist with +base+ name (String) followed by the
|
184
31
|
# +ext+ values (Array of String).
|
@@ -188,59 +35,6 @@ class MiGA::MiGA
|
|
188
35
|
File.exist?(base + f) or File.exist?("#{base}#{f}.gz")
|
189
36
|
end
|
190
37
|
end
|
191
|
-
end
|
192
|
-
|
193
|
-
##
|
194
|
-
# MiGA extensions to the File class.
|
195
|
-
class File
|
196
38
|
|
197
|
-
##
|
198
|
-
# Method to transfer a file from +old_name+ to +new_name+, using a +method+
|
199
|
-
# that can be one of :symlink for File#symlink, :hardlink for File#link, or
|
200
|
-
# :copy for FileUtils#cp_r.
|
201
|
-
def self.generic_transfer(old_name, new_name, method)
|
202
|
-
return nil if exist? new_name
|
203
|
-
case method
|
204
|
-
when :symlink
|
205
|
-
File.symlink(old_name, new_name)
|
206
|
-
when :hardlink
|
207
|
-
File.link(old_name, new_name)
|
208
|
-
when :copy
|
209
|
-
FileUtils.cp_r(old_name, new_name)
|
210
|
-
else
|
211
|
-
raise "Unknown transfer method: #{method}."
|
212
|
-
end
|
213
|
-
end
|
214
|
-
end
|
215
|
-
|
216
|
-
##
|
217
|
-
# MiGA extensions to the String class.
|
218
|
-
class String
|
219
|
-
|
220
|
-
##
|
221
|
-
# Replace any character not allowed in a MiGA name for underscore (_). This
|
222
|
-
# results in a MiGA-compliant name EXCEPT for empty strings, that results in
|
223
|
-
# empty strings.
|
224
|
-
def miga_name
|
225
|
-
gsub(/[^A-Za-z0-9_]/, '_')
|
226
|
-
end
|
227
|
-
|
228
|
-
##
|
229
|
-
# Is the string a MiGA-compliant name?
|
230
|
-
def miga_name?
|
231
|
-
!(self !~ /^[A-Za-z0-9_]+$/)
|
232
|
-
end
|
233
|
-
|
234
|
-
##
|
235
|
-
# Replace underscores by spaces or dots (depending on context).
|
236
|
-
def unmiga_name
|
237
|
-
gsub(/_(str|sp|subsp|pv)__/,"_\\1._").tr('_', ' ')
|
238
|
-
end
|
239
|
-
|
240
|
-
##
|
241
|
-
# Wraps the string with fixed Integer +width+.
|
242
|
-
def wrap_width(width)
|
243
|
-
gsub(/([^\n\r]{1,#{width}})/,"\\1\n")
|
244
|
-
end
|
245
39
|
end
|
246
40
|
|
@@ -0,0 +1,49 @@
|
|
1
|
+
|
2
|
+
class MiGA::MiGA
|
3
|
+
|
4
|
+
# Class-level
|
5
|
+
class << self
|
6
|
+
##
|
7
|
+
# Turn on debugging.
|
8
|
+
def DEBUG_ON ; @@DEBUG=true end
|
9
|
+
|
10
|
+
##
|
11
|
+
# Turn off debugging.
|
12
|
+
def DEBUG_OFF ; @@DEBUG=false end
|
13
|
+
|
14
|
+
##
|
15
|
+
# Turn on debug tracing (and debugging).
|
16
|
+
def DEBUG_TRACE_ON
|
17
|
+
@@DEBUG_TRACE=true
|
18
|
+
DEBUG_ON()
|
19
|
+
end
|
20
|
+
|
21
|
+
##
|
22
|
+
# Turn off debug tracing (but not debugging).
|
23
|
+
def DEBUG_TRACE_OFF
|
24
|
+
@@DEBUG_TRACE=false
|
25
|
+
end
|
26
|
+
|
27
|
+
##
|
28
|
+
# Send debug message.
|
29
|
+
def DEBUG(*args)
|
30
|
+
$stderr.puts(*args) if @@DEBUG
|
31
|
+
$stderr.puts(
|
32
|
+
caller.map{ |v| v.gsub(/^/,' ') }.join("\n") ) if @@DEBUG_TRACE
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
module MiGA::Common
|
39
|
+
|
40
|
+
##
|
41
|
+
# Should debugging information be reported?
|
42
|
+
@@DEBUG = false
|
43
|
+
|
44
|
+
##
|
45
|
+
# Should the trace of debugging information be reported?
|
46
|
+
@@DEBUG_TRACE = false
|
47
|
+
|
48
|
+
end
|
49
|
+
|
@@ -0,0 +1,135 @@
|
|
1
|
+
|
2
|
+
require 'tempfile'
|
3
|
+
require 'zlib'
|
4
|
+
|
5
|
+
module MiGA::Common::Format
|
6
|
+
|
7
|
+
##
|
8
|
+
# Tabulates an +values+, and Array of Arrays, all with the same number of
|
9
|
+
# entries as +header+. Returns an Array of String, one per line.
|
10
|
+
def tabulate(header, values)
|
11
|
+
fields = [header.map(&:to_s)]
|
12
|
+
fields << fields.first.map{ |h| h.gsub(/\S/, '-') }
|
13
|
+
fields += values.map{ |row| row.map{ |cell| cell.nil? ? '?' : cell.to_s } }
|
14
|
+
clen = fields.map{ |row| row.map(&:length) }.transpose.map(&:max)
|
15
|
+
fields.map do |row|
|
16
|
+
(0 .. clen.size-1).map do |col_n|
|
17
|
+
col_n==0 ? row[col_n].rjust(clen[col_n]) : row[col_n].ljust(clen[col_n])
|
18
|
+
end.join(' ')
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# Cleans a FastA file in place.
|
24
|
+
def clean_fasta_file(file)
|
25
|
+
tmp_fh = nil
|
26
|
+
begin
|
27
|
+
if file =~ /\.gz/
|
28
|
+
tmp_path = Tempfile.new('MiGA.gz').tap(&:close).path
|
29
|
+
tmp_fh = Zlib::GzipWriter.open(tmp_path)
|
30
|
+
fh = Zlib::GzipReader.open(file)
|
31
|
+
else
|
32
|
+
tmp_fh = Tempfile.new('MiGA')
|
33
|
+
tmp_path = tmp_fh.path
|
34
|
+
fh = File.open(file, 'r')
|
35
|
+
end
|
36
|
+
buffer = ''
|
37
|
+
fh.each_line do |ln|
|
38
|
+
ln.chomp!
|
39
|
+
if ln =~ /^>\s*(\S+)(.*)/
|
40
|
+
(id, df) = [$1, $2]
|
41
|
+
tmp_fh.print buffer.wrap_width(80)
|
42
|
+
buffer = ''
|
43
|
+
tmp_fh.puts ">#{id.gsub(/[^A-Za-z0-9_\|\.]/, "_")}#{df}"
|
44
|
+
else
|
45
|
+
buffer << ln.gsub(/[^A-Za-z\.\-]/, '')
|
46
|
+
end
|
47
|
+
end
|
48
|
+
tmp_fh.print buffer.wrap_width(80)
|
49
|
+
tmp_fh.close
|
50
|
+
fh.close
|
51
|
+
FileUtils.cp(tmp_path, file)
|
52
|
+
ensure
|
53
|
+
begin
|
54
|
+
tmp_fh.close unless tmp_fh.nil?
|
55
|
+
File.unlink(tmp_path) unless tmp_path.nil?
|
56
|
+
rescue
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
##
|
62
|
+
# Calculates the average and standard deviation of the sequence lengths in
|
63
|
+
# a FastA or FastQ file (supports gzipped files). The +format+ must be a
|
64
|
+
# Symbol, one of +:fasta+ or +:fastq+. Additional estimations can be
|
65
|
+
# controlled via the +opts+ Hash. Supported options include:
|
66
|
+
# - +:n50+: If true, it also returns the N50 and the median (in bp).
|
67
|
+
# - +gc+: If true, it also returns the G+C content (in %).
|
68
|
+
def seqs_length(file, format, opts={})
|
69
|
+
fh = (file =~ /\.gz/) ? Zlib::GzipReader.open(file) : File.open(file, 'r')
|
70
|
+
l = []
|
71
|
+
gc = 0
|
72
|
+
i = 0 # <- Zlib::GzipReader doesn't set $.
|
73
|
+
fh.each_line do |ln|
|
74
|
+
i += 1
|
75
|
+
if (format==:fasta and ln =~ /^>/) or (format==:fastq and (i % 4)==1)
|
76
|
+
l << 0
|
77
|
+
elsif format==:fasta or (i % 4)==2
|
78
|
+
l[l.size-1] += ln.chomp.size
|
79
|
+
gc += ln.scan(/[GCgc]/).count if opts[:gc]
|
80
|
+
end
|
81
|
+
end
|
82
|
+
fh.close
|
83
|
+
|
84
|
+
o = { n: l.size, tot: l.inject(:+) }
|
85
|
+
o[:avg] = o[:tot].to_f/l.size
|
86
|
+
o[:var] = l.map{ |a| a ** 2 }.inject(:+).to_f/l.size - o[:avg]**2
|
87
|
+
o[:sd] = Math.sqrt o[:var]
|
88
|
+
o[:gc] = 100.0*gc/o[:tot] if opts[:gc]
|
89
|
+
if opts[:n50]
|
90
|
+
l.sort!
|
91
|
+
thr = o[:tot]/2
|
92
|
+
pos = 0
|
93
|
+
l.each do |a|
|
94
|
+
pos += a
|
95
|
+
o[:n50] = a
|
96
|
+
break if pos >= thr
|
97
|
+
end
|
98
|
+
o[:med] = o[:n].even? ?
|
99
|
+
0.5*l[o[:n]/2-1,2].inject(:+) : l[(o[:n]-1)/2]
|
100
|
+
end
|
101
|
+
o
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
##
|
106
|
+
# MiGA extensions to the String class.
|
107
|
+
class String
|
108
|
+
|
109
|
+
##
|
110
|
+
# Replace any character not allowed in a MiGA name for underscore (_). This
|
111
|
+
# results in a MiGA-compliant name EXCEPT for empty strings, that results in
|
112
|
+
# empty strings.
|
113
|
+
def miga_name
|
114
|
+
gsub(/[^A-Za-z0-9_]/, '_')
|
115
|
+
end
|
116
|
+
|
117
|
+
##
|
118
|
+
# Is the string a MiGA-compliant name?
|
119
|
+
def miga_name?
|
120
|
+
!(self !~ /^[A-Za-z0-9_]+$/)
|
121
|
+
end
|
122
|
+
|
123
|
+
##
|
124
|
+
# Replace underscores by spaces or dots (depending on context).
|
125
|
+
def unmiga_name
|
126
|
+
gsub(/_(str|sp|subsp|pv)__/,"_\\1._").tr('_', ' ')
|
127
|
+
end
|
128
|
+
|
129
|
+
##
|
130
|
+
# Wraps the string with fixed Integer +width+.
|
131
|
+
def wrap_width(width)
|
132
|
+
gsub(/([^\n\r]{1,#{width}})/,"\\1\n")
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|