bio-ngs 0.3.2.alpha.01
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/Gemfile +39 -0
- data/Gemfile.lock +81 -0
- data/LICENSE.txt +28 -0
- data/README.rdoc +240 -0
- data/Rakefile +60 -0
- data/VERSION +1 -0
- data/bin/biongs +35 -0
- data/bio-ngs.gemspec +215 -0
- data/ext/mkrf_conf.rb +87 -0
- data/lib/bio-ngs.rb +54 -0
- data/lib/bio/appl/ngs/bcl2qseq.rb +93 -0
- data/lib/bio/appl/ngs/blast.rb +36 -0
- data/lib/bio/appl/ngs/bowtie-inspect.rb +50 -0
- data/lib/bio/appl/ngs/cufflinks.rb +489 -0
- data/lib/bio/appl/ngs/fastx.rb +170 -0
- data/lib/bio/appl/ngs/samtools.rb +118 -0
- data/lib/bio/appl/ngs/sff_extract.rb +23 -0
- data/lib/bio/appl/ngs/tophat.rb +158 -0
- data/lib/bio/ngs/converter.rb +100 -0
- data/lib/bio/ngs/core_ext.rb +12 -0
- data/lib/bio/ngs/db.rb +66 -0
- data/lib/bio/ngs/db/migrate/homology/201105030707_create_blastout.rb +22 -0
- data/lib/bio/ngs/db/migrate/homology/201105030709_create_goannotation.rb +29 -0
- data/lib/bio/ngs/db/migrate/ontology/201105030708_create_go.rb +18 -0
- data/lib/bio/ngs/db/migrate/ontology/201105030710_create_gene_go.rb +17 -0
- data/lib/bio/ngs/db/migrate/ontology/201105030711_create_gene.rb +16 -0
- data/lib/bio/ngs/db/models.rb +1 -0
- data/lib/bio/ngs/db/models/homology.rb +8 -0
- data/lib/bio/ngs/db/models/ontology.rb +16 -0
- data/lib/bio/ngs/ext/bin/common/fastq_coverage_graph.sh +161 -0
- data/lib/bio/ngs/ext/bin/common/sff_extract +1505 -0
- data/lib/bio/ngs/ext/bin/linux/samtools +0 -0
- data/lib/bio/ngs/ext/bin/osx/samtools +0 -0
- data/lib/bio/ngs/ext/versions.yaml +73 -0
- data/lib/bio/ngs/graphics.rb +189 -0
- data/lib/bio/ngs/homology.rb +102 -0
- data/lib/bio/ngs/ontology.rb +103 -0
- data/lib/bio/ngs/quality.rb +64 -0
- data/lib/bio/ngs/record.rb +50 -0
- data/lib/bio/ngs/task.rb +46 -0
- data/lib/bio/ngs/utils.rb +176 -0
- data/lib/development_tasks.rb +34 -0
- data/lib/enumerable.rb +37 -0
- data/lib/tasks/bwa.thor +126 -0
- data/lib/tasks/convert.thor +454 -0
- data/lib/tasks/history.thor +51 -0
- data/lib/tasks/homology.thor +121 -0
- data/lib/tasks/ontology.thor +93 -0
- data/lib/tasks/project.thor +51 -0
- data/lib/tasks/quality.thor +142 -0
- data/lib/tasks/rna.thor +126 -0
- data/lib/tasks/sff_extract.thor +9 -0
- data/lib/templates/README.tt +43 -0
- data/lib/templates/db.tt +6 -0
- data/lib/wrapper.rb +225 -0
- data/spec/converter_qseq_spec.rb +56 -0
- data/spec/fixture/s_1_1_1108_qseq.txt +100 -0
- data/spec/quality_spec.rb +40 -0
- data/spec/sff_extract_spec.rb +98 -0
- data/spec/spec_helper.rb +55 -0
- data/spec/tophat_spec.rb +99 -0
- data/spec/utils_spec.rb +22 -0
- data/test/conf/test_db.yml +4 -0
- data/test/data/blastoutput.xml +69 -0
- data/test/data/gene-GO.json +1 -0
- data/test/data/goa_uniprot +27 -0
- data/test/data/goslim_goa.obo +1763 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-ngs.rb +17 -0
- data/test/test_db.rb +21 -0
- data/test/test_homology.rb +102 -0
- data/test/test_ngs.rb +21 -0
- data/test/test_ontology.rb +74 -0
- data/test/test_utils.rb +29 -0
- metadata +460 -0
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
#
|
|
2
|
+
#
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2011
|
|
5
|
+
# Francesco Strozzi <francesco.strozzi@gmail.com>
|
|
6
|
+
# Raoul J.P. Bonnal <r@bioruby.org>
|
|
7
|
+
# License:: The Ruby License
|
|
8
|
+
#
|
|
9
|
+
#
|
|
10
|
+
|
|
11
|
+
require 'ostruct'
|
|
12
|
+
|
|
13
|
+
module Bio
|
|
14
|
+
module Ngs
|
|
15
|
+
class FastQuality
|
|
16
|
+
|
|
17
|
+
require 'matrix'
|
|
18
|
+
|
|
19
|
+
attr_accessor :format
|
|
20
|
+
# as reported in http://dx.doi.org/10.1093/nar/gkp1137
|
|
21
|
+
# we set the default to fastq_sanger, is a better policy to specify
|
|
22
|
+
# ALWAYS the format
|
|
23
|
+
def initialize(file, format=:fastq_sanger)
|
|
24
|
+
begin
|
|
25
|
+
@file = file
|
|
26
|
+
@stream = Bio::FlatFile.auto(file)
|
|
27
|
+
@format = format
|
|
28
|
+
raise ArgumentError, "the method only accepts FASTQ" unless @stream.dbclass == Bio::Fastq
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def quality_profile
|
|
33
|
+
qual = nil
|
|
34
|
+
tot_reads = 0
|
|
35
|
+
@stream.each do |read|
|
|
36
|
+
if qual then
|
|
37
|
+
qual += Vector[*read.quality_scores]
|
|
38
|
+
else
|
|
39
|
+
qual = Vector[*read.quality_scores]
|
|
40
|
+
end
|
|
41
|
+
tot_reads += 1
|
|
42
|
+
end
|
|
43
|
+
qual = qual/tot_reads.to_f
|
|
44
|
+
return qual.to_a
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Restart from the beginning of the file and draw a profile of B qalities
|
|
48
|
+
def track_b_count
|
|
49
|
+
quals = Hash.new(0) # a new element is initialized at zero
|
|
50
|
+
reads_count=0
|
|
51
|
+
@stream = Bio::FlatFile.auto(@file)
|
|
52
|
+
@stream.each do |read|
|
|
53
|
+
read.format = format
|
|
54
|
+
reads_count+=1
|
|
55
|
+
read_qualities = read.quality_scores
|
|
56
|
+
read_qualities.each_index do |read_index|
|
|
57
|
+
quals[read_index]+=1 if read_qualities[read_index] == 2
|
|
58
|
+
end #seq
|
|
59
|
+
end#reads
|
|
60
|
+
OpenStruct.new(:n_reads=>reads_count, :b_profile=>quals.sort)
|
|
61
|
+
end
|
|
62
|
+
end #FastQuality
|
|
63
|
+
end #Ngs
|
|
64
|
+
end #Bio
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
#
|
|
2
|
+
#
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2011
|
|
5
|
+
# Francesco Strozzi <francesco.strozzi@gmail.com>
|
|
6
|
+
# License:: The Ruby License
|
|
7
|
+
#
|
|
8
|
+
#
|
|
9
|
+
|
|
10
|
+
module Bio
|
|
11
|
+
module Ngs
|
|
12
|
+
class Record
|
|
13
|
+
|
|
14
|
+
def initialize(file)
|
|
15
|
+
@filename = file
|
|
16
|
+
@file = File.new(file,"a+")
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def save(name,*args)
|
|
20
|
+
params = {:name => name, :args => args }
|
|
21
|
+
unless is_saved?(params) || params[:name] =~/history/
|
|
22
|
+
@file.write(params.to_yaml)
|
|
23
|
+
@file.close
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def load
|
|
28
|
+
tasks = []
|
|
29
|
+
YAML.each_document(@file) do |ydoc|
|
|
30
|
+
ydoc[:args].flatten!
|
|
31
|
+
tasks << ydoc
|
|
32
|
+
end
|
|
33
|
+
return tasks
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def clear
|
|
37
|
+
history = File.delete(@filename)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
private
|
|
41
|
+
|
|
42
|
+
def is_saved?(params)
|
|
43
|
+
tasks = []
|
|
44
|
+
YAML.each_document(@file) {|ydoc| tasks << ydoc}
|
|
45
|
+
return tasks.include?(params)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
data/lib/bio/ngs/task.rb
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
|
|
2
|
+
#
|
|
3
|
+
#
|
|
4
|
+
#
|
|
5
|
+
# Copyright:: Copyright (C) 2011
|
|
6
|
+
# Francesco Strozzi <francesco.strozzi@gmail.com>
|
|
7
|
+
# License:: The Ruby License
|
|
8
|
+
#
|
|
9
|
+
#
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# opening class Thor::Task to add a save_history method
|
|
13
|
+
class Thor
|
|
14
|
+
class Task
|
|
15
|
+
|
|
16
|
+
def run(instance, args=[])
|
|
17
|
+
public_method?(instance) ? instance.send(name, *args) : instance.class.handle_no_task_error(name)
|
|
18
|
+
save_history(instance,args) unless instance.class == Bio::Ngs::Runner or instance.class == Thor::Sandbox::History
|
|
19
|
+
rescue ArgumentError => e
|
|
20
|
+
handle_argument_error?(instance, e, caller) ?
|
|
21
|
+
instance.class.handle_argument_error(self, e) : (raise e)
|
|
22
|
+
rescue NoMethodError => e
|
|
23
|
+
handle_no_method_error?(instance, e, caller) ?
|
|
24
|
+
instance.class.handle_no_task_error(name) : (raise e)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
private
|
|
29
|
+
|
|
30
|
+
# process Thor instance and save the task name and parameters
|
|
31
|
+
def save_history(instance,args)
|
|
32
|
+
invocation = instance.instance_variable_get("@_invocations").to_a[0]
|
|
33
|
+
classes = invocation[0].to_s
|
|
34
|
+
name = invocation[1].first
|
|
35
|
+
classes.gsub!(/Thor::Sandbox::/,"")
|
|
36
|
+
classes.gsub!(/::/,":")
|
|
37
|
+
classes.downcase!
|
|
38
|
+
options = [args,instance.options]
|
|
39
|
+
history = Bio::Ngs::Record.new(Bio::Ngs::HISTORY_FILE)
|
|
40
|
+
history.save(classes+":"+name,options)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
end
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
#
|
|
2
|
+
#
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2011
|
|
5
|
+
# Francesco Strozzi <francesco.strozzi@gmail.com>
|
|
6
|
+
# Raoul J.P. Bonnal <r@bioruby.org>
|
|
7
|
+
# License:: The Ruby License
|
|
8
|
+
#
|
|
9
|
+
#
|
|
10
|
+
require 'find'
|
|
11
|
+
|
|
12
|
+
module Bio
|
|
13
|
+
module Ngs
|
|
14
|
+
class Utils
|
|
15
|
+
class BinaryNotFound < StandardError
|
|
16
|
+
def initialize(opts={})
|
|
17
|
+
@skip_task = opts[:skip_task]
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def skip_task?
|
|
21
|
+
@skip_task
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
class << self
|
|
25
|
+
def binary(name)
|
|
26
|
+
begin
|
|
27
|
+
if !(plugin_binaries_found = find_binary_files(name)).empty?
|
|
28
|
+
return plugin_binaries_found.first
|
|
29
|
+
elsif (os_binary = Bio::Command.query_command ["which", name]) != ""
|
|
30
|
+
return os_binary.tr("\n","")
|
|
31
|
+
else
|
|
32
|
+
raise BinaryNotFound.new(:skip_task=>true), "No binary found with this name: #{name}"
|
|
33
|
+
end
|
|
34
|
+
rescue BinaryNotFound => e
|
|
35
|
+
warn e.message
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
end #binary
|
|
39
|
+
|
|
40
|
+
def os_type
|
|
41
|
+
require 'rbconfig'
|
|
42
|
+
case Config::CONFIG['host_os']
|
|
43
|
+
when /darwin/ then return "osx"
|
|
44
|
+
when /linux/ then return "linux"
|
|
45
|
+
when /mswin|mingw/ then raise NotImplementedError, "This plugin does not run on Windows"
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Remove from filename the dot and the extension, adds the tag and the new extension
|
|
50
|
+
def tag_filename(filename, tag, extension)
|
|
51
|
+
if filename=~/\..*/
|
|
52
|
+
filename.gsub(/\..*/, "_#{tag}.#{extension}")
|
|
53
|
+
else
|
|
54
|
+
"#{filename}_#{tag}.#{extension}"
|
|
55
|
+
end
|
|
56
|
+
end #tag_filename
|
|
57
|
+
|
|
58
|
+
def extend_system_path
|
|
59
|
+
path = File.expand_path(File.dirname(__FILE__))
|
|
60
|
+
common_dir= File.join(path,"ext","bin","common")
|
|
61
|
+
os_dir = File.join(path,"ext","bin",self.os_type)
|
|
62
|
+
sub_dirs = Dir[os_dir+"/*"].select do |file|
|
|
63
|
+
File.directory?(file)
|
|
64
|
+
end.map do |dir|
|
|
65
|
+
":"+dir
|
|
66
|
+
end.join
|
|
67
|
+
ENV["PATH"]+=":"+common_dir+":"+os_dir + sub_dirs
|
|
68
|
+
end #extend_system_path
|
|
69
|
+
|
|
70
|
+
def download_with_progress(opts = {:url => nil, :mode => "", :filename => nil})
|
|
71
|
+
require "open-uri"
|
|
72
|
+
require "progressbar"
|
|
73
|
+
puts "Downloading from #{opts[:url]}"
|
|
74
|
+
filename = (opts[:filename]) ? opts[:filename] : opts[:url].split('/')[-1]
|
|
75
|
+
mode = (opts[:mode]) ? opts[:mode] : ""
|
|
76
|
+
pbar = nil
|
|
77
|
+
open(opts[:url],"r"+mode,
|
|
78
|
+
:content_length_proc => lambda {|t|
|
|
79
|
+
if t && 0 < t
|
|
80
|
+
pbar = ProgressBar.new('', t)
|
|
81
|
+
pbar.file_transfer_mode
|
|
82
|
+
end
|
|
83
|
+
},
|
|
84
|
+
:progress_proc => lambda {|s|
|
|
85
|
+
pbar.set s if pbar
|
|
86
|
+
}) do |remote|
|
|
87
|
+
open(filename,"w"+mode) {|file| file.write remote.read(16384) until remote.eof?}
|
|
88
|
+
end
|
|
89
|
+
puts "\nDone"
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def uncompress_gz_file(file_in)
|
|
93
|
+
require 'zlib'
|
|
94
|
+
puts "Uncompressing file #{file_in}"
|
|
95
|
+
file_out = file_in.gsub(/.gz/,"")
|
|
96
|
+
Zlib::GzipReader.open(file_in) {|gz|
|
|
97
|
+
open(file_out,"w") do |file|
|
|
98
|
+
file.write gz.read
|
|
99
|
+
end
|
|
100
|
+
}
|
|
101
|
+
puts "Done\n"
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def download_and_uncompress(url,fileout)
|
|
106
|
+
self.download_with_progress(:url => url,:mode => "b",:filename => fileout)
|
|
107
|
+
self.uncompress_gz_file(fileout)
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def uncompress_command(suffix)
|
|
111
|
+
case suffix
|
|
112
|
+
when "tar.bz2" then "tar xvfj"
|
|
113
|
+
when "tar.gz" then "tar xvfz"
|
|
114
|
+
when "zip" then "unzip"
|
|
115
|
+
else
|
|
116
|
+
raise "Unkonw suffix."
|
|
117
|
+
end
|
|
118
|
+
end #uncompress_command
|
|
119
|
+
|
|
120
|
+
def uncompress_any(tool_name, tool_record)
|
|
121
|
+
tool_file_name = "#{tool_record["basename"]}.#{tool_record["suffix"]}"
|
|
122
|
+
tool_dir_name = tool_record["basename"]
|
|
123
|
+
uncompress = uncompress_command(tool_record["suffix"])
|
|
124
|
+
STDERR.puts "#{uncompress} #{tool_file_name}"
|
|
125
|
+
system "#{uncompress} #{tool_file_name}"
|
|
126
|
+
STDERR.puts "sto per uscire"
|
|
127
|
+
if Dir.exists?(tool_dir_name)
|
|
128
|
+
tool_dir_name
|
|
129
|
+
elsif Dir.exists?("#{tool_name}-#{tool_record['version']}")
|
|
130
|
+
"#{tool_name}-#{tool_record['version']}"
|
|
131
|
+
else
|
|
132
|
+
raise "BioNGS can not identify the uncompressed destination folder"
|
|
133
|
+
end
|
|
134
|
+
end #uncompress
|
|
135
|
+
|
|
136
|
+
def compile_source(tool_name, tool_record, path_external, path_binary)
|
|
137
|
+
puts "Uncompressing #{tool_name}..."
|
|
138
|
+
tool_dir_name = uncompress_any(tool_name, tool_record)
|
|
139
|
+
puts "Compiling #{tool_name}..."
|
|
140
|
+
cd(tool_dir_name) do
|
|
141
|
+
system "PKG_CONFIG_PATH='#{path_external}/bin/common/lib/pkgconfig' ./configure --prefix=#{path_binary} --bindir=#{path_binary}"
|
|
142
|
+
system "make"
|
|
143
|
+
system "make install"
|
|
144
|
+
end #cd
|
|
145
|
+
end #uncompress_compile
|
|
146
|
+
|
|
147
|
+
def install_binary(tool_name, tool_record, path_external, path_binary)
|
|
148
|
+
require 'fileutils'
|
|
149
|
+
include FileUtils::Verbose
|
|
150
|
+
puts "Uncompressing #{tool_name}"
|
|
151
|
+
uncompressed_tool_dir_name = uncompress_any(tool_name, tool_record)
|
|
152
|
+
puts "Installing #{tool_name}"
|
|
153
|
+
path_binary_tool = File.join(path_binary,tool_name)
|
|
154
|
+
FileUtils.remove_dir(path_binary_tool) if Dir.exists?(path_binary_tool)
|
|
155
|
+
FileUtils.mkdir(path_binary_tool)
|
|
156
|
+
FileUtils.cp_r "#{uncompressed_tool_dir_name}/.", path_binary_tool, :preserve=>true
|
|
157
|
+
end #uncompress install binary
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
# search in the current gem's directory for installed binaries which the name binary_name
|
|
161
|
+
# it's a recursive search in common and os specific directories
|
|
162
|
+
# return an array: empty if the binary can not be found otherwise full path to the binaries
|
|
163
|
+
# it is up to the user choose which binary to use, it's suggested to use the first in the array
|
|
164
|
+
# to have a behavirou similar to the search PATH
|
|
165
|
+
def find_binary_files(binary_name)
|
|
166
|
+
path = File.expand_path(File.dirname(__FILE__))
|
|
167
|
+
Find.find(File.join(path,"ext","bin","common"),File.join(path,"ext","bin",self.os_type)).select do |f|
|
|
168
|
+
File.file?(f) && File.basename(f) == binary_name
|
|
169
|
+
end
|
|
170
|
+
end #find_binary_file
|
|
171
|
+
|
|
172
|
+
end #eiginclass
|
|
173
|
+
|
|
174
|
+
end # end Utils
|
|
175
|
+
end # end NGS
|
|
176
|
+
end # end Bio
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
class BioNgs
|
|
2
|
+
# Rake tasks inspired by Jeweler approach
|
|
3
|
+
# Include tasks used during gem installation.
|
|
4
|
+
# Why ? If a developer want's to have a ready to go environment
|
|
5
|
+
# with bioinformatics software supported by biongs in it's cloned directory.
|
|
6
|
+
class BioNgsTasks < ::Rake::TaskLib
|
|
7
|
+
attr_accessor :biongs
|
|
8
|
+
|
|
9
|
+
def initialize
|
|
10
|
+
yield self if block_given?
|
|
11
|
+
|
|
12
|
+
define
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def biongs
|
|
16
|
+
@biongs ||= self
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def define
|
|
20
|
+
namespace :devenv do
|
|
21
|
+
desc "install external bioinformatics tools, for development, locally -in this directory, cloned from github?-"
|
|
22
|
+
task :bio_tools do
|
|
23
|
+
Dir.chdir("ext") do
|
|
24
|
+
load 'mkrf_conf.rb'
|
|
25
|
+
`rake -f Rakefile`
|
|
26
|
+
FileUtils.remove("Rakefile")
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
task :devenv => 'devenv:bio_tools'
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
data/lib/enumerable.rb
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
#
|
|
2
|
+
# enumerable.rb - Add Math functionalities to enumerable
|
|
3
|
+
#
|
|
4
|
+
# ToDo: refactor, it's not the right approach: used in Bio::Ngs::Cufflinks::Diff.process_de
|
|
5
|
+
#
|
|
6
|
+
# Copyright:: Copyright (C) 2011
|
|
7
|
+
# Raoul Bonnal <r@bioruby.org>,
|
|
8
|
+
# Francesco Strozzi <francesco.strozzi@gmail.com>
|
|
9
|
+
# License:: The Ruby License
|
|
10
|
+
#
|
|
11
|
+
#
|
|
12
|
+
|
|
13
|
+
module Enumerable
|
|
14
|
+
|
|
15
|
+
# sum of an array of numbers
|
|
16
|
+
def sum
|
|
17
|
+
return self.inject(0){|acc,i|acc +i}
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# average of an array of numbers
|
|
21
|
+
def average
|
|
22
|
+
return self.sum/self.length.to_f
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# variance of an array of numbers
|
|
26
|
+
def sample_variance
|
|
27
|
+
avg=self.average
|
|
28
|
+
sum=self.inject(0){|acc,i|acc +(i-avg)**2}
|
|
29
|
+
return(1/self.length.to_f*sum)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# standard deviation of an array of numbers
|
|
33
|
+
def standard_deviation
|
|
34
|
+
return Math.sqrt(self.sample_variance)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
end # module Enumerable
|
data/lib/tasks/bwa.thor
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
#
|
|
2
|
+
#
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2011
|
|
5
|
+
# Francesco Strozzi <francesco.strozzi@gmail.com>
|
|
6
|
+
# License:: The Ruby License
|
|
7
|
+
#
|
|
8
|
+
#
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Bwa < Thor
|
|
12
|
+
|
|
13
|
+
class Index < Bwa
|
|
14
|
+
|
|
15
|
+
desc "short [FASTA]", "Make the BWT index for a SHORT FASTA database"
|
|
16
|
+
method_option :colorspace, :type => :boolean, :desc => "Index in Colorspace"
|
|
17
|
+
method_option :prefix, :type => :string, :desc => "Database index name"
|
|
18
|
+
def short(fasta)
|
|
19
|
+
real_prefix = (options[:prefix]) ? options[:prefix] : fasta
|
|
20
|
+
Bio::BWA.make_index(:file_in => fasta, :c => options[:colorspace], :prefix => real_prefix)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
desc "long [FASTA]", "Make the BWT index for a LONG FASTA database"
|
|
24
|
+
method_option :colorspace, :type => :boolean, :desc => "Index in Colorspace"
|
|
25
|
+
method_option :prefix, :type => :string, :desc => "Database index name"
|
|
26
|
+
def long(fasta)
|
|
27
|
+
real_prefix = (options[:prefix]) ? options[:prefix] : fasta
|
|
28
|
+
Bio::BWA.make_index(:file_in => fasta, :a => "bwtsw",:c => options[:colorspace], :prefix => real_prefix)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class Aln < Bwa
|
|
35
|
+
|
|
36
|
+
desc "short [FASTQ]", "Run the aligment for SHORT query sequences"
|
|
37
|
+
method_option :file_out, :type => :string, :desc => "file to write output to instead of stdout", :required => true
|
|
38
|
+
method_option :prefix, :type => :string, :desc => "Database prefix", :required => true
|
|
39
|
+
method_option :n, :type => :numeric, :desc => "max #diff (int) or missing prob under 0.02 err rate (float) [0.04]"
|
|
40
|
+
method_option :o, :type => :numeric, :desc => "maximum number or fraction of gap opens [1]"
|
|
41
|
+
method_option :e, :type => :numeric, :desc => "maximum number of gap extensions, -1 for disabling long gaps [-1]"
|
|
42
|
+
method_option :i, :type => :numeric, :desc => "do not put an indel within INT bp towards the ends [5]"
|
|
43
|
+
method_option :d, :type => :numeric, :desc => "maximum occurrences for extending a long deletion [10]"
|
|
44
|
+
method_option :l, :type => :numeric, :desc => "seed length [32]"
|
|
45
|
+
method_option :k, :type => :numeric, :desc => "maximum differences in the seed [2]"
|
|
46
|
+
method_option :m, :type => :numeric, :desc => "maximum entries in the queue [2000000]"
|
|
47
|
+
method_option :t, :type => :numeric, :desc => "number of threads [1]"
|
|
48
|
+
method_option :M, :type => :numeric, :desc => "mismatch penalty [3]"
|
|
49
|
+
method_option :O, :type => :numeric, :desc => "gap open penalty [11]"
|
|
50
|
+
method_option :E, :type => :numeric, :desc => "gap extension penalty [4]"
|
|
51
|
+
method_option :R, :type => :numeric, :desc => "stop searching when there are >INT equally best hits [30]"
|
|
52
|
+
method_option :q, :type => :numeric, :desc => "quality threshold for read trimming down to 35bp [0]"
|
|
53
|
+
method_option :B, :type => :numeric, :desc => "length of barcode"
|
|
54
|
+
method_option :c, :type => :boolean, :desc => "input sequences are in the color space"
|
|
55
|
+
method_option :L, :type => :boolean, :desc => "log-scaled gap penalty for long deletions"
|
|
56
|
+
method_option :N, :type => :boolean, :desc => "non-iterative mode: search for all n-difference hits"
|
|
57
|
+
method_option :I, :type => :boolean, :desc => "the input is in the Illumina 1.3+ FASTQ-like format"
|
|
58
|
+
method_option :b, :type => :boolean, :desc => "the input read file is in the BAM format"
|
|
59
|
+
method_option :single, :type => :boolean, :desc => "use single-end reads only (effective with -b)"
|
|
60
|
+
method_option :first, :type => :boolean, :desc => "use the 1st read in a pair (effective with -b)"
|
|
61
|
+
method_option :second, :type => :boolean, :desc => "use the 2nd read in a pair (effective with -b)"
|
|
62
|
+
def short(fastq)
|
|
63
|
+
bwa_options = options.dup
|
|
64
|
+
bwa_options[:file_in] = fastq
|
|
65
|
+
Bio::BWA.short_read_alignment(bwa_options.symbolize_keys)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
desc "long [FASTQ]", "Run the aligment for LONG query sequences"
|
|
69
|
+
method_option :file_out, :type => :string, :desc => "file to output results to instead of stdout", :required => true
|
|
70
|
+
method_option :prefix, :type => :string, :desc => "Database prefix", :required => true
|
|
71
|
+
method_option :a, :type => :numeric, :desc => "score for a match [1]"
|
|
72
|
+
method_option :b, :type => :numeric, :desc => "mismatch penalty [3]"
|
|
73
|
+
method_option :q, :type => :numeric, :desc => "gap open penalty [5]"
|
|
74
|
+
method_option :r, :type => :numeric, :desc => "gap extension penalty [2]"
|
|
75
|
+
method_option :t, :type => :numeric, :desc => "number of threads [1]"
|
|
76
|
+
method_option :w, :type => :numeric, :desc => "band width [50]"
|
|
77
|
+
method_option :m, :type => :numeric, :desc => "mask level [0.50]"
|
|
78
|
+
method_option :T, :type => :numeric, :desc => "score threshold divided by a [30]"
|
|
79
|
+
method_option :s, :type => :numeric, :desc => "maximum seeding interval size [3]"
|
|
80
|
+
method_option :z, :type => :numeric, :desc => "Z-best [1]"
|
|
81
|
+
method_option :N, :type => :numeric, :desc => "seeds to trigger reverse alignment [5]"
|
|
82
|
+
method_option :c, :type => :numeric, :desc => "coefficient of length-threshold adjustment [5.5]"
|
|
83
|
+
method_option :H, :type => :boolean, :desc => "in SAM output, use hard clipping rather than soft"
|
|
84
|
+
def long(fastq)
|
|
85
|
+
bwa_options = options.dup
|
|
86
|
+
bwa_options[:file_in] = fastq
|
|
87
|
+
Bio::BWA.long_read_alignment(bwa_options.symbolize_keys)
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class Sam < Bwa
|
|
94
|
+
|
|
95
|
+
desc "single [SAI]", "Convert SAI alignment output into SAM format (single end)"
|
|
96
|
+
method_option :prefix, :type => :string, :required => true, :desc => "Database prefix"
|
|
97
|
+
method_option :fastq, :type => :string, :required => true, :desc => "FastQ file"
|
|
98
|
+
method_option :file_out, :type => :string, :required => true, :desc => "File to save the output"
|
|
99
|
+
method_options :n => :numeric, :r => :string
|
|
100
|
+
def single(sai)
|
|
101
|
+
bwa_options = options.dup
|
|
102
|
+
bwa_options[:sai] = sai
|
|
103
|
+
Bio::BWA.sai_to_sam_single(bwa_options.symbolize_keys)
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
desc "paired", "Convert SAI alignment output into SAM format (paired ends)"
|
|
107
|
+
method_option :prefix, :type => :string, :required => true, :desc => "Database prefix"
|
|
108
|
+
method_option :file_out, :type => :string, :required => true, :desc => "File to save the output"
|
|
109
|
+
method_option :sai, :type => :array, :required => true, :desc => "The 2 SAI files"
|
|
110
|
+
method_option :fastq, :type => :array, :required => true, :desc => "The 2 Fasta/Q files"
|
|
111
|
+
method_option :a, :type => :numeric, :desc => "maximum insert size [500]"
|
|
112
|
+
method_option :o, :type => :numeric, :desc => "maximum occurrences for one end [100000]"
|
|
113
|
+
method_option :n, :type => :numeric, :desc => "maximum hits to output for paired reads [3]"
|
|
114
|
+
method_option :N, :type => :numeric, :desc => "maximum hits to output for discordant pairs [10]"
|
|
115
|
+
method_option :c, :type => :numeric, :desc => "prior of chimeric rate (lower bound) [1.0e-05]"
|
|
116
|
+
method_option :r, :type => :string, :desc => "read group header line such as `@RG\tID:foo\tSM:bar' [null]"
|
|
117
|
+
method_option :P, :type => :boolean, :desc => "preload index into memory (for base-space reads only)"
|
|
118
|
+
method_option :s, :type => :boolean, :desc => "disable Smith-Waterman for the unmapped mate"
|
|
119
|
+
method_option :A, :type => :boolean, :desc => "disable insert size estimate (force -s)"
|
|
120
|
+
def paired
|
|
121
|
+
Bio::BWA.sai_to_sam_paired(options.dup.symbolize_keys)
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
end
|