assemblotron 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +39 -0
- data/Rakefile +8 -0
- data/lib/assemblotron.rb +210 -0
- data/lib/assemblotron/assemblers/constructors/abyss.rb +7 -0
- data/lib/assemblotron/assemblers/constructors/read_joiner.rb +7 -0
- data/lib/assemblotron/assemblers/constructors/sga.rb +7 -0
- data/lib/assemblotron/assemblers/constructors/soap_denovo_trans.rb +69 -0
- data/lib/assemblotron/assemblers/constructors/velvet_oases.rb +7 -0
- data/lib/assemblotron/assemblers/soap_denovo_trans.rb +120 -0
- data/lib/assemblotron/assemblers/soap_denovo_trans.yml +72 -0
- data/lib/assemblotron/objectives/assembly_score.rb +7 -0
- data/lib/assemblotron/sample.rb +150 -0
- data/lib/assemblotron/system.rb +39 -0
- data/lib/assemblotron/version.rb +12 -0
- data/test/data/subset.thousand_reads_l.fq +2000 -0
- data/test/data/subset.thousand_reads_r.fq +2000 -0
- data/test/data/thousand_reads_l.fq +4000 -0
- data/test/data/thousand_reads_r.fq +4000 -0
- data/test/helper.rb +16 -0
- data/test/test_installer.rb +27 -0
- data/test/test_sample.rb +28 -0
- data/test/test_soap_denovo_trans.rb +52 -0
- metadata +205 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 714db5f952949fb16921c707434b7ba07797c3aa
|
4
|
+
data.tar.gz: 407ce9019b37b5492a41d4b72e0101ea57f52c0e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e586502ee078aaee565064d21adc443ffff78bed298ed4a40b3b01208077129e738d3d126115597324551fe6767a99f70760e12fc4352214705373d5ac8d9150
|
7
|
+
data.tar.gz: b6e575f209ca04870fe1589dc3cdba1ce061664f859130889c5893c826560435dd7038c09e9d06272d14ea100ff73d220605b0cdccb8bcc55022cc048ce6b717
|
data/README.md
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
assemblotron
|
2
|
+
============
|
3
|
+
|
4
|
+
Automated optimal *de-novo* assembly.
|
5
|
+
|
6
|
+
Transcriptome assembly takes a *long* time and a *lot* of computational resources. The software is complex, and the best settings to use depend heavily on the organism being studied.
|
7
|
+
|
8
|
+
Assemblotron solves this problem by rapidly discovering the optimal settings for an assembler or assembly pipeline and performs the best possible assembly using the tools available.
|
9
|
+
|
10
|
+
A typical Assemblotron run takes only 3-4 hours on 8 cores of a desktop PC with an i7 processor, and greatly improves the accuracy of expression quantification and gene reconstruction in de-novo transcriptome analysis (blog posts/paper with evidence to follow shortly).
|
11
|
+
|
12
|
+
## Explanation
|
13
|
+
|
14
|
+
Assemblotron takes a small subsample of the available reads and runs an assembly. The assembly is thoroughly analysed and scored using [transrate](https://github.com/Blahah/transrate). Then the optimisation system [biopsy](https://github.com/Blahah/biopsy) is used to select new assembler settings to test, and another assembly is performed. This process is repeated until an estimate for the best possible assembly is found.
|
15
|
+
|
16
|
+
Further documentation will be provided when the software enters beta.
|
17
|
+
|
18
|
+
## Development status
|
19
|
+
|
20
|
+
[![Gem Version](https://badge.fury.io/rb/assemblotron.png)][gem]
|
21
|
+
[![Build Status](https://secure.travis-ci.org/Blahah/assemblotron.png?branch=master)][travis]
|
22
|
+
[![Dependency Status](https://gemnasium.com/Blahah/assemblotron.png?travis)][gemnasium]
|
23
|
+
[![Code Climate](https://codeclimate.com/github/Blahah/assemblotron.png)][codeclimate]
|
24
|
+
[![Coverage Status](https://coveralls.io/repos/Blahah/assemblotron/badge.png?branch=master)][coveralls]
|
25
|
+
|
26
|
+
[gem]: https://badge.fury.io/rb/assemblotron
|
27
|
+
[travis]: https://travis-ci.org/Blahah/assemblotron
|
28
|
+
[gemnasium]: https://gemnasium.com/Blahah/assemblotron
|
29
|
+
[codeclimate]: https://codeclimate.com/github/Blahah/assemblotron
|
30
|
+
[coveralls]: https://coveralls.io/r/Blahah/assemblotron
|
31
|
+
|
32
|
+
This software is in pre-alpha development and is not yet ready for deployment.
|
33
|
+
Please don't report issues or request documentation until we are ready for beta release (see below for estimated timeframe).
|
34
|
+
|
35
|
+
## Expectations for imminent versions
|
36
|
+
|
37
|
+
* **v0.1.0**: allow optimisation of SoapDenovoTrans, VelvetOases (due 10th November 2013)
|
38
|
+
* **v0.2.0**: add Abyss, SGA and ReadJoiner
|
39
|
+
* **v0.3.0**: allow optimising pipelines
|
data/Rakefile
ADDED
data/lib/assemblotron.rb
ADDED
@@ -0,0 +1,210 @@
|
|
1
|
+
require 'biopsy'
|
2
|
+
require 'logger'
|
3
|
+
require 'transrate'
|
4
|
+
require 'assemblotron/version'
|
5
|
+
require 'assemblotron/sample'
|
6
|
+
require 'pp'
|
7
|
+
require 'json'
|
8
|
+
|
9
|
+
module Assemblotron
|
10
|
+
|
11
|
+
include Transrate
|
12
|
+
|
13
|
+
|
14
|
+
class Controller
|
15
|
+
|
16
|
+
attr_accessor :global_opts
|
17
|
+
attr_accessor :assembler_opts
|
18
|
+
|
19
|
+
# Return a new Assemblotron
|
20
|
+
def initialize
|
21
|
+
@log = Logger.new(STDOUT)
|
22
|
+
@log.level = Logger::INFO
|
23
|
+
self.load_config
|
24
|
+
self.init_settings
|
25
|
+
@assemblers = []
|
26
|
+
self.load_assemblers
|
27
|
+
end # initialize
|
28
|
+
|
29
|
+
def self.header
|
30
|
+
"Assemblotron v#{VERSION::STRING.dup}"
|
31
|
+
end
|
32
|
+
|
33
|
+
# Initialise the Biopsy settings
|
34
|
+
def init_settings
|
35
|
+
s = Biopsy::Settings.instance
|
36
|
+
s.set_defaults
|
37
|
+
libdir = File.dirname(__FILE__)
|
38
|
+
s.target_dir = [File.join(libdir, 'assemblotron/assemblers/')]
|
39
|
+
s.objectives_dir = [File.join(libdir, 'assemblotron/objectives/')]
|
40
|
+
@log.debug "initialised Biopsy settings"
|
41
|
+
end # init_settings
|
42
|
+
|
43
|
+
# Load global configuration from the config file at
|
44
|
+
# +~/.assemblotron+, if it exists.
|
45
|
+
def load_config
|
46
|
+
config_file = File.join(Dir.home, ".assemblotron")
|
47
|
+
if File.exists? config_file
|
48
|
+
@log.debug "config file found at #{config_file}"
|
49
|
+
config = YAML::load_file(config_file)
|
50
|
+
if config.nil?
|
51
|
+
@log.warn 'config file malformed or empty'
|
52
|
+
return
|
53
|
+
end
|
54
|
+
@config = config.deep_symbolize
|
55
|
+
end
|
56
|
+
end # parse_config
|
57
|
+
|
58
|
+
# Discover and load available assemblers.
|
59
|
+
#
|
60
|
+
# Loads all assemblers provided by the program, and
|
61
|
+
# then searches any directories listed in the config
|
62
|
+
# file (+~/.assemblotron+) setting +assembler_dirs+.
|
63
|
+
#
|
64
|
+
# Directories listed in +assembler_dirs+ must contain:
|
65
|
+
#
|
66
|
+
# +definitions+:: Directory with one +.yml+ definition per assembler.
|
67
|
+
# See the documentation for Definition.
|
68
|
+
# +constructors+:: Directory with one +.rb+ file per assembler.
|
69
|
+
# See the documentation for Constructor.
|
70
|
+
def load_assemblers
|
71
|
+
Biopsy::Settings.instance.target_dir.each do |dir|
|
72
|
+
Dir.chdir dir do
|
73
|
+
Dir['*.yml'].each do |file|
|
74
|
+
name = File.basename(file, '.yml')
|
75
|
+
target = Biopsy::Target.new
|
76
|
+
target.load_by_name name
|
77
|
+
@assemblers << target
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end # load_assemblers
|
82
|
+
|
83
|
+
# Return an array of the names of available assemblers
|
84
|
+
def assemblers
|
85
|
+
a = []
|
86
|
+
@assemblers.each do |t|
|
87
|
+
a << t.name
|
88
|
+
a << t.shortname if t.shortname
|
89
|
+
end
|
90
|
+
a
|
91
|
+
end # assemblers
|
92
|
+
|
93
|
+
def list_assemblers
|
94
|
+
puts Controller.header
|
95
|
+
puts <<-EOS
|
96
|
+
|
97
|
+
Available assemblers are listed below.
|
98
|
+
Shortnames are shown in brackets if available.
|
99
|
+
|
100
|
+
Usage:
|
101
|
+
atron [global options] <assembler> [assembler options]
|
102
|
+
|
103
|
+
Assemblers:
|
104
|
+
EOS
|
105
|
+
@assemblers.each do |a|
|
106
|
+
p = " - #{a.name}"
|
107
|
+
p += " (#{a.shortname})" if a.respond_to? :shortname
|
108
|
+
puts p
|
109
|
+
end
|
110
|
+
end # list_assemblers
|
111
|
+
|
112
|
+
def options_for_assembler assembler
|
113
|
+
a = self.get_assembler assembler
|
114
|
+
parser = Trollop::Parser.new do
|
115
|
+
banner <<-EOS
|
116
|
+
#{Controller.header}
|
117
|
+
|
118
|
+
Options for assembler #{assembler}
|
119
|
+
EOS
|
120
|
+
opt :reference, "Path to reference proteome file in FASTA format",
|
121
|
+
:type => String,
|
122
|
+
:required => true
|
123
|
+
a.options.each_pair do |param, opts|
|
124
|
+
opt param,
|
125
|
+
opts[:desc],
|
126
|
+
:type => Controller.class_from_type(opts[:type])
|
127
|
+
end
|
128
|
+
end
|
129
|
+
Trollop::with_standard_exception_handling parser do
|
130
|
+
raise Trollop::HelpNeeded if ARGV.empty? # show help screen
|
131
|
+
parser.parse ARGV
|
132
|
+
end
|
133
|
+
end # options_for_assembler
|
134
|
+
|
135
|
+
def get_assembler assembler
|
136
|
+
ret = @assemblers.find do |a|
|
137
|
+
a.name == assembler ||
|
138
|
+
a.shortname == assembler
|
139
|
+
end
|
140
|
+
raise "couldn't find assembler #{assembler}" if ret.nil?
|
141
|
+
ret
|
142
|
+
end
|
143
|
+
|
144
|
+
def self.class_from_type type
|
145
|
+
case type
|
146
|
+
when 'string'
|
147
|
+
String
|
148
|
+
when 'int'
|
149
|
+
Integer
|
150
|
+
when 'float'
|
151
|
+
Float
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
def subsample_input
|
156
|
+
l = @assembler_opts[:left]
|
157
|
+
r = @assembler_opts[:right]
|
158
|
+
size = @global_opts[:subsample_size]
|
159
|
+
|
160
|
+
s = Sample.new(l, r)
|
161
|
+
ls, rs = s.subsample size
|
162
|
+
|
163
|
+
@assembler_opts[:left_subset] = ls
|
164
|
+
@assembler_opts[:right_subset] = rs
|
165
|
+
end
|
166
|
+
|
167
|
+
def final_assembly assembler, result
|
168
|
+
Dir.mkdir('final_assembly')
|
169
|
+
Dir.chdir('final_assembly') do
|
170
|
+
assembler.run result
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
def run assembler
|
175
|
+
# subsampling
|
176
|
+
if @global_opts[:skip_subsample]
|
177
|
+
@assembler_opts[:left_subset] = assembler_opts[:left]
|
178
|
+
@assembler_opts[:right_subset] = assembler_opts[:right]
|
179
|
+
else
|
180
|
+
subsample_input
|
181
|
+
end
|
182
|
+
|
183
|
+
# load reference and create ublast DB
|
184
|
+
@assembler_opts[:reference] = Transrate::Assembly.new(@assembler_opts[:reference])
|
185
|
+
ra = Transrate::ReciprocalAnnotation.new(@assembler_opts[:reference], @assembler_opts[:reference])
|
186
|
+
ra.make_reference_db
|
187
|
+
|
188
|
+
# setup the assembler
|
189
|
+
a = self.get_assembler assembler
|
190
|
+
a.setup_optim(@global_opts, @assembler_opts)
|
191
|
+
|
192
|
+
# run the optimisation
|
193
|
+
e = Biopsy::Experiment.new(a, options: @assembler_opts, threads: @global_opts[:threads])
|
194
|
+
res = e.run
|
195
|
+
|
196
|
+
# write out the result
|
197
|
+
File.open(@global_opts[:output_parameters], 'wb') do |f|
|
198
|
+
f.write(JSON.pretty_generate(res))
|
199
|
+
end
|
200
|
+
|
201
|
+
# run the final assembly
|
202
|
+
a.setup_final(@global_opts, @assembler_opts)
|
203
|
+
unless @global_opts[:skip_final]
|
204
|
+
final_assembly a, res
|
205
|
+
end
|
206
|
+
end # run
|
207
|
+
|
208
|
+
end # Controller
|
209
|
+
|
210
|
+
end # Assemblotron
|
@@ -0,0 +1,69 @@
|
|
1
|
+
class SoapDenovoTrans
|
2
|
+
|
3
|
+
def initialize
|
4
|
+
@count = 0
|
5
|
+
end
|
6
|
+
|
7
|
+
def run params
|
8
|
+
self.setup_soap(params) if @count == 0
|
9
|
+
self.run_soap params
|
10
|
+
@count += 1
|
11
|
+
end
|
12
|
+
|
13
|
+
# soapdt.config file only generated on first run
|
14
|
+
def setup_soap params
|
15
|
+
# make config file
|
16
|
+
rf = params[:readformat] == 'fastq' ? 'q' : 'f'
|
17
|
+
File.open("soapdt.config", "w") do |conf|
|
18
|
+
conf.puts "max_rd_len=20000"
|
19
|
+
conf.puts "[LIB]"
|
20
|
+
conf.puts "avg_ins=#{params[:insertsize]}"
|
21
|
+
conf.puts "reverse_seq=0"
|
22
|
+
conf.puts "asm_flags=3"
|
23
|
+
conf.puts "rank=2"
|
24
|
+
conf.puts "#{rf}1=#{params[:l]}"
|
25
|
+
conf.puts "#{rf}2=#{params[:r]}"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def include_defaults params
|
30
|
+
defaults = {
|
31
|
+
:K => 23,
|
32
|
+
:p => 8,
|
33
|
+
:d => 0,
|
34
|
+
:e => 2,
|
35
|
+
:M => 1,
|
36
|
+
:F => true,
|
37
|
+
:L => 100,
|
38
|
+
:t => 5,
|
39
|
+
:G => 50
|
40
|
+
}
|
41
|
+
defaults.merge params
|
42
|
+
end
|
43
|
+
|
44
|
+
def construct_command(params)
|
45
|
+
params = self.include_defaults params
|
46
|
+
cmd = "#{params[:path]} all"
|
47
|
+
# generic
|
48
|
+
cmd += " -s soapdt.config" # config file
|
49
|
+
cmd += " -a #{params[:memory]}" # memory assumption
|
50
|
+
cmd += " -o #{params[:out]}" # output directory
|
51
|
+
cmd += " -p #{params[:threads]}" # number of threads
|
52
|
+
# specific
|
53
|
+
cmd += " -K #{params[:K]}" # kmer size
|
54
|
+
cmd += " -d #{params[:d]}" # minimum kmer frequency
|
55
|
+
cmd += " -F" if params[:F] # fill gaps in scaffold
|
56
|
+
cmd += " -M #{params[:M]}" # strength of contig flattening
|
57
|
+
cmd += " -L #{params[:L]}" # minimum contig length
|
58
|
+
cmd += " -e #{params[:e]}" # delete contigs with coverage no greater than
|
59
|
+
cmd += " -t #{params[:t]}" # maximum number of transcripts from one locus
|
60
|
+
cmd += " -G #{params[:G]}" # allowed length difference between estimated and filled gap
|
61
|
+
end
|
62
|
+
|
63
|
+
# runs SOAPdt script
|
64
|
+
def run_soap(params)
|
65
|
+
cmd = self.construct_command(params)
|
66
|
+
`#{cmd} > #{@count}.log`
|
67
|
+
end
|
68
|
+
|
69
|
+
end # SoapDenovoTrans
|
@@ -0,0 +1,120 @@
|
|
1
|
+
require 'transrate'
|
2
|
+
|
3
|
+
class SoapDenovoTrans
|
4
|
+
|
5
|
+
include Which
|
6
|
+
|
7
|
+
# Return a new SoapDenovoTrans object
|
8
|
+
def initialize
|
9
|
+
@count = 0
|
10
|
+
@path = which('SOAPdenovo-Trans-127mer')
|
11
|
+
raise "SOAPdenovo-Trans-127mer was not in the PATH" if @path.empty?
|
12
|
+
@path = @path.first
|
13
|
+
end
|
14
|
+
|
15
|
+
# Run the assembler with the provided parameters,
|
16
|
+
# returning a Transrate::ComparativeMetrics object
|
17
|
+
# containing a score for the generated assembly
|
18
|
+
# compared to the reference.
|
19
|
+
def run params
|
20
|
+
# run the assembly
|
21
|
+
self.run_soap params
|
22
|
+
# retrieve output
|
23
|
+
scaffolds = Dir['*.scafSeq']
|
24
|
+
return nil if scaffolds.empty?
|
25
|
+
scaffolds = scaffolds.first
|
26
|
+
return nil if File.size(scaffolds) == 0
|
27
|
+
# return a ComparativeMetrics object
|
28
|
+
assembly = Transrate::Assembly.new(scaffolds)
|
29
|
+
Transrate::ComparativeMetrics.new(assembly, params[:reference])
|
30
|
+
end
|
31
|
+
|
32
|
+
# Perform any necessary setup for the assembler
|
33
|
+
# prior to running the parameter optimisation.
|
34
|
+
# This includes modifying the config file to point
|
35
|
+
# to the subsetted reads rather than the full set.
|
36
|
+
def setup_optim(global_opts, assembler_opts)
|
37
|
+
# setup config file for subsetted reads
|
38
|
+
left = assembler_opts[:left_subset]
|
39
|
+
right = assembler_opts[:right_subset]
|
40
|
+
f = create_config(left, right, assembler_opts)
|
41
|
+
assembler_opts[:config] = f
|
42
|
+
end
|
43
|
+
|
44
|
+
# Perform any necessary setup for the assembler
|
45
|
+
# prior to running the full optimal assembly.
|
46
|
+
# This includes resetting the config to refer
|
47
|
+
# to the full set of reads.
|
48
|
+
def setup_full(global_opts, assembler_opts)
|
49
|
+
# set config file for full read set
|
50
|
+
left = assembler_opts[:left]
|
51
|
+
right = assembler_opts[:right]
|
52
|
+
f = create_config(left, right, assembler_opts)
|
53
|
+
assembler_opts[:config] = f
|
54
|
+
end
|
55
|
+
|
56
|
+
# Generate a config file with the specified left and right
|
57
|
+
# read input files, returning the full path to the config file.
|
58
|
+
def create_config left, right, assembler_opts
|
59
|
+
# create the config file
|
60
|
+
filename = "#{Time.now}.full.config"
|
61
|
+
File.open(filename) do |f|
|
62
|
+
f << 'max_rd_len=5000'
|
63
|
+
f << '[LIB]'
|
64
|
+
f << "avg_ins=#{assembler_opts[:insertsize]}"
|
65
|
+
f << "reverse_seq=0" # don't reverse complement the reads
|
66
|
+
f << "asm_flags=3" # use the reads for assembly and scaffolding
|
67
|
+
f << "q1=#{left}"
|
68
|
+
f << "q2=#{right}"
|
69
|
+
end
|
70
|
+
File.expand_path filename
|
71
|
+
end
|
72
|
+
|
73
|
+
# Merge the default parameters with the hash provided
|
74
|
+
def include_defaults params
|
75
|
+
defaults = {
|
76
|
+
:K => 23,
|
77
|
+
:threads => 8,
|
78
|
+
:out => 'sdt',
|
79
|
+
:d => 0,
|
80
|
+
:e => 2,
|
81
|
+
:M => 1,
|
82
|
+
:F => true,
|
83
|
+
:L => 100,
|
84
|
+
:t => 5,
|
85
|
+
:G => 50
|
86
|
+
}
|
87
|
+
defaults.merge params
|
88
|
+
end
|
89
|
+
|
90
|
+
# Given a set of parameters, fill in any missing
|
91
|
+
# parameters with defaults and construct a command
|
92
|
+
# to run the target assembler. Return the command
|
93
|
+
# as a string.
|
94
|
+
def construct_command params
|
95
|
+
params = self.include_defaults params
|
96
|
+
cmd = "#{@path} all"
|
97
|
+
# generic
|
98
|
+
cmd += " -s #{params[:config]}" # config file
|
99
|
+
cmd += " -a #{params[:memory]}" if params.has_key? :memory # memory assumption
|
100
|
+
cmd += " -o #{params[:out]}" if params.has_key? :out # output directory
|
101
|
+
cmd += " -p #{params[:threads]}" # number of threads
|
102
|
+
# specific
|
103
|
+
cmd += " -K #{params[:K]}" # kmer size
|
104
|
+
cmd += " -d #{params[:d]}" # minimum kmer frequency
|
105
|
+
cmd += " -F" if params[:F] # fill gaps in scaffold
|
106
|
+
cmd += " -M #{params[:M]}" # strength of contig flattening
|
107
|
+
cmd += " -L #{params[:L]}" # minimum contig length
|
108
|
+
cmd += " -e #{params[:e]}" # delete contigs with coverage no greater than
|
109
|
+
cmd += " -t #{params[:t]}" # maximum number of transcripts from one locus
|
110
|
+
cmd += " -G #{params[:G]}" # allowed length difference between estimated and filled gap
|
111
|
+
end
|
112
|
+
|
113
|
+
# Run the SOAPdenovo-trans assembler with the provided
|
114
|
+
# parameters. Return the output generated.
|
115
|
+
def run_soap(params)
|
116
|
+
cmd = self.construct_command(params)
|
117
|
+
`#{cmd} > #{@count}.log`
|
118
|
+
end
|
119
|
+
|
120
|
+
end # SoapDenovoTrans
|