assemblotron 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +39 -0
- data/Rakefile +8 -0
- data/lib/assemblotron.rb +210 -0
- data/lib/assemblotron/assemblers/constructors/abyss.rb +7 -0
- data/lib/assemblotron/assemblers/constructors/read_joiner.rb +7 -0
- data/lib/assemblotron/assemblers/constructors/sga.rb +7 -0
- data/lib/assemblotron/assemblers/constructors/soap_denovo_trans.rb +69 -0
- data/lib/assemblotron/assemblers/constructors/velvet_oases.rb +7 -0
- data/lib/assemblotron/assemblers/soap_denovo_trans.rb +120 -0
- data/lib/assemblotron/assemblers/soap_denovo_trans.yml +72 -0
- data/lib/assemblotron/objectives/assembly_score.rb +7 -0
- data/lib/assemblotron/sample.rb +150 -0
- data/lib/assemblotron/system.rb +39 -0
- data/lib/assemblotron/version.rb +12 -0
- data/test/data/subset.thousand_reads_l.fq +2000 -0
- data/test/data/subset.thousand_reads_r.fq +2000 -0
- data/test/data/thousand_reads_l.fq +4000 -0
- data/test/data/thousand_reads_r.fq +4000 -0
- data/test/helper.rb +16 -0
- data/test/test_installer.rb +27 -0
- data/test/test_sample.rb +28 -0
- data/test/test_soap_denovo_trans.rb +52 -0
- metadata +205 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 714db5f952949fb16921c707434b7ba07797c3aa
|
4
|
+
data.tar.gz: 407ce9019b37b5492a41d4b72e0101ea57f52c0e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e586502ee078aaee565064d21adc443ffff78bed298ed4a40b3b01208077129e738d3d126115597324551fe6767a99f70760e12fc4352214705373d5ac8d9150
|
7
|
+
data.tar.gz: b6e575f209ca04870fe1589dc3cdba1ce061664f859130889c5893c826560435dd7038c09e9d06272d14ea100ff73d220605b0cdccb8bcc55022cc048ce6b717
|
data/README.md
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
assemblotron
|
2
|
+
============
|
3
|
+
|
4
|
+
Automated optimal *de-novo* assembly.
|
5
|
+
|
6
|
+
Transcriptome assembly takes a *long* time and a *lot* of computational resources. The software is complex, and the best settings to use depend heavily on the organism being studied.
|
7
|
+
|
8
|
+
Assemblotron solves this problem by rapidly discovering the optimal settings for an assembler or assembly pipeline and performs the best possible assembly using the tools available.
|
9
|
+
|
10
|
+
A typical Assemblotron run takes only 3-4 hours on 8 cores of a desktop PC with an i7 processor, and greatly improves the accuracy of expression quantification and gene reconstruction in de-novo transcriptome analysis (blog posts/paper with evidence to follow shortly).
|
11
|
+
|
12
|
+
## Explanation
|
13
|
+
|
14
|
+
Assemblotron takes a small subsample of the available reads and runs an assembly. The assembly is thoroughly analysed and scored using [transrate](https://github.com/Blahah/transrate). Then the optimisation system [biopsy](https://github.com/Blahah/biopsy) is used to select new assembler settings to test, and another assembly is performed. This process is repeated until an estimate for the best possible assembly is found.
|
15
|
+
|
16
|
+
Further documentation will be provided when the software enters beta.
|
17
|
+
|
18
|
+
## Development status
|
19
|
+
|
20
|
+
[][gem]
|
21
|
+
[][travis]
|
22
|
+
[][gemnasium]
|
23
|
+
[][codeclimate]
|
24
|
+
[][coveralls]
|
25
|
+
|
26
|
+
[gem]: https://badge.fury.io/rb/assemblotron
|
27
|
+
[travis]: https://travis-ci.org/Blahah/assemblotron
|
28
|
+
[gemnasium]: https://gemnasium.com/Blahah/assemblotron
|
29
|
+
[codeclimate]: https://codeclimate.com/github/Blahah/assemblotron
|
30
|
+
[coveralls]: https://coveralls.io/r/Blahah/assemblotron
|
31
|
+
|
32
|
+
This software is in pre-alpha development and is not yet ready for deployment.
|
33
|
+
Please don't report issues or request documentation until we are ready for beta release (see below for estimated timeframe).
|
34
|
+
|
35
|
+
## Expectations for imminent versions
|
36
|
+
|
37
|
+
* **v0.1.0**: allow optimisation of SoapDenovoTrans, VelvetOases (due 10th November 2013)
|
38
|
+
* **v0.2.0**: add Abyss, SGA and ReadJoiner
|
39
|
+
* **v0.3.0**: allow optimising pipelines
|
data/Rakefile
ADDED
data/lib/assemblotron.rb
ADDED
@@ -0,0 +1,210 @@
|
|
1
|
+
require 'biopsy'
|
2
|
+
require 'logger'
|
3
|
+
require 'transrate'
|
4
|
+
require 'assemblotron/version'
|
5
|
+
require 'assemblotron/sample'
|
6
|
+
require 'pp'
|
7
|
+
require 'json'
|
8
|
+
|
9
|
+
module Assemblotron
|
10
|
+
|
11
|
+
include Transrate
|
12
|
+
|
13
|
+
|
14
|
+
class Controller
|
15
|
+
|
16
|
+
attr_accessor :global_opts
|
17
|
+
attr_accessor :assembler_opts
|
18
|
+
|
19
|
+
# Return a new Assemblotron
|
20
|
+
def initialize
|
21
|
+
@log = Logger.new(STDOUT)
|
22
|
+
@log.level = Logger::INFO
|
23
|
+
self.load_config
|
24
|
+
self.init_settings
|
25
|
+
@assemblers = []
|
26
|
+
self.load_assemblers
|
27
|
+
end # initialize
|
28
|
+
|
29
|
+
def self.header
|
30
|
+
"Assemblotron v#{VERSION::STRING.dup}"
|
31
|
+
end
|
32
|
+
|
33
|
+
# Initialise the Biopsy settings
|
34
|
+
def init_settings
|
35
|
+
s = Biopsy::Settings.instance
|
36
|
+
s.set_defaults
|
37
|
+
libdir = File.dirname(__FILE__)
|
38
|
+
s.target_dir = [File.join(libdir, 'assemblotron/assemblers/')]
|
39
|
+
s.objectives_dir = [File.join(libdir, 'assemblotron/objectives/')]
|
40
|
+
@log.debug "initialised Biopsy settings"
|
41
|
+
end # init_settings
|
42
|
+
|
43
|
+
# Load global configuration from the config file at
|
44
|
+
# +~/.assemblotron+, if it exists.
|
45
|
+
def load_config
|
46
|
+
config_file = File.join(Dir.home, ".assemblotron")
|
47
|
+
if File.exists? config_file
|
48
|
+
@log.debug "config file found at #{config_file}"
|
49
|
+
config = YAML::load_file(config_file)
|
50
|
+
if config.nil?
|
51
|
+
@log.warn 'config file malformed or empty'
|
52
|
+
return
|
53
|
+
end
|
54
|
+
@config = config.deep_symbolize
|
55
|
+
end
|
56
|
+
end # parse_config
|
57
|
+
|
58
|
+
# Discover and load available assemblers.
|
59
|
+
#
|
60
|
+
# Loads all assemblers provided by the program, and
|
61
|
+
# then searches any directories listed in the config
|
62
|
+
# file (+~/.assemblotron+) setting +assembler_dirs+.
|
63
|
+
#
|
64
|
+
# Directories listed in +assembler_dirs+ must contain:
|
65
|
+
#
|
66
|
+
# +definitions+:: Directory with one +.yml+ definition per assembler.
|
67
|
+
# See the documentation for Definition.
|
68
|
+
# +constructors+:: Directory with one +.rb+ file per assembler.
|
69
|
+
# See the documentation for Constructor.
|
70
|
+
def load_assemblers
|
71
|
+
Biopsy::Settings.instance.target_dir.each do |dir|
|
72
|
+
Dir.chdir dir do
|
73
|
+
Dir['*.yml'].each do |file|
|
74
|
+
name = File.basename(file, '.yml')
|
75
|
+
target = Biopsy::Target.new
|
76
|
+
target.load_by_name name
|
77
|
+
@assemblers << target
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end # load_assemblers
|
82
|
+
|
83
|
+
# Return an array of the names of available assemblers
|
84
|
+
def assemblers
|
85
|
+
a = []
|
86
|
+
@assemblers.each do |t|
|
87
|
+
a << t.name
|
88
|
+
a << t.shortname if t.shortname
|
89
|
+
end
|
90
|
+
a
|
91
|
+
end # assemblers
|
92
|
+
|
93
|
+
def list_assemblers
|
94
|
+
puts Controller.header
|
95
|
+
puts <<-EOS
|
96
|
+
|
97
|
+
Available assemblers are listed below.
|
98
|
+
Shortnames are shown in brackets if available.
|
99
|
+
|
100
|
+
Usage:
|
101
|
+
atron [global options] <assembler> [assembler options]
|
102
|
+
|
103
|
+
Assemblers:
|
104
|
+
EOS
|
105
|
+
@assemblers.each do |a|
|
106
|
+
p = " - #{a.name}"
|
107
|
+
p += " (#{a.shortname})" if a.respond_to? :shortname
|
108
|
+
puts p
|
109
|
+
end
|
110
|
+
end # list_assemblers
|
111
|
+
|
112
|
+
def options_for_assembler assembler
|
113
|
+
a = self.get_assembler assembler
|
114
|
+
parser = Trollop::Parser.new do
|
115
|
+
banner <<-EOS
|
116
|
+
#{Controller.header}
|
117
|
+
|
118
|
+
Options for assembler #{assembler}
|
119
|
+
EOS
|
120
|
+
opt :reference, "Path to reference proteome file in FASTA format",
|
121
|
+
:type => String,
|
122
|
+
:required => true
|
123
|
+
a.options.each_pair do |param, opts|
|
124
|
+
opt param,
|
125
|
+
opts[:desc],
|
126
|
+
:type => Controller.class_from_type(opts[:type])
|
127
|
+
end
|
128
|
+
end
|
129
|
+
Trollop::with_standard_exception_handling parser do
|
130
|
+
raise Trollop::HelpNeeded if ARGV.empty? # show help screen
|
131
|
+
parser.parse ARGV
|
132
|
+
end
|
133
|
+
end # options_for_assembler
|
134
|
+
|
135
|
+
def get_assembler assembler
|
136
|
+
ret = @assemblers.find do |a|
|
137
|
+
a.name == assembler ||
|
138
|
+
a.shortname == assembler
|
139
|
+
end
|
140
|
+
raise "couldn't find assembler #{assembler}" if ret.nil?
|
141
|
+
ret
|
142
|
+
end
|
143
|
+
|
144
|
+
def self.class_from_type type
|
145
|
+
case type
|
146
|
+
when 'string'
|
147
|
+
String
|
148
|
+
when 'int'
|
149
|
+
Integer
|
150
|
+
when 'float'
|
151
|
+
Float
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
def subsample_input
|
156
|
+
l = @assembler_opts[:left]
|
157
|
+
r = @assembler_opts[:right]
|
158
|
+
size = @global_opts[:subsample_size]
|
159
|
+
|
160
|
+
s = Sample.new(l, r)
|
161
|
+
ls, rs = s.subsample size
|
162
|
+
|
163
|
+
@assembler_opts[:left_subset] = ls
|
164
|
+
@assembler_opts[:right_subset] = rs
|
165
|
+
end
|
166
|
+
|
167
|
+
def final_assembly assembler, result
|
168
|
+
Dir.mkdir('final_assembly')
|
169
|
+
Dir.chdir('final_assembly') do
|
170
|
+
assembler.run result
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
def run assembler
|
175
|
+
# subsampling
|
176
|
+
if @global_opts[:skip_subsample]
|
177
|
+
@assembler_opts[:left_subset] = assembler_opts[:left]
|
178
|
+
@assembler_opts[:right_subset] = assembler_opts[:right]
|
179
|
+
else
|
180
|
+
subsample_input
|
181
|
+
end
|
182
|
+
|
183
|
+
# load reference and create ublast DB
|
184
|
+
@assembler_opts[:reference] = Transrate::Assembly.new(@assembler_opts[:reference])
|
185
|
+
ra = Transrate::ReciprocalAnnotation.new(@assembler_opts[:reference], @assembler_opts[:reference])
|
186
|
+
ra.make_reference_db
|
187
|
+
|
188
|
+
# setup the assembler
|
189
|
+
a = self.get_assembler assembler
|
190
|
+
a.setup_optim(@global_opts, @assembler_opts)
|
191
|
+
|
192
|
+
# run the optimisation
|
193
|
+
e = Biopsy::Experiment.new(a, options: @assembler_opts, threads: @global_opts[:threads])
|
194
|
+
res = e.run
|
195
|
+
|
196
|
+
# write out the result
|
197
|
+
File.open(@global_opts[:output_parameters], 'wb') do |f|
|
198
|
+
f.write(JSON.pretty_generate(res))
|
199
|
+
end
|
200
|
+
|
201
|
+
# run the final assembly
|
202
|
+
a.setup_final(@global_opts, @assembler_opts)
|
203
|
+
unless @global_opts[:skip_final]
|
204
|
+
final_assembly a, res
|
205
|
+
end
|
206
|
+
end # run
|
207
|
+
|
208
|
+
end # Controller
|
209
|
+
|
210
|
+
end # Assemblotron
|
@@ -0,0 +1,69 @@
|
|
1
|
+
class SoapDenovoTrans
|
2
|
+
|
3
|
+
def initialize
|
4
|
+
@count = 0
|
5
|
+
end
|
6
|
+
|
7
|
+
def run params
|
8
|
+
self.setup_soap(params) if @count == 0
|
9
|
+
self.run_soap params
|
10
|
+
@count += 1
|
11
|
+
end
|
12
|
+
|
13
|
+
# soapdt.config file only generated on first run
|
14
|
+
def setup_soap params
|
15
|
+
# make config file
|
16
|
+
rf = params[:readformat] == 'fastq' ? 'q' : 'f'
|
17
|
+
File.open("soapdt.config", "w") do |conf|
|
18
|
+
conf.puts "max_rd_len=20000"
|
19
|
+
conf.puts "[LIB]"
|
20
|
+
conf.puts "avg_ins=#{params[:insertsize]}"
|
21
|
+
conf.puts "reverse_seq=0"
|
22
|
+
conf.puts "asm_flags=3"
|
23
|
+
conf.puts "rank=2"
|
24
|
+
conf.puts "#{rf}1=#{params[:l]}"
|
25
|
+
conf.puts "#{rf}2=#{params[:r]}"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def include_defaults params
|
30
|
+
defaults = {
|
31
|
+
:K => 23,
|
32
|
+
:p => 8,
|
33
|
+
:d => 0,
|
34
|
+
:e => 2,
|
35
|
+
:M => 1,
|
36
|
+
:F => true,
|
37
|
+
:L => 100,
|
38
|
+
:t => 5,
|
39
|
+
:G => 50
|
40
|
+
}
|
41
|
+
defaults.merge params
|
42
|
+
end
|
43
|
+
|
44
|
+
def construct_command(params)
|
45
|
+
params = self.include_defaults params
|
46
|
+
cmd = "#{params[:path]} all"
|
47
|
+
# generic
|
48
|
+
cmd += " -s soapdt.config" # config file
|
49
|
+
cmd += " -a #{params[:memory]}" # memory assumption
|
50
|
+
cmd += " -o #{params[:out]}" # output directory
|
51
|
+
cmd += " -p #{params[:threads]}" # number of threads
|
52
|
+
# specific
|
53
|
+
cmd += " -K #{params[:K]}" # kmer size
|
54
|
+
cmd += " -d #{params[:d]}" # minimum kmer frequency
|
55
|
+
cmd += " -F" if params[:F] # fill gaps in scaffold
|
56
|
+
cmd += " -M #{params[:M]}" # strength of contig flattening
|
57
|
+
cmd += " -L #{params[:L]}" # minimum contig length
|
58
|
+
cmd += " -e #{params[:e]}" # delete contigs with coverage no greater than
|
59
|
+
cmd += " -t #{params[:t]}" # maximum number of transcripts from one locus
|
60
|
+
cmd += " -G #{params[:G]}" # allowed length difference between estimated and filled gap
|
61
|
+
end
|
62
|
+
|
63
|
+
# runs SOAPdt script
|
64
|
+
def run_soap(params)
|
65
|
+
cmd = self.construct_command(params)
|
66
|
+
`#{cmd} > #{@count}.log`
|
67
|
+
end
|
68
|
+
|
69
|
+
end # SoapDenovoTrans
|
@@ -0,0 +1,120 @@
|
|
1
|
+
require 'transrate'
|
2
|
+
|
3
|
+
class SoapDenovoTrans
|
4
|
+
|
5
|
+
include Which
|
6
|
+
|
7
|
+
# Return a new SoapDenovoTrans object
|
8
|
+
def initialize
|
9
|
+
@count = 0
|
10
|
+
@path = which('SOAPdenovo-Trans-127mer')
|
11
|
+
raise "SOAPdenovo-Trans-127mer was not in the PATH" if @path.empty?
|
12
|
+
@path = @path.first
|
13
|
+
end
|
14
|
+
|
15
|
+
# Run the assembler with the provided parameters,
|
16
|
+
# returning a Transrate::ComparativeMetrics object
|
17
|
+
# containing a score for the generated assembly
|
18
|
+
# compared to the reference.
|
19
|
+
def run params
|
20
|
+
# run the assembly
|
21
|
+
self.run_soap params
|
22
|
+
# retrieve output
|
23
|
+
scaffolds = Dir['*.scafSeq']
|
24
|
+
return nil if scaffolds.empty?
|
25
|
+
scaffolds = scaffolds.first
|
26
|
+
return nil if File.size(scaffolds) == 0
|
27
|
+
# return a ComparativeMetrics object
|
28
|
+
assembly = Transrate::Assembly.new(scaffolds)
|
29
|
+
Transrate::ComparativeMetrics.new(assembly, params[:reference])
|
30
|
+
end
|
31
|
+
|
32
|
+
# Perform any necessary setup for the assembler
|
33
|
+
# prior to running the parameter optimisation.
|
34
|
+
# This includes modifying the config file to point
|
35
|
+
# to the subsetted reads rather than the full set.
|
36
|
+
def setup_optim(global_opts, assembler_opts)
|
37
|
+
# setup config file for subsetted reads
|
38
|
+
left = assembler_opts[:left_subset]
|
39
|
+
right = assembler_opts[:right_subset]
|
40
|
+
f = create_config(left, right, assembler_opts)
|
41
|
+
assembler_opts[:config] = f
|
42
|
+
end
|
43
|
+
|
44
|
+
# Perform any necessary setup for the assembler
|
45
|
+
# prior to running the full optimal assembly.
|
46
|
+
# This includes resetting the config to refer
|
47
|
+
# to the full set of reads.
|
48
|
+
def setup_full(global_opts, assembler_opts)
|
49
|
+
# set config file for full read set
|
50
|
+
left = assembler_opts[:left]
|
51
|
+
right = assembler_opts[:right]
|
52
|
+
f = create_config(left, right, assembler_opts)
|
53
|
+
assembler_opts[:config] = f
|
54
|
+
end
|
55
|
+
|
56
|
+
# Generate a config file with the specified left and right
|
57
|
+
# read input files, returning the full path to the config file.
|
58
|
+
def create_config left, right, assembler_opts
|
59
|
+
# create the config file
|
60
|
+
filename = "#{Time.now}.full.config"
|
61
|
+
File.open(filename) do |f|
|
62
|
+
f << 'max_rd_len=5000'
|
63
|
+
f << '[LIB]'
|
64
|
+
f << "avg_ins=#{assembler_opts[:insertsize]}"
|
65
|
+
f << "reverse_seq=0" # don't reverse complement the reads
|
66
|
+
f << "asm_flags=3" # use the reads for assembly and scaffolding
|
67
|
+
f << "q1=#{left}"
|
68
|
+
f << "q2=#{right}"
|
69
|
+
end
|
70
|
+
File.expand_path filename
|
71
|
+
end
|
72
|
+
|
73
|
+
# Merge the default parameters with the hash provided
|
74
|
+
def include_defaults params
|
75
|
+
defaults = {
|
76
|
+
:K => 23,
|
77
|
+
:threads => 8,
|
78
|
+
:out => 'sdt',
|
79
|
+
:d => 0,
|
80
|
+
:e => 2,
|
81
|
+
:M => 1,
|
82
|
+
:F => true,
|
83
|
+
:L => 100,
|
84
|
+
:t => 5,
|
85
|
+
:G => 50
|
86
|
+
}
|
87
|
+
defaults.merge params
|
88
|
+
end
|
89
|
+
|
90
|
+
# Given a set of parameters, fill in any missing
|
91
|
+
# parameters with defaults and construct a command
|
92
|
+
# to run the target assembler. Return the command
|
93
|
+
# as a string.
|
94
|
+
def construct_command params
|
95
|
+
params = self.include_defaults params
|
96
|
+
cmd = "#{@path} all"
|
97
|
+
# generic
|
98
|
+
cmd += " -s #{params[:config]}" # config file
|
99
|
+
cmd += " -a #{params[:memory]}" if params.has_key? :memory # memory assumption
|
100
|
+
cmd += " -o #{params[:out]}" if params.has_key? :out # output directory
|
101
|
+
cmd += " -p #{params[:threads]}" # number of threads
|
102
|
+
# specific
|
103
|
+
cmd += " -K #{params[:K]}" # kmer size
|
104
|
+
cmd += " -d #{params[:d]}" # minimum kmer frequency
|
105
|
+
cmd += " -F" if params[:F] # fill gaps in scaffold
|
106
|
+
cmd += " -M #{params[:M]}" # strength of contig flattening
|
107
|
+
cmd += " -L #{params[:L]}" # minimum contig length
|
108
|
+
cmd += " -e #{params[:e]}" # delete contigs with coverage no greater than
|
109
|
+
cmd += " -t #{params[:t]}" # maximum number of transcripts from one locus
|
110
|
+
cmd += " -G #{params[:G]}" # allowed length difference between estimated and filled gap
|
111
|
+
end
|
112
|
+
|
113
|
+
# Run the SOAPdenovo-trans assembler with the provided
|
114
|
+
# parameters. Return the output generated.
|
115
|
+
def run_soap(params)
|
116
|
+
cmd = self.construct_command(params)
|
117
|
+
`#{cmd} > #{@count}.log`
|
118
|
+
end
|
119
|
+
|
120
|
+
end # SoapDenovoTrans
|