assemblotron 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 714db5f952949fb16921c707434b7ba07797c3aa
4
+ data.tar.gz: 407ce9019b37b5492a41d4b72e0101ea57f52c0e
5
+ SHA512:
6
+ metadata.gz: e586502ee078aaee565064d21adc443ffff78bed298ed4a40b3b01208077129e738d3d126115597324551fe6767a99f70760e12fc4352214705373d5ac8d9150
7
+ data.tar.gz: b6e575f209ca04870fe1589dc3cdba1ce061664f859130889c5893c826560435dd7038c09e9d06272d14ea100ff73d220605b0cdccb8bcc55022cc048ce6b717
@@ -0,0 +1,39 @@
1
+ assemblotron
2
+ ============
3
+
4
+ Automated optimal *de-novo* assembly.
5
+
6
+ Transcriptome assembly takes a *long* time and a *lot* of computational resources. The software is complex, and the best settings to use depend heavily on the organism being studied.
7
+
8
+ Assemblotron solves this problem by rapidly discovering the optimal settings for an assembler or assembly pipeline and performs the best possible assembly using the tools available.
9
+
10
+ A typical Assemblotron run takes only 3-4 hours on 8 cores of a desktop PC with an i7 processor, and greatly improves the accuracy of expression quantification and gene reconstruction in de-novo transcriptome analysis (blog posts/paper with evidence to follow shortly).
11
+
12
+ ## Explanation
13
+
14
+ Assemblotron takes a small subsample of the available reads and runs an assembly. The assembly is thoroughly analysed and scored using [transrate](https://github.com/Blahah/transrate). Then the optimisation system [biopsy](https://github.com/Blahah/biopsy) is used to select new assembler settings to test, and another assembly is performed. This process is repeated until an estimate for the best possible assembly is found.
15
+
16
+ Further documentation will be provided when the software enters beta.
17
+
18
+ ## Development status
19
+
20
+ [![Gem Version](https://badge.fury.io/rb/assemblotron.png)][gem]
21
+ [![Build Status](https://secure.travis-ci.org/Blahah/assemblotron.png?branch=master)][travis]
22
+ [![Dependency Status](https://gemnasium.com/Blahah/assemblotron.png?travis)][gemnasium]
23
+ [![Code Climate](https://codeclimate.com/github/Blahah/assemblotron.png)][codeclimate]
24
+ [![Coverage Status](https://coveralls.io/repos/Blahah/assemblotron/badge.png?branch=master)][coveralls]
25
+
26
+ [gem]: https://badge.fury.io/rb/assemblotron
27
+ [travis]: https://travis-ci.org/Blahah/assemblotron
28
+ [gemnasium]: https://gemnasium.com/Blahah/assemblotron
29
+ [codeclimate]: https://codeclimate.com/github/Blahah/assemblotron
30
+ [coveralls]: https://coveralls.io/r/Blahah/assemblotron
31
+
32
+ This software is in pre-alpha development and is not yet ready for deployment.
33
+ Please don't report issues or request documentation until we are ready for beta release (see below for estimated timeframe).
34
+
35
+ ## Expectations for imminent versions
36
+
37
+ * **v0.1.0**: allow optimisation of SoapDenovoTrans, VelvetOases (due 10th November 2013)
38
+ * **v0.2.0**: add Abyss, SGA and ReadJoiner
39
+ * **v0.3.0**: allow optimising pipelines
@@ -0,0 +1,8 @@
1
+ require 'rake/testtask'
2
+
3
+ Rake::TestTask.new do |t|
4
+ t.libs << 'test'
5
+ end
6
+
7
+ desc "Run tests"
8
+ task :default => :test
@@ -0,0 +1,210 @@
1
+ require 'biopsy'
2
+ require 'logger'
3
+ require 'transrate'
4
+ require 'assemblotron/version'
5
+ require 'assemblotron/sample'
6
+ require 'pp'
7
+ require 'json'
8
+
9
+ module Assemblotron
10
+
11
+ include Transrate
12
+
13
+
14
+ class Controller
15
+
16
+ attr_accessor :global_opts
17
+ attr_accessor :assembler_opts
18
+
19
+ # Return a new Assemblotron
20
+ def initialize
21
+ @log = Logger.new(STDOUT)
22
+ @log.level = Logger::INFO
23
+ self.load_config
24
+ self.init_settings
25
+ @assemblers = []
26
+ self.load_assemblers
27
+ end # initialize
28
+
29
+ def self.header
30
+ "Assemblotron v#{VERSION::STRING.dup}"
31
+ end
32
+
33
+ # Initialise the Biopsy settings
34
+ def init_settings
35
+ s = Biopsy::Settings.instance
36
+ s.set_defaults
37
+ libdir = File.dirname(__FILE__)
38
+ s.target_dir = [File.join(libdir, 'assemblotron/assemblers/')]
39
+ s.objectives_dir = [File.join(libdir, 'assemblotron/objectives/')]
40
+ @log.debug "initialised Biopsy settings"
41
+ end # init_settings
42
+
43
+ # Load global configuration from the config file at
44
+ # +~/.assemblotron+, if it exists.
45
+ def load_config
46
+ config_file = File.join(Dir.home, ".assemblotron")
47
+ if File.exists? config_file
48
+ @log.debug "config file found at #{config_file}"
49
+ config = YAML::load_file(config_file)
50
+ if config.nil?
51
+ @log.warn 'config file malformed or empty'
52
+ return
53
+ end
54
+ @config = config.deep_symbolize
55
+ end
56
+ end # parse_config
57
+
58
+ # Discover and load available assemblers.
59
+ #
60
+ # Loads all assemblers provided by the program, and
61
+ # then searches any directories listed in the config
62
+ # file (+~/.assemblotron+) setting +assembler_dirs+.
63
+ #
64
+ # Directories listed in +assembler_dirs+ must contain:
65
+ #
66
+ # +definitions+:: Directory with one +.yml+ definition per assembler.
67
+ # See the documentation for Definition.
68
+ # +constructors+:: Directory with one +.rb+ file per assembler.
69
+ # See the documentation for Constructor.
70
+ def load_assemblers
71
+ Biopsy::Settings.instance.target_dir.each do |dir|
72
+ Dir.chdir dir do
73
+ Dir['*.yml'].each do |file|
74
+ name = File.basename(file, '.yml')
75
+ target = Biopsy::Target.new
76
+ target.load_by_name name
77
+ @assemblers << target
78
+ end
79
+ end
80
+ end
81
+ end # load_assemblers
82
+
83
+ # Return an array of the names of available assemblers
84
+ def assemblers
85
+ a = []
86
+ @assemblers.each do |t|
87
+ a << t.name
88
+ a << t.shortname if t.shortname
89
+ end
90
+ a
91
+ end # assemblers
92
+
93
+ def list_assemblers
94
+ puts Controller.header
95
+ puts <<-EOS
96
+
97
+ Available assemblers are listed below.
98
+ Shortnames are shown in brackets if available.
99
+
100
+ Usage:
101
+ atron [global options] <assembler> [assembler options]
102
+
103
+ Assemblers:
104
+ EOS
105
+ @assemblers.each do |a|
106
+ p = " - #{a.name}"
107
+ p += " (#{a.shortname})" if a.respond_to? :shortname
108
+ puts p
109
+ end
110
+ end # list_assemblers
111
+
112
+ def options_for_assembler assembler
113
+ a = self.get_assembler assembler
114
+ parser = Trollop::Parser.new do
115
+ banner <<-EOS
116
+ #{Controller.header}
117
+
118
+ Options for assembler #{assembler}
119
+ EOS
120
+ opt :reference, "Path to reference proteome file in FASTA format",
121
+ :type => String,
122
+ :required => true
123
+ a.options.each_pair do |param, opts|
124
+ opt param,
125
+ opts[:desc],
126
+ :type => Controller.class_from_type(opts[:type])
127
+ end
128
+ end
129
+ Trollop::with_standard_exception_handling parser do
130
+ raise Trollop::HelpNeeded if ARGV.empty? # show help screen
131
+ parser.parse ARGV
132
+ end
133
+ end # options_for_assembler
134
+
135
+ def get_assembler assembler
136
+ ret = @assemblers.find do |a|
137
+ a.name == assembler ||
138
+ a.shortname == assembler
139
+ end
140
+ raise "couldn't find assembler #{assembler}" if ret.nil?
141
+ ret
142
+ end
143
+
144
+ def self.class_from_type type
145
+ case type
146
+ when 'string'
147
+ String
148
+ when 'int'
149
+ Integer
150
+ when 'float'
151
+ Float
152
+ end
153
+ end
154
+
155
+ def subsample_input
156
+ l = @assembler_opts[:left]
157
+ r = @assembler_opts[:right]
158
+ size = @global_opts[:subsample_size]
159
+
160
+ s = Sample.new(l, r)
161
+ ls, rs = s.subsample size
162
+
163
+ @assembler_opts[:left_subset] = ls
164
+ @assembler_opts[:right_subset] = rs
165
+ end
166
+
167
+ def final_assembly assembler, result
168
+ Dir.mkdir('final_assembly')
169
+ Dir.chdir('final_assembly') do
170
+ assembler.run result
171
+ end
172
+ end
173
+
174
+ def run assembler
175
+ # subsampling
176
+ if @global_opts[:skip_subsample]
177
+ @assembler_opts[:left_subset] = assembler_opts[:left]
178
+ @assembler_opts[:right_subset] = assembler_opts[:right]
179
+ else
180
+ subsample_input
181
+ end
182
+
183
+ # load reference and create ublast DB
184
+ @assembler_opts[:reference] = Transrate::Assembly.new(@assembler_opts[:reference])
185
+ ra = Transrate::ReciprocalAnnotation.new(@assembler_opts[:reference], @assembler_opts[:reference])
186
+ ra.make_reference_db
187
+
188
+ # setup the assembler
189
+ a = self.get_assembler assembler
190
+ a.setup_optim(@global_opts, @assembler_opts)
191
+
192
+ # run the optimisation
193
+ e = Biopsy::Experiment.new(a, options: @assembler_opts, threads: @global_opts[:threads])
194
+ res = e.run
195
+
196
+ # write out the result
197
+ File.open(@global_opts[:output_parameters], 'wb') do |f|
198
+ f.write(JSON.pretty_generate(res))
199
+ end
200
+
201
+ # run the final assembly
202
+ a.setup_final(@global_opts, @assembler_opts)
203
+ unless @global_opts[:skip_final]
204
+ final_assembly a, res
205
+ end
206
+ end # run
207
+
208
+ end # Controller
209
+
210
+ end # Assemblotron
@@ -0,0 +1,7 @@
1
+ class Abyss
2
+
3
+ def run params
4
+
5
+ end
6
+
7
+ end # Abyss
@@ -0,0 +1,7 @@
1
+ class ReadJoiner
2
+
3
+ def run params
4
+
5
+ end
6
+
7
+ end # ReadJoiner
@@ -0,0 +1,7 @@
1
+ class SGA
2
+
3
+ def run params
4
+
5
+ end
6
+
7
+ end # SGA
@@ -0,0 +1,69 @@
1
+ class SoapDenovoTrans
2
+
3
+ def initialize
4
+ @count = 0
5
+ end
6
+
7
+ def run params
8
+ self.setup_soap(params) if @count == 0
9
+ self.run_soap params
10
+ @count += 1
11
+ end
12
+
13
+ # soapdt.config file only generated on first run
14
+ def setup_soap params
15
+ # make config file
16
+ rf = params[:readformat] == 'fastq' ? 'q' : 'f'
17
+ File.open("soapdt.config", "w") do |conf|
18
+ conf.puts "max_rd_len=20000"
19
+ conf.puts "[LIB]"
20
+ conf.puts "avg_ins=#{params[:insertsize]}"
21
+ conf.puts "reverse_seq=0"
22
+ conf.puts "asm_flags=3"
23
+ conf.puts "rank=2"
24
+ conf.puts "#{rf}1=#{params[:l]}"
25
+ conf.puts "#{rf}2=#{params[:r]}"
26
+ end
27
+ end
28
+
29
+ def include_defaults params
30
+ defaults = {
31
+ :K => 23,
32
+ :p => 8,
33
+ :d => 0,
34
+ :e => 2,
35
+ :M => 1,
36
+ :F => true,
37
+ :L => 100,
38
+ :t => 5,
39
+ :G => 50
40
+ }
41
+ defaults.merge params
42
+ end
43
+
44
+ def construct_command(params)
45
+ params = self.include_defaults params
46
+ cmd = "#{params[:path]} all"
47
+ # generic
48
+ cmd += " -s soapdt.config" # config file
49
+ cmd += " -a #{params[:memory]}" # memory assumption
50
+ cmd += " -o #{params[:out]}" # output directory
51
+ cmd += " -p #{params[:threads]}" # number of threads
52
+ # specific
53
+ cmd += " -K #{params[:K]}" # kmer size
54
+ cmd += " -d #{params[:d]}" # minimum kmer frequency
55
+ cmd += " -F" if params[:F] # fill gaps in scaffold
56
+ cmd += " -M #{params[:M]}" # strength of contig flattening
57
+ cmd += " -L #{params[:L]}" # minimum contig length
58
+ cmd += " -e #{params[:e]}" # delete contigs with coverage no greater than
59
+ cmd += " -t #{params[:t]}" # maximum number of transcripts from one locus
60
+ cmd += " -G #{params[:G]}" # allowed length difference between estimated and filled gap
61
+ end
62
+
63
+ # runs SOAPdt script
64
+ def run_soap(params)
65
+ cmd = self.construct_command(params)
66
+ `#{cmd} > #{@count}.log`
67
+ end
68
+
69
+ end # SoapDenovoTrans
@@ -0,0 +1,7 @@
1
+ class VelvetOases
2
+
3
+ def run params
4
+
5
+ end
6
+
7
+ end # VelvetOases
@@ -0,0 +1,120 @@
1
+ require 'transrate'
2
+
3
+ class SoapDenovoTrans
4
+
5
+ include Which
6
+
7
+ # Return a new SoapDenovoTrans object
8
+ def initialize
9
+ @count = 0
10
+ @path = which('SOAPdenovo-Trans-127mer')
11
+ raise "SOAPdenovo-Trans-127mer was not in the PATH" if @path.empty?
12
+ @path = @path.first
13
+ end
14
+
15
+ # Run the assembler with the provided parameters,
16
+ # returning a Transrate::ComparativeMetrics object
17
+ # containing a score for the generated assembly
18
+ # compared to the reference.
19
+ def run params
20
+ # run the assembly
21
+ self.run_soap params
22
+ # retrieve output
23
+ scaffolds = Dir['*.scafSeq']
24
+ return nil if scaffolds.empty?
25
+ scaffolds = scaffolds.first
26
+ return nil if File.size(scaffolds) == 0
27
+ # return a ComparativeMetrics object
28
+ assembly = Transrate::Assembly.new(scaffolds)
29
+ Transrate::ComparativeMetrics.new(assembly, params[:reference])
30
+ end
31
+
32
+ # Perform any necessary setup for the assembler
33
+ # prior to running the parameter optimisation.
34
+ # This includes modifying the config file to point
35
+ # to the subsetted reads rather than the full set.
36
+ def setup_optim(global_opts, assembler_opts)
37
+ # setup config file for subsetted reads
38
+ left = assembler_opts[:left_subset]
39
+ right = assembler_opts[:right_subset]
40
+ f = create_config(left, right, assembler_opts)
41
+ assembler_opts[:config] = f
42
+ end
43
+
44
+ # Perform any necessary setup for the assembler
45
+ # prior to running the full optimal assembly.
46
+ # This includes resetting the config to refer
47
+ # to the full set of reads.
48
+ def setup_full(global_opts, assembler_opts)
49
+ # set config file for full read set
50
+ left = assembler_opts[:left]
51
+ right = assembler_opts[:right]
52
+ f = create_config(left, right, assembler_opts)
53
+ assembler_opts[:config] = f
54
+ end
55
+
56
+ # Generate a config file with the specified left and right
57
+ # read input files, returning the full path to the config file.
58
+ def create_config left, right, assembler_opts
59
+ # create the config file
60
+ filename = "#{Time.now}.full.config"
61
+ File.open(filename) do |f|
62
+ f << 'max_rd_len=5000'
63
+ f << '[LIB]'
64
+ f << "avg_ins=#{assembler_opts[:insertsize]}"
65
+ f << "reverse_seq=0" # don't reverse complement the reads
66
+ f << "asm_flags=3" # use the reads for assembly and scaffolding
67
+ f << "q1=#{left}"
68
+ f << "q2=#{right}"
69
+ end
70
+ File.expand_path filename
71
+ end
72
+
73
+ # Merge the default parameters with the hash provided
74
+ def include_defaults params
75
+ defaults = {
76
+ :K => 23,
77
+ :threads => 8,
78
+ :out => 'sdt',
79
+ :d => 0,
80
+ :e => 2,
81
+ :M => 1,
82
+ :F => true,
83
+ :L => 100,
84
+ :t => 5,
85
+ :G => 50
86
+ }
87
+ defaults.merge params
88
+ end
89
+
90
+ # Given a set of parameters, fill in any missing
91
+ # parameters with defaults and construct a command
92
+ # to run the target assembler. Return the command
93
+ # as a string.
94
+ def construct_command params
95
+ params = self.include_defaults params
96
+ cmd = "#{@path} all"
97
+ # generic
98
+ cmd += " -s #{params[:config]}" # config file
99
+ cmd += " -a #{params[:memory]}" if params.has_key? :memory # memory assumption
100
+ cmd += " -o #{params[:out]}" if params.has_key? :out # output directory
101
+ cmd += " -p #{params[:threads]}" # number of threads
102
+ # specific
103
+ cmd += " -K #{params[:K]}" # kmer size
104
+ cmd += " -d #{params[:d]}" # minimum kmer frequency
105
+ cmd += " -F" if params[:F] # fill gaps in scaffold
106
+ cmd += " -M #{params[:M]}" # strength of contig flattening
107
+ cmd += " -L #{params[:L]}" # minimum contig length
108
+ cmd += " -e #{params[:e]}" # delete contigs with coverage no greater than
109
+ cmd += " -t #{params[:t]}" # maximum number of transcripts from one locus
110
+ cmd += " -G #{params[:G]}" # allowed length difference between estimated and filled gap
111
+ end
112
+
113
+ # Run the SOAPdenovo-trans assembler with the provided
114
+ # parameters. Return the output generated.
115
+ def run_soap(params)
116
+ cmd = self.construct_command(params)
117
+ `#{cmd} > #{@count}.log`
118
+ end
119
+
120
+ end # SoapDenovoTrans