finishm 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,55 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'optparse'
4
- require 'bio-logger'
5
-
6
- SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
7
-
8
- # Parse command line options into the options hash
9
- options = {
10
- :logger => 'stderr',
11
- :log_level => 'info',
12
- }
13
- o = OptionParser.new do |opts|
14
- opts.banner = "
15
- Usage: #{SCRIPT_NAME} <arguments>
16
-
17
- Description of what this program does...\n\n"
18
-
19
- opts.on("--velvet-pregraph GRAPH_FILE", "PreGraph file output from velveth [required]") do |arg|
20
- options[:velvet_pregraph_file] = arg
21
- end
22
-
23
- # logger options
24
- opts.separator "\nVerbosity:\n\n"
25
- opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
26
- opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
27
- opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
28
- end; o.parse!
29
- if ARGV.length != 0 or options[:velvet_pregraph_file].nil?
30
- $stderr.puts o
31
- exit 1
32
- end
33
- # Setup logging
34
- Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
35
-
36
-
37
- # Read in the velvet graph
38
- log.info "Parsing graph from #{options[:velvet_pregraph_file]}"
39
- graph = Bio::Velvet::Graph.parse_from_file(options[:velvet_pregraph_file])
40
- log.info "Finished parsing graph, found #{graph.number_of_nodes} nodes"
41
-
42
- # Log the number of nodes and arcs in the current graph
43
-
44
- # Read in the fasta file of immutable nodes, and extract the two most immutable
45
- # Log that they are found
46
-
47
- # Determine that the graph is connected or not between the two most immutable nodes, using some graph theoretic algorithm
48
- # If the graph is not connected, then there is no hope, exit
49
-
50
- # Go through the graph to get a list of the cap nodes
51
- # Log the number of cap nodes found
52
-
53
- # Trim off all the cap nodes back to cross nodes, keeping track of the lengths
54
-
55
- # Print the graph in graphviz dot format
@@ -1,241 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'optparse'
4
- require 'bio-logger'
5
- require 'bio-velvet'
6
- require 'tempfile'
7
- require 'pp'
8
-
9
- SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = 'finishm'
10
- $:.unshift File.join(File.dirname(__FILE__),'..','lib')
11
- require 'priner'
12
-
13
- # Parse command line options into the options hash
14
- options = {
15
- :logger => 'stderr',
16
- :log_level => 'info',
17
- :velvet_kmer_size => 43,#TODO: these options should be exposed to the user, and perhaps not guessed at
18
- :contig_end_length => 200,
19
- :output_assembly_path => '/tmp/velvetAssembly',
20
- :graph_search_leash_length => 3000,
21
- :assembly_coverage_cutoff => 1.5,
22
- }
23
- o = OptionParser.new do |opts|
24
- opts.banner = "
25
- Usage: #{SCRIPT_NAME} --reads <read_file> --contig <contig_file>
26
-
27
- Takes a set of reads and a contig that contains gap characters. Then it tries to fill in
28
- these N characters. It is possible that there is multiple ways to close the gap - in that case
29
- each is reported. \n\n"
30
-
31
-
32
- opts.on("--reads FILE", "gzipped fastq file of reads to perform the gap closing with [required]") do |arg|
33
- options[:reads_file] = arg
34
- end
35
- opts.on("--contig FILE", "fasta file of single contig containing Ns that are to be closed [required]") do |arg|
36
- options[:contig_file] = arg
37
- end
38
- opts.on("--output-trails-fasta PATH", "Output found paths to this file in fasta format [default: off]") do |arg|
39
- options[:overall_trail_output_fasta_file] = arg
40
- end
41
-
42
- opts.separator "\nOptional arguments:\n\n"
43
- opts.on("--overhang NUM", "Start assembling this far from the gap [default: #{options[:contig_end_length]}]") do |arg|
44
- options[:contig_end_length] = arg.to_i
45
- end
46
- opts.on("--start OFFSET", "Start trying to fill from this position in the contig, requires --stop [default: found from position of Ns}]") do |arg|
47
- options[:start_offset] = arg.to_i-1
48
- end
49
- opts.on("--stop OFFSET", "Start trying to fill to this position in the contig, requires --start [default: found from position of Ns}]") do |arg|
50
- options[:end_offset] = arg.to_i-1
51
- end
52
- opts.on("--assembly-png PATH", "Output assembly as a PNG file [default: off]") do |arg|
53
- options[:output_graph_png] = arg
54
- end
55
- opts.on("--assembly-svg PATH", "Output assembly as an SVG file [default: off]") do |arg|
56
- options[:output_graph_svg] = arg
57
- end
58
- opts.on("--assembly-dot PATH", "Output assembly as an DOT file [default: off]") do |arg|
59
- options[:output_graph_dot] = arg
60
- end
61
- opts.on("--velvet-kmer KMER", "kmer size to use with velvet [default: #{options[:velvet_kmer_size]}]") do |arg|
62
- options[:velvet_kmer_size] = arg.to_i
63
- end
64
-
65
- opts.separator "\nDebug-related options:\n\n"
66
-
67
-
68
-
69
- # logger options
70
- opts.separator "\nVerbosity:\n\n"
71
- opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
72
- opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
73
- opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
74
- end; o.parse!
75
- if ARGV.length != 0 or options[:reads_file].nil? or options[:contig_file].nil? or options[:overall_trail_output_fasta_file].nil?
76
- $stderr.puts o
77
- exit 1
78
- end
79
- # Setup logging
80
- Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
81
- Bio::Log::LoggerPlus.new 'bio-velvet'; Bio::Log::CLI.configure 'bio-velvet'
82
- log.outputters[0].formatter = Log4r::PatternFormatter.new(:pattern => "%5l %c %d: %m", :date_pattern => '%d/%m %T')
83
-
84
- log.debug "Running finishm with options: #{PP.pp(options, "").gsub(/\n$/,'')}" if log.debug?
85
-
86
- # Find where the Ns are
87
- n_region_start = nil
88
- n_region_end = nil
89
- sequence = nil
90
- Bio::FlatFile.foreach(options[:contig_file]) do |seq|
91
- if sequence
92
- raise Exception, "Sorry, this script can only handle single sequences to be gap filled at the moment"
93
- end
94
-
95
- sequence = seq.seq
96
-
97
- if options[:start_offset] and options[:end_offset]
98
- log.info "Trying to gap fill from #{options[:start_offset]+1} to #{options[:end_offset]+1}"
99
- n_region_start = options[:start_offset]
100
- n_region_end = options[:end_offset]
101
- else
102
- log.info "Determining where to fill from the presence of Ns"
103
-
104
- matches = sequence.match(/(N+)/i)
105
- if !matches
106
- raise "Unable to find any gaps in the input sequence. That was a bit too easy.."
107
- end
108
- n_region_start = matches.offset(0)[0]
109
- n_region_end = n_region_start + matches[1].length
110
- log.info "Detected a gap between #{n_region_start} and #{n_region_end}"
111
- end
112
-
113
- # Check to make sure we are sufficiently distant from the ends
114
- if n_region_start < options[:contig_end_length] or
115
- sequence.length - n_region_end < options[:contig_end_length]
116
- raise "The gap is too close to the end of the contig, sorry"
117
- end
118
- end
119
-
120
- # Do the assembly
121
- graph = nil
122
- if options[:previously_serialized_parsed_graph_file].nil?
123
- velvet_result = nil
124
- if options[:previous_assembly].nil? #If assembly has not already been carried out
125
- Tempfile.open('anchors.fa') do |tempfile|
126
- tempfile.puts ">anchor1"
127
- tempfile.puts sequence[n_region_start-options[:contig_end_length]-1...n_region_start]
128
- tempfile.puts ">anchor2"
129
- #Have to be in reverse, because the node finder finds the node at the start of the read, not the end
130
- fwd2 = Bio::Sequence::NA.new(sequence[n_region_end..(n_region_end+options[:contig_end_length])])
131
- tempfile.puts fwd2.reverse_complement.to_s
132
- tempfile.close
133
- log.debug "Inputting anchors into the assembly: #{File.open(tempfile.path).read}" if log.debug?
134
-
135
- log.info "Assembling sampled reads with velvet"
136
- # Bit of a hack, but have to use -short1 as the anchors because then start and end anchors will have node IDs 1,2,... etc.
137
- velvet_result = Bio::Velvet::Runner.new.velvet(
138
- options[:velvet_kmer_size],
139
- "-short #{tempfile.path} -short2 -fastq.gz #{options[:reads_file]}",
140
- "-read_trkg yes -cov_cutoff #{options[:assembly_coverage_cutoff]}",
141
- :output_assembly_path => options[:output_assembly_path]
142
- )
143
- if log.debug?
144
- log.debug "velveth stdout: #{velvet_result.velveth_stdout}"
145
- log.debug "velveth stderr: #{velvet_result.velveth_stderr}"
146
- log.debug "velvetg stdout: #{velvet_result.velvetg_stdout}"
147
- log.debug "velvetg stderr: #{velvet_result.velvetg_stderr}"
148
- end
149
- log.info "Finished running assembly"
150
- end
151
- else
152
- log.info "Using previous assembly stored at #{options[:previous_assembly]}"
153
- velvet_result = Bio::Velvet::Result.new
154
- velvet_result.result_directory = options[:previous_assembly]
155
- end
156
-
157
- log.info "Parsing the graph output from velvet"
158
- graph = Bio::Velvet::Graph.parse_from_file(File.join velvet_result.result_directory, 'LastGraph')
159
- log.info "Finished parsing graph: found #{graph.nodes.length} nodes and #{graph.arcs.length} arcs"
160
-
161
- if options[:serialize_parsed_graph_file]
162
- log.info "Storing a binary version of the graph file for later use at #{options[:serialize_parsed_graph_file]}"
163
- File.open(options[:serialize_parsed_graph_file],'wb') do |f|
164
- f.print Marshal.dump(graph)
165
- end
166
- log.info "Stored a binary representation of the velvet graph at #{options[:serialize_parsed_graph_file]}"
167
- end
168
-
169
- if options[:assembly_coverage_cutoff]
170
- log.info "Removing low-coverage nodes from the graph (less than #{options[:assembly_coverage_cutoff]})"
171
- cutoffer = Bio::AssemblyGraphAlgorithms::CoverageBasedGraphFilter.new
172
- deleted_nodes, deleted_arcs = cutoffer.remove_low_coverage_nodes(graph, options[:assembly_coverage_cutoff], :whitelisted_sequences => [1,2])
173
-
174
- log.info "Removed #{deleted_nodes.length} nodes and #{deleted_arcs.length} arcs from the graph due to low coverage"
175
- log.info "Now there is #{graph.nodes.length} nodes and #{graph.arcs.length} arcs remaining"
176
- end
177
- else
178
- log.info "Restoring graph file from #{options[:previously_serialized_parsed_graph_file]}.."
179
- graph = Marshal.load(File.open(options[:previously_serialized_parsed_graph_file]))
180
- log.info "Restoration complete"
181
- end
182
-
183
-
184
- # Find the anchor nodes again
185
- finder = Bio::AssemblyGraphAlgorithms::NodeFinder.new
186
- log.info "Finding node representing the end of the each contig"
187
- i = 1
188
- anchor_sequence_ids = [1,2]
189
- start_node, start_node_forward = finder.find_unique_node_with_sequence_id(graph, 1)
190
- end_node, end_node_forward = finder.find_unique_node_with_sequence_id(graph, 2)
191
- if start_node and end_node
192
- log.info "Found both anchoring nodes in the graph: #{start_node.node_id}/#{start_node_forward} and #{end_node.node_id}/#{end_node_forward}"
193
- else
194
- log.error "start node not found" if start_node.nil?
195
- log.error "end node not found" if end_node.nil?
196
- raise "Unable to find both anchor reads from the assembly, cannot continue. This is probably an error with this script, not you."
197
- end
198
-
199
- log.info "Removing nodes unconnected to either the start or the end from the graph.."
200
- original_num_nodes = graph.nodes.length
201
- original_num_arcs = graph.arcs.length
202
- filter = Bio::AssemblyGraphAlgorithms::ConnectivityBasedGraphFilter.new
203
- filter.remove_unconnected_nodes(graph, [start_node, end_node])
204
- log.info "Removed #{original_num_nodes-graph.nodes.length} nodes and #{original_num_arcs-graph.arcs.length} arcs"
205
-
206
-
207
- if options[:output_graph_png]
208
- log.info "Converting assembly to a graphviz PNG"
209
- viser = Bio::Assembly::ABVisualiser.new
210
- gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id})
211
- gv.output :png => options[:output_graph_png], :use => :neato
212
- end
213
- if options[:output_graph_svg]
214
- log.info "Converting assembly to a graphviz SVG"
215
- viser = Bio::Assembly::ABVisualiser.new
216
- gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id})
217
- gv.output :svg => options[:output_graph_svg], :use => :neato
218
- end
219
- if options[:output_graph_dot]
220
- log.info "Converting assembly to a graphviz DOT"
221
- viser = Bio::Assembly::ABVisualiser.new
222
- gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id, :digraph => false})
223
- gv.output :dot => options[:output_graph_dot]
224
- end
225
-
226
-
227
-
228
- log.info "Searching for trails between the nodes within the assembly graph"
229
- cartographer = Bio::AssemblyGraphAlgorithms::AcyclicConnectionFinder.new
230
- trails = cartographer.find_trails_between_nodes(graph, start_node, end_node, options[:graph_search_leash_length], start_node_forward)
231
- log.info "Found #{trails.length} trail(s) in total"
232
-
233
-
234
- log.debug "Outputing trail sequences"
235
- File.open(options[:overall_trail_output_fasta_file],'w') do |f|
236
- trails.each_with_index do |trail, i|
237
- f.puts ">trail#{i+1}"
238
- f.puts trail.sequence
239
- end
240
- end
241
-
@@ -1,49 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'optparse'
4
- require 'bio-logger'
5
- require 'csv'
6
-
7
- SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
8
-
9
- # Parse command line options into the options hash
10
- options = {
11
- :logger => 'stderr',
12
- :log_level => 'info',
13
- :min => 0,
14
- }
15
- o = OptionParser.new do |opts|
16
- opts.banner = "
17
- Usage: #{SCRIPT_NAME} <arguments>
18
-
19
- grep a multiple kmer abundance file according to specified criteria\n\n"
20
-
21
- opts.on("--min NUMBER", "At least 1 column has at least this many observations [default: #{options[:min]}]") do |arg|
22
- options[:min] = arg.to_f
23
- end
24
-
25
- # logger options
26
- opts.separator "\nVerbosity:\n\n"
27
- opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
28
- opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
29
- opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
30
- end; o.parse!
31
- if ARGV.length != 1
32
- $stderr.puts o
33
- exit 1
34
- end
35
- # Setup logging
36
- Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
37
-
38
- CSV.foreach(ARGV[0], :col_sep => ' ') do |row|
39
- kmer = row[0]
40
- passable = false
41
- row[1...row.length].each do |count|
42
- if count.to_f > options[:min]
43
- passable = true
44
- break
45
- end
46
- end
47
- puts row.join(' ') if passable
48
- end
49
-
@@ -1,377 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'optparse'
4
- require 'bio-logger'
5
- require 'csv'
6
- require 'tempfile'
7
- require 'pp'
8
- require 'systemu'
9
- require 'bio-velvet'
10
- require 'set'
11
-
12
- SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = 'finishm'
13
- $:.unshift File.join(File.dirname(__FILE__),'..','lib')
14
- require 'priner'
15
-
16
- # Parse command line options into the options hash
17
- options = {
18
- :logger => 'stderr',
19
- :log_level => 'info',
20
- :min_leftover_length => false,
21
- :kmer_coverage_target => 1,
22
- :velvet_kmer_size => 155,
23
- :contig_end_length => 300,
24
- :graph_search_leash_length => 20000,
25
- :reads_to_assemble => nil,
26
- :assembly_coverage_cutoff => 1.5,
27
- :kmer_path_filter_min_coverage => 1,
28
- :kmer_path_end_exclusion_length => 50,
29
- :trail_kmer_coverage_file => 'trail_coverages.csv'
30
- }
31
-
32
- # TODO: make a better interface for this. Maybe specify an entire genome, and then "Contig_1 end, Contig_3 start" or something
33
- # Look at the last 300bp of the first contig.
34
- extract_exactly_one_contig_from_file = lambda do |fasta_file_path|
35
- contig = nil
36
- Bio::FlatFile.foreach(Bio::FastaFormat, fasta_file_path) do |e|
37
- if contig.nil?
38
- contig = e.seq
39
- else
40
- raise "Multiple sequences found in a contig file! I need exactly one"
41
- end
42
- end
43
- raise "I need a contig to be in the start contig file" if contig.nil?
44
- Bio::Sequence::NA.new(contig.to_s)
45
- end
46
-
47
- o = OptionParser.new do |opts|
48
- opts.banner = "
49
- Usage: #{SCRIPT_NAME} <kmer_multiple_abundance_file>
50
-
51
- Given an input kmer then abundances space separated file, and a threshold, print out how many kmers are unique to different subsets of columns\n\n"
52
-
53
- opts.on("--pattern PATTERN", "kmer abundance pattern e.g. '0111001110' [required]") do |arg|
54
- options[:pattern] = arg
55
- end
56
- opts.on("--kmer-abundances FILE", "kmer multiple abundance file [required]") do |arg|
57
- options[:kmer_multiple_abundance_file] = arg
58
- end
59
- opts.on("--upper-threshold NUM", "kmer frequency cutoff to saying 'present' [required]") do |arg|
60
- options[:upper_threshold] = arg.to_i
61
- end
62
- opts.on("--lower-threshold NUM", "kmer frequency cutoff to saying 'not present' [required]") do |arg|
63
- options[:lower_threshold] = arg.to_i
64
- end
65
- opts.on("--reads FILES", "comma-separated list of sequence reads files in the same order as the pattern was supplied [required]") do |arg|
66
- options[:reads_files] = arg.split(',').collect{|r| File.absolute_path r}
67
- end
68
- opts.on("--start-contig FASTA", "path to a fasta file with the starting contig in it (only). Assumes we are building off the end of this contig [required]") do |arg|
69
- options[:start_contig] = extract_exactly_one_contig_from_file.call arg
70
- end
71
- opts.on("--end-contig FASTA", "path to a fasta file with the ending contig in it (only). Assumes we are building onto the start of this contig [required]") do |arg|
72
- options[:end_contig] = extract_exactly_one_contig_from_file.call arg
73
- end
74
-
75
- opts.separator "\nOptional arguments:\n\n"
76
- opts.on("--min-leftover-read-length NUMBER", "when searching for reads with kmers, require the kmer to be at the beginning or end of the selected read [default: #{options[:min_leftover_length]}]") do |arg|
77
- options[:min_leftover_length] = arg.to_i
78
- end
79
- opts.on("--kmer-coverage-target NUMBER", "when searching for reads with kmers, require this many copies per kmer [default: #{options[:kmer_coverage_target]}]") do |arg|
80
- options[:kmer_coverage_target] = arg.to_i
81
- end
82
- opts.on("--already-patterned-reads FILE", "Attempt to assemble the reads in the specified file, useful for re-assembly [default: off]") do |arg|
83
- options[:already_patterned_reads] = arg
84
- end
85
- opts.on("--output-assembly PATH", "Output assembly intermediate files to this directory [default: off]") do |arg|
86
- options[:output_assembly_path] = arg
87
- end
88
- opts.on("--assembly-png PATH", "Output assembly as a PNG file [default: off]") do |arg|
89
- options[:output_graph_png] = arg
90
- end
91
- opts.on("--assembly-svg PATH", "Output assembly as an SVG file [default: off]") do |arg|
92
- options[:output_graph_svg] = arg
93
- end
94
- opts.on("--assembly-dot PATH", "Output assembly as an DOT file [default: off]") do |arg|
95
- options[:output_graph_dot] = arg
96
- end
97
- # opts.on("--output-begin-kmers PATH", "Output kmers found at the beginning point to this file [default: off]") do |arg|
98
- # options[:output_begin_kmers] = arg
99
- # end
100
- # opts.on("--output-end-kmers PATH", "Output kmers found at the ending point to this file [default: off]") do |arg|
101
- # options[:output_end_kmers] = arg
102
- # end
103
- opts.on("--assembly-coverage-cutoff NUMBER", "Require this much coverage in each node, all other nodes are removed [default: #{options[:assembly_coverage_cutoff]}]") do |arg|
104
- options[:assembly_coverage_cutoff] = arg.to_f
105
- end
106
- opts.on("--contig-end-length LENGTH", "Number of base pairs to start into the ends of the contigs [default: #{options[:contig_end_length]}]") do |arg|
107
- options[:contig_end_length] = arg.to_i
108
- end
109
-
110
- # logger options
111
- opts.separator "\nVerbosity:\n\n"
112
- opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
113
- opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
114
- opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
115
- end; o.parse!
116
- if ARGV.length != 0 or options[:upper_threshold].nil? or options[:lower_threshold].nil? or options[:pattern].nil? or options[:kmer_multiple_abundance_file].nil? or options[:reads_files].nil?
117
- pp options
118
- $stderr.puts o
119
- exit 1
120
- end
121
- # Setup logging
122
- Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
123
- Bio::Log::LoggerPlus.new 'bio-velvet'
124
- Bio::Log::CLI.configure 'bio-velvet'
125
-
126
- pooled_reads_filename = 'pooled_sampled_reads.fasta'
127
- if options[:already_patterned_reads] #If skipping read extraction
128
- pooled_reads_filename = options[:already_patterned_reads]
129
-
130
- else
131
- # Parse pattern from cmdline
132
- desired_pattern = KmerAbundancePattern.new
133
- desired_pattern.parse_from_human(options[:pattern])
134
- if options[:reads_files].length != desired_pattern.length
135
- raise "Number of entries in the pattern #{desired_pattern.length} and number of reads files #{options[:reads].length} not equivalent!"
136
- end
137
-
138
- # Collect the kmers that will be used to find trusted reads i.e.
139
- # Go through each line of the kmer abundance file, looking for kmers that suit the pattern
140
- input_file = File.open options[:kmer_multiple_abundance_file]
141
- csv = CSV.new(input_file, :col_sep => ' ')
142
-
143
- whitelist_kmers = []
144
- blacklist_kmers = []
145
- csv.each do |row|
146
- max_i = row.length - 2 if max_i.nil?
147
-
148
- kmer = row[0]
149
- counts = row[1...row.length].collect{|s| s.to_i}
150
- probe = 'TTACATCTTATCTACAATAAACCTTCTGCCTTAGTTTTAGAGCCTATCCGAAAAGTCCTGCTGCTCTGAATGTTATCCAAGCACATGCAAAATGAATTAGT'
151
- this_pattern = []
152
- counts.each_with_index do |count, i|
153
- if count > options[:upper_threshold]
154
- this_pattern[i] = true
155
- elsif count < options[:lower_threshold]
156
- this_pattern[i] = false
157
- else
158
- # coverage was in no man's land between thresholds.
159
- # Ignore this kmer as noise.
160
- this_pattern[i] = '-'
161
- end
162
- end
163
- #log.debug "Found pattern #{this_pattern} from kmer #{kmer}, which has abundances #{counts}" if log.debug?
164
-
165
- if desired_pattern.consistent_with? this_pattern
166
- whitelist_kmers.push row[0]
167
- else
168
- # kmer is not present when it should be
169
- blacklist_kmers.push row[0]
170
- end
171
- end
172
- log.info "After parsing the kmer multiple abundance file, found #{whitelist_kmers.length} kmers that matched the pattern, and #{blacklist_kmers.length} that didn't"
173
- unless whitelist_kmers.length > 0
174
- log.error "No kmers found that satisfy the given pattern, exiting.."
175
- exit 1
176
- end
177
-
178
-
179
- #outdir = options[:output_directory]
180
- #Dir.mkdir outdir unless Dir.exist?(outdir)
181
-
182
- # grep the pattern out from the raw reads, subsampling so as to not overwhelm the assembler
183
- #Tempfile.open('whitelist') do |white|
184
- File.open 'whitelist', 'w' do |white|
185
- white.puts whitelist_kmers.join("\n")
186
- white.close
187
-
188
- #Tempfile.open('blacklist') do |black|
189
- File.open('black','w') do |black|
190
- black.puts blacklist_kmers.join("\n")
191
- black.close
192
-
193
- threadpool = []
194
- sampled_read_files = []
195
- log.info "Extracting reads that contain suitable kmers"
196
- options[:reads_files].each_with_index do |file, i|
197
- next unless desired_pattern[i] #Don't extract reads from reads where those reads should not have been amplified
198
-
199
- sampled = File.basename(file)+'.sampled_reads.fasta'
200
- sampled_read_files.push sampled
201
-
202
- grep_path = "#{ENV['HOME']}/git/priner/bin/read_selection_by_kmer "
203
- if options[:min_leftover_length]
204
- grep_path += "--min-leftover-length #{options[:min_leftover_length]} "
205
- end
206
- thr = Thread.new do
207
- grep_cmd = "#{grep_path} --whitelist #{white.path} --blacklist #{black.path} --reads #{file} --kmer-coverage-target #{options[:kmer_coverage_target]} > #{sampled}"
208
- log.debug "Running cmd: #{grep_cmd}"
209
- status, stdout, stderr = systemu grep_cmd
210
- log.debug stderr
211
-
212
- raise unless status.exitstatus == 0
213
- log.debug "Finished extracting reads from #{file}"
214
- end
215
- threadpool.push thr
216
- end
217
- threadpool.each do |thread| thread.join; end #wait until everything is finito
218
-
219
- log.info "Finished extracting reads for sampling. Now pooling sampled reads"
220
- pool_cmd = "cat #{sampled_read_files.join ' '} >#{pooled_reads_filename}"
221
- log.debug "Running cmd: #{pool_cmd}"
222
- status, stdout, stderr = systemu pool_cmd
223
- raise stderr if stderr != ''
224
- raise unless status.exitstatus == 0
225
- end
226
- end
227
- end
228
-
229
- log.info "Extracting dummy reads from the ends of contigs to use as anchors"
230
- start_contig = options[:start_contig]
231
- end_contig = options[:end_contig]
232
- if [start_contig.length, end_contig.length].min < 2*options[:contig_end_length]
233
- log.warn "Choice of initial/terminal nodes to perform graph search with may not be optimal due to the small contig size"
234
- end
235
- if [start_contig.length, end_contig.length].min < options[:contig_end_length]
236
- log.error "At least one contig too small to proceed with current code base, need to fix the code to allow such a small contig"
237
- exit 1
238
- end
239
- # Use the last bit of the first contig and the first bit of the second contig as the anchors
240
- velvet_result = nil
241
- Tempfile.open('anchors.fa') do |tempfile|
242
- # Putting these same sequences in many times seems to better the
243
- # chances velvet won't throw them out
244
- 50.times do
245
- tempfile.puts ">start_contig"
246
- tempfile.puts start_contig[start_contig.length-options[:contig_end_length]...start_contig.length]
247
- tempfile.puts ">end_contig"
248
- #Have to be in reverse, because the node finder finds the node at the start of the read, not the end
249
- fwd2 = Bio::Sequence::NA.new(end_contig[0...options[:contig_end_length]])
250
- tempfile.puts fwd2.reverse_complement.to_s
251
- end
252
- tempfile.close
253
- #puts `cat #{tempfile.path}`
254
-
255
- log.info "Assembling sampled reads with velvet"
256
- # Bit of a hack, but have to use -short1 as the anchors because then start and end anchors will have node IDs 1 and 2, respectively.
257
- velvet_result = Bio::Velvet::Runner.new.velvet(
258
- options[:velvet_kmer_size],
259
- "-short #{tempfile.path} -short2 #{pooled_reads_filename}",
260
- "-cov_cutoff #{options[:assembly_coverage_cutoff]} -read_trkg yes",
261
- :output_assembly_path => options[:output_assembly_path]
262
- )
263
- if log.debug?
264
- log.debug "velveth stdout: #{velvet_result.velveth_stdout}"
265
- log.debug "velveth stderr: #{velvet_result.velveth_stderr}"
266
- log.debug "velvetg stdout: #{velvet_result.velvetg_stdout}"
267
- log.debug "velvetg stderr: #{velvet_result.velvetg_stderr}"
268
- end
269
- log.info "Finished running assembly"
270
- end
271
-
272
- log.info "Parsing the graph output from velvet"
273
- graph = Bio::Velvet::Graph.parse_from_file(File.join velvet_result.result_directory, 'LastGraph')
274
- log.info "Finished parsing graph: found #{graph.nodes.length} nodes and #{graph.arcs.length} arcs"
275
-
276
- if options[:assembly_coverage_cutoff]
277
- log.info "Removing low-coverage nodes from the graph (less than #{options[:assembly_coverage_cutoff]})"
278
- cutoffer = Bio::AssemblyGraphAlgorithms::CoverageBasedGraphFilter.new
279
- deleted_nodes, deleted_arcs = cutoffer.remove_low_coverage_nodes(graph, options[:assembly_coverage_cutoff], :whitelisted_sequences => [1,2])
280
-
281
- log.info "Removed #{deleted_nodes.length} nodes and #{deleted_arcs.length} arcs from the graph due to low coverage"
282
- log.info "Now there is #{graph.nodes.length} nodes and #{graph.arcs.length} arcs remaining"
283
- end
284
-
285
- finder = Bio::AssemblyGraphAlgorithms::NodeFinder.new
286
- log.info "Finding node representing the end of the first contig"
287
- start_node, start_node_forward = finder.find_unique_node_with_sequence_id(graph, 1)
288
- log.info "Finding node representing the start of the second contig"
289
- end_node, end_node_forward = finder.find_unique_node_with_sequence_id(graph, 2)#TODO: find the node nearest the end of this, not the start
290
- if start_node.nil? or end_node.nil?
291
- if start_node.nil?
292
- log.error "Unable to find any nodes in the graph that have kmers corresponding to the _start_ point in them, sorry. Maybe fix the node finding code?"
293
- end
294
- if end_node.nil?
295
- log.error "Unable to find any nodes in the graph that have kmers corresponding to the _end_ point in them, sorry. Maybe fix the node finding code?"
296
- end
297
-
298
- if options[:output_graph_png] or options[:output_graph_svg] or options[:output_graph_dot]
299
- log.info "Converting assembly to a graphviz PNG/SVG/DOT, even if start/end node was not be found properly"
300
- viser = Bio::Assembly::ABVisualiser.new
301
- gv = viser.graphviz(graph)
302
- if options[:output_graph_png]
303
- log.info "Writing PNG of graph to #{options[:output_graph_png]}"
304
- gv.output :png => options[:output_graph_png]
305
- end
306
- if options[:output_graph_svg]
307
- log.info "Writing SVG of graph to #{options[:output_graph_svg]}"
308
- gv.output :svg => options[:output_graph_svg]
309
- end
310
- if options[:output_graph_dot]
311
- log.info "Writing DOT of graph to #{options[:output_graph_dot]}"
312
- gv.output :dot => options[:output_graph_dot]
313
- end
314
- end
315
- log.error "Unknown start or end points, giving up, sorry."
316
- exit 1
317
- end
318
- log.info "Node(s) found that are suitable as initial and terminal nodes in the graph search, respectively: #{start_node.node_id} and #{end_node.node_id}"
319
-
320
- log.info "Removing nodes unconnected to either the start or the end from the graph.."
321
- original_num_nodes = graph.nodes.length
322
- original_num_arcs = graph.arcs.length
323
- filter = Bio::AssemblyGraphAlgorithms::ConnectivityBasedGraphFilter.new
324
- filter.remove_unconnected_nodes(graph, [start_node, end_node])
325
- log.info "Removed #{original_num_nodes-graph.nodes.length} nodes and #{original_num_arcs-graph.arcs.length} arcs"
326
-
327
- if options[:output_graph_png]
328
- log.info "Converting assembly to a graphviz PNG"
329
- viser = Bio::Assembly::ABVisualiser.new
330
- gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id})
331
- gv.output :png => options[:output_graph_png], :use => :neato
332
- end
333
- if options[:output_graph_svg]
334
- log.info "Converting assembly to a graphviz SVG"
335
- viser = Bio::Assembly::ABVisualiser.new
336
- gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id})
337
- gv.output :svg => options[:output_graph_svg], :use => :neato
338
- end
339
- if options[:output_graph_dot]
340
- log.info "Converting assembly to a graphviz DOT"
341
- viser = Bio::Assembly::ABVisualiser.new
342
- gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id})
343
- gv.output :dot => options[:output_graph_dot]
344
- end
345
-
346
- log.info "Searching for trails between the initial and terminal nodes, within the assembly graph"
347
- cartographer = Bio::AssemblyGraphAlgorithms::AcyclicConnectionFinder.new
348
- #raise "Untested connection finder below"
349
- #trails = cartographer.find_all_trails_between_nodes(graph, start_node, end_node, options[:graph_search_leash_length], start_node_forward)
350
- trails = cartographer.find_trails_between_nodes(graph, start_node, end_node, options[:graph_search_leash_length], start_node_forward)
351
- log.info "Found #{trails.length} trail(s) between the initial and terminal nodes"
352
-
353
- log.info "Reading kmer abundances from #{options[:kmer_multiple_abundance_file]}.."
354
- kmer_hash = Bio::KmerMultipleAbundanceHash.parse_from_file options[:kmer_multiple_abundance_file]
355
- log.info "Finished reading the kmer abundances"
356
-
357
- if options[:trail_kmer_coverage_file]
358
- log.info "Writing out kmer coverages to #{options[:trail_kmer_coverage_file]}.."
359
- writer = Bio::AssemblyGraphAlgorithms::KmerCoverageWriter.new
360
- io = File.open(options[:trail_kmer_coverage_file],'w')
361
- writer.write(io, trails, kmer_hash)
362
- log.info "Finished writing"
363
- end
364
-
365
- log.info "Filtering trail(s) based on kmer coverage, requiring each kmer in the path to have a minimum of #{options[:kmer_path_filter_min_coverage]} coverage in patterned reads, except for the #{options[:kmer_path_end_exclusion_length]}bp at the ends"
366
- kmer_path_filter = Bio::AssemblyGraphAlgorithms::KmerCoverageBasedPathFilter.new
367
- thresholds = desired_pattern.collect{|c| c == true ? 1 : 0}
368
- log.info "Using thresholds for filtering: #{thresholds}"
369
- trails = kmer_path_filter.filter(trails, kmer_hash, thresholds, :exclude_ending_length => options[:kmer_path_end_exclusion_length])
370
- log.info "After filtering remained #{trails.length} trails"
371
-
372
- log.debug "Found trails: #{trails.collect{|t| t.to_s}.join("\n")}"
373
-
374
- trails.each_with_index do |trail, i|
375
- puts ">trail#{i+1}"
376
- puts trail.sequence
377
- end