finishm 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,55 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'optparse'
4
- require 'bio-logger'
5
-
6
- SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
7
-
8
- # Parse command line options into the options hash
9
- options = {
10
- :logger => 'stderr',
11
- :log_level => 'info',
12
- }
13
- o = OptionParser.new do |opts|
14
- opts.banner = "
15
- Usage: #{SCRIPT_NAME} <arguments>
16
-
17
- Description of what this program does...\n\n"
18
-
19
- opts.on("--velvet-pregraph GRAPH_FILE", "PreGraph file output from velveth [required]") do |arg|
20
- options[:velvet_pregraph_file] = arg
21
- end
22
-
23
- # logger options
24
- opts.separator "\nVerbosity:\n\n"
25
- opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
26
- opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
27
- opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
28
- end; o.parse!
29
- if ARGV.length != 0 or options[:velvet_pregraph_file].nil?
30
- $stderr.puts o
31
- exit 1
32
- end
33
- # Setup logging
34
- Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
35
-
36
-
37
- # Read in the velvet graph
38
- log.info "Parsing graph from #{options[:velvet_pregraph_file]}"
39
- graph = Bio::Velvet::Graph.parse_from_file(options[:velvet_pregraph_file])
40
- log.info "Finished parsing graph, found #{graph.number_of_nodes} nodes"
41
-
42
- # Log the number of nodes and arcs in the current graph
43
-
44
- # Read in the fasta file of immutable nodes, and extract the two most immutable
45
- # Log that they are found
46
-
47
- # Determine that the graph is connected or not between the two most immutable nodes, using some graph theoretic algorithm
48
- # If the graph is not connected, then there is no hope, exit
49
-
50
- # Go through the graph to get a list of the cap nodes
51
- # Log the number of cap nodes found
52
-
53
- # Trim off all the cap nodes back to cross nodes, keeping track of the lengths
54
-
55
- # Print the graph in graphviz dot format
@@ -1,241 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'optparse'
4
- require 'bio-logger'
5
- require 'bio-velvet'
6
- require 'tempfile'
7
- require 'pp'
8
-
9
- SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = 'finishm'
10
- $:.unshift File.join(File.dirname(__FILE__),'..','lib')
11
- require 'priner'
12
-
13
- # Parse command line options into the options hash
14
- options = {
15
- :logger => 'stderr',
16
- :log_level => 'info',
17
- :velvet_kmer_size => 43,#TODO: these options should be exposed to the user, and perhaps not guessed at
18
- :contig_end_length => 200,
19
- :output_assembly_path => '/tmp/velvetAssembly',
20
- :graph_search_leash_length => 3000,
21
- :assembly_coverage_cutoff => 1.5,
22
- }
23
- o = OptionParser.new do |opts|
24
- opts.banner = "
25
- Usage: #{SCRIPT_NAME} --reads <read_file> --contig <contig_file>
26
-
27
- Takes a set of reads and a contig that contains gap characters. Then it tries to fill in
28
- these N characters. It is possible that there is multiple ways to close the gap - in that case
29
- each is reported. \n\n"
30
-
31
-
32
- opts.on("--reads FILE", "gzipped fastq file of reads to perform the gap closing with [required]") do |arg|
33
- options[:reads_file] = arg
34
- end
35
- opts.on("--contig FILE", "fasta file of single contig containing Ns that are to be closed [required]") do |arg|
36
- options[:contig_file] = arg
37
- end
38
- opts.on("--output-trails-fasta PATH", "Output found paths to this file in fasta format [default: off]") do |arg|
39
- options[:overall_trail_output_fasta_file] = arg
40
- end
41
-
42
- opts.separator "\nOptional arguments:\n\n"
43
- opts.on("--overhang NUM", "Start assembling this far from the gap [default: #{options[:contig_end_length]}]") do |arg|
44
- options[:contig_end_length] = arg.to_i
45
- end
46
- opts.on("--start OFFSET", "Start trying to fill from this position in the contig, requires --stop [default: found from position of Ns}]") do |arg|
47
- options[:start_offset] = arg.to_i-1
48
- end
49
- opts.on("--stop OFFSET", "Start trying to fill to this position in the contig, requires --start [default: found from position of Ns}]") do |arg|
50
- options[:end_offset] = arg.to_i-1
51
- end
52
- opts.on("--assembly-png PATH", "Output assembly as a PNG file [default: off]") do |arg|
53
- options[:output_graph_png] = arg
54
- end
55
- opts.on("--assembly-svg PATH", "Output assembly as an SVG file [default: off]") do |arg|
56
- options[:output_graph_svg] = arg
57
- end
58
- opts.on("--assembly-dot PATH", "Output assembly as an DOT file [default: off]") do |arg|
59
- options[:output_graph_dot] = arg
60
- end
61
- opts.on("--velvet-kmer KMER", "kmer size to use with velvet [default: #{options[:velvet_kmer_size]}]") do |arg|
62
- options[:velvet_kmer_size] = arg.to_i
63
- end
64
-
65
- opts.separator "\nDebug-related options:\n\n"
66
-
67
-
68
-
69
- # logger options
70
- opts.separator "\nVerbosity:\n\n"
71
- opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
72
- opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
73
- opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
74
- end; o.parse!
75
- if ARGV.length != 0 or options[:reads_file].nil? or options[:contig_file].nil? or options[:overall_trail_output_fasta_file].nil?
76
- $stderr.puts o
77
- exit 1
78
- end
79
- # Setup logging
80
- Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
81
- Bio::Log::LoggerPlus.new 'bio-velvet'; Bio::Log::CLI.configure 'bio-velvet'
82
- log.outputters[0].formatter = Log4r::PatternFormatter.new(:pattern => "%5l %c %d: %m", :date_pattern => '%d/%m %T')
83
-
84
- log.debug "Running finishm with options: #{PP.pp(options, "").gsub(/\n$/,'')}" if log.debug?
85
-
86
- # Find where the Ns are
87
- n_region_start = nil
88
- n_region_end = nil
89
- sequence = nil
90
- Bio::FlatFile.foreach(options[:contig_file]) do |seq|
91
- if sequence
92
- raise Exception, "Sorry, this script can only handle single sequences to be gap filled at the moment"
93
- end
94
-
95
- sequence = seq.seq
96
-
97
- if options[:start_offset] and options[:end_offset]
98
- log.info "Trying to gap fill from #{options[:start_offset]+1} to #{options[:end_offset]+1}"
99
- n_region_start = options[:start_offset]
100
- n_region_end = options[:end_offset]
101
- else
102
- log.info "Determining where to fill from the presence of Ns"
103
-
104
- matches = sequence.match(/(N+)/i)
105
- if !matches
106
- raise "Unable to find any gaps in the input sequence. That was a bit too easy.."
107
- end
108
- n_region_start = matches.offset(0)[0]
109
- n_region_end = n_region_start + matches[1].length
110
- log.info "Detected a gap between #{n_region_start} and #{n_region_end}"
111
- end
112
-
113
- # Check to make sure we are sufficiently distant from the ends
114
- if n_region_start < options[:contig_end_length] or
115
- sequence.length - n_region_end < options[:contig_end_length]
116
- raise "The gap is too close to the end of the contig, sorry"
117
- end
118
- end
119
-
120
- # Do the assembly
121
- graph = nil
122
- if options[:previously_serialized_parsed_graph_file].nil?
123
- velvet_result = nil
124
- if options[:previous_assembly].nil? #If assembly has not already been carried out
125
- Tempfile.open('anchors.fa') do |tempfile|
126
- tempfile.puts ">anchor1"
127
- tempfile.puts sequence[n_region_start-options[:contig_end_length]-1...n_region_start]
128
- tempfile.puts ">anchor2"
129
- #Have to be in reverse, because the node finder finds the node at the start of the read, not the end
130
- fwd2 = Bio::Sequence::NA.new(sequence[n_region_end..(n_region_end+options[:contig_end_length])])
131
- tempfile.puts fwd2.reverse_complement.to_s
132
- tempfile.close
133
- log.debug "Inputting anchors into the assembly: #{File.open(tempfile.path).read}" if log.debug?
134
-
135
- log.info "Assembling sampled reads with velvet"
136
- # Bit of a hack, but have to use -short1 as the anchors because then start and end anchors will have node IDs 1,2,... etc.
137
- velvet_result = Bio::Velvet::Runner.new.velvet(
138
- options[:velvet_kmer_size],
139
- "-short #{tempfile.path} -short2 -fastq.gz #{options[:reads_file]}",
140
- "-read_trkg yes -cov_cutoff #{options[:assembly_coverage_cutoff]}",
141
- :output_assembly_path => options[:output_assembly_path]
142
- )
143
- if log.debug?
144
- log.debug "velveth stdout: #{velvet_result.velveth_stdout}"
145
- log.debug "velveth stderr: #{velvet_result.velveth_stderr}"
146
- log.debug "velvetg stdout: #{velvet_result.velvetg_stdout}"
147
- log.debug "velvetg stderr: #{velvet_result.velvetg_stderr}"
148
- end
149
- log.info "Finished running assembly"
150
- end
151
- else
152
- log.info "Using previous assembly stored at #{options[:previous_assembly]}"
153
- velvet_result = Bio::Velvet::Result.new
154
- velvet_result.result_directory = options[:previous_assembly]
155
- end
156
-
157
- log.info "Parsing the graph output from velvet"
158
- graph = Bio::Velvet::Graph.parse_from_file(File.join velvet_result.result_directory, 'LastGraph')
159
- log.info "Finished parsing graph: found #{graph.nodes.length} nodes and #{graph.arcs.length} arcs"
160
-
161
- if options[:serialize_parsed_graph_file]
162
- log.info "Storing a binary version of the graph file for later use at #{options[:serialize_parsed_graph_file]}"
163
- File.open(options[:serialize_parsed_graph_file],'wb') do |f|
164
- f.print Marshal.dump(graph)
165
- end
166
- log.info "Stored a binary representation of the velvet graph at #{options[:serialize_parsed_graph_file]}"
167
- end
168
-
169
- if options[:assembly_coverage_cutoff]
170
- log.info "Removing low-coverage nodes from the graph (less than #{options[:assembly_coverage_cutoff]})"
171
- cutoffer = Bio::AssemblyGraphAlgorithms::CoverageBasedGraphFilter.new
172
- deleted_nodes, deleted_arcs = cutoffer.remove_low_coverage_nodes(graph, options[:assembly_coverage_cutoff], :whitelisted_sequences => [1,2])
173
-
174
- log.info "Removed #{deleted_nodes.length} nodes and #{deleted_arcs.length} arcs from the graph due to low coverage"
175
- log.info "Now there is #{graph.nodes.length} nodes and #{graph.arcs.length} arcs remaining"
176
- end
177
- else
178
- log.info "Restoring graph file from #{options[:previously_serialized_parsed_graph_file]}.."
179
- graph = Marshal.load(File.open(options[:previously_serialized_parsed_graph_file]))
180
- log.info "Restoration complete"
181
- end
182
-
183
-
184
- # Find the anchor nodes again
185
- finder = Bio::AssemblyGraphAlgorithms::NodeFinder.new
186
- log.info "Finding node representing the end of the each contig"
187
- i = 1
188
- anchor_sequence_ids = [1,2]
189
- start_node, start_node_forward = finder.find_unique_node_with_sequence_id(graph, 1)
190
- end_node, end_node_forward = finder.find_unique_node_with_sequence_id(graph, 2)
191
- if start_node and end_node
192
- log.info "Found both anchoring nodes in the graph: #{start_node.node_id}/#{start_node_forward} and #{end_node.node_id}/#{end_node_forward}"
193
- else
194
- log.error "start node not found" if start_node.nil?
195
- log.error "end node not found" if end_node.nil?
196
- raise "Unable to find both anchor reads from the assembly, cannot continue. This is probably an error with this script, not you."
197
- end
198
-
199
- log.info "Removing nodes unconnected to either the start or the end from the graph.."
200
- original_num_nodes = graph.nodes.length
201
- original_num_arcs = graph.arcs.length
202
- filter = Bio::AssemblyGraphAlgorithms::ConnectivityBasedGraphFilter.new
203
- filter.remove_unconnected_nodes(graph, [start_node, end_node])
204
- log.info "Removed #{original_num_nodes-graph.nodes.length} nodes and #{original_num_arcs-graph.arcs.length} arcs"
205
-
206
-
207
- if options[:output_graph_png]
208
- log.info "Converting assembly to a graphviz PNG"
209
- viser = Bio::Assembly::ABVisualiser.new
210
- gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id})
211
- gv.output :png => options[:output_graph_png], :use => :neato
212
- end
213
- if options[:output_graph_svg]
214
- log.info "Converting assembly to a graphviz SVG"
215
- viser = Bio::Assembly::ABVisualiser.new
216
- gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id})
217
- gv.output :svg => options[:output_graph_svg], :use => :neato
218
- end
219
- if options[:output_graph_dot]
220
- log.info "Converting assembly to a graphviz DOT"
221
- viser = Bio::Assembly::ABVisualiser.new
222
- gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id, :digraph => false})
223
- gv.output :dot => options[:output_graph_dot]
224
- end
225
-
226
-
227
-
228
- log.info "Searching for trails between the nodes within the assembly graph"
229
- cartographer = Bio::AssemblyGraphAlgorithms::AcyclicConnectionFinder.new
230
- trails = cartographer.find_trails_between_nodes(graph, start_node, end_node, options[:graph_search_leash_length], start_node_forward)
231
- log.info "Found #{trails.length} trail(s) in total"
232
-
233
-
234
- log.debug "Outputing trail sequences"
235
- File.open(options[:overall_trail_output_fasta_file],'w') do |f|
236
- trails.each_with_index do |trail, i|
237
- f.puts ">trail#{i+1}"
238
- f.puts trail.sequence
239
- end
240
- end
241
-
@@ -1,49 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'optparse'
4
- require 'bio-logger'
5
- require 'csv'
6
-
7
- SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
8
-
9
- # Parse command line options into the options hash
10
- options = {
11
- :logger => 'stderr',
12
- :log_level => 'info',
13
- :min => 0,
14
- }
15
- o = OptionParser.new do |opts|
16
- opts.banner = "
17
- Usage: #{SCRIPT_NAME} <arguments>
18
-
19
- grep a multiple kmer abundance file according to specified criteria\n\n"
20
-
21
- opts.on("--min NUMBER", "At least 1 column has at least this many observations [default: #{options[:min]}]") do |arg|
22
- options[:min] = arg.to_f
23
- end
24
-
25
- # logger options
26
- opts.separator "\nVerbosity:\n\n"
27
- opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
28
- opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
29
- opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
30
- end; o.parse!
31
- if ARGV.length != 1
32
- $stderr.puts o
33
- exit 1
34
- end
35
- # Setup logging
36
- Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
37
-
38
- CSV.foreach(ARGV[0], :col_sep => ' ') do |row|
39
- kmer = row[0]
40
- passable = false
41
- row[1...row.length].each do |count|
42
- if count.to_f > options[:min]
43
- passable = true
44
- break
45
- end
46
- end
47
- puts row.join(' ') if passable
48
- end
49
-
@@ -1,377 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'optparse'
4
- require 'bio-logger'
5
- require 'csv'
6
- require 'tempfile'
7
- require 'pp'
8
- require 'systemu'
9
- require 'bio-velvet'
10
- require 'set'
11
-
12
- SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = 'finishm'
13
- $:.unshift File.join(File.dirname(__FILE__),'..','lib')
14
- require 'priner'
15
-
16
- # Parse command line options into the options hash
17
- options = {
18
- :logger => 'stderr',
19
- :log_level => 'info',
20
- :min_leftover_length => false,
21
- :kmer_coverage_target => 1,
22
- :velvet_kmer_size => 155,
23
- :contig_end_length => 300,
24
- :graph_search_leash_length => 20000,
25
- :reads_to_assemble => nil,
26
- :assembly_coverage_cutoff => 1.5,
27
- :kmer_path_filter_min_coverage => 1,
28
- :kmer_path_end_exclusion_length => 50,
29
- :trail_kmer_coverage_file => 'trail_coverages.csv'
30
- }
31
-
32
- # TODO: make a better interface for this. Maybe specify an entire genome, and then "Contig_1 end, Contig_3 start" or something
33
- # Look at the last 300bp of the first contig.
34
- extract_exactly_one_contig_from_file = lambda do |fasta_file_path|
35
- contig = nil
36
- Bio::FlatFile.foreach(Bio::FastaFormat, fasta_file_path) do |e|
37
- if contig.nil?
38
- contig = e.seq
39
- else
40
- raise "Multiple sequences found in a contig file! I need exactly one"
41
- end
42
- end
43
- raise "I need a contig to be in the start contig file" if contig.nil?
44
- Bio::Sequence::NA.new(contig.to_s)
45
- end
46
-
47
- o = OptionParser.new do |opts|
48
- opts.banner = "
49
- Usage: #{SCRIPT_NAME} <kmer_multiple_abundance_file>
50
-
51
- Given an input kmer then abundances space separated file, and a threshold, print out how many kmers are unique to different subsets of columns\n\n"
52
-
53
- opts.on("--pattern PATTERN", "kmer abundance pattern e.g. '0111001110' [required]") do |arg|
54
- options[:pattern] = arg
55
- end
56
- opts.on("--kmer-abundances FILE", "kmer multiple abundance file [required]") do |arg|
57
- options[:kmer_multiple_abundance_file] = arg
58
- end
59
- opts.on("--upper-threshold NUM", "kmer frequency cutoff to saying 'present' [required]") do |arg|
60
- options[:upper_threshold] = arg.to_i
61
- end
62
- opts.on("--lower-threshold NUM", "kmer frequency cutoff to saying 'not present' [required]") do |arg|
63
- options[:lower_threshold] = arg.to_i
64
- end
65
- opts.on("--reads FILES", "comma-separated list of sequence reads files in the same order as the pattern was supplied [required]") do |arg|
66
- options[:reads_files] = arg.split(',').collect{|r| File.absolute_path r}
67
- end
68
- opts.on("--start-contig FASTA", "path to a fasta file with the starting contig in it (only). Assumes we are building off the end of this contig [required]") do |arg|
69
- options[:start_contig] = extract_exactly_one_contig_from_file.call arg
70
- end
71
- opts.on("--end-contig FASTA", "path to a fasta file with the ending contig in it (only). Assumes we are building onto the start of this contig [required]") do |arg|
72
- options[:end_contig] = extract_exactly_one_contig_from_file.call arg
73
- end
74
-
75
- opts.separator "\nOptional arguments:\n\n"
76
- opts.on("--min-leftover-read-length NUMBER", "when searching for reads with kmers, require the kmer to be at the beginning or end of the selected read [default: #{options[:min_leftover_length]}]") do |arg|
77
- options[:min_leftover_length] = arg.to_i
78
- end
79
- opts.on("--kmer-coverage-target NUMBER", "when searching for reads with kmers, require this many copies per kmer [default: #{options[:kmer_coverage_target]}]") do |arg|
80
- options[:kmer_coverage_target] = arg.to_i
81
- end
82
- opts.on("--already-patterned-reads FILE", "Attempt to assemble the reads in the specified file, useful for re-assembly [default: off]") do |arg|
83
- options[:already_patterned_reads] = arg
84
- end
85
- opts.on("--output-assembly PATH", "Output assembly intermediate files to this directory [default: off]") do |arg|
86
- options[:output_assembly_path] = arg
87
- end
88
- opts.on("--assembly-png PATH", "Output assembly as a PNG file [default: off]") do |arg|
89
- options[:output_graph_png] = arg
90
- end
91
- opts.on("--assembly-svg PATH", "Output assembly as an SVG file [default: off]") do |arg|
92
- options[:output_graph_svg] = arg
93
- end
94
- opts.on("--assembly-dot PATH", "Output assembly as an DOT file [default: off]") do |arg|
95
- options[:output_graph_dot] = arg
96
- end
97
- # opts.on("--output-begin-kmers PATH", "Output kmers found at the beginning point to this file [default: off]") do |arg|
98
- # options[:output_begin_kmers] = arg
99
- # end
100
- # opts.on("--output-end-kmers PATH", "Output kmers found at the ending point to this file [default: off]") do |arg|
101
- # options[:output_end_kmers] = arg
102
- # end
103
- opts.on("--assembly-coverage-cutoff NUMBER", "Require this much coverage in each node, all other nodes are removed [default: #{options[:assembly_coverage_cutoff]}]") do |arg|
104
- options[:assembly_coverage_cutoff] = arg.to_f
105
- end
106
- opts.on("--contig-end-length LENGTH", "Number of base pairs to start into the ends of the contigs [default: #{options[:contig_end_length]}]") do |arg|
107
- options[:contig_end_length] = arg.to_i
108
- end
109
-
110
- # logger options
111
- opts.separator "\nVerbosity:\n\n"
112
- opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
113
- opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
114
- opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
115
- end; o.parse!
116
- if ARGV.length != 0 or options[:upper_threshold].nil? or options[:lower_threshold].nil? or options[:pattern].nil? or options[:kmer_multiple_abundance_file].nil? or options[:reads_files].nil?
117
- pp options
118
- $stderr.puts o
119
- exit 1
120
- end
121
- # Setup logging
122
- Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
123
- Bio::Log::LoggerPlus.new 'bio-velvet'
124
- Bio::Log::CLI.configure 'bio-velvet'
125
-
126
- pooled_reads_filename = 'pooled_sampled_reads.fasta'
127
- if options[:already_patterned_reads] #If skipping read extraction
128
- pooled_reads_filename = options[:already_patterned_reads]
129
-
130
- else
131
- # Parse pattern from cmdline
132
- desired_pattern = KmerAbundancePattern.new
133
- desired_pattern.parse_from_human(options[:pattern])
134
- if options[:reads_files].length != desired_pattern.length
135
- raise "Number of entries in the pattern #{desired_pattern.length} and number of reads files #{options[:reads].length} not equivalent!"
136
- end
137
-
138
- # Collect the kmers that will be used to find trusted reads i.e.
139
- # Go through each line of the kmer abundance file, looking for kmers that suit the pattern
140
- input_file = File.open options[:kmer_multiple_abundance_file]
141
- csv = CSV.new(input_file, :col_sep => ' ')
142
-
143
- whitelist_kmers = []
144
- blacklist_kmers = []
145
- csv.each do |row|
146
- max_i = row.length - 2 if max_i.nil?
147
-
148
- kmer = row[0]
149
- counts = row[1...row.length].collect{|s| s.to_i}
150
- probe = 'TTACATCTTATCTACAATAAACCTTCTGCCTTAGTTTTAGAGCCTATCCGAAAAGTCCTGCTGCTCTGAATGTTATCCAAGCACATGCAAAATGAATTAGT'
151
- this_pattern = []
152
- counts.each_with_index do |count, i|
153
- if count > options[:upper_threshold]
154
- this_pattern[i] = true
155
- elsif count < options[:lower_threshold]
156
- this_pattern[i] = false
157
- else
158
- # coverage was in no man's land between thresholds.
159
- # Ignore this kmer as noise.
160
- this_pattern[i] = '-'
161
- end
162
- end
163
- #log.debug "Found pattern #{this_pattern} from kmer #{kmer}, which has abundances #{counts}" if log.debug?
164
-
165
- if desired_pattern.consistent_with? this_pattern
166
- whitelist_kmers.push row[0]
167
- else
168
- # kmer is not present when it should be
169
- blacklist_kmers.push row[0]
170
- end
171
- end
172
- log.info "After parsing the kmer multiple abundance file, found #{whitelist_kmers.length} kmers that matched the pattern, and #{blacklist_kmers.length} that didn't"
173
- unless whitelist_kmers.length > 0
174
- log.error "No kmers found that satisfy the given pattern, exiting.."
175
- exit 1
176
- end
177
-
178
-
179
- #outdir = options[:output_directory]
180
- #Dir.mkdir outdir unless Dir.exist?(outdir)
181
-
182
- # grep the pattern out from the raw reads, subsampling so as to not overwhelm the assembler
183
- #Tempfile.open('whitelist') do |white|
184
- File.open 'whitelist', 'w' do |white|
185
- white.puts whitelist_kmers.join("\n")
186
- white.close
187
-
188
- #Tempfile.open('blacklist') do |black|
189
- File.open('black','w') do |black|
190
- black.puts blacklist_kmers.join("\n")
191
- black.close
192
-
193
- threadpool = []
194
- sampled_read_files = []
195
- log.info "Extracting reads that contain suitable kmers"
196
- options[:reads_files].each_with_index do |file, i|
197
- next unless desired_pattern[i] #Don't extract reads from reads where those reads should not have been amplified
198
-
199
- sampled = File.basename(file)+'.sampled_reads.fasta'
200
- sampled_read_files.push sampled
201
-
202
- grep_path = "#{ENV['HOME']}/git/priner/bin/read_selection_by_kmer "
203
- if options[:min_leftover_length]
204
- grep_path += "--min-leftover-length #{options[:min_leftover_length]} "
205
- end
206
- thr = Thread.new do
207
- grep_cmd = "#{grep_path} --whitelist #{white.path} --blacklist #{black.path} --reads #{file} --kmer-coverage-target #{options[:kmer_coverage_target]} > #{sampled}"
208
- log.debug "Running cmd: #{grep_cmd}"
209
- status, stdout, stderr = systemu grep_cmd
210
- log.debug stderr
211
-
212
- raise unless status.exitstatus == 0
213
- log.debug "Finished extracting reads from #{file}"
214
- end
215
- threadpool.push thr
216
- end
217
- threadpool.each do |thread| thread.join; end #wait until everything is finito
218
-
219
- log.info "Finished extracting reads for sampling. Now pooling sampled reads"
220
- pool_cmd = "cat #{sampled_read_files.join ' '} >#{pooled_reads_filename}"
221
- log.debug "Running cmd: #{pool_cmd}"
222
- status, stdout, stderr = systemu pool_cmd
223
- raise stderr if stderr != ''
224
- raise unless status.exitstatus == 0
225
- end
226
- end
227
- end
228
-
229
- log.info "Extracting dummy reads from the ends of contigs to use as anchors"
230
- start_contig = options[:start_contig]
231
- end_contig = options[:end_contig]
232
- if [start_contig.length, end_contig.length].min < 2*options[:contig_end_length]
233
- log.warn "Choice of initial/terminal nodes to perform graph search with may not be optimal due to the small contig size"
234
- end
235
- if [start_contig.length, end_contig.length].min < options[:contig_end_length]
236
- log.error "At least one contig too small to proceed with current code base, need to fix the code to allow such a small contig"
237
- exit 1
238
- end
239
- # Use the last bit of the first contig and the first bit of the second contig as the anchors
240
- velvet_result = nil
241
- Tempfile.open('anchors.fa') do |tempfile|
242
- # Putting these same sequences in many times seems to better the
243
- # chances velvet won't throw them out
244
- 50.times do
245
- tempfile.puts ">start_contig"
246
- tempfile.puts start_contig[start_contig.length-options[:contig_end_length]...start_contig.length]
247
- tempfile.puts ">end_contig"
248
- #Have to be in reverse, because the node finder finds the node at the start of the read, not the end
249
- fwd2 = Bio::Sequence::NA.new(end_contig[0...options[:contig_end_length]])
250
- tempfile.puts fwd2.reverse_complement.to_s
251
- end
252
- tempfile.close
253
- #puts `cat #{tempfile.path}`
254
-
255
- log.info "Assembling sampled reads with velvet"
256
- # Bit of a hack, but have to use -short1 as the anchors because then start and end anchors will have node IDs 1 and 2, respectively.
257
- velvet_result = Bio::Velvet::Runner.new.velvet(
258
- options[:velvet_kmer_size],
259
- "-short #{tempfile.path} -short2 #{pooled_reads_filename}",
260
- "-cov_cutoff #{options[:assembly_coverage_cutoff]} -read_trkg yes",
261
- :output_assembly_path => options[:output_assembly_path]
262
- )
263
- if log.debug?
264
- log.debug "velveth stdout: #{velvet_result.velveth_stdout}"
265
- log.debug "velveth stderr: #{velvet_result.velveth_stderr}"
266
- log.debug "velvetg stdout: #{velvet_result.velvetg_stdout}"
267
- log.debug "velvetg stderr: #{velvet_result.velvetg_stderr}"
268
- end
269
- log.info "Finished running assembly"
270
- end
271
-
272
- log.info "Parsing the graph output from velvet"
273
- graph = Bio::Velvet::Graph.parse_from_file(File.join velvet_result.result_directory, 'LastGraph')
274
- log.info "Finished parsing graph: found #{graph.nodes.length} nodes and #{graph.arcs.length} arcs"
275
-
276
- if options[:assembly_coverage_cutoff]
277
- log.info "Removing low-coverage nodes from the graph (less than #{options[:assembly_coverage_cutoff]})"
278
- cutoffer = Bio::AssemblyGraphAlgorithms::CoverageBasedGraphFilter.new
279
- deleted_nodes, deleted_arcs = cutoffer.remove_low_coverage_nodes(graph, options[:assembly_coverage_cutoff], :whitelisted_sequences => [1,2])
280
-
281
- log.info "Removed #{deleted_nodes.length} nodes and #{deleted_arcs.length} arcs from the graph due to low coverage"
282
- log.info "Now there is #{graph.nodes.length} nodes and #{graph.arcs.length} arcs remaining"
283
- end
284
-
285
- finder = Bio::AssemblyGraphAlgorithms::NodeFinder.new
286
- log.info "Finding node representing the end of the first contig"
287
- start_node, start_node_forward = finder.find_unique_node_with_sequence_id(graph, 1)
288
- log.info "Finding node representing the start of the second contig"
289
- end_node, end_node_forward = finder.find_unique_node_with_sequence_id(graph, 2)#TODO: find the node nearest the end of this, not the start
290
- if start_node.nil? or end_node.nil?
291
- if start_node.nil?
292
- log.error "Unable to find any nodes in the graph that have kmers corresponding to the _start_ point in them, sorry. Maybe fix the node finding code?"
293
- end
294
- if end_node.nil?
295
- log.error "Unable to find any nodes in the graph that have kmers corresponding to the _end_ point in them, sorry. Maybe fix the node finding code?"
296
- end
297
-
298
- if options[:output_graph_png] or options[:output_graph_svg] or options[:output_graph_dot]
299
- log.info "Converting assembly to a graphviz PNG/SVG/DOT, even if start/end node was not be found properly"
300
- viser = Bio::Assembly::ABVisualiser.new
301
- gv = viser.graphviz(graph)
302
- if options[:output_graph_png]
303
- log.info "Writing PNG of graph to #{options[:output_graph_png]}"
304
- gv.output :png => options[:output_graph_png]
305
- end
306
- if options[:output_graph_svg]
307
- log.info "Writing SVG of graph to #{options[:output_graph_svg]}"
308
- gv.output :svg => options[:output_graph_svg]
309
- end
310
- if options[:output_graph_dot]
311
- log.info "Writing DOT of graph to #{options[:output_graph_dot]}"
312
- gv.output :dot => options[:output_graph_dot]
313
- end
314
- end
315
- log.error "Unknown start or end points, giving up, sorry."
316
- exit 1
317
- end
318
- log.info "Node(s) found that are suitable as initial and terminal nodes in the graph search, respectively: #{start_node.node_id} and #{end_node.node_id}"
319
-
320
- log.info "Removing nodes unconnected to either the start or the end from the graph.."
321
- original_num_nodes = graph.nodes.length
322
- original_num_arcs = graph.arcs.length
323
- filter = Bio::AssemblyGraphAlgorithms::ConnectivityBasedGraphFilter.new
324
- filter.remove_unconnected_nodes(graph, [start_node, end_node])
325
- log.info "Removed #{original_num_nodes-graph.nodes.length} nodes and #{original_num_arcs-graph.arcs.length} arcs"
326
-
327
- if options[:output_graph_png]
328
- log.info "Converting assembly to a graphviz PNG"
329
- viser = Bio::Assembly::ABVisualiser.new
330
- gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id})
331
- gv.output :png => options[:output_graph_png], :use => :neato
332
- end
333
- if options[:output_graph_svg]
334
- log.info "Converting assembly to a graphviz SVG"
335
- viser = Bio::Assembly::ABVisualiser.new
336
- gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id})
337
- gv.output :svg => options[:output_graph_svg], :use => :neato
338
- end
339
- if options[:output_graph_dot]
340
- log.info "Converting assembly to a graphviz DOT"
341
- viser = Bio::Assembly::ABVisualiser.new
342
- gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id})
343
- gv.output :dot => options[:output_graph_dot]
344
- end
345
-
346
- log.info "Searching for trails between the initial and terminal nodes, within the assembly graph"
347
- cartographer = Bio::AssemblyGraphAlgorithms::AcyclicConnectionFinder.new
348
- #raise "Untested connection finder below"
349
- #trails = cartographer.find_all_trails_between_nodes(graph, start_node, end_node, options[:graph_search_leash_length], start_node_forward)
350
- trails = cartographer.find_trails_between_nodes(graph, start_node, end_node, options[:graph_search_leash_length], start_node_forward)
351
- log.info "Found #{trails.length} trail(s) between the initial and terminal nodes"
352
-
353
- log.info "Reading kmer abundances from #{options[:kmer_multiple_abundance_file]}.."
354
- kmer_hash = Bio::KmerMultipleAbundanceHash.parse_from_file options[:kmer_multiple_abundance_file]
355
- log.info "Finished reading the kmer abundances"
356
-
357
- if options[:trail_kmer_coverage_file]
358
- log.info "Writing out kmer coverages to #{options[:trail_kmer_coverage_file]}.."
359
- writer = Bio::AssemblyGraphAlgorithms::KmerCoverageWriter.new
360
- io = File.open(options[:trail_kmer_coverage_file],'w')
361
- writer.write(io, trails, kmer_hash)
362
- log.info "Finished writing"
363
- end
364
-
365
- log.info "Filtering trail(s) based on kmer coverage, requiring each kmer in the path to have a minimum of #{options[:kmer_path_filter_min_coverage]} coverage in patterned reads, except for the #{options[:kmer_path_end_exclusion_length]}bp at the ends"
366
- kmer_path_filter = Bio::AssemblyGraphAlgorithms::KmerCoverageBasedPathFilter.new
367
- thresholds = desired_pattern.collect{|c| c == true ? 1 : 0}
368
- log.info "Using thresholds for filtering: #{thresholds}"
369
- trails = kmer_path_filter.filter(trails, kmer_hash, thresholds, :exclude_ending_length => options[:kmer_path_end_exclusion_length])
370
- log.info "After filtering remained #{trails.length} trails"
371
-
372
- log.debug "Found trails: #{trails.collect{|t| t.to_s}.join("\n")}"
373
-
374
- trails.each_with_index do |trail, i|
375
- puts ">trail#{i+1}"
376
- puts trail.sequence
377
- end