finishm 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +19 -19
- data/VERSION +1 -1
- data/finishm.gemspec +631 -0
- data/lib/assembly/graph_generator.rb +0 -1
- data/lib/assembly/probed_graph.rb +0 -1
- metadata +99 -96
- data/bin/assembly_visualiser +0 -106
- data/bin/check_primer_combinations.rb +0 -73
- data/bin/contig_joiner.rb +0 -244
- data/bin/contigs_against_assembly.rb +0 -153
- data/bin/finishm_assembler +0 -55
- data/bin/finishm_gap_closer.rb +0 -241
- data/bin/kmer_abundance_file_tool.rb +0 -49
- data/bin/kmer_pattern_to_assembly.rb +0 -377
- data/bin/kmer_profile_finder.rb +0 -92
- data/bin/kmers_count_parse.d +0 -52
- data/bin/kmers_count_tabulate.d +0 -123
- data/bin/kmers_count_tabulate.rb +0 -84
- data/bin/pcr_result_parser.rb +0 -108
- data/bin/primer_finder.rb +0 -119
- data/bin/read_selection_by_kmer.d +0 -174
- data/bin/scaffold_by_pattern.rb +0 -119
- data/bin/scaffold_connection_possibilities_to_knowns.rb +0 -193
- data/bin/scaffold_end_coverages.rb +0 -69
- data/bin/trail_validator.rb +0 -84
data/bin/finishm_assembler
DELETED
@@ -1,55 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'optparse'
|
4
|
-
require 'bio-logger'
|
5
|
-
|
6
|
-
SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
|
7
|
-
|
8
|
-
# Parse command line options into the options hash
|
9
|
-
options = {
|
10
|
-
:logger => 'stderr',
|
11
|
-
:log_level => 'info',
|
12
|
-
}
|
13
|
-
o = OptionParser.new do |opts|
|
14
|
-
opts.banner = "
|
15
|
-
Usage: #{SCRIPT_NAME} <arguments>
|
16
|
-
|
17
|
-
Description of what this program does...\n\n"
|
18
|
-
|
19
|
-
opts.on("--velvet-pregraph GRAPH_FILE", "PreGraph file output from velveth [required]") do |arg|
|
20
|
-
options[:velvet_pregraph_file] = arg
|
21
|
-
end
|
22
|
-
|
23
|
-
# logger options
|
24
|
-
opts.separator "\nVerbosity:\n\n"
|
25
|
-
opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
|
26
|
-
opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
|
27
|
-
opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
|
28
|
-
end; o.parse!
|
29
|
-
if ARGV.length != 0 or options[:velvet_pregraph_file].nil?
|
30
|
-
$stderr.puts o
|
31
|
-
exit 1
|
32
|
-
end
|
33
|
-
# Setup logging
|
34
|
-
Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
|
35
|
-
|
36
|
-
|
37
|
-
# Read in the velvet graph
|
38
|
-
log.info "Parsing graph from #{options[:velvet_pregraph_file]}"
|
39
|
-
graph = Bio::Velvet::Graph.parse_from_file(options[:velvet_pregraph_file])
|
40
|
-
log.info "Finished parsing graph, found #{graph.number_of_nodes} nodes"
|
41
|
-
|
42
|
-
# Log the number of nodes and arcs in the current graph
|
43
|
-
|
44
|
-
# Read in the fasta file of immutable nodes, and extract the two most immutable
|
45
|
-
# Log that they are found
|
46
|
-
|
47
|
-
# Determine that the graph is connected or not between the two most immutable nodes, using some graph theoretic algorithm
|
48
|
-
# If the graph is not connected, then there is no hope, exit
|
49
|
-
|
50
|
-
# Go through the graph to get a list of the cap nodes
|
51
|
-
# Log the number of cap nodes found
|
52
|
-
|
53
|
-
# Trim off all the cap nodes back to cross nodes, keeping track of the lengths
|
54
|
-
|
55
|
-
# Print the graph in graphviz dot format
|
data/bin/finishm_gap_closer.rb
DELETED
@@ -1,241 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'optparse'
|
4
|
-
require 'bio-logger'
|
5
|
-
require 'bio-velvet'
|
6
|
-
require 'tempfile'
|
7
|
-
require 'pp'
|
8
|
-
|
9
|
-
SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = 'finishm'
|
10
|
-
$:.unshift File.join(File.dirname(__FILE__),'..','lib')
|
11
|
-
require 'priner'
|
12
|
-
|
13
|
-
# Parse command line options into the options hash
|
14
|
-
options = {
|
15
|
-
:logger => 'stderr',
|
16
|
-
:log_level => 'info',
|
17
|
-
:velvet_kmer_size => 43,#TODO: these options should be exposed to the user, and perhaps not guessed at
|
18
|
-
:contig_end_length => 200,
|
19
|
-
:output_assembly_path => '/tmp/velvetAssembly',
|
20
|
-
:graph_search_leash_length => 3000,
|
21
|
-
:assembly_coverage_cutoff => 1.5,
|
22
|
-
}
|
23
|
-
o = OptionParser.new do |opts|
|
24
|
-
opts.banner = "
|
25
|
-
Usage: #{SCRIPT_NAME} --reads <read_file> --contig <contig_file>
|
26
|
-
|
27
|
-
Takes a set of reads and a contig that contains gap characters. Then it tries to fill in
|
28
|
-
these N characters. It is possible that there is multiple ways to close the gap - in that case
|
29
|
-
each is reported. \n\n"
|
30
|
-
|
31
|
-
|
32
|
-
opts.on("--reads FILE", "gzipped fastq file of reads to perform the gap closing with [required]") do |arg|
|
33
|
-
options[:reads_file] = arg
|
34
|
-
end
|
35
|
-
opts.on("--contig FILE", "fasta file of single contig containing Ns that are to be closed [required]") do |arg|
|
36
|
-
options[:contig_file] = arg
|
37
|
-
end
|
38
|
-
opts.on("--output-trails-fasta PATH", "Output found paths to this file in fasta format [default: off]") do |arg|
|
39
|
-
options[:overall_trail_output_fasta_file] = arg
|
40
|
-
end
|
41
|
-
|
42
|
-
opts.separator "\nOptional arguments:\n\n"
|
43
|
-
opts.on("--overhang NUM", "Start assembling this far from the gap [default: #{options[:contig_end_length]}]") do |arg|
|
44
|
-
options[:contig_end_length] = arg.to_i
|
45
|
-
end
|
46
|
-
opts.on("--start OFFSET", "Start trying to fill from this position in the contig, requires --stop [default: found from position of Ns}]") do |arg|
|
47
|
-
options[:start_offset] = arg.to_i-1
|
48
|
-
end
|
49
|
-
opts.on("--stop OFFSET", "Start trying to fill to this position in the contig, requires --start [default: found from position of Ns}]") do |arg|
|
50
|
-
options[:end_offset] = arg.to_i-1
|
51
|
-
end
|
52
|
-
opts.on("--assembly-png PATH", "Output assembly as a PNG file [default: off]") do |arg|
|
53
|
-
options[:output_graph_png] = arg
|
54
|
-
end
|
55
|
-
opts.on("--assembly-svg PATH", "Output assembly as an SVG file [default: off]") do |arg|
|
56
|
-
options[:output_graph_svg] = arg
|
57
|
-
end
|
58
|
-
opts.on("--assembly-dot PATH", "Output assembly as an DOT file [default: off]") do |arg|
|
59
|
-
options[:output_graph_dot] = arg
|
60
|
-
end
|
61
|
-
opts.on("--velvet-kmer KMER", "kmer size to use with velvet [default: #{options[:velvet_kmer_size]}]") do |arg|
|
62
|
-
options[:velvet_kmer_size] = arg.to_i
|
63
|
-
end
|
64
|
-
|
65
|
-
opts.separator "\nDebug-related options:\n\n"
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
# logger options
|
70
|
-
opts.separator "\nVerbosity:\n\n"
|
71
|
-
opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
|
72
|
-
opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
|
73
|
-
opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
|
74
|
-
end; o.parse!
|
75
|
-
if ARGV.length != 0 or options[:reads_file].nil? or options[:contig_file].nil? or options[:overall_trail_output_fasta_file].nil?
|
76
|
-
$stderr.puts o
|
77
|
-
exit 1
|
78
|
-
end
|
79
|
-
# Setup logging
|
80
|
-
Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
|
81
|
-
Bio::Log::LoggerPlus.new 'bio-velvet'; Bio::Log::CLI.configure 'bio-velvet'
|
82
|
-
log.outputters[0].formatter = Log4r::PatternFormatter.new(:pattern => "%5l %c %d: %m", :date_pattern => '%d/%m %T')
|
83
|
-
|
84
|
-
log.debug "Running finishm with options: #{PP.pp(options, "").gsub(/\n$/,'')}" if log.debug?
|
85
|
-
|
86
|
-
# Find where the Ns are
|
87
|
-
n_region_start = nil
|
88
|
-
n_region_end = nil
|
89
|
-
sequence = nil
|
90
|
-
Bio::FlatFile.foreach(options[:contig_file]) do |seq|
|
91
|
-
if sequence
|
92
|
-
raise Exception, "Sorry, this script can only handle single sequences to be gap filled at the moment"
|
93
|
-
end
|
94
|
-
|
95
|
-
sequence = seq.seq
|
96
|
-
|
97
|
-
if options[:start_offset] and options[:end_offset]
|
98
|
-
log.info "Trying to gap fill from #{options[:start_offset]+1} to #{options[:end_offset]+1}"
|
99
|
-
n_region_start = options[:start_offset]
|
100
|
-
n_region_end = options[:end_offset]
|
101
|
-
else
|
102
|
-
log.info "Determining where to fill from the presence of Ns"
|
103
|
-
|
104
|
-
matches = sequence.match(/(N+)/i)
|
105
|
-
if !matches
|
106
|
-
raise "Unable to find any gaps in the input sequence. That was a bit too easy.."
|
107
|
-
end
|
108
|
-
n_region_start = matches.offset(0)[0]
|
109
|
-
n_region_end = n_region_start + matches[1].length
|
110
|
-
log.info "Detected a gap between #{n_region_start} and #{n_region_end}"
|
111
|
-
end
|
112
|
-
|
113
|
-
# Check to make sure we are sufficiently distant from the ends
|
114
|
-
if n_region_start < options[:contig_end_length] or
|
115
|
-
sequence.length - n_region_end < options[:contig_end_length]
|
116
|
-
raise "The gap is too close to the end of the contig, sorry"
|
117
|
-
end
|
118
|
-
end
|
119
|
-
|
120
|
-
# Do the assembly
|
121
|
-
graph = nil
|
122
|
-
if options[:previously_serialized_parsed_graph_file].nil?
|
123
|
-
velvet_result = nil
|
124
|
-
if options[:previous_assembly].nil? #If assembly has not already been carried out
|
125
|
-
Tempfile.open('anchors.fa') do |tempfile|
|
126
|
-
tempfile.puts ">anchor1"
|
127
|
-
tempfile.puts sequence[n_region_start-options[:contig_end_length]-1...n_region_start]
|
128
|
-
tempfile.puts ">anchor2"
|
129
|
-
#Have to be in reverse, because the node finder finds the node at the start of the read, not the end
|
130
|
-
fwd2 = Bio::Sequence::NA.new(sequence[n_region_end..(n_region_end+options[:contig_end_length])])
|
131
|
-
tempfile.puts fwd2.reverse_complement.to_s
|
132
|
-
tempfile.close
|
133
|
-
log.debug "Inputting anchors into the assembly: #{File.open(tempfile.path).read}" if log.debug?
|
134
|
-
|
135
|
-
log.info "Assembling sampled reads with velvet"
|
136
|
-
# Bit of a hack, but have to use -short1 as the anchors because then start and end anchors will have node IDs 1,2,... etc.
|
137
|
-
velvet_result = Bio::Velvet::Runner.new.velvet(
|
138
|
-
options[:velvet_kmer_size],
|
139
|
-
"-short #{tempfile.path} -short2 -fastq.gz #{options[:reads_file]}",
|
140
|
-
"-read_trkg yes -cov_cutoff #{options[:assembly_coverage_cutoff]}",
|
141
|
-
:output_assembly_path => options[:output_assembly_path]
|
142
|
-
)
|
143
|
-
if log.debug?
|
144
|
-
log.debug "velveth stdout: #{velvet_result.velveth_stdout}"
|
145
|
-
log.debug "velveth stderr: #{velvet_result.velveth_stderr}"
|
146
|
-
log.debug "velvetg stdout: #{velvet_result.velvetg_stdout}"
|
147
|
-
log.debug "velvetg stderr: #{velvet_result.velvetg_stderr}"
|
148
|
-
end
|
149
|
-
log.info "Finished running assembly"
|
150
|
-
end
|
151
|
-
else
|
152
|
-
log.info "Using previous assembly stored at #{options[:previous_assembly]}"
|
153
|
-
velvet_result = Bio::Velvet::Result.new
|
154
|
-
velvet_result.result_directory = options[:previous_assembly]
|
155
|
-
end
|
156
|
-
|
157
|
-
log.info "Parsing the graph output from velvet"
|
158
|
-
graph = Bio::Velvet::Graph.parse_from_file(File.join velvet_result.result_directory, 'LastGraph')
|
159
|
-
log.info "Finished parsing graph: found #{graph.nodes.length} nodes and #{graph.arcs.length} arcs"
|
160
|
-
|
161
|
-
if options[:serialize_parsed_graph_file]
|
162
|
-
log.info "Storing a binary version of the graph file for later use at #{options[:serialize_parsed_graph_file]}"
|
163
|
-
File.open(options[:serialize_parsed_graph_file],'wb') do |f|
|
164
|
-
f.print Marshal.dump(graph)
|
165
|
-
end
|
166
|
-
log.info "Stored a binary representation of the velvet graph at #{options[:serialize_parsed_graph_file]}"
|
167
|
-
end
|
168
|
-
|
169
|
-
if options[:assembly_coverage_cutoff]
|
170
|
-
log.info "Removing low-coverage nodes from the graph (less than #{options[:assembly_coverage_cutoff]})"
|
171
|
-
cutoffer = Bio::AssemblyGraphAlgorithms::CoverageBasedGraphFilter.new
|
172
|
-
deleted_nodes, deleted_arcs = cutoffer.remove_low_coverage_nodes(graph, options[:assembly_coverage_cutoff], :whitelisted_sequences => [1,2])
|
173
|
-
|
174
|
-
log.info "Removed #{deleted_nodes.length} nodes and #{deleted_arcs.length} arcs from the graph due to low coverage"
|
175
|
-
log.info "Now there is #{graph.nodes.length} nodes and #{graph.arcs.length} arcs remaining"
|
176
|
-
end
|
177
|
-
else
|
178
|
-
log.info "Restoring graph file from #{options[:previously_serialized_parsed_graph_file]}.."
|
179
|
-
graph = Marshal.load(File.open(options[:previously_serialized_parsed_graph_file]))
|
180
|
-
log.info "Restoration complete"
|
181
|
-
end
|
182
|
-
|
183
|
-
|
184
|
-
# Find the anchor nodes again
|
185
|
-
finder = Bio::AssemblyGraphAlgorithms::NodeFinder.new
|
186
|
-
log.info "Finding node representing the end of the each contig"
|
187
|
-
i = 1
|
188
|
-
anchor_sequence_ids = [1,2]
|
189
|
-
start_node, start_node_forward = finder.find_unique_node_with_sequence_id(graph, 1)
|
190
|
-
end_node, end_node_forward = finder.find_unique_node_with_sequence_id(graph, 2)
|
191
|
-
if start_node and end_node
|
192
|
-
log.info "Found both anchoring nodes in the graph: #{start_node.node_id}/#{start_node_forward} and #{end_node.node_id}/#{end_node_forward}"
|
193
|
-
else
|
194
|
-
log.error "start node not found" if start_node.nil?
|
195
|
-
log.error "end node not found" if end_node.nil?
|
196
|
-
raise "Unable to find both anchor reads from the assembly, cannot continue. This is probably an error with this script, not you."
|
197
|
-
end
|
198
|
-
|
199
|
-
log.info "Removing nodes unconnected to either the start or the end from the graph.."
|
200
|
-
original_num_nodes = graph.nodes.length
|
201
|
-
original_num_arcs = graph.arcs.length
|
202
|
-
filter = Bio::AssemblyGraphAlgorithms::ConnectivityBasedGraphFilter.new
|
203
|
-
filter.remove_unconnected_nodes(graph, [start_node, end_node])
|
204
|
-
log.info "Removed #{original_num_nodes-graph.nodes.length} nodes and #{original_num_arcs-graph.arcs.length} arcs"
|
205
|
-
|
206
|
-
|
207
|
-
if options[:output_graph_png]
|
208
|
-
log.info "Converting assembly to a graphviz PNG"
|
209
|
-
viser = Bio::Assembly::ABVisualiser.new
|
210
|
-
gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id})
|
211
|
-
gv.output :png => options[:output_graph_png], :use => :neato
|
212
|
-
end
|
213
|
-
if options[:output_graph_svg]
|
214
|
-
log.info "Converting assembly to a graphviz SVG"
|
215
|
-
viser = Bio::Assembly::ABVisualiser.new
|
216
|
-
gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id})
|
217
|
-
gv.output :svg => options[:output_graph_svg], :use => :neato
|
218
|
-
end
|
219
|
-
if options[:output_graph_dot]
|
220
|
-
log.info "Converting assembly to a graphviz DOT"
|
221
|
-
viser = Bio::Assembly::ABVisualiser.new
|
222
|
-
gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id, :digraph => false})
|
223
|
-
gv.output :dot => options[:output_graph_dot]
|
224
|
-
end
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
log.info "Searching for trails between the nodes within the assembly graph"
|
229
|
-
cartographer = Bio::AssemblyGraphAlgorithms::AcyclicConnectionFinder.new
|
230
|
-
trails = cartographer.find_trails_between_nodes(graph, start_node, end_node, options[:graph_search_leash_length], start_node_forward)
|
231
|
-
log.info "Found #{trails.length} trail(s) in total"
|
232
|
-
|
233
|
-
|
234
|
-
log.debug "Outputing trail sequences"
|
235
|
-
File.open(options[:overall_trail_output_fasta_file],'w') do |f|
|
236
|
-
trails.each_with_index do |trail, i|
|
237
|
-
f.puts ">trail#{i+1}"
|
238
|
-
f.puts trail.sequence
|
239
|
-
end
|
240
|
-
end
|
241
|
-
|
@@ -1,49 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'optparse'
|
4
|
-
require 'bio-logger'
|
5
|
-
require 'csv'
|
6
|
-
|
7
|
-
SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
|
8
|
-
|
9
|
-
# Parse command line options into the options hash
|
10
|
-
options = {
|
11
|
-
:logger => 'stderr',
|
12
|
-
:log_level => 'info',
|
13
|
-
:min => 0,
|
14
|
-
}
|
15
|
-
o = OptionParser.new do |opts|
|
16
|
-
opts.banner = "
|
17
|
-
Usage: #{SCRIPT_NAME} <arguments>
|
18
|
-
|
19
|
-
grep a multiple kmer abundance file according to specified criteria\n\n"
|
20
|
-
|
21
|
-
opts.on("--min NUMBER", "At least 1 column has at least this many observations [default: #{options[:min]}]") do |arg|
|
22
|
-
options[:min] = arg.to_f
|
23
|
-
end
|
24
|
-
|
25
|
-
# logger options
|
26
|
-
opts.separator "\nVerbosity:\n\n"
|
27
|
-
opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
|
28
|
-
opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
|
29
|
-
opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
|
30
|
-
end; o.parse!
|
31
|
-
if ARGV.length != 1
|
32
|
-
$stderr.puts o
|
33
|
-
exit 1
|
34
|
-
end
|
35
|
-
# Setup logging
|
36
|
-
Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
|
37
|
-
|
38
|
-
CSV.foreach(ARGV[0], :col_sep => ' ') do |row|
|
39
|
-
kmer = row[0]
|
40
|
-
passable = false
|
41
|
-
row[1...row.length].each do |count|
|
42
|
-
if count.to_f > options[:min]
|
43
|
-
passable = true
|
44
|
-
break
|
45
|
-
end
|
46
|
-
end
|
47
|
-
puts row.join(' ') if passable
|
48
|
-
end
|
49
|
-
|
@@ -1,377 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'optparse'
|
4
|
-
require 'bio-logger'
|
5
|
-
require 'csv'
|
6
|
-
require 'tempfile'
|
7
|
-
require 'pp'
|
8
|
-
require 'systemu'
|
9
|
-
require 'bio-velvet'
|
10
|
-
require 'set'
|
11
|
-
|
12
|
-
SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = 'finishm'
|
13
|
-
$:.unshift File.join(File.dirname(__FILE__),'..','lib')
|
14
|
-
require 'priner'
|
15
|
-
|
16
|
-
# Parse command line options into the options hash
|
17
|
-
options = {
|
18
|
-
:logger => 'stderr',
|
19
|
-
:log_level => 'info',
|
20
|
-
:min_leftover_length => false,
|
21
|
-
:kmer_coverage_target => 1,
|
22
|
-
:velvet_kmer_size => 155,
|
23
|
-
:contig_end_length => 300,
|
24
|
-
:graph_search_leash_length => 20000,
|
25
|
-
:reads_to_assemble => nil,
|
26
|
-
:assembly_coverage_cutoff => 1.5,
|
27
|
-
:kmer_path_filter_min_coverage => 1,
|
28
|
-
:kmer_path_end_exclusion_length => 50,
|
29
|
-
:trail_kmer_coverage_file => 'trail_coverages.csv'
|
30
|
-
}
|
31
|
-
|
32
|
-
# TODO: make a better interface for this. Maybe specify an entire genome, and then "Contig_1 end, Contig_3 start" or something
|
33
|
-
# Look at the last 300bp of the first contig.
|
34
|
-
extract_exactly_one_contig_from_file = lambda do |fasta_file_path|
|
35
|
-
contig = nil
|
36
|
-
Bio::FlatFile.foreach(Bio::FastaFormat, fasta_file_path) do |e|
|
37
|
-
if contig.nil?
|
38
|
-
contig = e.seq
|
39
|
-
else
|
40
|
-
raise "Multiple sequences found in a contig file! I need exactly one"
|
41
|
-
end
|
42
|
-
end
|
43
|
-
raise "I need a contig to be in the start contig file" if contig.nil?
|
44
|
-
Bio::Sequence::NA.new(contig.to_s)
|
45
|
-
end
|
46
|
-
|
47
|
-
o = OptionParser.new do |opts|
|
48
|
-
opts.banner = "
|
49
|
-
Usage: #{SCRIPT_NAME} <kmer_multiple_abundance_file>
|
50
|
-
|
51
|
-
Given an input kmer then abundances space separated file, and a threshold, print out how many kmers are unique to different subsets of columns\n\n"
|
52
|
-
|
53
|
-
opts.on("--pattern PATTERN", "kmer abundance pattern e.g. '0111001110' [required]") do |arg|
|
54
|
-
options[:pattern] = arg
|
55
|
-
end
|
56
|
-
opts.on("--kmer-abundances FILE", "kmer multiple abundance file [required]") do |arg|
|
57
|
-
options[:kmer_multiple_abundance_file] = arg
|
58
|
-
end
|
59
|
-
opts.on("--upper-threshold NUM", "kmer frequency cutoff to saying 'present' [required]") do |arg|
|
60
|
-
options[:upper_threshold] = arg.to_i
|
61
|
-
end
|
62
|
-
opts.on("--lower-threshold NUM", "kmer frequency cutoff to saying 'not present' [required]") do |arg|
|
63
|
-
options[:lower_threshold] = arg.to_i
|
64
|
-
end
|
65
|
-
opts.on("--reads FILES", "comma-separated list of sequence reads files in the same order as the pattern was supplied [required]") do |arg|
|
66
|
-
options[:reads_files] = arg.split(',').collect{|r| File.absolute_path r}
|
67
|
-
end
|
68
|
-
opts.on("--start-contig FASTA", "path to a fasta file with the starting contig in it (only). Assumes we are building off the end of this contig [required]") do |arg|
|
69
|
-
options[:start_contig] = extract_exactly_one_contig_from_file.call arg
|
70
|
-
end
|
71
|
-
opts.on("--end-contig FASTA", "path to a fasta file with the ending contig in it (only). Assumes we are building onto the start of this contig [required]") do |arg|
|
72
|
-
options[:end_contig] = extract_exactly_one_contig_from_file.call arg
|
73
|
-
end
|
74
|
-
|
75
|
-
opts.separator "\nOptional arguments:\n\n"
|
76
|
-
opts.on("--min-leftover-read-length NUMBER", "when searching for reads with kmers, require the kmer to be at the beginning or end of the selected read [default: #{options[:min_leftover_length]}]") do |arg|
|
77
|
-
options[:min_leftover_length] = arg.to_i
|
78
|
-
end
|
79
|
-
opts.on("--kmer-coverage-target NUMBER", "when searching for reads with kmers, require this many copies per kmer [default: #{options[:kmer_coverage_target]}]") do |arg|
|
80
|
-
options[:kmer_coverage_target] = arg.to_i
|
81
|
-
end
|
82
|
-
opts.on("--already-patterned-reads FILE", "Attempt to assemble the reads in the specified file, useful for re-assembly [default: off]") do |arg|
|
83
|
-
options[:already_patterned_reads] = arg
|
84
|
-
end
|
85
|
-
opts.on("--output-assembly PATH", "Output assembly intermediate files to this directory [default: off]") do |arg|
|
86
|
-
options[:output_assembly_path] = arg
|
87
|
-
end
|
88
|
-
opts.on("--assembly-png PATH", "Output assembly as a PNG file [default: off]") do |arg|
|
89
|
-
options[:output_graph_png] = arg
|
90
|
-
end
|
91
|
-
opts.on("--assembly-svg PATH", "Output assembly as an SVG file [default: off]") do |arg|
|
92
|
-
options[:output_graph_svg] = arg
|
93
|
-
end
|
94
|
-
opts.on("--assembly-dot PATH", "Output assembly as an DOT file [default: off]") do |arg|
|
95
|
-
options[:output_graph_dot] = arg
|
96
|
-
end
|
97
|
-
# opts.on("--output-begin-kmers PATH", "Output kmers found at the beginning point to this file [default: off]") do |arg|
|
98
|
-
# options[:output_begin_kmers] = arg
|
99
|
-
# end
|
100
|
-
# opts.on("--output-end-kmers PATH", "Output kmers found at the ending point to this file [default: off]") do |arg|
|
101
|
-
# options[:output_end_kmers] = arg
|
102
|
-
# end
|
103
|
-
opts.on("--assembly-coverage-cutoff NUMBER", "Require this much coverage in each node, all other nodes are removed [default: #{options[:assembly_coverage_cutoff]}]") do |arg|
|
104
|
-
options[:assembly_coverage_cutoff] = arg.to_f
|
105
|
-
end
|
106
|
-
opts.on("--contig-end-length LENGTH", "Number of base pairs to start into the ends of the contigs [default: #{options[:contig_end_length]}]") do |arg|
|
107
|
-
options[:contig_end_length] = arg.to_i
|
108
|
-
end
|
109
|
-
|
110
|
-
# logger options
|
111
|
-
opts.separator "\nVerbosity:\n\n"
|
112
|
-
opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
|
113
|
-
opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
|
114
|
-
opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
|
115
|
-
end; o.parse!
|
116
|
-
if ARGV.length != 0 or options[:upper_threshold].nil? or options[:lower_threshold].nil? or options[:pattern].nil? or options[:kmer_multiple_abundance_file].nil? or options[:reads_files].nil?
|
117
|
-
pp options
|
118
|
-
$stderr.puts o
|
119
|
-
exit 1
|
120
|
-
end
|
121
|
-
# Setup logging
|
122
|
-
Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
|
123
|
-
Bio::Log::LoggerPlus.new 'bio-velvet'
|
124
|
-
Bio::Log::CLI.configure 'bio-velvet'
|
125
|
-
|
126
|
-
pooled_reads_filename = 'pooled_sampled_reads.fasta'
|
127
|
-
if options[:already_patterned_reads] #If skipping read extraction
|
128
|
-
pooled_reads_filename = options[:already_patterned_reads]
|
129
|
-
|
130
|
-
else
|
131
|
-
# Parse pattern from cmdline
|
132
|
-
desired_pattern = KmerAbundancePattern.new
|
133
|
-
desired_pattern.parse_from_human(options[:pattern])
|
134
|
-
if options[:reads_files].length != desired_pattern.length
|
135
|
-
raise "Number of entries in the pattern #{desired_pattern.length} and number of reads files #{options[:reads].length} not equivalent!"
|
136
|
-
end
|
137
|
-
|
138
|
-
# Collect the kmers that will be used to find trusted reads i.e.
|
139
|
-
# Go through each line of the kmer abundance file, looking for kmers that suit the pattern
|
140
|
-
input_file = File.open options[:kmer_multiple_abundance_file]
|
141
|
-
csv = CSV.new(input_file, :col_sep => ' ')
|
142
|
-
|
143
|
-
whitelist_kmers = []
|
144
|
-
blacklist_kmers = []
|
145
|
-
csv.each do |row|
|
146
|
-
max_i = row.length - 2 if max_i.nil?
|
147
|
-
|
148
|
-
kmer = row[0]
|
149
|
-
counts = row[1...row.length].collect{|s| s.to_i}
|
150
|
-
probe = 'TTACATCTTATCTACAATAAACCTTCTGCCTTAGTTTTAGAGCCTATCCGAAAAGTCCTGCTGCTCTGAATGTTATCCAAGCACATGCAAAATGAATTAGT'
|
151
|
-
this_pattern = []
|
152
|
-
counts.each_with_index do |count, i|
|
153
|
-
if count > options[:upper_threshold]
|
154
|
-
this_pattern[i] = true
|
155
|
-
elsif count < options[:lower_threshold]
|
156
|
-
this_pattern[i] = false
|
157
|
-
else
|
158
|
-
# coverage was in no man's land between thresholds.
|
159
|
-
# Ignore this kmer as noise.
|
160
|
-
this_pattern[i] = '-'
|
161
|
-
end
|
162
|
-
end
|
163
|
-
#log.debug "Found pattern #{this_pattern} from kmer #{kmer}, which has abundances #{counts}" if log.debug?
|
164
|
-
|
165
|
-
if desired_pattern.consistent_with? this_pattern
|
166
|
-
whitelist_kmers.push row[0]
|
167
|
-
else
|
168
|
-
# kmer is not present when it should be
|
169
|
-
blacklist_kmers.push row[0]
|
170
|
-
end
|
171
|
-
end
|
172
|
-
log.info "After parsing the kmer multiple abundance file, found #{whitelist_kmers.length} kmers that matched the pattern, and #{blacklist_kmers.length} that didn't"
|
173
|
-
unless whitelist_kmers.length > 0
|
174
|
-
log.error "No kmers found that satisfy the given pattern, exiting.."
|
175
|
-
exit 1
|
176
|
-
end
|
177
|
-
|
178
|
-
|
179
|
-
#outdir = options[:output_directory]
|
180
|
-
#Dir.mkdir outdir unless Dir.exist?(outdir)
|
181
|
-
|
182
|
-
# grep the pattern out from the raw reads, subsampling so as to not overwhelm the assembler
|
183
|
-
#Tempfile.open('whitelist') do |white|
|
184
|
-
File.open 'whitelist', 'w' do |white|
|
185
|
-
white.puts whitelist_kmers.join("\n")
|
186
|
-
white.close
|
187
|
-
|
188
|
-
#Tempfile.open('blacklist') do |black|
|
189
|
-
File.open('black','w') do |black|
|
190
|
-
black.puts blacklist_kmers.join("\n")
|
191
|
-
black.close
|
192
|
-
|
193
|
-
threadpool = []
|
194
|
-
sampled_read_files = []
|
195
|
-
log.info "Extracting reads that contain suitable kmers"
|
196
|
-
options[:reads_files].each_with_index do |file, i|
|
197
|
-
next unless desired_pattern[i] #Don't extract reads from reads where those reads should not have been amplified
|
198
|
-
|
199
|
-
sampled = File.basename(file)+'.sampled_reads.fasta'
|
200
|
-
sampled_read_files.push sampled
|
201
|
-
|
202
|
-
grep_path = "#{ENV['HOME']}/git/priner/bin/read_selection_by_kmer "
|
203
|
-
if options[:min_leftover_length]
|
204
|
-
grep_path += "--min-leftover-length #{options[:min_leftover_length]} "
|
205
|
-
end
|
206
|
-
thr = Thread.new do
|
207
|
-
grep_cmd = "#{grep_path} --whitelist #{white.path} --blacklist #{black.path} --reads #{file} --kmer-coverage-target #{options[:kmer_coverage_target]} > #{sampled}"
|
208
|
-
log.debug "Running cmd: #{grep_cmd}"
|
209
|
-
status, stdout, stderr = systemu grep_cmd
|
210
|
-
log.debug stderr
|
211
|
-
|
212
|
-
raise unless status.exitstatus == 0
|
213
|
-
log.debug "Finished extracting reads from #{file}"
|
214
|
-
end
|
215
|
-
threadpool.push thr
|
216
|
-
end
|
217
|
-
threadpool.each do |thread| thread.join; end #wait until everything is finito
|
218
|
-
|
219
|
-
log.info "Finished extracting reads for sampling. Now pooling sampled reads"
|
220
|
-
pool_cmd = "cat #{sampled_read_files.join ' '} >#{pooled_reads_filename}"
|
221
|
-
log.debug "Running cmd: #{pool_cmd}"
|
222
|
-
status, stdout, stderr = systemu pool_cmd
|
223
|
-
raise stderr if stderr != ''
|
224
|
-
raise unless status.exitstatus == 0
|
225
|
-
end
|
226
|
-
end
|
227
|
-
end
|
228
|
-
|
229
|
-
log.info "Extracting dummy reads from the ends of contigs to use as anchors"
|
230
|
-
start_contig = options[:start_contig]
|
231
|
-
end_contig = options[:end_contig]
|
232
|
-
if [start_contig.length, end_contig.length].min < 2*options[:contig_end_length]
|
233
|
-
log.warn "Choice of initial/terminal nodes to perform graph search with may not be optimal due to the small contig size"
|
234
|
-
end
|
235
|
-
if [start_contig.length, end_contig.length].min < options[:contig_end_length]
|
236
|
-
log.error "At least one contig too small to proceed with current code base, need to fix the code to allow such a small contig"
|
237
|
-
exit 1
|
238
|
-
end
|
239
|
-
# Use the last bit of the first contig and the first bit of the second contig as the anchors
|
240
|
-
velvet_result = nil
|
241
|
-
Tempfile.open('anchors.fa') do |tempfile|
|
242
|
-
# Putting these same sequences in many times seems to better the
|
243
|
-
# chances velvet won't throw them out
|
244
|
-
50.times do
|
245
|
-
tempfile.puts ">start_contig"
|
246
|
-
tempfile.puts start_contig[start_contig.length-options[:contig_end_length]...start_contig.length]
|
247
|
-
tempfile.puts ">end_contig"
|
248
|
-
#Have to be in reverse, because the node finder finds the node at the start of the read, not the end
|
249
|
-
fwd2 = Bio::Sequence::NA.new(end_contig[0...options[:contig_end_length]])
|
250
|
-
tempfile.puts fwd2.reverse_complement.to_s
|
251
|
-
end
|
252
|
-
tempfile.close
|
253
|
-
#puts `cat #{tempfile.path}`
|
254
|
-
|
255
|
-
log.info "Assembling sampled reads with velvet"
|
256
|
-
# Bit of a hack, but have to use -short1 as the anchors because then start and end anchors will have node IDs 1 and 2, respectively.
|
257
|
-
velvet_result = Bio::Velvet::Runner.new.velvet(
|
258
|
-
options[:velvet_kmer_size],
|
259
|
-
"-short #{tempfile.path} -short2 #{pooled_reads_filename}",
|
260
|
-
"-cov_cutoff #{options[:assembly_coverage_cutoff]} -read_trkg yes",
|
261
|
-
:output_assembly_path => options[:output_assembly_path]
|
262
|
-
)
|
263
|
-
if log.debug?
|
264
|
-
log.debug "velveth stdout: #{velvet_result.velveth_stdout}"
|
265
|
-
log.debug "velveth stderr: #{velvet_result.velveth_stderr}"
|
266
|
-
log.debug "velvetg stdout: #{velvet_result.velvetg_stdout}"
|
267
|
-
log.debug "velvetg stderr: #{velvet_result.velvetg_stderr}"
|
268
|
-
end
|
269
|
-
log.info "Finished running assembly"
|
270
|
-
end
|
271
|
-
|
272
|
-
log.info "Parsing the graph output from velvet"
|
273
|
-
graph = Bio::Velvet::Graph.parse_from_file(File.join velvet_result.result_directory, 'LastGraph')
|
274
|
-
log.info "Finished parsing graph: found #{graph.nodes.length} nodes and #{graph.arcs.length} arcs"
|
275
|
-
|
276
|
-
if options[:assembly_coverage_cutoff]
|
277
|
-
log.info "Removing low-coverage nodes from the graph (less than #{options[:assembly_coverage_cutoff]})"
|
278
|
-
cutoffer = Bio::AssemblyGraphAlgorithms::CoverageBasedGraphFilter.new
|
279
|
-
deleted_nodes, deleted_arcs = cutoffer.remove_low_coverage_nodes(graph, options[:assembly_coverage_cutoff], :whitelisted_sequences => [1,2])
|
280
|
-
|
281
|
-
log.info "Removed #{deleted_nodes.length} nodes and #{deleted_arcs.length} arcs from the graph due to low coverage"
|
282
|
-
log.info "Now there is #{graph.nodes.length} nodes and #{graph.arcs.length} arcs remaining"
|
283
|
-
end
|
284
|
-
|
285
|
-
finder = Bio::AssemblyGraphAlgorithms::NodeFinder.new
|
286
|
-
log.info "Finding node representing the end of the first contig"
|
287
|
-
start_node, start_node_forward = finder.find_unique_node_with_sequence_id(graph, 1)
|
288
|
-
log.info "Finding node representing the start of the second contig"
|
289
|
-
end_node, end_node_forward = finder.find_unique_node_with_sequence_id(graph, 2)#TODO: find the node nearest the end of this, not the start
|
290
|
-
if start_node.nil? or end_node.nil?
|
291
|
-
if start_node.nil?
|
292
|
-
log.error "Unable to find any nodes in the graph that have kmers corresponding to the _start_ point in them, sorry. Maybe fix the node finding code?"
|
293
|
-
end
|
294
|
-
if end_node.nil?
|
295
|
-
log.error "Unable to find any nodes in the graph that have kmers corresponding to the _end_ point in them, sorry. Maybe fix the node finding code?"
|
296
|
-
end
|
297
|
-
|
298
|
-
if options[:output_graph_png] or options[:output_graph_svg] or options[:output_graph_dot]
|
299
|
-
log.info "Converting assembly to a graphviz PNG/SVG/DOT, even if start/end node was not be found properly"
|
300
|
-
viser = Bio::Assembly::ABVisualiser.new
|
301
|
-
gv = viser.graphviz(graph)
|
302
|
-
if options[:output_graph_png]
|
303
|
-
log.info "Writing PNG of graph to #{options[:output_graph_png]}"
|
304
|
-
gv.output :png => options[:output_graph_png]
|
305
|
-
end
|
306
|
-
if options[:output_graph_svg]
|
307
|
-
log.info "Writing SVG of graph to #{options[:output_graph_svg]}"
|
308
|
-
gv.output :svg => options[:output_graph_svg]
|
309
|
-
end
|
310
|
-
if options[:output_graph_dot]
|
311
|
-
log.info "Writing DOT of graph to #{options[:output_graph_dot]}"
|
312
|
-
gv.output :dot => options[:output_graph_dot]
|
313
|
-
end
|
314
|
-
end
|
315
|
-
log.error "Unknown start or end points, giving up, sorry."
|
316
|
-
exit 1
|
317
|
-
end
|
318
|
-
log.info "Node(s) found that are suitable as initial and terminal nodes in the graph search, respectively: #{start_node.node_id} and #{end_node.node_id}"
|
319
|
-
|
320
|
-
log.info "Removing nodes unconnected to either the start or the end from the graph.."
|
321
|
-
original_num_nodes = graph.nodes.length
|
322
|
-
original_num_arcs = graph.arcs.length
|
323
|
-
filter = Bio::AssemblyGraphAlgorithms::ConnectivityBasedGraphFilter.new
|
324
|
-
filter.remove_unconnected_nodes(graph, [start_node, end_node])
|
325
|
-
log.info "Removed #{original_num_nodes-graph.nodes.length} nodes and #{original_num_arcs-graph.arcs.length} arcs"
|
326
|
-
|
327
|
-
if options[:output_graph_png]
|
328
|
-
log.info "Converting assembly to a graphviz PNG"
|
329
|
-
viser = Bio::Assembly::ABVisualiser.new
|
330
|
-
gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id})
|
331
|
-
gv.output :png => options[:output_graph_png], :use => :neato
|
332
|
-
end
|
333
|
-
if options[:output_graph_svg]
|
334
|
-
log.info "Converting assembly to a graphviz SVG"
|
335
|
-
viser = Bio::Assembly::ABVisualiser.new
|
336
|
-
gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id})
|
337
|
-
gv.output :svg => options[:output_graph_svg], :use => :neato
|
338
|
-
end
|
339
|
-
if options[:output_graph_dot]
|
340
|
-
log.info "Converting assembly to a graphviz DOT"
|
341
|
-
viser = Bio::Assembly::ABVisualiser.new
|
342
|
-
gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id})
|
343
|
-
gv.output :dot => options[:output_graph_dot]
|
344
|
-
end
|
345
|
-
|
346
|
-
log.info "Searching for trails between the initial and terminal nodes, within the assembly graph"
|
347
|
-
cartographer = Bio::AssemblyGraphAlgorithms::AcyclicConnectionFinder.new
|
348
|
-
#raise "Untested connection finder below"
|
349
|
-
#trails = cartographer.find_all_trails_between_nodes(graph, start_node, end_node, options[:graph_search_leash_length], start_node_forward)
|
350
|
-
trails = cartographer.find_trails_between_nodes(graph, start_node, end_node, options[:graph_search_leash_length], start_node_forward)
|
351
|
-
log.info "Found #{trails.length} trail(s) between the initial and terminal nodes"
|
352
|
-
|
353
|
-
log.info "Reading kmer abundances from #{options[:kmer_multiple_abundance_file]}.."
|
354
|
-
kmer_hash = Bio::KmerMultipleAbundanceHash.parse_from_file options[:kmer_multiple_abundance_file]
|
355
|
-
log.info "Finished reading the kmer abundances"
|
356
|
-
|
357
|
-
if options[:trail_kmer_coverage_file]
|
358
|
-
log.info "Writing out kmer coverages to #{options[:trail_kmer_coverage_file]}.."
|
359
|
-
writer = Bio::AssemblyGraphAlgorithms::KmerCoverageWriter.new
|
360
|
-
io = File.open(options[:trail_kmer_coverage_file],'w')
|
361
|
-
writer.write(io, trails, kmer_hash)
|
362
|
-
log.info "Finished writing"
|
363
|
-
end
|
364
|
-
|
365
|
-
log.info "Filtering trail(s) based on kmer coverage, requiring each kmer in the path to have a minimum of #{options[:kmer_path_filter_min_coverage]} coverage in patterned reads, except for the #{options[:kmer_path_end_exclusion_length]}bp at the ends"
|
366
|
-
kmer_path_filter = Bio::AssemblyGraphAlgorithms::KmerCoverageBasedPathFilter.new
|
367
|
-
thresholds = desired_pattern.collect{|c| c == true ? 1 : 0}
|
368
|
-
log.info "Using thresholds for filtering: #{thresholds}"
|
369
|
-
trails = kmer_path_filter.filter(trails, kmer_hash, thresholds, :exclude_ending_length => options[:kmer_path_end_exclusion_length])
|
370
|
-
log.info "After filtering remained #{trails.length} trails"
|
371
|
-
|
372
|
-
log.debug "Found trails: #{trails.collect{|t| t.to_s}.join("\n")}"
|
373
|
-
|
374
|
-
trails.each_with_index do |trail, i|
|
375
|
-
puts ">trail#{i+1}"
|
376
|
-
puts trail.sequence
|
377
|
-
end
|