finishm 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +19 -19
- data/VERSION +1 -1
- data/finishm.gemspec +631 -0
- data/lib/assembly/graph_generator.rb +0 -1
- data/lib/assembly/probed_graph.rb +0 -1
- metadata +99 -96
- data/bin/assembly_visualiser +0 -106
- data/bin/check_primer_combinations.rb +0 -73
- data/bin/contig_joiner.rb +0 -244
- data/bin/contigs_against_assembly.rb +0 -153
- data/bin/finishm_assembler +0 -55
- data/bin/finishm_gap_closer.rb +0 -241
- data/bin/kmer_abundance_file_tool.rb +0 -49
- data/bin/kmer_pattern_to_assembly.rb +0 -377
- data/bin/kmer_profile_finder.rb +0 -92
- data/bin/kmers_count_parse.d +0 -52
- data/bin/kmers_count_tabulate.d +0 -123
- data/bin/kmers_count_tabulate.rb +0 -84
- data/bin/pcr_result_parser.rb +0 -108
- data/bin/primer_finder.rb +0 -119
- data/bin/read_selection_by_kmer.d +0 -174
- data/bin/scaffold_by_pattern.rb +0 -119
- data/bin/scaffold_connection_possibilities_to_knowns.rb +0 -193
- data/bin/scaffold_end_coverages.rb +0 -69
- data/bin/trail_validator.rb +0 -84
data/bin/finishm_assembler
DELETED
@@ -1,55 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'optparse'
|
4
|
-
require 'bio-logger'
|
5
|
-
|
6
|
-
SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
|
7
|
-
|
8
|
-
# Parse command line options into the options hash
|
9
|
-
options = {
|
10
|
-
:logger => 'stderr',
|
11
|
-
:log_level => 'info',
|
12
|
-
}
|
13
|
-
o = OptionParser.new do |opts|
|
14
|
-
opts.banner = "
|
15
|
-
Usage: #{SCRIPT_NAME} <arguments>
|
16
|
-
|
17
|
-
Description of what this program does...\n\n"
|
18
|
-
|
19
|
-
opts.on("--velvet-pregraph GRAPH_FILE", "PreGraph file output from velveth [required]") do |arg|
|
20
|
-
options[:velvet_pregraph_file] = arg
|
21
|
-
end
|
22
|
-
|
23
|
-
# logger options
|
24
|
-
opts.separator "\nVerbosity:\n\n"
|
25
|
-
opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
|
26
|
-
opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
|
27
|
-
opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
|
28
|
-
end; o.parse!
|
29
|
-
if ARGV.length != 0 or options[:velvet_pregraph_file].nil?
|
30
|
-
$stderr.puts o
|
31
|
-
exit 1
|
32
|
-
end
|
33
|
-
# Setup logging
|
34
|
-
Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
|
35
|
-
|
36
|
-
|
37
|
-
# Read in the velvet graph
|
38
|
-
log.info "Parsing graph from #{options[:velvet_pregraph_file]}"
|
39
|
-
graph = Bio::Velvet::Graph.parse_from_file(options[:velvet_pregraph_file])
|
40
|
-
log.info "Finished parsing graph, found #{graph.number_of_nodes} nodes"
|
41
|
-
|
42
|
-
# Log the number of nodes and arcs in the current graph
|
43
|
-
|
44
|
-
# Read in the fasta file of immutable nodes, and extract the two most immutable
|
45
|
-
# Log that they are found
|
46
|
-
|
47
|
-
# Determine that the graph is connected or not between the two most immutable nodes, using some graph theoretic algorithm
|
48
|
-
# If the graph is not connected, then there is no hope, exit
|
49
|
-
|
50
|
-
# Go through the graph to get a list of the cap nodes
|
51
|
-
# Log the number of cap nodes found
|
52
|
-
|
53
|
-
# Trim off all the cap nodes back to cross nodes, keeping track of the lengths
|
54
|
-
|
55
|
-
# Print the graph in graphviz dot format
|
data/bin/finishm_gap_closer.rb
DELETED
@@ -1,241 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'optparse'
|
4
|
-
require 'bio-logger'
|
5
|
-
require 'bio-velvet'
|
6
|
-
require 'tempfile'
|
7
|
-
require 'pp'
|
8
|
-
|
9
|
-
SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = 'finishm'
|
10
|
-
$:.unshift File.join(File.dirname(__FILE__),'..','lib')
|
11
|
-
require 'priner'
|
12
|
-
|
13
|
-
# Parse command line options into the options hash
|
14
|
-
options = {
|
15
|
-
:logger => 'stderr',
|
16
|
-
:log_level => 'info',
|
17
|
-
:velvet_kmer_size => 43,#TODO: these options should be exposed to the user, and perhaps not guessed at
|
18
|
-
:contig_end_length => 200,
|
19
|
-
:output_assembly_path => '/tmp/velvetAssembly',
|
20
|
-
:graph_search_leash_length => 3000,
|
21
|
-
:assembly_coverage_cutoff => 1.5,
|
22
|
-
}
|
23
|
-
o = OptionParser.new do |opts|
|
24
|
-
opts.banner = "
|
25
|
-
Usage: #{SCRIPT_NAME} --reads <read_file> --contig <contig_file>
|
26
|
-
|
27
|
-
Takes a set of reads and a contig that contains gap characters. Then it tries to fill in
|
28
|
-
these N characters. It is possible that there is multiple ways to close the gap - in that case
|
29
|
-
each is reported. \n\n"
|
30
|
-
|
31
|
-
|
32
|
-
opts.on("--reads FILE", "gzipped fastq file of reads to perform the gap closing with [required]") do |arg|
|
33
|
-
options[:reads_file] = arg
|
34
|
-
end
|
35
|
-
opts.on("--contig FILE", "fasta file of single contig containing Ns that are to be closed [required]") do |arg|
|
36
|
-
options[:contig_file] = arg
|
37
|
-
end
|
38
|
-
opts.on("--output-trails-fasta PATH", "Output found paths to this file in fasta format [default: off]") do |arg|
|
39
|
-
options[:overall_trail_output_fasta_file] = arg
|
40
|
-
end
|
41
|
-
|
42
|
-
opts.separator "\nOptional arguments:\n\n"
|
43
|
-
opts.on("--overhang NUM", "Start assembling this far from the gap [default: #{options[:contig_end_length]}]") do |arg|
|
44
|
-
options[:contig_end_length] = arg.to_i
|
45
|
-
end
|
46
|
-
opts.on("--start OFFSET", "Start trying to fill from this position in the contig, requires --stop [default: found from position of Ns}]") do |arg|
|
47
|
-
options[:start_offset] = arg.to_i-1
|
48
|
-
end
|
49
|
-
opts.on("--stop OFFSET", "Start trying to fill to this position in the contig, requires --start [default: found from position of Ns}]") do |arg|
|
50
|
-
options[:end_offset] = arg.to_i-1
|
51
|
-
end
|
52
|
-
opts.on("--assembly-png PATH", "Output assembly as a PNG file [default: off]") do |arg|
|
53
|
-
options[:output_graph_png] = arg
|
54
|
-
end
|
55
|
-
opts.on("--assembly-svg PATH", "Output assembly as an SVG file [default: off]") do |arg|
|
56
|
-
options[:output_graph_svg] = arg
|
57
|
-
end
|
58
|
-
opts.on("--assembly-dot PATH", "Output assembly as an DOT file [default: off]") do |arg|
|
59
|
-
options[:output_graph_dot] = arg
|
60
|
-
end
|
61
|
-
opts.on("--velvet-kmer KMER", "kmer size to use with velvet [default: #{options[:velvet_kmer_size]}]") do |arg|
|
62
|
-
options[:velvet_kmer_size] = arg.to_i
|
63
|
-
end
|
64
|
-
|
65
|
-
opts.separator "\nDebug-related options:\n\n"
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
# logger options
|
70
|
-
opts.separator "\nVerbosity:\n\n"
|
71
|
-
opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
|
72
|
-
opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
|
73
|
-
opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
|
74
|
-
end; o.parse!
|
75
|
-
if ARGV.length != 0 or options[:reads_file].nil? or options[:contig_file].nil? or options[:overall_trail_output_fasta_file].nil?
|
76
|
-
$stderr.puts o
|
77
|
-
exit 1
|
78
|
-
end
|
79
|
-
# Setup logging
|
80
|
-
Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
|
81
|
-
Bio::Log::LoggerPlus.new 'bio-velvet'; Bio::Log::CLI.configure 'bio-velvet'
|
82
|
-
log.outputters[0].formatter = Log4r::PatternFormatter.new(:pattern => "%5l %c %d: %m", :date_pattern => '%d/%m %T')
|
83
|
-
|
84
|
-
log.debug "Running finishm with options: #{PP.pp(options, "").gsub(/\n$/,'')}" if log.debug?
|
85
|
-
|
86
|
-
# Find where the Ns are
|
87
|
-
n_region_start = nil
|
88
|
-
n_region_end = nil
|
89
|
-
sequence = nil
|
90
|
-
Bio::FlatFile.foreach(options[:contig_file]) do |seq|
|
91
|
-
if sequence
|
92
|
-
raise Exception, "Sorry, this script can only handle single sequences to be gap filled at the moment"
|
93
|
-
end
|
94
|
-
|
95
|
-
sequence = seq.seq
|
96
|
-
|
97
|
-
if options[:start_offset] and options[:end_offset]
|
98
|
-
log.info "Trying to gap fill from #{options[:start_offset]+1} to #{options[:end_offset]+1}"
|
99
|
-
n_region_start = options[:start_offset]
|
100
|
-
n_region_end = options[:end_offset]
|
101
|
-
else
|
102
|
-
log.info "Determining where to fill from the presence of Ns"
|
103
|
-
|
104
|
-
matches = sequence.match(/(N+)/i)
|
105
|
-
if !matches
|
106
|
-
raise "Unable to find any gaps in the input sequence. That was a bit too easy.."
|
107
|
-
end
|
108
|
-
n_region_start = matches.offset(0)[0]
|
109
|
-
n_region_end = n_region_start + matches[1].length
|
110
|
-
log.info "Detected a gap between #{n_region_start} and #{n_region_end}"
|
111
|
-
end
|
112
|
-
|
113
|
-
# Check to make sure we are sufficiently distant from the ends
|
114
|
-
if n_region_start < options[:contig_end_length] or
|
115
|
-
sequence.length - n_region_end < options[:contig_end_length]
|
116
|
-
raise "The gap is too close to the end of the contig, sorry"
|
117
|
-
end
|
118
|
-
end
|
119
|
-
|
120
|
-
# Do the assembly
|
121
|
-
graph = nil
|
122
|
-
if options[:previously_serialized_parsed_graph_file].nil?
|
123
|
-
velvet_result = nil
|
124
|
-
if options[:previous_assembly].nil? #If assembly has not already been carried out
|
125
|
-
Tempfile.open('anchors.fa') do |tempfile|
|
126
|
-
tempfile.puts ">anchor1"
|
127
|
-
tempfile.puts sequence[n_region_start-options[:contig_end_length]-1...n_region_start]
|
128
|
-
tempfile.puts ">anchor2"
|
129
|
-
#Have to be in reverse, because the node finder finds the node at the start of the read, not the end
|
130
|
-
fwd2 = Bio::Sequence::NA.new(sequence[n_region_end..(n_region_end+options[:contig_end_length])])
|
131
|
-
tempfile.puts fwd2.reverse_complement.to_s
|
132
|
-
tempfile.close
|
133
|
-
log.debug "Inputting anchors into the assembly: #{File.open(tempfile.path).read}" if log.debug?
|
134
|
-
|
135
|
-
log.info "Assembling sampled reads with velvet"
|
136
|
-
# Bit of a hack, but have to use -short1 as the anchors because then start and end anchors will have node IDs 1,2,... etc.
|
137
|
-
velvet_result = Bio::Velvet::Runner.new.velvet(
|
138
|
-
options[:velvet_kmer_size],
|
139
|
-
"-short #{tempfile.path} -short2 -fastq.gz #{options[:reads_file]}",
|
140
|
-
"-read_trkg yes -cov_cutoff #{options[:assembly_coverage_cutoff]}",
|
141
|
-
:output_assembly_path => options[:output_assembly_path]
|
142
|
-
)
|
143
|
-
if log.debug?
|
144
|
-
log.debug "velveth stdout: #{velvet_result.velveth_stdout}"
|
145
|
-
log.debug "velveth stderr: #{velvet_result.velveth_stderr}"
|
146
|
-
log.debug "velvetg stdout: #{velvet_result.velvetg_stdout}"
|
147
|
-
log.debug "velvetg stderr: #{velvet_result.velvetg_stderr}"
|
148
|
-
end
|
149
|
-
log.info "Finished running assembly"
|
150
|
-
end
|
151
|
-
else
|
152
|
-
log.info "Using previous assembly stored at #{options[:previous_assembly]}"
|
153
|
-
velvet_result = Bio::Velvet::Result.new
|
154
|
-
velvet_result.result_directory = options[:previous_assembly]
|
155
|
-
end
|
156
|
-
|
157
|
-
log.info "Parsing the graph output from velvet"
|
158
|
-
graph = Bio::Velvet::Graph.parse_from_file(File.join velvet_result.result_directory, 'LastGraph')
|
159
|
-
log.info "Finished parsing graph: found #{graph.nodes.length} nodes and #{graph.arcs.length} arcs"
|
160
|
-
|
161
|
-
if options[:serialize_parsed_graph_file]
|
162
|
-
log.info "Storing a binary version of the graph file for later use at #{options[:serialize_parsed_graph_file]}"
|
163
|
-
File.open(options[:serialize_parsed_graph_file],'wb') do |f|
|
164
|
-
f.print Marshal.dump(graph)
|
165
|
-
end
|
166
|
-
log.info "Stored a binary representation of the velvet graph at #{options[:serialize_parsed_graph_file]}"
|
167
|
-
end
|
168
|
-
|
169
|
-
if options[:assembly_coverage_cutoff]
|
170
|
-
log.info "Removing low-coverage nodes from the graph (less than #{options[:assembly_coverage_cutoff]})"
|
171
|
-
cutoffer = Bio::AssemblyGraphAlgorithms::CoverageBasedGraphFilter.new
|
172
|
-
deleted_nodes, deleted_arcs = cutoffer.remove_low_coverage_nodes(graph, options[:assembly_coverage_cutoff], :whitelisted_sequences => [1,2])
|
173
|
-
|
174
|
-
log.info "Removed #{deleted_nodes.length} nodes and #{deleted_arcs.length} arcs from the graph due to low coverage"
|
175
|
-
log.info "Now there is #{graph.nodes.length} nodes and #{graph.arcs.length} arcs remaining"
|
176
|
-
end
|
177
|
-
else
|
178
|
-
log.info "Restoring graph file from #{options[:previously_serialized_parsed_graph_file]}.."
|
179
|
-
graph = Marshal.load(File.open(options[:previously_serialized_parsed_graph_file]))
|
180
|
-
log.info "Restoration complete"
|
181
|
-
end
|
182
|
-
|
183
|
-
|
184
|
-
# Find the anchor nodes again
|
185
|
-
finder = Bio::AssemblyGraphAlgorithms::NodeFinder.new
|
186
|
-
log.info "Finding node representing the end of the each contig"
|
187
|
-
i = 1
|
188
|
-
anchor_sequence_ids = [1,2]
|
189
|
-
start_node, start_node_forward = finder.find_unique_node_with_sequence_id(graph, 1)
|
190
|
-
end_node, end_node_forward = finder.find_unique_node_with_sequence_id(graph, 2)
|
191
|
-
if start_node and end_node
|
192
|
-
log.info "Found both anchoring nodes in the graph: #{start_node.node_id}/#{start_node_forward} and #{end_node.node_id}/#{end_node_forward}"
|
193
|
-
else
|
194
|
-
log.error "start node not found" if start_node.nil?
|
195
|
-
log.error "end node not found" if end_node.nil?
|
196
|
-
raise "Unable to find both anchor reads from the assembly, cannot continue. This is probably an error with this script, not you."
|
197
|
-
end
|
198
|
-
|
199
|
-
log.info "Removing nodes unconnected to either the start or the end from the graph.."
|
200
|
-
original_num_nodes = graph.nodes.length
|
201
|
-
original_num_arcs = graph.arcs.length
|
202
|
-
filter = Bio::AssemblyGraphAlgorithms::ConnectivityBasedGraphFilter.new
|
203
|
-
filter.remove_unconnected_nodes(graph, [start_node, end_node])
|
204
|
-
log.info "Removed #{original_num_nodes-graph.nodes.length} nodes and #{original_num_arcs-graph.arcs.length} arcs"
|
205
|
-
|
206
|
-
|
207
|
-
if options[:output_graph_png]
|
208
|
-
log.info "Converting assembly to a graphviz PNG"
|
209
|
-
viser = Bio::Assembly::ABVisualiser.new
|
210
|
-
gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id})
|
211
|
-
gv.output :png => options[:output_graph_png], :use => :neato
|
212
|
-
end
|
213
|
-
if options[:output_graph_svg]
|
214
|
-
log.info "Converting assembly to a graphviz SVG"
|
215
|
-
viser = Bio::Assembly::ABVisualiser.new
|
216
|
-
gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id})
|
217
|
-
gv.output :svg => options[:output_graph_svg], :use => :neato
|
218
|
-
end
|
219
|
-
if options[:output_graph_dot]
|
220
|
-
log.info "Converting assembly to a graphviz DOT"
|
221
|
-
viser = Bio::Assembly::ABVisualiser.new
|
222
|
-
gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id, :digraph => false})
|
223
|
-
gv.output :dot => options[:output_graph_dot]
|
224
|
-
end
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
log.info "Searching for trails between the nodes within the assembly graph"
|
229
|
-
cartographer = Bio::AssemblyGraphAlgorithms::AcyclicConnectionFinder.new
|
230
|
-
trails = cartographer.find_trails_between_nodes(graph, start_node, end_node, options[:graph_search_leash_length], start_node_forward)
|
231
|
-
log.info "Found #{trails.length} trail(s) in total"
|
232
|
-
|
233
|
-
|
234
|
-
log.debug "Outputing trail sequences"
|
235
|
-
File.open(options[:overall_trail_output_fasta_file],'w') do |f|
|
236
|
-
trails.each_with_index do |trail, i|
|
237
|
-
f.puts ">trail#{i+1}"
|
238
|
-
f.puts trail.sequence
|
239
|
-
end
|
240
|
-
end
|
241
|
-
|
@@ -1,49 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'optparse'
|
4
|
-
require 'bio-logger'
|
5
|
-
require 'csv'
|
6
|
-
|
7
|
-
SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
|
8
|
-
|
9
|
-
# Parse command line options into the options hash
|
10
|
-
options = {
|
11
|
-
:logger => 'stderr',
|
12
|
-
:log_level => 'info',
|
13
|
-
:min => 0,
|
14
|
-
}
|
15
|
-
o = OptionParser.new do |opts|
|
16
|
-
opts.banner = "
|
17
|
-
Usage: #{SCRIPT_NAME} <arguments>
|
18
|
-
|
19
|
-
grep a multiple kmer abundance file according to specified criteria\n\n"
|
20
|
-
|
21
|
-
opts.on("--min NUMBER", "At least 1 column has at least this many observations [default: #{options[:min]}]") do |arg|
|
22
|
-
options[:min] = arg.to_f
|
23
|
-
end
|
24
|
-
|
25
|
-
# logger options
|
26
|
-
opts.separator "\nVerbosity:\n\n"
|
27
|
-
opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
|
28
|
-
opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
|
29
|
-
opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
|
30
|
-
end; o.parse!
|
31
|
-
if ARGV.length != 1
|
32
|
-
$stderr.puts o
|
33
|
-
exit 1
|
34
|
-
end
|
35
|
-
# Setup logging
|
36
|
-
Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
|
37
|
-
|
38
|
-
CSV.foreach(ARGV[0], :col_sep => ' ') do |row|
|
39
|
-
kmer = row[0]
|
40
|
-
passable = false
|
41
|
-
row[1...row.length].each do |count|
|
42
|
-
if count.to_f > options[:min]
|
43
|
-
passable = true
|
44
|
-
break
|
45
|
-
end
|
46
|
-
end
|
47
|
-
puts row.join(' ') if passable
|
48
|
-
end
|
49
|
-
|
@@ -1,377 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'optparse'
|
4
|
-
require 'bio-logger'
|
5
|
-
require 'csv'
|
6
|
-
require 'tempfile'
|
7
|
-
require 'pp'
|
8
|
-
require 'systemu'
|
9
|
-
require 'bio-velvet'
|
10
|
-
require 'set'
|
11
|
-
|
12
|
-
SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = 'finishm'
|
13
|
-
$:.unshift File.join(File.dirname(__FILE__),'..','lib')
|
14
|
-
require 'priner'
|
15
|
-
|
16
|
-
# Parse command line options into the options hash
|
17
|
-
options = {
|
18
|
-
:logger => 'stderr',
|
19
|
-
:log_level => 'info',
|
20
|
-
:min_leftover_length => false,
|
21
|
-
:kmer_coverage_target => 1,
|
22
|
-
:velvet_kmer_size => 155,
|
23
|
-
:contig_end_length => 300,
|
24
|
-
:graph_search_leash_length => 20000,
|
25
|
-
:reads_to_assemble => nil,
|
26
|
-
:assembly_coverage_cutoff => 1.5,
|
27
|
-
:kmer_path_filter_min_coverage => 1,
|
28
|
-
:kmer_path_end_exclusion_length => 50,
|
29
|
-
:trail_kmer_coverage_file => 'trail_coverages.csv'
|
30
|
-
}
|
31
|
-
|
32
|
-
# TODO: make a better interface for this. Maybe specify an entire genome, and then "Contig_1 end, Contig_3 start" or something
|
33
|
-
# Look at the last 300bp of the first contig.
|
34
|
-
extract_exactly_one_contig_from_file = lambda do |fasta_file_path|
|
35
|
-
contig = nil
|
36
|
-
Bio::FlatFile.foreach(Bio::FastaFormat, fasta_file_path) do |e|
|
37
|
-
if contig.nil?
|
38
|
-
contig = e.seq
|
39
|
-
else
|
40
|
-
raise "Multiple sequences found in a contig file! I need exactly one"
|
41
|
-
end
|
42
|
-
end
|
43
|
-
raise "I need a contig to be in the start contig file" if contig.nil?
|
44
|
-
Bio::Sequence::NA.new(contig.to_s)
|
45
|
-
end
|
46
|
-
|
47
|
-
o = OptionParser.new do |opts|
|
48
|
-
opts.banner = "
|
49
|
-
Usage: #{SCRIPT_NAME} <kmer_multiple_abundance_file>
|
50
|
-
|
51
|
-
Given an input kmer then abundances space separated file, and a threshold, print out how many kmers are unique to different subsets of columns\n\n"
|
52
|
-
|
53
|
-
opts.on("--pattern PATTERN", "kmer abundance pattern e.g. '0111001110' [required]") do |arg|
|
54
|
-
options[:pattern] = arg
|
55
|
-
end
|
56
|
-
opts.on("--kmer-abundances FILE", "kmer multiple abundance file [required]") do |arg|
|
57
|
-
options[:kmer_multiple_abundance_file] = arg
|
58
|
-
end
|
59
|
-
opts.on("--upper-threshold NUM", "kmer frequency cutoff to saying 'present' [required]") do |arg|
|
60
|
-
options[:upper_threshold] = arg.to_i
|
61
|
-
end
|
62
|
-
opts.on("--lower-threshold NUM", "kmer frequency cutoff to saying 'not present' [required]") do |arg|
|
63
|
-
options[:lower_threshold] = arg.to_i
|
64
|
-
end
|
65
|
-
opts.on("--reads FILES", "comma-separated list of sequence reads files in the same order as the pattern was supplied [required]") do |arg|
|
66
|
-
options[:reads_files] = arg.split(',').collect{|r| File.absolute_path r}
|
67
|
-
end
|
68
|
-
opts.on("--start-contig FASTA", "path to a fasta file with the starting contig in it (only). Assumes we are building off the end of this contig [required]") do |arg|
|
69
|
-
options[:start_contig] = extract_exactly_one_contig_from_file.call arg
|
70
|
-
end
|
71
|
-
opts.on("--end-contig FASTA", "path to a fasta file with the ending contig in it (only). Assumes we are building onto the start of this contig [required]") do |arg|
|
72
|
-
options[:end_contig] = extract_exactly_one_contig_from_file.call arg
|
73
|
-
end
|
74
|
-
|
75
|
-
opts.separator "\nOptional arguments:\n\n"
|
76
|
-
opts.on("--min-leftover-read-length NUMBER", "when searching for reads with kmers, require the kmer to be at the beginning or end of the selected read [default: #{options[:min_leftover_length]}]") do |arg|
|
77
|
-
options[:min_leftover_length] = arg.to_i
|
78
|
-
end
|
79
|
-
opts.on("--kmer-coverage-target NUMBER", "when searching for reads with kmers, require this many copies per kmer [default: #{options[:kmer_coverage_target]}]") do |arg|
|
80
|
-
options[:kmer_coverage_target] = arg.to_i
|
81
|
-
end
|
82
|
-
opts.on("--already-patterned-reads FILE", "Attempt to assemble the reads in the specified file, useful for re-assembly [default: off]") do |arg|
|
83
|
-
options[:already_patterned_reads] = arg
|
84
|
-
end
|
85
|
-
opts.on("--output-assembly PATH", "Output assembly intermediate files to this directory [default: off]") do |arg|
|
86
|
-
options[:output_assembly_path] = arg
|
87
|
-
end
|
88
|
-
opts.on("--assembly-png PATH", "Output assembly as a PNG file [default: off]") do |arg|
|
89
|
-
options[:output_graph_png] = arg
|
90
|
-
end
|
91
|
-
opts.on("--assembly-svg PATH", "Output assembly as an SVG file [default: off]") do |arg|
|
92
|
-
options[:output_graph_svg] = arg
|
93
|
-
end
|
94
|
-
opts.on("--assembly-dot PATH", "Output assembly as an DOT file [default: off]") do |arg|
|
95
|
-
options[:output_graph_dot] = arg
|
96
|
-
end
|
97
|
-
# opts.on("--output-begin-kmers PATH", "Output kmers found at the beginning point to this file [default: off]") do |arg|
|
98
|
-
# options[:output_begin_kmers] = arg
|
99
|
-
# end
|
100
|
-
# opts.on("--output-end-kmers PATH", "Output kmers found at the ending point to this file [default: off]") do |arg|
|
101
|
-
# options[:output_end_kmers] = arg
|
102
|
-
# end
|
103
|
-
opts.on("--assembly-coverage-cutoff NUMBER", "Require this much coverage in each node, all other nodes are removed [default: #{options[:assembly_coverage_cutoff]}]") do |arg|
|
104
|
-
options[:assembly_coverage_cutoff] = arg.to_f
|
105
|
-
end
|
106
|
-
opts.on("--contig-end-length LENGTH", "Number of base pairs to start into the ends of the contigs [default: #{options[:contig_end_length]}]") do |arg|
|
107
|
-
options[:contig_end_length] = arg.to_i
|
108
|
-
end
|
109
|
-
|
110
|
-
# logger options
|
111
|
-
opts.separator "\nVerbosity:\n\n"
|
112
|
-
opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
|
113
|
-
opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
|
114
|
-
opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
|
115
|
-
end; o.parse!
|
116
|
-
if ARGV.length != 0 or options[:upper_threshold].nil? or options[:lower_threshold].nil? or options[:pattern].nil? or options[:kmer_multiple_abundance_file].nil? or options[:reads_files].nil?
|
117
|
-
pp options
|
118
|
-
$stderr.puts o
|
119
|
-
exit 1
|
120
|
-
end
|
121
|
-
# Setup logging
|
122
|
-
Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
|
123
|
-
Bio::Log::LoggerPlus.new 'bio-velvet'
|
124
|
-
Bio::Log::CLI.configure 'bio-velvet'
|
125
|
-
|
126
|
-
pooled_reads_filename = 'pooled_sampled_reads.fasta'
|
127
|
-
if options[:already_patterned_reads] #If skipping read extraction
|
128
|
-
pooled_reads_filename = options[:already_patterned_reads]
|
129
|
-
|
130
|
-
else
|
131
|
-
# Parse pattern from cmdline
|
132
|
-
desired_pattern = KmerAbundancePattern.new
|
133
|
-
desired_pattern.parse_from_human(options[:pattern])
|
134
|
-
if options[:reads_files].length != desired_pattern.length
|
135
|
-
raise "Number of entries in the pattern #{desired_pattern.length} and number of reads files #{options[:reads].length} not equivalent!"
|
136
|
-
end
|
137
|
-
|
138
|
-
# Collect the kmers that will be used to find trusted reads i.e.
|
139
|
-
# Go through each line of the kmer abundance file, looking for kmers that suit the pattern
|
140
|
-
input_file = File.open options[:kmer_multiple_abundance_file]
|
141
|
-
csv = CSV.new(input_file, :col_sep => ' ')
|
142
|
-
|
143
|
-
whitelist_kmers = []
|
144
|
-
blacklist_kmers = []
|
145
|
-
csv.each do |row|
|
146
|
-
max_i = row.length - 2 if max_i.nil?
|
147
|
-
|
148
|
-
kmer = row[0]
|
149
|
-
counts = row[1...row.length].collect{|s| s.to_i}
|
150
|
-
probe = 'TTACATCTTATCTACAATAAACCTTCTGCCTTAGTTTTAGAGCCTATCCGAAAAGTCCTGCTGCTCTGAATGTTATCCAAGCACATGCAAAATGAATTAGT'
|
151
|
-
this_pattern = []
|
152
|
-
counts.each_with_index do |count, i|
|
153
|
-
if count > options[:upper_threshold]
|
154
|
-
this_pattern[i] = true
|
155
|
-
elsif count < options[:lower_threshold]
|
156
|
-
this_pattern[i] = false
|
157
|
-
else
|
158
|
-
# coverage was in no man's land between thresholds.
|
159
|
-
# Ignore this kmer as noise.
|
160
|
-
this_pattern[i] = '-'
|
161
|
-
end
|
162
|
-
end
|
163
|
-
#log.debug "Found pattern #{this_pattern} from kmer #{kmer}, which has abundances #{counts}" if log.debug?
|
164
|
-
|
165
|
-
if desired_pattern.consistent_with? this_pattern
|
166
|
-
whitelist_kmers.push row[0]
|
167
|
-
else
|
168
|
-
# kmer is not present when it should be
|
169
|
-
blacklist_kmers.push row[0]
|
170
|
-
end
|
171
|
-
end
|
172
|
-
log.info "After parsing the kmer multiple abundance file, found #{whitelist_kmers.length} kmers that matched the pattern, and #{blacklist_kmers.length} that didn't"
|
173
|
-
unless whitelist_kmers.length > 0
|
174
|
-
log.error "No kmers found that satisfy the given pattern, exiting.."
|
175
|
-
exit 1
|
176
|
-
end
|
177
|
-
|
178
|
-
|
179
|
-
#outdir = options[:output_directory]
|
180
|
-
#Dir.mkdir outdir unless Dir.exist?(outdir)
|
181
|
-
|
182
|
-
# grep the pattern out from the raw reads, subsampling so as to not overwhelm the assembler
|
183
|
-
#Tempfile.open('whitelist') do |white|
|
184
|
-
File.open 'whitelist', 'w' do |white|
|
185
|
-
white.puts whitelist_kmers.join("\n")
|
186
|
-
white.close
|
187
|
-
|
188
|
-
#Tempfile.open('blacklist') do |black|
|
189
|
-
File.open('black','w') do |black|
|
190
|
-
black.puts blacklist_kmers.join("\n")
|
191
|
-
black.close
|
192
|
-
|
193
|
-
threadpool = []
|
194
|
-
sampled_read_files = []
|
195
|
-
log.info "Extracting reads that contain suitable kmers"
|
196
|
-
options[:reads_files].each_with_index do |file, i|
|
197
|
-
next unless desired_pattern[i] #Don't extract reads from reads where those reads should not have been amplified
|
198
|
-
|
199
|
-
sampled = File.basename(file)+'.sampled_reads.fasta'
|
200
|
-
sampled_read_files.push sampled
|
201
|
-
|
202
|
-
grep_path = "#{ENV['HOME']}/git/priner/bin/read_selection_by_kmer "
|
203
|
-
if options[:min_leftover_length]
|
204
|
-
grep_path += "--min-leftover-length #{options[:min_leftover_length]} "
|
205
|
-
end
|
206
|
-
thr = Thread.new do
|
207
|
-
grep_cmd = "#{grep_path} --whitelist #{white.path} --blacklist #{black.path} --reads #{file} --kmer-coverage-target #{options[:kmer_coverage_target]} > #{sampled}"
|
208
|
-
log.debug "Running cmd: #{grep_cmd}"
|
209
|
-
status, stdout, stderr = systemu grep_cmd
|
210
|
-
log.debug stderr
|
211
|
-
|
212
|
-
raise unless status.exitstatus == 0
|
213
|
-
log.debug "Finished extracting reads from #{file}"
|
214
|
-
end
|
215
|
-
threadpool.push thr
|
216
|
-
end
|
217
|
-
threadpool.each do |thread| thread.join; end #wait until everything is finito
|
218
|
-
|
219
|
-
log.info "Finished extracting reads for sampling. Now pooling sampled reads"
|
220
|
-
pool_cmd = "cat #{sampled_read_files.join ' '} >#{pooled_reads_filename}"
|
221
|
-
log.debug "Running cmd: #{pool_cmd}"
|
222
|
-
status, stdout, stderr = systemu pool_cmd
|
223
|
-
raise stderr if stderr != ''
|
224
|
-
raise unless status.exitstatus == 0
|
225
|
-
end
|
226
|
-
end
|
227
|
-
end
|
228
|
-
|
229
|
-
log.info "Extracting dummy reads from the ends of contigs to use as anchors"
|
230
|
-
start_contig = options[:start_contig]
|
231
|
-
end_contig = options[:end_contig]
|
232
|
-
if [start_contig.length, end_contig.length].min < 2*options[:contig_end_length]
|
233
|
-
log.warn "Choice of initial/terminal nodes to perform graph search with may not be optimal due to the small contig size"
|
234
|
-
end
|
235
|
-
if [start_contig.length, end_contig.length].min < options[:contig_end_length]
|
236
|
-
log.error "At least one contig too small to proceed with current code base, need to fix the code to allow such a small contig"
|
237
|
-
exit 1
|
238
|
-
end
|
239
|
-
# Use the last bit of the first contig and the first bit of the second contig as the anchors
|
240
|
-
velvet_result = nil
|
241
|
-
Tempfile.open('anchors.fa') do |tempfile|
|
242
|
-
# Putting these same sequences in many times seems to better the
|
243
|
-
# chances velvet won't throw them out
|
244
|
-
50.times do
|
245
|
-
tempfile.puts ">start_contig"
|
246
|
-
tempfile.puts start_contig[start_contig.length-options[:contig_end_length]...start_contig.length]
|
247
|
-
tempfile.puts ">end_contig"
|
248
|
-
#Have to be in reverse, because the node finder finds the node at the start of the read, not the end
|
249
|
-
fwd2 = Bio::Sequence::NA.new(end_contig[0...options[:contig_end_length]])
|
250
|
-
tempfile.puts fwd2.reverse_complement.to_s
|
251
|
-
end
|
252
|
-
tempfile.close
|
253
|
-
#puts `cat #{tempfile.path}`
|
254
|
-
|
255
|
-
log.info "Assembling sampled reads with velvet"
|
256
|
-
# Bit of a hack, but have to use -short1 as the anchors because then start and end anchors will have node IDs 1 and 2, respectively.
|
257
|
-
velvet_result = Bio::Velvet::Runner.new.velvet(
|
258
|
-
options[:velvet_kmer_size],
|
259
|
-
"-short #{tempfile.path} -short2 #{pooled_reads_filename}",
|
260
|
-
"-cov_cutoff #{options[:assembly_coverage_cutoff]} -read_trkg yes",
|
261
|
-
:output_assembly_path => options[:output_assembly_path]
|
262
|
-
)
|
263
|
-
if log.debug?
|
264
|
-
log.debug "velveth stdout: #{velvet_result.velveth_stdout}"
|
265
|
-
log.debug "velveth stderr: #{velvet_result.velveth_stderr}"
|
266
|
-
log.debug "velvetg stdout: #{velvet_result.velvetg_stdout}"
|
267
|
-
log.debug "velvetg stderr: #{velvet_result.velvetg_stderr}"
|
268
|
-
end
|
269
|
-
log.info "Finished running assembly"
|
270
|
-
end
|
271
|
-
|
272
|
-
log.info "Parsing the graph output from velvet"
|
273
|
-
graph = Bio::Velvet::Graph.parse_from_file(File.join velvet_result.result_directory, 'LastGraph')
|
274
|
-
log.info "Finished parsing graph: found #{graph.nodes.length} nodes and #{graph.arcs.length} arcs"
|
275
|
-
|
276
|
-
if options[:assembly_coverage_cutoff]
|
277
|
-
log.info "Removing low-coverage nodes from the graph (less than #{options[:assembly_coverage_cutoff]})"
|
278
|
-
cutoffer = Bio::AssemblyGraphAlgorithms::CoverageBasedGraphFilter.new
|
279
|
-
deleted_nodes, deleted_arcs = cutoffer.remove_low_coverage_nodes(graph, options[:assembly_coverage_cutoff], :whitelisted_sequences => [1,2])
|
280
|
-
|
281
|
-
log.info "Removed #{deleted_nodes.length} nodes and #{deleted_arcs.length} arcs from the graph due to low coverage"
|
282
|
-
log.info "Now there is #{graph.nodes.length} nodes and #{graph.arcs.length} arcs remaining"
|
283
|
-
end
|
284
|
-
|
285
|
-
finder = Bio::AssemblyGraphAlgorithms::NodeFinder.new
|
286
|
-
log.info "Finding node representing the end of the first contig"
|
287
|
-
start_node, start_node_forward = finder.find_unique_node_with_sequence_id(graph, 1)
|
288
|
-
log.info "Finding node representing the start of the second contig"
|
289
|
-
end_node, end_node_forward = finder.find_unique_node_with_sequence_id(graph, 2)#TODO: find the node nearest the end of this, not the start
|
290
|
-
if start_node.nil? or end_node.nil?
|
291
|
-
if start_node.nil?
|
292
|
-
log.error "Unable to find any nodes in the graph that have kmers corresponding to the _start_ point in them, sorry. Maybe fix the node finding code?"
|
293
|
-
end
|
294
|
-
if end_node.nil?
|
295
|
-
log.error "Unable to find any nodes in the graph that have kmers corresponding to the _end_ point in them, sorry. Maybe fix the node finding code?"
|
296
|
-
end
|
297
|
-
|
298
|
-
if options[:output_graph_png] or options[:output_graph_svg] or options[:output_graph_dot]
|
299
|
-
log.info "Converting assembly to a graphviz PNG/SVG/DOT, even if start/end node was not be found properly"
|
300
|
-
viser = Bio::Assembly::ABVisualiser.new
|
301
|
-
gv = viser.graphviz(graph)
|
302
|
-
if options[:output_graph_png]
|
303
|
-
log.info "Writing PNG of graph to #{options[:output_graph_png]}"
|
304
|
-
gv.output :png => options[:output_graph_png]
|
305
|
-
end
|
306
|
-
if options[:output_graph_svg]
|
307
|
-
log.info "Writing SVG of graph to #{options[:output_graph_svg]}"
|
308
|
-
gv.output :svg => options[:output_graph_svg]
|
309
|
-
end
|
310
|
-
if options[:output_graph_dot]
|
311
|
-
log.info "Writing DOT of graph to #{options[:output_graph_dot]}"
|
312
|
-
gv.output :dot => options[:output_graph_dot]
|
313
|
-
end
|
314
|
-
end
|
315
|
-
log.error "Unknown start or end points, giving up, sorry."
|
316
|
-
exit 1
|
317
|
-
end
|
318
|
-
log.info "Node(s) found that are suitable as initial and terminal nodes in the graph search, respectively: #{start_node.node_id} and #{end_node.node_id}"
|
319
|
-
|
320
|
-
log.info "Removing nodes unconnected to either the start or the end from the graph.."
|
321
|
-
original_num_nodes = graph.nodes.length
|
322
|
-
original_num_arcs = graph.arcs.length
|
323
|
-
filter = Bio::AssemblyGraphAlgorithms::ConnectivityBasedGraphFilter.new
|
324
|
-
filter.remove_unconnected_nodes(graph, [start_node, end_node])
|
325
|
-
log.info "Removed #{original_num_nodes-graph.nodes.length} nodes and #{original_num_arcs-graph.arcs.length} arcs"
|
326
|
-
|
327
|
-
if options[:output_graph_png]
|
328
|
-
log.info "Converting assembly to a graphviz PNG"
|
329
|
-
viser = Bio::Assembly::ABVisualiser.new
|
330
|
-
gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id})
|
331
|
-
gv.output :png => options[:output_graph_png], :use => :neato
|
332
|
-
end
|
333
|
-
if options[:output_graph_svg]
|
334
|
-
log.info "Converting assembly to a graphviz SVG"
|
335
|
-
viser = Bio::Assembly::ABVisualiser.new
|
336
|
-
gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id})
|
337
|
-
gv.output :svg => options[:output_graph_svg], :use => :neato
|
338
|
-
end
|
339
|
-
if options[:output_graph_dot]
|
340
|
-
log.info "Converting assembly to a graphviz DOT"
|
341
|
-
viser = Bio::Assembly::ABVisualiser.new
|
342
|
-
gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id})
|
343
|
-
gv.output :dot => options[:output_graph_dot]
|
344
|
-
end
|
345
|
-
|
346
|
-
log.info "Searching for trails between the initial and terminal nodes, within the assembly graph"
|
347
|
-
cartographer = Bio::AssemblyGraphAlgorithms::AcyclicConnectionFinder.new
|
348
|
-
#raise "Untested connection finder below"
|
349
|
-
#trails = cartographer.find_all_trails_between_nodes(graph, start_node, end_node, options[:graph_search_leash_length], start_node_forward)
|
350
|
-
trails = cartographer.find_trails_between_nodes(graph, start_node, end_node, options[:graph_search_leash_length], start_node_forward)
|
351
|
-
log.info "Found #{trails.length} trail(s) between the initial and terminal nodes"
|
352
|
-
|
353
|
-
log.info "Reading kmer abundances from #{options[:kmer_multiple_abundance_file]}.."
|
354
|
-
kmer_hash = Bio::KmerMultipleAbundanceHash.parse_from_file options[:kmer_multiple_abundance_file]
|
355
|
-
log.info "Finished reading the kmer abundances"
|
356
|
-
|
357
|
-
if options[:trail_kmer_coverage_file]
|
358
|
-
log.info "Writing out kmer coverages to #{options[:trail_kmer_coverage_file]}.."
|
359
|
-
writer = Bio::AssemblyGraphAlgorithms::KmerCoverageWriter.new
|
360
|
-
io = File.open(options[:trail_kmer_coverage_file],'w')
|
361
|
-
writer.write(io, trails, kmer_hash)
|
362
|
-
log.info "Finished writing"
|
363
|
-
end
|
364
|
-
|
365
|
-
log.info "Filtering trail(s) based on kmer coverage, requiring each kmer in the path to have a minimum of #{options[:kmer_path_filter_min_coverage]} coverage in patterned reads, except for the #{options[:kmer_path_end_exclusion_length]}bp at the ends"
|
366
|
-
kmer_path_filter = Bio::AssemblyGraphAlgorithms::KmerCoverageBasedPathFilter.new
|
367
|
-
thresholds = desired_pattern.collect{|c| c == true ? 1 : 0}
|
368
|
-
log.info "Using thresholds for filtering: #{thresholds}"
|
369
|
-
trails = kmer_path_filter.filter(trails, kmer_hash, thresholds, :exclude_ending_length => options[:kmer_path_end_exclusion_length])
|
370
|
-
log.info "After filtering remained #{trails.length} trails"
|
371
|
-
|
372
|
-
log.debug "Found trails: #{trails.collect{|t| t.to_s}.join("\n")}"
|
373
|
-
|
374
|
-
trails.each_with_index do |trail, i|
|
375
|
-
puts ">trail#{i+1}"
|
376
|
-
puts trail.sequence
|
377
|
-
end
|