finishm 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +19 -19
- data/VERSION +1 -1
- data/finishm.gemspec +631 -0
- data/lib/assembly/graph_generator.rb +0 -1
- data/lib/assembly/probed_graph.rb +0 -1
- metadata +99 -96
- data/bin/assembly_visualiser +0 -106
- data/bin/check_primer_combinations.rb +0 -73
- data/bin/contig_joiner.rb +0 -244
- data/bin/contigs_against_assembly.rb +0 -153
- data/bin/finishm_assembler +0 -55
- data/bin/finishm_gap_closer.rb +0 -241
- data/bin/kmer_abundance_file_tool.rb +0 -49
- data/bin/kmer_pattern_to_assembly.rb +0 -377
- data/bin/kmer_profile_finder.rb +0 -92
- data/bin/kmers_count_parse.d +0 -52
- data/bin/kmers_count_tabulate.d +0 -123
- data/bin/kmers_count_tabulate.rb +0 -84
- data/bin/pcr_result_parser.rb +0 -108
- data/bin/primer_finder.rb +0 -119
- data/bin/read_selection_by_kmer.d +0 -174
- data/bin/scaffold_by_pattern.rb +0 -119
- data/bin/scaffold_connection_possibilities_to_knowns.rb +0 -193
- data/bin/scaffold_end_coverages.rb +0 -69
- data/bin/trail_validator.rb +0 -84
@@ -1,73 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'optparse'
|
4
|
-
require 'bio-logger'
|
5
|
-
|
6
|
-
$:.unshift File.join(ENV['HOME'],'git','bioruby-primer3','lib')
|
7
|
-
require 'bio-primer3'
|
8
|
-
|
9
|
-
if __FILE__ == $0 #needs to be removed if this script is distributed as part of a rubygem
|
10
|
-
SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
|
11
|
-
|
12
|
-
# Parse command line options into the options hash
|
13
|
-
options = {
|
14
|
-
:logger => 'stderr',
|
15
|
-
}
|
16
|
-
o = OptionParser.new do |opts|
|
17
|
-
opts.banner = "
|
18
|
-
Usage: #{SCRIPT_NAME} -p1 <primer1> -f2 <primer_list_file>
|
19
|
-
|
20
|
-
Uses primer3's \"check primers\" to find whether primers match against each other\n\n"
|
21
|
-
|
22
|
-
opts.on("--primer1 PRIMER", "Primer on one side [required]") do |arg|
|
23
|
-
options[:primer1] = arg
|
24
|
-
end
|
25
|
-
opts.on("--primers2 PRIMER_FILE", "A list of primers in a file, newline separated [required]") do |arg|
|
26
|
-
options[:primers2_file] = arg
|
27
|
-
end
|
28
|
-
|
29
|
-
# logger options
|
30
|
-
opts.separator "\nVerbosity:\n\n"
|
31
|
-
opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {Bio::Log::CLI.trace('error')}
|
32
|
-
opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
|
33
|
-
opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| Bio::Log::CLI.trace(s)}
|
34
|
-
end; o.parse!
|
35
|
-
if ARGV.length != 0
|
36
|
-
$stderr.puts o
|
37
|
-
exit 1
|
38
|
-
end
|
39
|
-
# Setup logging. bio-logger defaults to STDERR not STDOUT, I disagree
|
40
|
-
Bio::Log::CLI.logger(options[:logger]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
|
41
|
-
|
42
|
-
|
43
|
-
# Read in input data
|
44
|
-
primers1 = [options[:primer1]]
|
45
|
-
primers2 = File.open(options[:primers2_file]).read.split("\n").collect{|c| c.strip}
|
46
|
-
log.info "Read in #{primers1.length} left primers and #{primers2.length} right primers e.g. #{primers1[0]} and #{primers2[0]}"
|
47
|
-
|
48
|
-
goods = 0
|
49
|
-
bads = 0
|
50
|
-
failed_to_run = 0
|
51
|
-
primers1.each do |primer1|
|
52
|
-
primers2.each do |primer2|
|
53
|
-
begin
|
54
|
-
result, obj = Bio::Primer3.test_primer_compatibility primer1, primer2, 'PRIMER_EXPLAIN_FLAG'=>1
|
55
|
-
|
56
|
-
puts [
|
57
|
-
primer1, primer2, result, obj['PRIMER_LEFT_EXPLAIN'], obj['PRIMER_RIGHT_EXPLAIN']
|
58
|
-
].join "\t"
|
59
|
-
|
60
|
-
if result
|
61
|
-
goods += 1
|
62
|
-
else
|
63
|
-
bads += 1
|
64
|
-
end
|
65
|
-
|
66
|
-
rescue Exception => e
|
67
|
-
failed_to_run += 1
|
68
|
-
end
|
69
|
-
end
|
70
|
-
end
|
71
|
-
log.info "Found #{goods} OK primer pairs and #{bads} not OK primer pairs"
|
72
|
-
log.warn "#{failed_to_run} weren't checked by Primer3 because it failed to run" if failed_to_run > 0
|
73
|
-
end #end if running as a script
|
data/bin/contig_joiner.rb
DELETED
@@ -1,244 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'optparse'
|
4
|
-
require 'bio-logger'
|
5
|
-
require 'bio-velvet'
|
6
|
-
require 'tempfile'
|
7
|
-
require 'pp'
|
8
|
-
|
9
|
-
SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = 'finishm'
|
10
|
-
$:.unshift File.join(File.dirname(__FILE__),'..','lib')
|
11
|
-
require 'priner'
|
12
|
-
|
13
|
-
# Parse command line options into the options hash
|
14
|
-
options = {
|
15
|
-
:logger => 'stderr',
|
16
|
-
:log_level => 'info',
|
17
|
-
:velvet_kmer_size => 73,#TODO: these options should be exposed to the user, and perhaps not guessed at
|
18
|
-
:velvetg_arguments => '-read_trkg yes',# -exp_cov 41 -cov_cutoff 12.0973243610491', #hack
|
19
|
-
:contig_end_length => 300,
|
20
|
-
:output_assembly_path => 'velvetAssembly',
|
21
|
-
:graph_search_leash_length => 3000,
|
22
|
-
}
|
23
|
-
o = OptionParser.new do |opts|
|
24
|
-
opts.banner = "
|
25
|
-
Usage: #{SCRIPT_NAME} --reads <read_file> --contigs <contigs_file>
|
26
|
-
|
27
|
-
Takes a set of reads and a set of contigs. Then it runs an assembly based on those reads,
|
28
|
-
and tries to fill in possible gaps between the contigs. There may be multiple ways
|
29
|
-
to join two contig ends together - in this that multiple cases are reported. \n\n"
|
30
|
-
|
31
|
-
|
32
|
-
opts.on("--reads FILE", "gzipped fastq file of reads to perform the re-assembly with [required]") do |arg|
|
33
|
-
options[:reads_file] = arg
|
34
|
-
end
|
35
|
-
opts.on("--contigs FILE", "fasta file of contigs to be joined together [required]") do |arg|
|
36
|
-
options[:contigs_file] = arg
|
37
|
-
end
|
38
|
-
|
39
|
-
opts.separator "\nOptional arguments:\n\n"
|
40
|
-
opts.on("--output-trails-fasta PATH", "Output found paths to this file in fasta format [default: off]") do |arg|
|
41
|
-
options[:overall_trail_output_fasta_file] = arg
|
42
|
-
end
|
43
|
-
opts.on("--already-assembled-velvet-directory PATH", "Skip until after assembly in this process, and start from this assembly directory created during a previous run of this script [default: off]") do |arg|
|
44
|
-
options[:previous_assembly] = arg
|
45
|
-
end
|
46
|
-
opts.on("--serialize-velvet-graph FILE", "So that the velvet graph does not have to be reparsed, serialise the parsed object for later use in this file [default: off]") do |arg|
|
47
|
-
options[:serialize_parsed_graph_file] = arg
|
48
|
-
end
|
49
|
-
opts.on("--already-serialized-velvet-graph FILE", "Restore the parsed velvet graph from this file [default: off]") do |arg|
|
50
|
-
options[:previously_serialized_parsed_graph_file] = arg
|
51
|
-
end
|
52
|
-
|
53
|
-
|
54
|
-
# logger options
|
55
|
-
opts.separator "\nVerbosity:\n\n"
|
56
|
-
opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
|
57
|
-
opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
|
58
|
-
opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
|
59
|
-
end; o.parse!
|
60
|
-
if ARGV.length != 0 or options[:reads_file].nil? or options[:contigs_file].nil?
|
61
|
-
$stderr.puts o
|
62
|
-
exit 1
|
63
|
-
end
|
64
|
-
# Setup logging
|
65
|
-
Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
|
66
|
-
Bio::Log::LoggerPlus.new 'bio-velvet'; Bio::Log::CLI.configure 'bio-velvet'
|
67
|
-
|
68
|
-
# Extract contig ends from each of the input contigs, so that the contig ends can be found in the
|
69
|
-
# assembly graph structure.
|
70
|
-
contig_ends = []
|
71
|
-
velvet_sequence_id_to_contig_end = {}
|
72
|
-
contig_lengths = {}
|
73
|
-
class ContigEnd
|
74
|
-
attr_accessor :sequence, :start_or_end, :contig_name, :velvet_sequence_id
|
75
|
-
end
|
76
|
-
velvet_read_index = 1
|
77
|
-
Bio::FlatFile.foreach(options[:contigs_file]) do |seq|
|
78
|
-
contig_lengths[seq.definition] = seq.seq.length
|
79
|
-
if seq.seq.length < options[:contig_end_length]
|
80
|
-
log.warn "Contig #{seq.definition} is shorter than the end length used to anchor the contig in the assembly. This is not ideal but may be ok."
|
81
|
-
#TODO: fix this - should be counting from the middle. Should I just ignore those ones?
|
82
|
-
end
|
83
|
-
# Add the start of the contig
|
84
|
-
contig_end = ContigEnd.new
|
85
|
-
contig_end.start_or_end = :start
|
86
|
-
contig_end.sequence = Bio::Sequence::NA.new(seq.seq[0...options[:contig_end_length]]).reverse_complement.to_s
|
87
|
-
contig_end.contig_name = seq.definition
|
88
|
-
velvet_sequence_id_to_contig_end[velvet_read_index] = contig_end
|
89
|
-
contig_end.velvet_sequence_id = velvet_read_index; velvet_read_index += 1
|
90
|
-
contig_ends.push contig_end
|
91
|
-
|
92
|
-
|
93
|
-
# Add the back of the contig
|
94
|
-
contig_end = ContigEnd.new
|
95
|
-
contig_end.start_or_end = :end
|
96
|
-
s = seq.seq
|
97
|
-
contig_end.sequence = s[s.length-options[:contig_end_length]...s.length]
|
98
|
-
contig_end.contig_name = seq.definition
|
99
|
-
velvet_sequence_id_to_contig_end[velvet_read_index] = contig_end
|
100
|
-
contig_end.velvet_sequence_id = velvet_read_index; velvet_read_index += 1
|
101
|
-
contig_ends.push contig_end
|
102
|
-
end
|
103
|
-
log.info "Parsed in #{contig_ends.length} contig ends from the two sides of each input contig"
|
104
|
-
|
105
|
-
|
106
|
-
graph = nil
|
107
|
-
if options[:previously_serialized_parsed_graph_file].nil?
|
108
|
-
velvet_result = nil
|
109
|
-
if options[:previous_assembly].nil? #If assembly has not already been carried out
|
110
|
-
Tempfile.open('anchors.fa') do |tempfile|
|
111
|
-
contig_ends.each do |contig_end|
|
112
|
-
tempfile.puts ">anchor#{contig_end.velvet_sequence_id}"
|
113
|
-
tempfile.puts contig_end.sequence
|
114
|
-
end
|
115
|
-
|
116
|
-
log.info "Assembling sampled reads with velvet"
|
117
|
-
# Bit of a hack, but have to use -short1 as the anchors because then start and end anchors will have node IDs 1,2,... etc.
|
118
|
-
velvet_result = Bio::Velvet::Runner.new.velvet(
|
119
|
-
options[:velvet_kmer_size],
|
120
|
-
"-short #{tempfile.path} -short2 -fastq.gz #{options[:reads_file]}",
|
121
|
-
options[:velvetg_arguments],
|
122
|
-
:output_assembly_path => options[:output_assembly_path]
|
123
|
-
)
|
124
|
-
if log.debug?
|
125
|
-
log.debug "velveth stdout: #{velvet_result.velveth_stdout}"
|
126
|
-
log.debug "velveth stderr: #{velvet_result.velveth_stderr}"
|
127
|
-
log.debug "velvetg stdout: #{velvet_result.velvetg_stdout}"
|
128
|
-
log.debug "velvetg stderr: #{velvet_result.velvetg_stderr}"
|
129
|
-
end
|
130
|
-
log.info "Finished running assembly"
|
131
|
-
end
|
132
|
-
else
|
133
|
-
log.info "Using previous assembly stored at #{options[:previous_assembly]}"
|
134
|
-
velvet_result = Bio::Velvet::Result.new
|
135
|
-
velvet_result.result_directory = options[:previous_assembly]
|
136
|
-
end
|
137
|
-
|
138
|
-
require 'ruby-prof'
|
139
|
-
RubyProf.start
|
140
|
-
|
141
|
-
log.info "Parsing the graph output from velvet"
|
142
|
-
graph = Bio::Velvet::Graph.parse_from_file(File.join velvet_result.result_directory, 'Graph2')
|
143
|
-
log.info "Finished parsing graph: found #{graph.nodes.length} nodes and #{graph.arcs.length} arcs"
|
144
|
-
|
145
|
-
result = RubyProf.stop
|
146
|
-
printer = RubyProf::FlatPrinter.new(result)
|
147
|
-
printer.print(STDOUT)
|
148
|
-
|
149
|
-
if options[:serialize_parsed_graph_file]
|
150
|
-
log.info "Storing a binary version of the graph file for later use at #{options[:serialize_parsed_graph_file]}"
|
151
|
-
File.open(options[:serialize_parsed_graph_file],'wb') do |f|
|
152
|
-
f.print Marshal.dump(graph)
|
153
|
-
end
|
154
|
-
log.info "Stored a binary representation of the velvet graph at #{options[:serialize_parsed_graph_file]}"
|
155
|
-
end
|
156
|
-
else
|
157
|
-
log.info "Restoring graph file from #{options[:previously_serialized_parsed_graph_file]}.."
|
158
|
-
graph = Marshal.load(File.open(options[:previously_serialized_parsed_graph_file]))
|
159
|
-
log.info "Restoration complete"
|
160
|
-
end
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
# Find the anchoring nodes for each of the contig ends
|
165
|
-
finder = Bio::AssemblyGraphAlgorithms::NodeFinder.new
|
166
|
-
log.info "Finding node representing the end of the each contig"
|
167
|
-
i = 1
|
168
|
-
anchor_sequence_ids = contig_ends.collect{|c| c.velvet_sequence_id}
|
169
|
-
anchoring_nodes_and_directions = finder.find_unique_nodes_with_sequence_ids(graph, anchor_sequence_ids)
|
170
|
-
num_anchors_found = anchoring_nodes_and_directions.reject{|s,e| e[0].nil?}.length
|
171
|
-
anchoring_node_id_to_contig_end = {}
|
172
|
-
anchoring_nodes_and_directions.each do |seq_id, node_and_direction|
|
173
|
-
next if node_and_direction[0].nil? #skip when there is no node found in the graph for this contig end
|
174
|
-
anchoring_node_id_to_contig_end[node_and_direction[0].node_id] = velvet_sequence_id_to_contig_end[seq_id]
|
175
|
-
end
|
176
|
-
log.info "Found anchoring nodes for #{num_anchors_found} out of #{contig_ends.length} contig ends"
|
177
|
-
|
178
|
-
log.info "Searching for trails between the nodes within the assembly graph"
|
179
|
-
cartographer = Bio::AssemblyGraphAlgorithms::AcyclicConnectionFinder.new
|
180
|
-
trail_sets = cartographer.find_trails_between_node_set(graph, anchoring_nodes_and_directions.values.reject{|v| v[0].nil?}, options[:graph_search_leash_length])
|
181
|
-
log.info "Found #{trail_sets.reduce(0){|s,set|s+=set.length}} trail(s) in total"
|
182
|
-
|
183
|
-
node_id_to_contig_description = {}
|
184
|
-
anchoring_nodes_and_directions.each do |seq_id, pair|
|
185
|
-
next if pair.empty? #When no nodes were found
|
186
|
-
node_id = pair[0].node_id
|
187
|
-
node_id_to_contig_description[node_id] = velvet_sequence_id_to_contig_end[seq_id]
|
188
|
-
end
|
189
|
-
contig_end_id_to_partners = {}
|
190
|
-
# Tabulate all the partners each way (complete the previously triangular matrix)
|
191
|
-
trail_sets.each do |trail_set|
|
192
|
-
trail_set.each do |trail|
|
193
|
-
start_id = trail.first.node.node_id
|
194
|
-
end_id = trail.last.node.node_id
|
195
|
-
contig_end_id_to_partners[start_id] ||= []
|
196
|
-
contig_end_id_to_partners[start_id].push node_id_to_contig_description[end_id]
|
197
|
-
contig_end_id_to_partners[end_id] ||= []
|
198
|
-
contig_end_id_to_partners[end_id].push node_id_to_contig_description[start_id]
|
199
|
-
end
|
200
|
-
end
|
201
|
-
|
202
|
-
puts %w(contig_end_id contig_name contig_length connections).join "\t"
|
203
|
-
trail_sets.each_with_index do |trail_set, i|
|
204
|
-
partner_contig_ends = contig_end_id_to_partners[contig_ends[i].velvet_sequence_id]
|
205
|
-
partner_contig_ends ||= []
|
206
|
-
# Each contig has 2 trail sets associated with it - one for the start and one for the end
|
207
|
-
puts [
|
208
|
-
contig_ends[i].velvet_sequence_id,
|
209
|
-
contig_ends[i].contig_name,
|
210
|
-
contig_lengths[contig_ends[i].contig_name],
|
211
|
-
partner_contig_ends.collect{|c| c.velvet_sequence_id}.sort.join(',')
|
212
|
-
].join("\t")
|
213
|
-
end
|
214
|
-
|
215
|
-
if options[:overall_trail_output_fasta_file]
|
216
|
-
File.open(options[:overall_trail_output_fasta_file],'w') do |outfile|
|
217
|
-
trail_sets.each do |trail_set|
|
218
|
-
trail_set.each do |trail|
|
219
|
-
begin
|
220
|
-
trail_sequence = trail.sequence #Get the trail sequence first as this may not be possible.
|
221
|
-
|
222
|
-
start_id = trail.first.node.node_id
|
223
|
-
end_id = trail.last.node.node_id
|
224
|
-
start_contig_end = anchoring_node_id_to_contig_end[start_id]
|
225
|
-
end_contig_end = anchoring_node_id_to_contig_end[end_id]
|
226
|
-
outfile.print '>'
|
227
|
-
outfile.print start_contig_end.contig_name
|
228
|
-
outfile.print '_'
|
229
|
-
outfile.print start_contig_end.start_or_end
|
230
|
-
outfile.print ':'
|
231
|
-
outfile.print end_contig_end.contig_name
|
232
|
-
outfile.print '_'
|
233
|
-
outfile.puts end_contig_end.start_or_end
|
234
|
-
|
235
|
-
outfile.puts trail_sequence
|
236
|
-
rescue Bio::Velvet::NotImplementedException => e
|
237
|
-
log.warn "Problem getting sequence of found trail #{trail.to_s}, skipping this trail: #{e.to_s}"
|
238
|
-
end
|
239
|
-
end
|
240
|
-
end
|
241
|
-
end
|
242
|
-
end
|
243
|
-
|
244
|
-
|
@@ -1,153 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'optparse'
|
4
|
-
require 'bio-logger'
|
5
|
-
require 'systemu'
|
6
|
-
|
7
|
-
SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
|
8
|
-
|
9
|
-
# Parse command line options into the options hash
|
10
|
-
options = {
|
11
|
-
:logger => 'stderr',
|
12
|
-
:log_level => 'info',
|
13
|
-
}
|
14
|
-
o = OptionParser.new do |opts|
|
15
|
-
opts.banner = "
|
16
|
-
Usage: #{SCRIPT_NAME} -b <contigs_against_assembly.blast_outfmt6.csv>
|
17
|
-
|
18
|
-
Takes a set of contigs, and an assembly. Works out if there are any contigs where there is a blast hit spanning of the contigs using two of the assembly's contig ends.\n\n"
|
19
|
-
|
20
|
-
opts.on("--query FASTA_FILE", "new contigs fasta file [Required]") do |arg|
|
21
|
-
options[:query_file] = arg
|
22
|
-
end
|
23
|
-
opts.on("--blastdb FASTA_FILE_FORMATTED", "basename of makeblastdb output [Required]") do |arg|
|
24
|
-
options[:blastdb] = arg
|
25
|
-
end
|
26
|
-
|
27
|
-
# logger options
|
28
|
-
opts.separator "\nVerbosity:\n\n"
|
29
|
-
opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
|
30
|
-
opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
|
31
|
-
opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
|
32
|
-
end; o.parse!
|
33
|
-
if ARGV.length != 0 or options[:query_file].nil? or options[:blastdb].nil?
|
34
|
-
$stderr.puts o
|
35
|
-
exit 1
|
36
|
-
end
|
37
|
-
# Setup logging
|
38
|
-
Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
|
39
|
-
|
40
|
-
|
41
|
-
# Read in the blast file
|
42
|
-
blast_results = []
|
43
|
-
class BlastResult
|
44
|
-
attr_accessor :qseqid, :sseqid, :pident, :length, :mismatch, :gapopen, :qstart, :qend, :sstart, :subject_end, :evalue, :bitscore, :query_length, :subject_length
|
45
|
-
|
46
|
-
attr_accessor :cutoff_inwards
|
47
|
-
|
48
|
-
def initialize
|
49
|
-
@cutoff_inwards = 500
|
50
|
-
end
|
51
|
-
|
52
|
-
def hits_end_of_subject?
|
53
|
-
@subject_end >= @subject_length-@cutoff_inwards and @length >= 100
|
54
|
-
end
|
55
|
-
|
56
|
-
def hits_start_of_subject?
|
57
|
-
@sstart <= @cutoff_inwards and @length >= 100
|
58
|
-
end
|
59
|
-
|
60
|
-
def hits_end_of_query?
|
61
|
-
@qend >= @query_length-@cutoff_inwards and @length >= 100
|
62
|
-
end
|
63
|
-
|
64
|
-
def hits_start_of_query?
|
65
|
-
@qstart <= @cutoff_inwards and @length >= 100
|
66
|
-
end
|
67
|
-
end
|
68
|
-
|
69
|
-
status, blast_output, stderr = systemu "blastn -query #{options[:query_file].inspect} -db #{options[:blastdb].inspect} -outfmt '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen' -evalue 1e-5"
|
70
|
-
raise stderr unless stderr==""
|
71
|
-
raise "bad status running blast" unless status.exitstatus == 0
|
72
|
-
log.debug "Finished running blast, presumably successfully"
|
73
|
-
|
74
|
-
blast_output.each_line do |line|
|
75
|
-
res = BlastResult.new
|
76
|
-
row = line.chomp.split "\t"
|
77
|
-
[:qseqid, :sseqid, :pident, :length, :mismatch, :gapopen, :qstart,
|
78
|
-
:qend, :sstart, :subject_end, :evalue, :bitscore,
|
79
|
-
:query_length, :subject_length].each_with_index do |attr, i|
|
80
|
-
res.send "#{attr}=".to_sym, row[i]
|
81
|
-
end
|
82
|
-
[:length, :mismatch, :gapopen, :qstart,
|
83
|
-
:qend, :sstart, :subject_end,:query_length, :subject_length].each do |attr|
|
84
|
-
res.send "#{attr}=".to_sym, res.send(attr).to_i
|
85
|
-
end
|
86
|
-
[:pident, :evalue, :bitscore].each do |attr|
|
87
|
-
res.send "#{attr}=".to_sym, res.send(attr).to_f
|
88
|
-
end
|
89
|
-
|
90
|
-
blast_results.push res
|
91
|
-
end
|
92
|
-
log.info "Parsed #{blast_results.length} blast results e.g. #{blast_results[0].inspect}"
|
93
|
-
|
94
|
-
|
95
|
-
query_to_blast_results = {}
|
96
|
-
hit_to_blast_results = {}
|
97
|
-
blast_results.each do |result|
|
98
|
-
query_to_blast_results[result.qseqid] ||= []
|
99
|
-
query_to_blast_results[result.qseqid].push result
|
100
|
-
|
101
|
-
hit_to_blast_results[result.sseqid] ||= []
|
102
|
-
hit_to_blast_results[result.sseqid].push result
|
103
|
-
end
|
104
|
-
|
105
|
-
# For each query sequence, does it map to the ends of both contigs
|
106
|
-
header = %w(query subject1 subject2 qstart1? qend1? sstart1? send1? qstart2? qend2? sstart2? send2?).join("\t")
|
107
|
-
query_to_blast_results.each do |query_id, hits|
|
108
|
-
query_length = hits[0].query_length
|
109
|
-
keepers = []
|
110
|
-
|
111
|
-
hits.each do |hit|
|
112
|
-
# perfect if it hits the start or the end (but not both) of both the query and the subject, unless it is circular
|
113
|
-
if hit.hits_start_of_query? ^ hit.hits_end_of_query? and
|
114
|
-
hit.hits_start_of_subject? ^ hit.hits_end_of_subject?
|
115
|
-
keepers.push hit
|
116
|
-
elsif hit.hits_start_of_query? or hit.hits_end_of_query? or
|
117
|
-
hit.hits_start_of_subject? or hit.hits_end_of_subject?
|
118
|
-
log.info "There's a half-correct hit for #{query_id}: qstart? #{hit.hits_start_of_query?} qend #{hit.hits_end_of_query?} "+
|
119
|
-
"sstart #{hit.hits_start_of_subject?} send #{hit.hits_end_of_subject?}, to subject sequence #{hit.sseqid}"
|
120
|
-
end
|
121
|
-
end
|
122
|
-
|
123
|
-
if keepers.empty?
|
124
|
-
log.debug "no latchings found for #{query_id}"
|
125
|
-
elsif keepers.length == 1
|
126
|
-
log.info "Query #{query_id} only latches on to a single end, maybe manually inspect"
|
127
|
-
elsif keepers.length == 2
|
128
|
-
log.debug "Query #{query_id} has 2 keepers!"
|
129
|
-
q = keepers.collect{|hit| hit.hits_start_of_query?}.join
|
130
|
-
s = keepers.collect{|hit| hit.hits_start_of_subject?}.join
|
131
|
-
if (q == 'truefalse' or q == 'falsetrue') and
|
132
|
-
(s == 'truefalse' or s == 'falsetrue')
|
133
|
-
outs = (0..1).collect{|i|
|
134
|
-
[
|
135
|
-
keepers[i].hits_start_of_query?,
|
136
|
-
keepers[i].hits_end_of_query?,
|
137
|
-
keepers[i].hits_start_of_subject?,
|
138
|
-
keepers[i].hits_end_of_subject?,
|
139
|
-
]
|
140
|
-
}.flatten
|
141
|
-
unless header.nil?
|
142
|
-
puts header
|
143
|
-
header = nil
|
144
|
-
end
|
145
|
-
puts [query_id, keepers[0].sseqid, keepers[1].sseqid, outs].flatten.join("\t")
|
146
|
-
else
|
147
|
-
log.info "Query #{query_id} has 2 keepers, but they are fighting it seems"
|
148
|
-
end
|
149
|
-
else
|
150
|
-
log.info "More than 2 keepers found for #{query_id}, manual inspection likely required"
|
151
|
-
end
|
152
|
-
end
|
153
|
-
|