crb-blast 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,19 @@
1
+ require 'open3'
2
+
3
+ module CRB_Blast
4
+
5
+ class Cmd
6
+
7
+ attr_accessor :cmd, :stdout, :stderr, :status
8
+
9
+ def initialize cmd
10
+ @cmd = cmd
11
+ end
12
+
13
+ def run
14
+ @stdout, @stderr, @status = Open3.capture3 @cmd
15
+ end
16
+
17
+ end
18
+
19
+ end
@@ -0,0 +1,515 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bio'
4
+ require 'which'
5
+ require 'threach'
6
+
7
+ module CRB_Blast
8
+
9
+ class Bio::FastaFormat
10
+ def isNucl?
11
+ Bio::Sequence.guess(self.seq, 0.9, 500) == Bio::Sequence::NA
12
+ end
13
+
14
+ def isProt?
15
+ Bio::Sequence.guess(self.seq, 0.9, 500) == Bio::Sequence::AA
16
+ end
17
+ end
18
+
19
+ class CRB_Blast
20
+
21
+ include Which
22
+
23
+ attr_accessor :query_name, :target_name, :reciprocals
24
+ attr_accessor :missed
25
+ attr_accessor :target_is_prot, :query_is_prot
26
+ attr_accessor :query_results, :target_results, :working_dir
27
+
28
+ def initialize query, target, output=nil
29
+ raise IOError.new("File not found #{query}") if !File.exist?(query)
30
+ raise IOError.new("File not found #{target}") if !File.exist?(target)
31
+ @query = File.expand_path(query)
32
+ @target = File.expand_path(target)
33
+ if output.nil?
34
+ #@working_dir = File.expand_path(File.dirname(query)) # no trailing /
35
+ @working_dir = "."
36
+ else
37
+ @working_dir = File.expand_path(output)
38
+ mkcmd = "mkdir #{@working_dir}"
39
+ if !Dir.exist?(@working_dir)
40
+ puts mkcmd
41
+ mkdir = Cmd.new(mkcmd)
42
+ mkdir.run
43
+ if !mkdir.status.success?
44
+ raise RuntimeError.new("Unable to create output directory")
45
+ end
46
+ end
47
+ end
48
+ @makedb_path = which('makeblastdb')
49
+ raise 'makeblastdb was not in the PATH' if @makedb_path.empty?
50
+ @blastn_path = which('blastn')
51
+ raise 'blastn was not in the PATH' if @blastn_path.empty?
52
+ @tblastn_path = which('tblastn')
53
+ raise 'tblastn was not in the PATH' if @tblastn_path.empty?
54
+ @blastx_path = which('blastx')
55
+ raise 'blastx was not in the PATH' if @blastx_path.empty?
56
+ @blastp_path = which('blastp')
57
+ raise 'blastp was not in the PATH' if @blastp_path.empty?
58
+ @makedb_path = @makedb_path.first
59
+ @blastn_path = @blastn_path.first
60
+ @tblastn_path = @tblastn_path.first
61
+ @blastx_path = @blastx_path.first
62
+ @blastp_path = @blastp_path.first
63
+ end
64
+
65
+ #
66
+ # makes a blast database from the query and the target
67
+ #
68
+ def makedb
69
+ # only scan the first few hundred entries
70
+ n = 100
71
+ # check if the query is a nucl or prot seq
72
+ query_file = Bio::FastaFormat.open(@query)
73
+ count_p=0
74
+ count=0
75
+ query_file.take(n).each do |entry|
76
+ count_p += 1 if entry.isProt?
77
+ count += 1
78
+ end
79
+ if count_p > count*0.9
80
+ @query_is_prot = true
81
+ else
82
+ @query_is_prot = false
83
+ end
84
+
85
+ # check if the target is a nucl or prot seq
86
+ target_file = Bio::FastaFormat.open(@target)
87
+ count_p=0
88
+ count=0
89
+ target_file.take(n).each do |entry|
90
+ count_p += 1 if entry.isProt?
91
+ count += 1
92
+ end
93
+ if count_p > count*0.9
94
+ @target_is_prot = true
95
+ else
96
+ @target_is_prot = false
97
+ end
98
+ # construct the output database names
99
+ @query_name = File.basename(@query).split('.')[0..-2].join('.')
100
+ @target_name = File.basename(@target).split('.')[0..-2].join('.')
101
+
102
+ # check if the databases already exist in @working_dir
103
+ make_query_db_cmd = "#{@makedb_path} -in #{@query}"
104
+ make_query_db_cmd << " -dbtype nucl " if !@query_is_prot
105
+ make_query_db_cmd << " -dbtype prot " if @query_is_prot
106
+ make_query_db_cmd << " -title #{query_name} "
107
+ make_query_db_cmd << " -out #{@working_dir}/#{query_name}"
108
+ db_query = "#{query_name}.nsq" if !@query_is_prot
109
+ db_query = "#{query_name}.psq" if @query_is_prot
110
+ if !File.exists?("#{@working_dir}/#{db_query}")
111
+ make_db = Cmd.new(make_query_db_cmd)
112
+ make_db.run
113
+ if !make_db.status.success?
114
+ raise RuntimeError.new("BLAST Error creating database")
115
+ end
116
+ end
117
+
118
+ make_target_db_cmd = "#{@makedb_path} -in #{@target}"
119
+ make_target_db_cmd << " -dbtype nucl " if !@target_is_prot
120
+ make_target_db_cmd << " -dbtype prot " if @target_is_prot
121
+ make_target_db_cmd << " -title #{target_name} "
122
+ make_target_db_cmd << " -out #{@working_dir}/#{target_name}"
123
+
124
+ db_target = "#{target_name}.nsq" if !@target_is_prot
125
+ db_target = "#{target_name}.psq" if @target_is_prot
126
+ if !File.exists?("#{@working_dir}/#{db_target}")
127
+ make_db = Cmd.new(make_target_db_cmd)
128
+ make_db.run
129
+ if !make_db.status.success?
130
+ raise RuntimeError.new("BLAST Error creating database")
131
+ end
132
+ end
133
+ @databases = true
134
+ [@query_name, @target_name]
135
+ end
136
+
137
+ # Construct BLAST output file name and run blast with multiple chunks or
138
+ # with multiple threads
139
+ #
140
+ # @param [Float] evalue The evalue cutoff to use with BLAST
141
+ # @param [Integer] threads The number of threads to run
142
+ # @param [Boolean] split If the fasta files should be split into chunks
143
+ def run_blast(evalue, threads, split)
144
+ if @databases
145
+ @output1 = "#{@working_dir}/#{query_name}_into_#{target_name}.1.blast"
146
+ @output2 = "#{@working_dir}/#{target_name}_into_#{query_name}.2.blast"
147
+ if @query_is_prot
148
+ if @target_is_prot
149
+ bin1 = "#{@blastp_path} "
150
+ bin2 = "#{@blastp_path} "
151
+ else
152
+ bin1 = "#{@tblastn_path} "
153
+ bin2 = "#{@blastx_path} "
154
+ end
155
+ else
156
+ if @target_is_prot
157
+ bin1 = "#{@blastx_path} "
158
+ bin2 = "#{@tblastn_path} "
159
+ else
160
+ bin1 = "#{@blastn_path} "
161
+ bin2 = "#{@blastn_path} "
162
+ end
163
+ end
164
+ if split and threads > 1
165
+ run_blast_with_splitting evalue, threads, bin1, bin2
166
+ else
167
+ run_blast_with_threads evalue, threads, bin1, bin2
168
+ end
169
+ return true
170
+ else
171
+ return false
172
+ end
173
+ end
174
+
175
+ # Run BLAST using its own multithreading
176
+ #
177
+ # @param [Float] evalue The evalue cutoff to use with BLAST
178
+ # @param [Integer] threads The number of threads to run
179
+ # @param [String] bin1
180
+ # @param [String] bin2
181
+ def run_blast_with_threads evalue, threads, bin1, bin2
182
+ # puts "running blast with #{threads} threads"
183
+ cmd1 = "#{bin1} -query #{@query} -db #{@working_dir}/#{@target_name} "
184
+ cmd1 << " -out #{@output1} -evalue #{evalue} "
185
+ cmd1 << " -outfmt \"6 std qlen slen\" "
186
+ cmd1 << " -max_target_seqs 50 "
187
+ cmd1 << " -num_threads #{threads}"
188
+
189
+ cmd2 = "#{bin2} -query #{@target} -db #{@working_dir}/#{@query_name} "
190
+ cmd2 << " -out #{@output2} -evalue #{evalue} "
191
+ cmd2 << " -outfmt \"6 std qlen slen\" "
192
+ cmd2 << " -max_target_seqs 50 "
193
+ cmd2 << " -num_threads #{threads}"
194
+ if !File.exist?("#{@output1}")
195
+ blast1 = Cmd.new(cmd1)
196
+ blast1.run
197
+ if !blast1.status.success?
198
+ raise RuntimeError.new("BLAST Error:\n#{blast1.stderr}")
199
+ end
200
+ end
201
+
202
+ if !File.exist?("#{@output2}")
203
+ blast2 = Cmd.new(cmd2)
204
+ blast2.run
205
+ if !blast2.status.success?
206
+ raise RuntimeError.new("BLAST Error:\n#{blast2.stderr}")
207
+ end
208
+ end
209
+ end
210
+
211
+ # Run BLAST by splitting the input into multiple chunks and using 1 thread
212
+ # for each chunk
213
+ #
214
+ # @param [Float] evalue The evalue cutoff to use with BLAST
215
+ # @param [Integer] threads The number of threads to run
216
+ # @param [String] bin1
217
+ # @param [String] bin2
218
+ def run_blast_with_splitting evalue, threads, bin1, bin2
219
+ # puts "running blast by splitting input into #{threads} pieces"
220
+ blasts=[]
221
+ files = split_input(@query, threads)
222
+ files.threach(threads) do |thread|
223
+ cmd1 = "#{bin1} -query #{thread} -db #{@working_dir}/#{@target_name} "
224
+ cmd1 << " -out #{thread}.blast -evalue #{evalue} "
225
+ cmd1 << " -outfmt \"6 std qlen slen\" "
226
+ cmd1 << " -max_target_seqs 50 "
227
+ cmd1 << " -num_threads 1"
228
+ if !File.exists?("#{thread}.blast")
229
+ blast1 = Cmd.new(cmd1)
230
+ blast1.run
231
+ if !blast1.status.success?
232
+ raise RuntimeError.new("BLAST Error:\n#{blast1.stderr}")
233
+ end
234
+ end
235
+ blasts << "#{thread}.blast"
236
+ end
237
+ cat_cmd = "cat "
238
+ cat_cmd << blasts.join(" ")
239
+ cat_cmd << " > #{@output1}"
240
+ catting = Cmd.new(cat_cmd)
241
+ catting.run
242
+ if !catting.status.success?
243
+ raise RuntimeError.new("Problem catting files:\n#{catting.stderr}")
244
+ end
245
+ files.each do |file|
246
+ File.delete(file) if File.exist?(file)
247
+ end
248
+ blasts.each do |b|
249
+ File.delete(b) # delete intermediate blast output files
250
+ end
251
+
252
+ blasts=[]
253
+ files = split_input(@target, threads)
254
+ files.threach(threads) do |thread|
255
+ cmd2 = "#{bin2} -query #{thread} -db #{@working_dir}/#{@query_name} "
256
+ cmd2 << " -out #{thread}.blast -evalue #{evalue} "
257
+ cmd2 << " -outfmt \"6 std qlen slen\" "
258
+ cmd2 << " -max_target_seqs 50 "
259
+ cmd2 << " -num_threads 1"
260
+ if !File.exists?("#{thread}.blast")
261
+ blast2 = Cmd.new(cmd2)
262
+ blast2.run
263
+ if !blast2.status.success?
264
+ raise RuntimeError.new("BLAST Error:\n#{blast2.stderr}")
265
+ end
266
+ end
267
+ blasts << "#{thread}.blast"
268
+ end
269
+ cat_cmd = "cat "
270
+ cat_cmd << blasts.join(" ")
271
+ cat_cmd << " > #{@output2}"
272
+ catting = Cmd.new(cat_cmd)
273
+ catting.run
274
+ if !catting.status.success?
275
+ raise RuntimeError.new("Problem catting files:\n#{catting.stderr}")
276
+ end
277
+ files.each do |file|
278
+ File.delete(file) if File.exist?(file)
279
+ end
280
+ blasts.each do |b|
281
+ File.delete(b) # delete intermediate blast output files
282
+ end
283
+
284
+ end
285
+
286
+ # Split a fasta file in pieces
287
+ #
288
+ # @param [String] filename
289
+ # @param [Integer] pieces
290
+ def split_input filename, pieces
291
+ input = {}
292
+ name = nil
293
+ seq=""
294
+ File.open(filename).each_line do |line|
295
+ if line =~ /^>(.*)$/
296
+ if name
297
+ input[name]=seq
298
+ seq=""
299
+ end
300
+ name = $1
301
+ else
302
+ seq << line.chomp
303
+ end
304
+ end
305
+ input[name]=seq
306
+ # construct list of output file handles
307
+ outputs=[]
308
+ output_files=[]
309
+ pieces.times do |n|
310
+ outfile = "#{filename}_chunk_#{n}.fasta"
311
+ outfile = File.expand_path(outfile)
312
+ outputs[n] = File.open("#{outfile}", "w")
313
+ output_files[n] = "#{outfile}"
314
+ end
315
+ # write sequences
316
+ count=0
317
+ input.each_pair do |name, seq|
318
+ outputs[count].write(">#{name}\n")
319
+ outputs[count].write("#{seq}\n")
320
+ count += 1
321
+ count %= pieces
322
+ end
323
+ outputs.each do |out|
324
+ out.close
325
+ end
326
+ output_files
327
+ end
328
+
329
+ # Load the two BLAST output files and store the hits in a hash
330
+ #
331
+ def load_outputs
332
+ if File.exist?("#{@working_dir}/reciprocal_hits.txt")
333
+ # puts "reciprocal output already exists"
334
+ else
335
+ @query_results = Hash.new
336
+ @target_results = Hash.new
337
+ q_count=0
338
+ t_count=0
339
+ if !File.exists?("#{@output1}")
340
+ raise RuntimeError.new("can't find #{@output1}")
341
+ end
342
+ if !File.exists?("#{@output2}")
343
+ raise RuntimeError.new("can't find #{@output2}")
344
+ end
345
+ if File.exists?("#{@output1}") and File.exists?("#{@output2}")
346
+ File.open("#{@output1}").each_line do |line|
347
+ cols = line.chomp.split("\t")
348
+ hit = Hit.new(cols)
349
+ @query_results[hit.query] = [] if !@query_results.has_key?(hit.query)
350
+ @query_results[hit.query] << hit
351
+ q_count += 1
352
+ end
353
+ File.open("#{@output2}").each_line do |line|
354
+ cols = line.chomp.split("\t")
355
+ hit = Hit.new(cols)
356
+ @target_results[hit.query] = [] if !@target_results.has_key?(hit.query)
357
+ @target_results[hit.query] << hit
358
+ t_count += 1
359
+ end
360
+ else
361
+ raise "need to run blast first"
362
+ end
363
+ end
364
+ [q_count, t_count]
365
+ end
366
+
367
+ # fills @reciprocals with strict reciprocal hits from the blast results
368
+ def find_reciprocals
369
+ if File.exist?("#{@working_dir}/reciprocal_hits.txt")
370
+ # puts "reciprocal output already exists"
371
+ else
372
+ @reciprocals = Hash.new
373
+ @missed = Hash.new
374
+ @evalues = []
375
+ @longest = 0
376
+ hits = 0
377
+ @query_results.each_pair do |query_id, list_of_hits|
378
+ list_of_hits.each_with_index do |target_hit, query_index|
379
+ if @target_results.has_key?(target_hit.target)
380
+ list_of_hits_2 = @target_results[target_hit.target]
381
+ list_of_hits_2.each_with_index do |query_hit2, target_index|
382
+ if query_index == 0 && target_index == 0 &&
383
+ query_id == query_hit2.target
384
+ e = target_hit.evalue.to_f
385
+ e = 1e-200 if e==0
386
+ e = -Math.log10(e)
387
+ if !@reciprocals.key?(query_id)
388
+ @reciprocals[query_id] = []
389
+ end
390
+ @reciprocals[query_id] << target_hit
391
+ hits += 1
392
+ @longest = target_hit.alnlen if target_hit.alnlen > @longest
393
+ @evalues << {:e => e, :length => target_hit.alnlen}
394
+ elsif query_id == query_hit2.target
395
+ if !@missed.key?(query_id)
396
+ @missed[query_id] = []
397
+ end
398
+ @missed[query_id] << target_hit
399
+ end
400
+ end
401
+ end
402
+ end
403
+ end
404
+ end
405
+ return hits
406
+ end
407
+
408
+ # Learns the evalue cutoff based on the length of the sequence
409
+ # Finds hits that have a lower evalue than this cutoff
410
+ def find_secondaries
411
+
412
+ if File.exist?("#{@working_dir}/reciprocal_hits.txt")
413
+ # puts "reciprocal output already exists"
414
+ else
415
+ length_hash = Hash.new
416
+ fitting = Hash.new
417
+ @evalues.each do |h|
418
+ length_hash[h[:length]] = [] if !length_hash.key?(h[:length])
419
+ length_hash[h[:length]] << h
420
+ end
421
+
422
+ (10..@longest).each do |centre|
423
+ e = 0
424
+ count = 0
425
+ s = centre*0.1
426
+ s = s.to_i
427
+ s = 5 if s < 5
428
+ (-s..s).each do |side|
429
+ if length_hash.has_key?(centre+side)
430
+ length_hash[centre+side].each do |point|
431
+ e += point[:e]
432
+ count += 1
433
+ end
434
+ end
435
+ end
436
+ if count>0
437
+ mean = e/count
438
+ fitting[centre] = mean
439
+ end
440
+ end
441
+ hits = 0
442
+ @missed.each_pair do |id, list|
443
+ list.each do |hit|
444
+ l = hit.alnlen.to_i
445
+ e = hit.evalue
446
+ e = 1e-200 if e==0
447
+ e = -Math.log10(e)
448
+ if fitting.has_key?(l)
449
+ if e >= fitting[l]
450
+ if !@reciprocals.key?(id)
451
+ @reciprocals[id] = []
452
+ found=false
453
+ @reciprocals[id].each do |existing_hit|
454
+ if existing_hit.query == hit.query &&
455
+ existing_hit.target == hit.target
456
+ found=true
457
+ end
458
+ end
459
+ if !found
460
+ @reciprocals[id] << hit
461
+ hits += 1
462
+ end
463
+ end
464
+ end
465
+ end
466
+ end
467
+ end
468
+ end
469
+ return hits
470
+ end
471
+
472
+ def clear_memory
473
+ # running lots of jobs at the same time was keeping a lot of stuff in
474
+ # memory that you might not want so this empties out those big hashes.
475
+ @query_results = nil
476
+ @target_results = nil
477
+ end
478
+
479
+ def run evalue=1e-5, threads=1, split=true
480
+ makedb
481
+ run_blast evalue, threads, split
482
+ load_outputs
483
+ find_reciprocals
484
+ find_secondaries
485
+ end
486
+
487
+ def size
488
+ hits=0
489
+ @reciprocals.each_pair do |key, list|
490
+ list.each do |hit|
491
+ hits += 1
492
+ end
493
+ end
494
+ hits
495
+ end
496
+
497
+ def write_output
498
+ s=""
499
+ unless @reciprocals.nil?
500
+ @reciprocals.each_pair do |query_id, hits|
501
+ hits.each do |hit|
502
+ s << "#{hit}\n"
503
+ end
504
+ end
505
+ File.open("#{@working_dir}/reciprocal_hits.txt", "w") {|f| f.write s }
506
+ end
507
+ end
508
+
509
+ def has_reciprocal? contig
510
+ return true if @reciprocals.has_key?(contig)
511
+ return false
512
+ end
513
+ end
514
+
515
+ end