crb-blast 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ require 'open3'
2
+
3
+ module CRB_Blast
4
+
5
+ class Cmd
6
+
7
+ attr_accessor :cmd, :stdout, :stderr, :status
8
+
9
+ def initialize cmd
10
+ @cmd = cmd
11
+ end
12
+
13
+ def run
14
+ @stdout, @stderr, @status = Open3.capture3 @cmd
15
+ end
16
+
17
+ end
18
+
19
+ end
@@ -0,0 +1,515 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bio'
4
+ require 'which'
5
+ require 'threach'
6
+
7
+ module CRB_Blast
8
+
9
+ class Bio::FastaFormat
10
+ def isNucl?
11
+ Bio::Sequence.guess(self.seq, 0.9, 500) == Bio::Sequence::NA
12
+ end
13
+
14
+ def isProt?
15
+ Bio::Sequence.guess(self.seq, 0.9, 500) == Bio::Sequence::AA
16
+ end
17
+ end
18
+
19
+ class CRB_Blast
20
+
21
+ include Which
22
+
23
+ attr_accessor :query_name, :target_name, :reciprocals
24
+ attr_accessor :missed
25
+ attr_accessor :target_is_prot, :query_is_prot
26
+ attr_accessor :query_results, :target_results, :working_dir
27
+
28
+ def initialize query, target, output=nil
29
+ raise IOError.new("File not found #{query}") if !File.exist?(query)
30
+ raise IOError.new("File not found #{target}") if !File.exist?(target)
31
+ @query = File.expand_path(query)
32
+ @target = File.expand_path(target)
33
+ if output.nil?
34
+ #@working_dir = File.expand_path(File.dirname(query)) # no trailing /
35
+ @working_dir = "."
36
+ else
37
+ @working_dir = File.expand_path(output)
38
+ mkcmd = "mkdir #{@working_dir}"
39
+ if !Dir.exist?(@working_dir)
40
+ puts mkcmd
41
+ mkdir = Cmd.new(mkcmd)
42
+ mkdir.run
43
+ if !mkdir.status.success?
44
+ raise RuntimeError.new("Unable to create output directory")
45
+ end
46
+ end
47
+ end
48
+ @makedb_path = which('makeblastdb')
49
+ raise 'makeblastdb was not in the PATH' if @makedb_path.empty?
50
+ @blastn_path = which('blastn')
51
+ raise 'blastn was not in the PATH' if @blastn_path.empty?
52
+ @tblastn_path = which('tblastn')
53
+ raise 'tblastn was not in the PATH' if @tblastn_path.empty?
54
+ @blastx_path = which('blastx')
55
+ raise 'blastx was not in the PATH' if @blastx_path.empty?
56
+ @blastp_path = which('blastp')
57
+ raise 'blastp was not in the PATH' if @blastp_path.empty?
58
+ @makedb_path = @makedb_path.first
59
+ @blastn_path = @blastn_path.first
60
+ @tblastn_path = @tblastn_path.first
61
+ @blastx_path = @blastx_path.first
62
+ @blastp_path = @blastp_path.first
63
+ end
64
+
65
+ #
66
+ # makes a blast database from the query and the target
67
+ #
68
+ def makedb
69
+ # only scan the first few hundred entries
70
+ n = 100
71
+ # check if the query is a nucl or prot seq
72
+ query_file = Bio::FastaFormat.open(@query)
73
+ count_p=0
74
+ count=0
75
+ query_file.take(n).each do |entry|
76
+ count_p += 1 if entry.isProt?
77
+ count += 1
78
+ end
79
+ if count_p > count*0.9
80
+ @query_is_prot = true
81
+ else
82
+ @query_is_prot = false
83
+ end
84
+
85
+ # check if the target is a nucl or prot seq
86
+ target_file = Bio::FastaFormat.open(@target)
87
+ count_p=0
88
+ count=0
89
+ target_file.take(n).each do |entry|
90
+ count_p += 1 if entry.isProt?
91
+ count += 1
92
+ end
93
+ if count_p > count*0.9
94
+ @target_is_prot = true
95
+ else
96
+ @target_is_prot = false
97
+ end
98
+ # construct the output database names
99
+ @query_name = File.basename(@query).split('.')[0..-2].join('.')
100
+ @target_name = File.basename(@target).split('.')[0..-2].join('.')
101
+
102
+ # check if the databases already exist in @working_dir
103
+ make_query_db_cmd = "#{@makedb_path} -in #{@query}"
104
+ make_query_db_cmd << " -dbtype nucl " if !@query_is_prot
105
+ make_query_db_cmd << " -dbtype prot " if @query_is_prot
106
+ make_query_db_cmd << " -title #{query_name} "
107
+ make_query_db_cmd << " -out #{@working_dir}/#{query_name}"
108
+ db_query = "#{query_name}.nsq" if !@query_is_prot
109
+ db_query = "#{query_name}.psq" if @query_is_prot
110
+ if !File.exists?("#{@working_dir}/#{db_query}")
111
+ make_db = Cmd.new(make_query_db_cmd)
112
+ make_db.run
113
+ if !make_db.status.success?
114
+ raise RuntimeError.new("BLAST Error creating database")
115
+ end
116
+ end
117
+
118
+ make_target_db_cmd = "#{@makedb_path} -in #{@target}"
119
+ make_target_db_cmd << " -dbtype nucl " if !@target_is_prot
120
+ make_target_db_cmd << " -dbtype prot " if @target_is_prot
121
+ make_target_db_cmd << " -title #{target_name} "
122
+ make_target_db_cmd << " -out #{@working_dir}/#{target_name}"
123
+
124
+ db_target = "#{target_name}.nsq" if !@target_is_prot
125
+ db_target = "#{target_name}.psq" if @target_is_prot
126
+ if !File.exists?("#{@working_dir}/#{db_target}")
127
+ make_db = Cmd.new(make_target_db_cmd)
128
+ make_db.run
129
+ if !make_db.status.success?
130
+ raise RuntimeError.new("BLAST Error creating database")
131
+ end
132
+ end
133
+ @databases = true
134
+ [@query_name, @target_name]
135
+ end
136
+
137
+ # Construct BLAST output file name and run blast with multiple chunks or
138
+ # with multiple threads
139
+ #
140
+ # @param [Float] evalue The evalue cutoff to use with BLAST
141
+ # @param [Integer] threads The number of threads to run
142
+ # @param [Boolean] split If the fasta files should be split into chunks
143
+ def run_blast(evalue, threads, split)
144
+ if @databases
145
+ @output1 = "#{@working_dir}/#{query_name}_into_#{target_name}.1.blast"
146
+ @output2 = "#{@working_dir}/#{target_name}_into_#{query_name}.2.blast"
147
+ if @query_is_prot
148
+ if @target_is_prot
149
+ bin1 = "#{@blastp_path} "
150
+ bin2 = "#{@blastp_path} "
151
+ else
152
+ bin1 = "#{@tblastn_path} "
153
+ bin2 = "#{@blastx_path} "
154
+ end
155
+ else
156
+ if @target_is_prot
157
+ bin1 = "#{@blastx_path} "
158
+ bin2 = "#{@tblastn_path} "
159
+ else
160
+ bin1 = "#{@blastn_path} "
161
+ bin2 = "#{@blastn_path} "
162
+ end
163
+ end
164
+ if split and threads > 1
165
+ run_blast_with_splitting evalue, threads, bin1, bin2
166
+ else
167
+ run_blast_with_threads evalue, threads, bin1, bin2
168
+ end
169
+ return true
170
+ else
171
+ return false
172
+ end
173
+ end
174
+
175
+ # Run BLAST using its own multithreading
176
+ #
177
+ # @param [Float] evalue The evalue cutoff to use with BLAST
178
+ # @param [Integer] threads The number of threads to run
179
+ # @param [String] bin1
180
+ # @param [String] bin2
181
+ def run_blast_with_threads evalue, threads, bin1, bin2
182
+ # puts "running blast with #{threads} threads"
183
+ cmd1 = "#{bin1} -query #{@query} -db #{@working_dir}/#{@target_name} "
184
+ cmd1 << " -out #{@output1} -evalue #{evalue} "
185
+ cmd1 << " -outfmt \"6 std qlen slen\" "
186
+ cmd1 << " -max_target_seqs 50 "
187
+ cmd1 << " -num_threads #{threads}"
188
+
189
+ cmd2 = "#{bin2} -query #{@target} -db #{@working_dir}/#{@query_name} "
190
+ cmd2 << " -out #{@output2} -evalue #{evalue} "
191
+ cmd2 << " -outfmt \"6 std qlen slen\" "
192
+ cmd2 << " -max_target_seqs 50 "
193
+ cmd2 << " -num_threads #{threads}"
194
+ if !File.exist?("#{@output1}")
195
+ blast1 = Cmd.new(cmd1)
196
+ blast1.run
197
+ if !blast1.status.success?
198
+ raise RuntimeError.new("BLAST Error:\n#{blast1.stderr}")
199
+ end
200
+ end
201
+
202
+ if !File.exist?("#{@output2}")
203
+ blast2 = Cmd.new(cmd2)
204
+ blast2.run
205
+ if !blast2.status.success?
206
+ raise RuntimeError.new("BLAST Error:\n#{blast2.stderr}")
207
+ end
208
+ end
209
+ end
210
+
211
+ # Run BLAST by splitting the input into multiple chunks and using 1 thread
212
+ # for each chunk
213
+ #
214
+ # @param [Float] evalue The evalue cutoff to use with BLAST
215
+ # @param [Integer] threads The number of threads to run
216
+ # @param [String] bin1
217
+ # @param [String] bin2
218
+ def run_blast_with_splitting evalue, threads, bin1, bin2
219
+ # puts "running blast by splitting input into #{threads} pieces"
220
+ blasts=[]
221
+ files = split_input(@query, threads)
222
+ files.threach(threads) do |thread|
223
+ cmd1 = "#{bin1} -query #{thread} -db #{@working_dir}/#{@target_name} "
224
+ cmd1 << " -out #{thread}.blast -evalue #{evalue} "
225
+ cmd1 << " -outfmt \"6 std qlen slen\" "
226
+ cmd1 << " -max_target_seqs 50 "
227
+ cmd1 << " -num_threads 1"
228
+ if !File.exists?("#{thread}.blast")
229
+ blast1 = Cmd.new(cmd1)
230
+ blast1.run
231
+ if !blast1.status.success?
232
+ raise RuntimeError.new("BLAST Error:\n#{blast1.stderr}")
233
+ end
234
+ end
235
+ blasts << "#{thread}.blast"
236
+ end
237
+ cat_cmd = "cat "
238
+ cat_cmd << blasts.join(" ")
239
+ cat_cmd << " > #{@output1}"
240
+ catting = Cmd.new(cat_cmd)
241
+ catting.run
242
+ if !catting.status.success?
243
+ raise RuntimeError.new("Problem catting files:\n#{catting.stderr}")
244
+ end
245
+ files.each do |file|
246
+ File.delete(file) if File.exist?(file)
247
+ end
248
+ blasts.each do |b|
249
+ File.delete(b) # delete intermediate blast output files
250
+ end
251
+
252
+ blasts=[]
253
+ files = split_input(@target, threads)
254
+ files.threach(threads) do |thread|
255
+ cmd2 = "#{bin2} -query #{thread} -db #{@working_dir}/#{@query_name} "
256
+ cmd2 << " -out #{thread}.blast -evalue #{evalue} "
257
+ cmd2 << " -outfmt \"6 std qlen slen\" "
258
+ cmd2 << " -max_target_seqs 50 "
259
+ cmd2 << " -num_threads 1"
260
+ if !File.exists?("#{thread}.blast")
261
+ blast2 = Cmd.new(cmd2)
262
+ blast2.run
263
+ if !blast2.status.success?
264
+ raise RuntimeError.new("BLAST Error:\n#{blast2.stderr}")
265
+ end
266
+ end
267
+ blasts << "#{thread}.blast"
268
+ end
269
+ cat_cmd = "cat "
270
+ cat_cmd << blasts.join(" ")
271
+ cat_cmd << " > #{@output2}"
272
+ catting = Cmd.new(cat_cmd)
273
+ catting.run
274
+ if !catting.status.success?
275
+ raise RuntimeError.new("Problem catting files:\n#{catting.stderr}")
276
+ end
277
+ files.each do |file|
278
+ File.delete(file) if File.exist?(file)
279
+ end
280
+ blasts.each do |b|
281
+ File.delete(b) # delete intermediate blast output files
282
+ end
283
+
284
+ end
285
+
286
+ # Split a fasta file in pieces
287
+ #
288
+ # @param [String] filename
289
+ # @param [Integer] pieces
290
+ def split_input filename, pieces
291
+ input = {}
292
+ name = nil
293
+ seq=""
294
+ File.open(filename).each_line do |line|
295
+ if line =~ /^>(.*)$/
296
+ if name
297
+ input[name]=seq
298
+ seq=""
299
+ end
300
+ name = $1
301
+ else
302
+ seq << line.chomp
303
+ end
304
+ end
305
+ input[name]=seq
306
+ # construct list of output file handles
307
+ outputs=[]
308
+ output_files=[]
309
+ pieces.times do |n|
310
+ outfile = "#{filename}_chunk_#{n}.fasta"
311
+ outfile = File.expand_path(outfile)
312
+ outputs[n] = File.open("#{outfile}", "w")
313
+ output_files[n] = "#{outfile}"
314
+ end
315
+ # write sequences
316
+ count=0
317
+ input.each_pair do |name, seq|
318
+ outputs[count].write(">#{name}\n")
319
+ outputs[count].write("#{seq}\n")
320
+ count += 1
321
+ count %= pieces
322
+ end
323
+ outputs.each do |out|
324
+ out.close
325
+ end
326
+ output_files
327
+ end
328
+
329
+ # Load the two BLAST output files and store the hits in a hash
330
+ #
331
+ def load_outputs
332
+ if File.exist?("#{@working_dir}/reciprocal_hits.txt")
333
+ # puts "reciprocal output already exists"
334
+ else
335
+ @query_results = Hash.new
336
+ @target_results = Hash.new
337
+ q_count=0
338
+ t_count=0
339
+ if !File.exists?("#{@output1}")
340
+ raise RuntimeError.new("can't find #{@output1}")
341
+ end
342
+ if !File.exists?("#{@output2}")
343
+ raise RuntimeError.new("can't find #{@output2}")
344
+ end
345
+ if File.exists?("#{@output1}") and File.exists?("#{@output2}")
346
+ File.open("#{@output1}").each_line do |line|
347
+ cols = line.chomp.split("\t")
348
+ hit = Hit.new(cols)
349
+ @query_results[hit.query] = [] if !@query_results.has_key?(hit.query)
350
+ @query_results[hit.query] << hit
351
+ q_count += 1
352
+ end
353
+ File.open("#{@output2}").each_line do |line|
354
+ cols = line.chomp.split("\t")
355
+ hit = Hit.new(cols)
356
+ @target_results[hit.query] = [] if !@target_results.has_key?(hit.query)
357
+ @target_results[hit.query] << hit
358
+ t_count += 1
359
+ end
360
+ else
361
+ raise "need to run blast first"
362
+ end
363
+ end
364
+ [q_count, t_count]
365
+ end
366
+
367
+ # fills @reciprocals with strict reciprocal hits from the blast results
368
+ def find_reciprocals
369
+ if File.exist?("#{@working_dir}/reciprocal_hits.txt")
370
+ # puts "reciprocal output already exists"
371
+ else
372
+ @reciprocals = Hash.new
373
+ @missed = Hash.new
374
+ @evalues = []
375
+ @longest = 0
376
+ hits = 0
377
+ @query_results.each_pair do |query_id, list_of_hits|
378
+ list_of_hits.each_with_index do |target_hit, query_index|
379
+ if @target_results.has_key?(target_hit.target)
380
+ list_of_hits_2 = @target_results[target_hit.target]
381
+ list_of_hits_2.each_with_index do |query_hit2, target_index|
382
+ if query_index == 0 && target_index == 0 &&
383
+ query_id == query_hit2.target
384
+ e = target_hit.evalue.to_f
385
+ e = 1e-200 if e==0
386
+ e = -Math.log10(e)
387
+ if !@reciprocals.key?(query_id)
388
+ @reciprocals[query_id] = []
389
+ end
390
+ @reciprocals[query_id] << target_hit
391
+ hits += 1
392
+ @longest = target_hit.alnlen if target_hit.alnlen > @longest
393
+ @evalues << {:e => e, :length => target_hit.alnlen}
394
+ elsif query_id == query_hit2.target
395
+ if !@missed.key?(query_id)
396
+ @missed[query_id] = []
397
+ end
398
+ @missed[query_id] << target_hit
399
+ end
400
+ end
401
+ end
402
+ end
403
+ end
404
+ end
405
+ return hits
406
+ end
407
+
408
+ # Learns the evalue cutoff based on the length of the sequence
409
+ # Finds hits that have a lower evalue than this cutoff
410
+ def find_secondaries
411
+
412
+ if File.exist?("#{@working_dir}/reciprocal_hits.txt")
413
+ # puts "reciprocal output already exists"
414
+ else
415
+ length_hash = Hash.new
416
+ fitting = Hash.new
417
+ @evalues.each do |h|
418
+ length_hash[h[:length]] = [] if !length_hash.key?(h[:length])
419
+ length_hash[h[:length]] << h
420
+ end
421
+
422
+ (10..@longest).each do |centre|
423
+ e = 0
424
+ count = 0
425
+ s = centre*0.1
426
+ s = s.to_i
427
+ s = 5 if s < 5
428
+ (-s..s).each do |side|
429
+ if length_hash.has_key?(centre+side)
430
+ length_hash[centre+side].each do |point|
431
+ e += point[:e]
432
+ count += 1
433
+ end
434
+ end
435
+ end
436
+ if count>0
437
+ mean = e/count
438
+ fitting[centre] = mean
439
+ end
440
+ end
441
+ hits = 0
442
+ @missed.each_pair do |id, list|
443
+ list.each do |hit|
444
+ l = hit.alnlen.to_i
445
+ e = hit.evalue
446
+ e = 1e-200 if e==0
447
+ e = -Math.log10(e)
448
+ if fitting.has_key?(l)
449
+ if e >= fitting[l]
450
+ if !@reciprocals.key?(id)
451
+ @reciprocals[id] = []
452
+ found=false
453
+ @reciprocals[id].each do |existing_hit|
454
+ if existing_hit.query == hit.query &&
455
+ existing_hit.target == hit.target
456
+ found=true
457
+ end
458
+ end
459
+ if !found
460
+ @reciprocals[id] << hit
461
+ hits += 1
462
+ end
463
+ end
464
+ end
465
+ end
466
+ end
467
+ end
468
+ end
469
+ return hits
470
+ end
471
+
472
+ def clear_memory
473
+ # running lots of jobs at the same time was keeping a lot of stuff in
474
+ # memory that you might not want so this empties out those big hashes.
475
+ @query_results = nil
476
+ @target_results = nil
477
+ end
478
+
479
+ def run evalue=1e-5, threads=1, split=true
480
+ makedb
481
+ run_blast evalue, threads, split
482
+ load_outputs
483
+ find_reciprocals
484
+ find_secondaries
485
+ end
486
+
487
+ def size
488
+ hits=0
489
+ @reciprocals.each_pair do |key, list|
490
+ list.each do |hit|
491
+ hits += 1
492
+ end
493
+ end
494
+ hits
495
+ end
496
+
497
+ def write_output
498
+ s=""
499
+ unless @reciprocals.nil?
500
+ @reciprocals.each_pair do |query_id, hits|
501
+ hits.each do |hit|
502
+ s << "#{hit}\n"
503
+ end
504
+ end
505
+ File.open("#{@working_dir}/reciprocal_hits.txt", "w") {|f| f.write s }
506
+ end
507
+ end
508
+
509
+ def has_reciprocal? contig
510
+ return true if @reciprocals.has_key?(contig)
511
+ return false
512
+ end
513
+ end
514
+
515
+ end