crb-blast 0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (5) hide show
  1. checksums.yaml +7 -0
  2. data/bin/crb-blast +65 -0
  3. data/lib/crb-blast.rb +348 -0
  4. data/lib/hit.rb +30 -0
  5. metadata +168 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c3a5d87d0431c8bdc70d1adf34e61d2c853e1364
4
+ data.tar.gz: 07c351d04011f795aab32983c1e69a8daaaa2407
5
+ SHA512:
6
+ metadata.gz: 36faeb5e2d53512661f28192212f70b080c4a98751b88d9239474279fdb5c2d03ecaa7e6f74e990a59afd19f2edebdeda596f0e613b5b2f751f4b882b84a8788
7
+ data.tar.gz: 276e0d706780586b6f13551aa9fa21dd1f9794fa665ae9341593aff0f6d2e12999a8e44e33918e52435726f257636146a954e7d59a11f5ac22692c9dbe644c12
@@ -0,0 +1,65 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #
4
+ # run crb-blast from the cli
5
+ #
6
+
7
+ require 'trollop'
8
+ require 'crb-blast'
9
+
10
+ opts = Trollop::options do
11
+ banner <<-EOS
12
+
13
+ CRB-Blast v0.1 by Chris Boursnell <cmb211@cam.ac.uk>
14
+
15
+ Conditional Reciprocal Best BLAST
16
+
17
+ USAGE:
18
+ crb-blast <options>
19
+
20
+ OPTIONS:
21
+
22
+ EOS
23
+ opt :query,
24
+ "query fasta file in nucleotide format",
25
+ :required => true,
26
+ :type => String
27
+
28
+ opt :target,
29
+ "target fasta file as nucleotide or protein",
30
+ :required => true,
31
+ :type => String
32
+
33
+ opt :evalue,
34
+ "e-value cut off for BLAST. Format 1e-5",
35
+ :default => 1e-5,
36
+ :type => :float
37
+
38
+ opt :threads,
39
+ "number of threads to run BLAST with",
40
+ :default => 1,
41
+ :type => :int
42
+
43
+ opt :output,
44
+ "output file as tsv",
45
+ :required => true,
46
+ :type => String
47
+ end
48
+
49
+ Trollop::die :query, "must exist" if !File.exist?(opts[:query])
50
+ Trollop::die :target, "must exist" if !File.exist?(opts[:target])
51
+
52
+ blaster = CRB_Blast.new(opts.query, opts.target)
53
+ dbs = blaster.makedb
54
+ run = blaster.run_blast(opts.evalue, opts.threads)
55
+ load = blaster.load_outputs
56
+ recips = blaster.find_reciprocals
57
+ secondaries = blaster.find_secondaries
58
+
59
+ File.open("#{opts.output}", 'w') do |out|
60
+ blaster.reciprocals.each_pair do |query_id, hits|
61
+ hits.each do |hit|
62
+ out.write "#{hit}\n"
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,348 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bio'
4
+ require 'which'
5
+ require 'hit'
6
+ require 'crb-blast'
7
+
8
+ class Bio::FastaFormat
9
+ def isNucl?
10
+ Bio::Sequence.guess(self.seq, 0.9, 500) == Bio::Sequence::NA
11
+ end
12
+
13
+ def isProt?
14
+ Bio::Sequence.guess(self.seq, 0.9, 500) == Bio::Sequence::AA
15
+ end
16
+ end
17
+
18
+ class CRB_Blast
19
+
20
+ include Which
21
+
22
+ attr_accessor :query_name, :target_name, :reciprocals
23
+ attr_accessor :missed
24
+ attr_accessor :target_is_prot, :query_is_prot
25
+ attr_accessor :query_results, :target_results, :working_dir
26
+
27
+ def initialize query, target, output:nil
28
+ @query = query
29
+ @target = target
30
+ if output.nil?
31
+ @working_dir = File.expand_path(File.dirname(query)) # no trailing /
32
+ else
33
+ @working_dir = File.expand_path(output)
34
+ mkcmd = "mkdir #{@working_dir}"
35
+ if !Dir.exist?(@working_dir)
36
+ puts mkcmd
37
+ `#{mkcmd}`
38
+ end
39
+ end
40
+ @makedb_path = which('makeblastdb')
41
+ raise 'makeblastdb was not in the PATH' if @makedb_path.empty?
42
+ @blastn_path = which('blastn')
43
+ raise 'blastn was not in the PATH' if @blastn_path.empty?
44
+ @tblastn_path = which('tblastn')
45
+ raise 'tblastn was not in the PATH' if @tblastn_path.empty?
46
+ @blastx_path = which('blastx')
47
+ raise 'blastx was not in the PATH' if @blastx_path.empty?
48
+ @blastp_path = which('blastp')
49
+ raise 'blastp was not in the PATH' if @blastp_path.empty?
50
+ @makedb_path = @makedb_path.first
51
+ @blastn_path = @blastn_path.first
52
+ @tblastn_path = @tblastn_path.first
53
+ @blastx_path = @blastx_path.first
54
+ @blastp_path = @blastp_path.first
55
+ end
56
+
57
+ #
58
+ # makes a blast database from the query and the target
59
+ #
60
+ def makedb
61
+ # only scan the first few hundred entries
62
+ n = 100
63
+ # check if the query is a nucl or prot seq
64
+ query_file = Bio::FastaFormat.open(@query)
65
+ count_p=0
66
+ count=0
67
+ query_file.take(n).each do |entry|
68
+ count_p += 1 if entry.isProt?
69
+ count += 1
70
+ end
71
+ if count_p > count*0.9
72
+ @query_is_prot = true
73
+ else
74
+ @query_is_prot = false
75
+ end
76
+
77
+ # check if the target is a nucl or prot seq
78
+ target_file = Bio::FastaFormat.open(@target)
79
+ count_p=0
80
+ count=0
81
+ target_file.take(n).each do |entry|
82
+ count_p += 1 if entry.isProt?
83
+ count += 1
84
+ end
85
+ if count_p > count*0.9
86
+ @target_is_prot = true
87
+ else
88
+ @target_is_prot = false
89
+ end
90
+ # construct the output database names
91
+ @query_name = File.basename(@query).split('.')[0..-2].join('.')
92
+ @target_name = File.basename(@target).split('.')[0..-2].join('.')
93
+
94
+ # check if the databases already exist in @working_dir
95
+ make_query_db_cmd = "#{@makedb_path} -in #{@query}"
96
+ make_query_db_cmd << " -dbtype nucl " if !@query_is_prot
97
+ make_query_db_cmd << " -dbtype prot " if @query_is_prot
98
+ make_query_db_cmd << " -title #{query_name} "
99
+ make_query_db_cmd << " -out #{@working_dir}/#{query_name}"
100
+ db_query = "#{query_name}.nsq" if !@query_is_prot
101
+ db_query = "#{query_name}.psq" if @query_is_prot
102
+ if !File.exists?("#{@working_dir}/#{db_query}")
103
+ `#{make_query_db_cmd}`
104
+ end
105
+
106
+ make_target_db_cmd = "#{@makedb_path} -in #{@target}"
107
+ make_target_db_cmd << " -dbtype nucl " if !@target_is_prot
108
+ make_target_db_cmd << " -dbtype prot " if @target_is_prot
109
+ make_target_db_cmd << " -title #{target_name} "
110
+ make_target_db_cmd << " -out #{@working_dir}/#{target_name}"
111
+
112
+ db_target = "#{target_name}.nsq" if !@target_is_prot
113
+ db_target = "#{target_name}.psq" if @target_is_prot
114
+ if !File.exists?("#{@working_dir}/#{db_target}")
115
+ `#{make_target_db_cmd}`
116
+ end
117
+ @databases = true
118
+ [@query_name, @target_name]
119
+ end
120
+
121
+ def run_blast(evalue, threads)
122
+ if @databases
123
+ @output1 = "#{@working_dir}/#{query_name}_into_#{target_name}.1.blast"
124
+ @output2 = "#{@working_dir}/#{target_name}_into_#{query_name}.2.blast"
125
+ cmd1=""
126
+ cmd2=""
127
+ if @query_is_prot
128
+ if @target_is_prot
129
+ cmd1 << "#{@blastp_path} "
130
+ cmd2 << "#{@blastp_path} "
131
+ else
132
+ cmd1 << "#{@tblastn_path} "
133
+ cmd2 << "#{@blastx_path} "
134
+ end
135
+ else
136
+ if @target_is_prot
137
+ cmd1 << "#{@blastx_path} "
138
+ cmd2 << "#{@tblastn_path} "
139
+ else
140
+ cmd1 << "#{@blastn_path} "
141
+ cmd2 << "#{@blastn_path} "
142
+ end
143
+ end
144
+ cmd1 << " -query #{@query} -db #{@working_dir}/#{@target_name} "
145
+ cmd1 << " -out #{@output1} -evalue #{evalue} "
146
+ cmd1 << " -outfmt \"6 std qlen slen\" "
147
+ cmd1 << " -max_target_seqs 50 "
148
+ cmd1 << " -num_threads #{threads}"
149
+
150
+ cmd2 << " -query #{@target} -db #{@working_dir}/#{@query_name} "
151
+ cmd2 << " -out #{@output2} -evalue #{evalue} "
152
+ cmd2 << " -outfmt \"6 std qlen slen\" "
153
+ cmd2 << " -max_target_seqs 50 "
154
+ cmd2 << " -num_threads #{threads}"
155
+
156
+ if !File.exists?("#{@output1}")
157
+ `#{cmd1}`
158
+ end
159
+ if !File.exists?("#{@output2}")
160
+ `#{cmd2}`
161
+ end
162
+ return true
163
+ else
164
+ return false
165
+ end
166
+ end
167
+
168
+ def load_outputs
169
+ if File.exist?("#{@working_dir}/reciprocal_hits.txt")
170
+ # puts "reciprocal output already exists"
171
+ else
172
+ @query_results = Hash.new
173
+ @target_results = Hash.new
174
+ q_count=0
175
+ t_count=0
176
+ if !File.exists?("#{@output1}")
177
+ puts "can't find #{@output1}"
178
+ end
179
+ if !File.exists?("#{@output2}")
180
+ puts "can't find #{@output2}"
181
+ end
182
+ if File.exists?("#{@output1}") and File.exists?("#{@output2}")
183
+ File.open("#{@output1}").each_line do |line|
184
+ cols = line.chomp.split("\t")
185
+ hit = Hit.new(cols)
186
+ @query_results[hit.query] = [] if !@query_results.has_key?(hit.query)
187
+ @query_results[hit.query] << hit
188
+ q_count += 1
189
+ end
190
+ File.open("#{@output2}").each_line do |line|
191
+ cols = line.chomp.split("\t")
192
+ hit = Hit.new(cols)
193
+ @target_results[hit.query] = [] if !@target_results.has_key?(hit.query)
194
+ @target_results[hit.query] << hit
195
+ t_count += 1
196
+ end
197
+ else
198
+ raise "need to run blast first"
199
+ end
200
+ end
201
+ [q_count, t_count]
202
+ end
203
+
204
+ # fills @reciprocals with strict reciprocal hits from the blast results
205
+ def find_reciprocals
206
+ if File.exist?("#{@working_dir}/reciprocal_hits.txt")
207
+ # puts "reciprocal output already exists"
208
+ else
209
+ @reciprocals = Hash.new
210
+ @missed = Hash.new
211
+ @evalues = []
212
+ @longest=0
213
+ hits = 0
214
+ @query_results.each_pair do |query_id, list_of_hits|
215
+ list_of_hits.each_with_index do |target_hit, query_index|
216
+ if @target_results.has_key?(target_hit.target)
217
+ list_of_hits_2 = @target_results[target_hit.target]
218
+ list_of_hits_2.each_with_index do |query_hit2, target_index|
219
+ if query_index == 0 && target_index == 0 &&
220
+ query_id == query_hit2.target
221
+ e = target_hit.evalue.to_f
222
+ e = 1e-200 if e==0
223
+ e = -Math.log10(e)
224
+ if !@reciprocals.key?(query_id)
225
+ @reciprocals[query_id] = []
226
+ end
227
+ @reciprocals[query_id] << target_hit
228
+ hits += 1
229
+ @longest = target_hit.alnlen if target_hit.alnlen > @longest
230
+ @evalues << {:e => e, :length => target_hit.alnlen}
231
+ elsif query_id == query_hit2.target
232
+ if !@missed.key?(query_id)
233
+ @missed[query_id] = []
234
+ end
235
+ @missed[query_id] << target_hit
236
+ end
237
+ end
238
+ end
239
+ end
240
+ end
241
+ end
242
+ return hits
243
+ end
244
+
245
+ def find_secondaries
246
+
247
+ if File.exist?("#{@working_dir}/reciprocal_hits.txt")
248
+ # puts "reciprocal output already exists"
249
+ else
250
+ length_hash = Hash.new
251
+ fitting = Hash.new
252
+ @evalues.each do |h|
253
+ length_hash[h[:length]] = [] if !length_hash.key?(h[:length])
254
+ length_hash[h[:length]] << h
255
+ end
256
+
257
+ (10..@longest).each do |centre|
258
+ e = 0
259
+ count = 0
260
+ s = centre*0.1
261
+ s = s.to_i
262
+ s = 5 if s < 5
263
+ (-s..s).each do |side|
264
+ if length_hash.has_key?(centre+side)
265
+ length_hash[centre+side].each do |point|
266
+ e += point[:e]
267
+ count += 1
268
+ end
269
+ end
270
+ end
271
+ if count>0
272
+ mean = e/count
273
+ fitting[centre] = mean
274
+ end
275
+ end
276
+ hits = 0
277
+ @missed.each_pair do |id, list|
278
+ list.each do |hit|
279
+ l = hit.alnlen.to_i
280
+ e = hit.evalue
281
+ e = 1e-200 if e==0
282
+ e = -Math.log10(e)
283
+ if fitting.has_key?(l)
284
+ if e >= fitting[l]
285
+ if !@reciprocals.key?(id)
286
+ @reciprocals[id] = []
287
+ found=false
288
+ @reciprocals[id].each do |existing_hit|
289
+ if existing_hit.query == hit.query &&
290
+ existing_hit.target == hit.target
291
+ found=true
292
+ end
293
+ end
294
+ if !found
295
+ @reciprocals[id] << hit
296
+ hits += 1
297
+ end
298
+ end
299
+ end
300
+ end
301
+ end
302
+ end
303
+ end
304
+ return hits
305
+ end
306
+
307
+ def clear_memory
308
+ # running lots of jobs at the same time was keeping a lot of stuff in
309
+ # memory that you might not want so this empties out those big hashes.
310
+ @query_results = nil
311
+ @target_results = nil
312
+ end
313
+
314
+ def run evalue, threads
315
+ makedb
316
+ run_blast evalue, threads
317
+ load_outputs
318
+ find_reciprocals
319
+ find_secondaries
320
+ end
321
+
322
+ def size
323
+ hits=0
324
+ @reciprocals.each do |list|
325
+ list.each do |hit|
326
+ hits += 1
327
+ end
328
+ end
329
+ hits
330
+ end
331
+
332
+ def write_output
333
+ s=""
334
+ unless @reciprocals.nil?
335
+ @reciprocals.each_pair do |query_id, hits|
336
+ hits.each do |hit|
337
+ s << "#{hit}\n"
338
+ end
339
+ end
340
+ File.open("#{@working_dir}/reciprocal_hits.txt", "w") {|f| f.write s }
341
+ end
342
+ end
343
+
344
+ def has_reciprocal? contig
345
+ return true if @reciprocals.has_key?(contig)
346
+ return false
347
+ end
348
+ end
@@ -0,0 +1,30 @@
1
+ class Hit
2
+ # Fields: query id, subject id, % identity, alignment length, mismatches,
3
+ # gap opens, q. start, q. end, s. start, s. end, evalue, bit score
4
+ attr_accessor :query, :target, :id, :alnlen, :mismatches, :gaps, :qstart,
5
+ :qend, :tstart, :tend, :evalue, :bitscore, :qlen, :tlen
6
+
7
+ def initialize(list)
8
+ raise(RuntimeError, "unexpected number of columns") unless list.length==14
9
+ @query = list[0].split(/[\|\ ]/).first
10
+ @target = list[1].split(/[\|\ ]/).first
11
+ @id = list[2]
12
+ @alnlen = list[3].to_i
13
+ @mismatches = list[4].to_i
14
+ @gaps = list[5].to_i
15
+ @qstart = list[6].to_i
16
+ @qend = list[7].to_i
17
+ @tstart = list[8].to_i
18
+ @tend = list[9].to_i
19
+ @evalue = list[10].to_f
20
+ @bitscore = list[11].to_f
21
+ @qlen = list[12].to_f
22
+ @tlen = list[13].to_f
23
+ end
24
+
25
+ def to_s
26
+ s = "#{@query}\t#{@target}\t#{@id}\t#{@alnlen}\t#{@evalue}\t#{@bitscore}\t"
27
+ s << "#{@qstart}..#{@qend}\t#{@tstart}..#{@tend}"
28
+ return s
29
+ end
30
+ end
metadata ADDED
@@ -0,0 +1,168 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: crb-blast
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ platform: ruby
6
+ authors:
7
+ - Chris Boursnell
8
+ - Richard Smith-Unna
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-04-28 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: trollop
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - ">="
19
+ - !ruby/object:Gem::Version
20
+ version: '0'
21
+ type: :runtime
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ version: '0'
28
+ - !ruby/object:Gem::Dependency
29
+ name: rake
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ type: :runtime
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ - !ruby/object:Gem::Dependency
43
+ name: bio
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - "~>"
47
+ - !ruby/object:Gem::Version
48
+ version: '1.4'
49
+ - - ">="
50
+ - !ruby/object:Gem::Version
51
+ version: 1.4.3
52
+ type: :runtime
53
+ prerelease: false
54
+ version_requirements: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - "~>"
57
+ - !ruby/object:Gem::Version
58
+ version: '1.4'
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: 1.4.3
62
+ - !ruby/object:Gem::Dependency
63
+ name: which
64
+ requirement: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '='
67
+ - !ruby/object:Gem::Version
68
+ version: 0.0.2
69
+ type: :runtime
70
+ prerelease: false
71
+ version_requirements: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '='
74
+ - !ruby/object:Gem::Version
75
+ version: 0.0.2
76
+ - !ruby/object:Gem::Dependency
77
+ name: turn
78
+ requirement: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ type: :development
84
+ prerelease: false
85
+ version_requirements: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ - !ruby/object:Gem::Dependency
91
+ name: simplecov
92
+ requirement: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ type: :development
98
+ prerelease: false
99
+ version_requirements: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ - !ruby/object:Gem::Dependency
105
+ name: shoulda-context
106
+ requirement: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ type: :development
112
+ prerelease: false
113
+ version_requirements: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ - !ruby/object:Gem::Dependency
119
+ name: coveralls
120
+ requirement: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: 0.6.7
125
+ type: :development
126
+ prerelease: false
127
+ version_requirements: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: 0.6.7
132
+ description: Increased sensitivity reciprocal best BLAST with automated cutoff learning
133
+ based on species divergence
134
+ email: cmb211@cam.ac.uk
135
+ executables:
136
+ - crb-blast
137
+ extensions: []
138
+ extra_rdoc_files: []
139
+ files:
140
+ - bin/crb-blast
141
+ - lib/crb-blast.rb
142
+ - lib/hit.rb
143
+ homepage: http://rubygems.org/gems/crb-blast
144
+ licenses:
145
+ - MIT
146
+ metadata: {}
147
+ post_install_message:
148
+ rdoc_options: []
149
+ require_paths:
150
+ - lib
151
+ required_ruby_version: !ruby/object:Gem::Requirement
152
+ requirements:
153
+ - - ">="
154
+ - !ruby/object:Gem::Version
155
+ version: '0'
156
+ required_rubygems_version: !ruby/object:Gem::Requirement
157
+ requirements:
158
+ - - ">="
159
+ - !ruby/object:Gem::Version
160
+ version: '0'
161
+ requirements: []
162
+ rubyforge_project:
163
+ rubygems_version: 2.2.2
164
+ signing_key:
165
+ specification_version: 4
166
+ summary: Increased sensitivity reciprocal best BLAST
167
+ test_files: []
168
+ has_rdoc: