crb-blast 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (5) hide show
  1. checksums.yaml +7 -0
  2. data/bin/crb-blast +65 -0
  3. data/lib/crb-blast.rb +348 -0
  4. data/lib/hit.rb +30 -0
  5. metadata +168 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c3a5d87d0431c8bdc70d1adf34e61d2c853e1364
4
+ data.tar.gz: 07c351d04011f795aab32983c1e69a8daaaa2407
5
+ SHA512:
6
+ metadata.gz: 36faeb5e2d53512661f28192212f70b080c4a98751b88d9239474279fdb5c2d03ecaa7e6f74e990a59afd19f2edebdeda596f0e613b5b2f751f4b882b84a8788
7
+ data.tar.gz: 276e0d706780586b6f13551aa9fa21dd1f9794fa665ae9341593aff0f6d2e12999a8e44e33918e52435726f257636146a954e7d59a11f5ac22692c9dbe644c12
@@ -0,0 +1,65 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #
4
+ # run crb-blast from the cli
5
+ #
6
+
7
+ require 'trollop'
8
+ require 'crb-blast'
9
+
10
+ opts = Trollop::options do
11
+ banner <<-EOS
12
+
13
+ CRB-Blast v0.1 by Chris Boursnell <cmb211@cam.ac.uk>
14
+
15
+ Conditional Reciprocal Best BLAST
16
+
17
+ USAGE:
18
+ crb-blast <options>
19
+
20
+ OPTIONS:
21
+
22
+ EOS
23
+ opt :query,
24
+ "query fasta file in nucleotide format",
25
+ :required => true,
26
+ :type => String
27
+
28
+ opt :target,
29
+ "target fasta file as nucleotide or protein",
30
+ :required => true,
31
+ :type => String
32
+
33
+ opt :evalue,
34
+ "e-value cut off for BLAST. Format 1e-5",
35
+ :default => 1e-5,
36
+ :type => :float
37
+
38
+ opt :threads,
39
+ "number of threads to run BLAST with",
40
+ :default => 1,
41
+ :type => :int
42
+
43
+ opt :output,
44
+ "output file as tsv",
45
+ :required => true,
46
+ :type => String
47
+ end
48
+
49
+ Trollop::die :query, "must exist" if !File.exist?(opts[:query])
50
+ Trollop::die :target, "must exist" if !File.exist?(opts[:target])
51
+
52
+ blaster = CRB_Blast.new(opts.query, opts.target)
53
+ dbs = blaster.makedb
54
+ run = blaster.run_blast(opts.evalue, opts.threads)
55
+ load = blaster.load_outputs
56
+ recips = blaster.find_reciprocals
57
+ secondaries = blaster.find_secondaries
58
+
59
+ File.open("#{opts.output}", 'w') do |out|
60
+ blaster.reciprocals.each_pair do |query_id, hits|
61
+ hits.each do |hit|
62
+ out.write "#{hit}\n"
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,348 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bio'
4
+ require 'which'
5
+ require 'hit'
6
+ require 'crb-blast'
7
+
8
+ class Bio::FastaFormat
9
+ def isNucl?
10
+ Bio::Sequence.guess(self.seq, 0.9, 500) == Bio::Sequence::NA
11
+ end
12
+
13
+ def isProt?
14
+ Bio::Sequence.guess(self.seq, 0.9, 500) == Bio::Sequence::AA
15
+ end
16
+ end
17
+
18
+ class CRB_Blast
19
+
20
+ include Which
21
+
22
+ attr_accessor :query_name, :target_name, :reciprocals
23
+ attr_accessor :missed
24
+ attr_accessor :target_is_prot, :query_is_prot
25
+ attr_accessor :query_results, :target_results, :working_dir
26
+
27
+ def initialize query, target, output:nil
28
+ @query = query
29
+ @target = target
30
+ if output.nil?
31
+ @working_dir = File.expand_path(File.dirname(query)) # no trailing /
32
+ else
33
+ @working_dir = File.expand_path(output)
34
+ mkcmd = "mkdir #{@working_dir}"
35
+ if !Dir.exist?(@working_dir)
36
+ puts mkcmd
37
+ `#{mkcmd}`
38
+ end
39
+ end
40
+ @makedb_path = which('makeblastdb')
41
+ raise 'makeblastdb was not in the PATH' if @makedb_path.empty?
42
+ @blastn_path = which('blastn')
43
+ raise 'blastn was not in the PATH' if @blastn_path.empty?
44
+ @tblastn_path = which('tblastn')
45
+ raise 'tblastn was not in the PATH' if @tblastn_path.empty?
46
+ @blastx_path = which('blastx')
47
+ raise 'blastx was not in the PATH' if @blastx_path.empty?
48
+ @blastp_path = which('blastp')
49
+ raise 'blastp was not in the PATH' if @blastp_path.empty?
50
+ @makedb_path = @makedb_path.first
51
+ @blastn_path = @blastn_path.first
52
+ @tblastn_path = @tblastn_path.first
53
+ @blastx_path = @blastx_path.first
54
+ @blastp_path = @blastp_path.first
55
+ end
56
+
57
+ #
58
+ # makes a blast database from the query and the target
59
+ #
60
+ def makedb
61
+ # only scan the first few hundred entries
62
+ n = 100
63
+ # check if the query is a nucl or prot seq
64
+ query_file = Bio::FastaFormat.open(@query)
65
+ count_p=0
66
+ count=0
67
+ query_file.take(n).each do |entry|
68
+ count_p += 1 if entry.isProt?
69
+ count += 1
70
+ end
71
+ if count_p > count*0.9
72
+ @query_is_prot = true
73
+ else
74
+ @query_is_prot = false
75
+ end
76
+
77
+ # check if the target is a nucl or prot seq
78
+ target_file = Bio::FastaFormat.open(@target)
79
+ count_p=0
80
+ count=0
81
+ target_file.take(n).each do |entry|
82
+ count_p += 1 if entry.isProt?
83
+ count += 1
84
+ end
85
+ if count_p > count*0.9
86
+ @target_is_prot = true
87
+ else
88
+ @target_is_prot = false
89
+ end
90
+ # construct the output database names
91
+ @query_name = File.basename(@query).split('.')[0..-2].join('.')
92
+ @target_name = File.basename(@target).split('.')[0..-2].join('.')
93
+
94
+ # check if the databases already exist in @working_dir
95
+ make_query_db_cmd = "#{@makedb_path} -in #{@query}"
96
+ make_query_db_cmd << " -dbtype nucl " if !@query_is_prot
97
+ make_query_db_cmd << " -dbtype prot " if @query_is_prot
98
+ make_query_db_cmd << " -title #{query_name} "
99
+ make_query_db_cmd << " -out #{@working_dir}/#{query_name}"
100
+ db_query = "#{query_name}.nsq" if !@query_is_prot
101
+ db_query = "#{query_name}.psq" if @query_is_prot
102
+ if !File.exists?("#{@working_dir}/#{db_query}")
103
+ `#{make_query_db_cmd}`
104
+ end
105
+
106
+ make_target_db_cmd = "#{@makedb_path} -in #{@target}"
107
+ make_target_db_cmd << " -dbtype nucl " if !@target_is_prot
108
+ make_target_db_cmd << " -dbtype prot " if @target_is_prot
109
+ make_target_db_cmd << " -title #{target_name} "
110
+ make_target_db_cmd << " -out #{@working_dir}/#{target_name}"
111
+
112
+ db_target = "#{target_name}.nsq" if !@target_is_prot
113
+ db_target = "#{target_name}.psq" if @target_is_prot
114
+ if !File.exists?("#{@working_dir}/#{db_target}")
115
+ `#{make_target_db_cmd}`
116
+ end
117
+ @databases = true
118
+ [@query_name, @target_name]
119
+ end
120
+
121
+ def run_blast(evalue, threads)
122
+ if @databases
123
+ @output1 = "#{@working_dir}/#{query_name}_into_#{target_name}.1.blast"
124
+ @output2 = "#{@working_dir}/#{target_name}_into_#{query_name}.2.blast"
125
+ cmd1=""
126
+ cmd2=""
127
+ if @query_is_prot
128
+ if @target_is_prot
129
+ cmd1 << "#{@blastp_path} "
130
+ cmd2 << "#{@blastp_path} "
131
+ else
132
+ cmd1 << "#{@tblastn_path} "
133
+ cmd2 << "#{@blastx_path} "
134
+ end
135
+ else
136
+ if @target_is_prot
137
+ cmd1 << "#{@blastx_path} "
138
+ cmd2 << "#{@tblastn_path} "
139
+ else
140
+ cmd1 << "#{@blastn_path} "
141
+ cmd2 << "#{@blastn_path} "
142
+ end
143
+ end
144
+ cmd1 << " -query #{@query} -db #{@working_dir}/#{@target_name} "
145
+ cmd1 << " -out #{@output1} -evalue #{evalue} "
146
+ cmd1 << " -outfmt \"6 std qlen slen\" "
147
+ cmd1 << " -max_target_seqs 50 "
148
+ cmd1 << " -num_threads #{threads}"
149
+
150
+ cmd2 << " -query #{@target} -db #{@working_dir}/#{@query_name} "
151
+ cmd2 << " -out #{@output2} -evalue #{evalue} "
152
+ cmd2 << " -outfmt \"6 std qlen slen\" "
153
+ cmd2 << " -max_target_seqs 50 "
154
+ cmd2 << " -num_threads #{threads}"
155
+
156
+ if !File.exists?("#{@output1}")
157
+ `#{cmd1}`
158
+ end
159
+ if !File.exists?("#{@output2}")
160
+ `#{cmd2}`
161
+ end
162
+ return true
163
+ else
164
+ return false
165
+ end
166
+ end
167
+
168
+ def load_outputs
169
+ if File.exist?("#{@working_dir}/reciprocal_hits.txt")
170
+ # puts "reciprocal output already exists"
171
+ else
172
+ @query_results = Hash.new
173
+ @target_results = Hash.new
174
+ q_count=0
175
+ t_count=0
176
+ if !File.exists?("#{@output1}")
177
+ puts "can't find #{@output1}"
178
+ end
179
+ if !File.exists?("#{@output2}")
180
+ puts "can't find #{@output2}"
181
+ end
182
+ if File.exists?("#{@output1}") and File.exists?("#{@output2}")
183
+ File.open("#{@output1}").each_line do |line|
184
+ cols = line.chomp.split("\t")
185
+ hit = Hit.new(cols)
186
+ @query_results[hit.query] = [] if !@query_results.has_key?(hit.query)
187
+ @query_results[hit.query] << hit
188
+ q_count += 1
189
+ end
190
+ File.open("#{@output2}").each_line do |line|
191
+ cols = line.chomp.split("\t")
192
+ hit = Hit.new(cols)
193
+ @target_results[hit.query] = [] if !@target_results.has_key?(hit.query)
194
+ @target_results[hit.query] << hit
195
+ t_count += 1
196
+ end
197
+ else
198
+ raise "need to run blast first"
199
+ end
200
+ end
201
+ [q_count, t_count]
202
+ end
203
+
204
+ # fills @reciprocals with strict reciprocal hits from the blast results
205
+ def find_reciprocals
206
+ if File.exist?("#{@working_dir}/reciprocal_hits.txt")
207
+ # puts "reciprocal output already exists"
208
+ else
209
+ @reciprocals = Hash.new
210
+ @missed = Hash.new
211
+ @evalues = []
212
+ @longest=0
213
+ hits = 0
214
+ @query_results.each_pair do |query_id, list_of_hits|
215
+ list_of_hits.each_with_index do |target_hit, query_index|
216
+ if @target_results.has_key?(target_hit.target)
217
+ list_of_hits_2 = @target_results[target_hit.target]
218
+ list_of_hits_2.each_with_index do |query_hit2, target_index|
219
+ if query_index == 0 && target_index == 0 &&
220
+ query_id == query_hit2.target
221
+ e = target_hit.evalue.to_f
222
+ e = 1e-200 if e==0
223
+ e = -Math.log10(e)
224
+ if !@reciprocals.key?(query_id)
225
+ @reciprocals[query_id] = []
226
+ end
227
+ @reciprocals[query_id] << target_hit
228
+ hits += 1
229
+ @longest = target_hit.alnlen if target_hit.alnlen > @longest
230
+ @evalues << {:e => e, :length => target_hit.alnlen}
231
+ elsif query_id == query_hit2.target
232
+ if !@missed.key?(query_id)
233
+ @missed[query_id] = []
234
+ end
235
+ @missed[query_id] << target_hit
236
+ end
237
+ end
238
+ end
239
+ end
240
+ end
241
+ end
242
+ return hits
243
+ end
244
+
245
+ def find_secondaries
246
+
247
+ if File.exist?("#{@working_dir}/reciprocal_hits.txt")
248
+ # puts "reciprocal output already exists"
249
+ else
250
+ length_hash = Hash.new
251
+ fitting = Hash.new
252
+ @evalues.each do |h|
253
+ length_hash[h[:length]] = [] if !length_hash.key?(h[:length])
254
+ length_hash[h[:length]] << h
255
+ end
256
+
257
+ (10..@longest).each do |centre|
258
+ e = 0
259
+ count = 0
260
+ s = centre*0.1
261
+ s = s.to_i
262
+ s = 5 if s < 5
263
+ (-s..s).each do |side|
264
+ if length_hash.has_key?(centre+side)
265
+ length_hash[centre+side].each do |point|
266
+ e += point[:e]
267
+ count += 1
268
+ end
269
+ end
270
+ end
271
+ if count>0
272
+ mean = e/count
273
+ fitting[centre] = mean
274
+ end
275
+ end
276
+ hits = 0
277
+ @missed.each_pair do |id, list|
278
+ list.each do |hit|
279
+ l = hit.alnlen.to_i
280
+ e = hit.evalue
281
+ e = 1e-200 if e==0
282
+ e = -Math.log10(e)
283
+ if fitting.has_key?(l)
284
+ if e >= fitting[l]
285
+ if !@reciprocals.key?(id)
286
+ @reciprocals[id] = []
287
+ found=false
288
+ @reciprocals[id].each do |existing_hit|
289
+ if existing_hit.query == hit.query &&
290
+ existing_hit.target == hit.target
291
+ found=true
292
+ end
293
+ end
294
+ if !found
295
+ @reciprocals[id] << hit
296
+ hits += 1
297
+ end
298
+ end
299
+ end
300
+ end
301
+ end
302
+ end
303
+ end
304
+ return hits
305
+ end
306
+
307
+ def clear_memory
308
+ # running lots of jobs at the same time was keeping a lot of stuff in
309
+ # memory that you might not want so this empties out those big hashes.
310
+ @query_results = nil
311
+ @target_results = nil
312
+ end
313
+
314
+ def run evalue, threads
315
+ makedb
316
+ run_blast evalue, threads
317
+ load_outputs
318
+ find_reciprocals
319
+ find_secondaries
320
+ end
321
+
322
+ def size
323
+ hits=0
324
+ @reciprocals.each do |list|
325
+ list.each do |hit|
326
+ hits += 1
327
+ end
328
+ end
329
+ hits
330
+ end
331
+
332
+ def write_output
333
+ s=""
334
+ unless @reciprocals.nil?
335
+ @reciprocals.each_pair do |query_id, hits|
336
+ hits.each do |hit|
337
+ s << "#{hit}\n"
338
+ end
339
+ end
340
+ File.open("#{@working_dir}/reciprocal_hits.txt", "w") {|f| f.write s }
341
+ end
342
+ end
343
+
344
+ def has_reciprocal? contig
345
+ return true if @reciprocals.has_key?(contig)
346
+ return false
347
+ end
348
+ end
@@ -0,0 +1,30 @@
1
+ class Hit
2
+ # Fields: query id, subject id, % identity, alignment length, mismatches,
3
+ # gap opens, q. start, q. end, s. start, s. end, evalue, bit score
4
+ attr_accessor :query, :target, :id, :alnlen, :mismatches, :gaps, :qstart,
5
+ :qend, :tstart, :tend, :evalue, :bitscore, :qlen, :tlen
6
+
7
+ def initialize(list)
8
+ raise(RuntimeError, "unexpected number of columns") unless list.length==14
9
+ @query = list[0].split(/[\|\ ]/).first
10
+ @target = list[1].split(/[\|\ ]/).first
11
+ @id = list[2]
12
+ @alnlen = list[3].to_i
13
+ @mismatches = list[4].to_i
14
+ @gaps = list[5].to_i
15
+ @qstart = list[6].to_i
16
+ @qend = list[7].to_i
17
+ @tstart = list[8].to_i
18
+ @tend = list[9].to_i
19
+ @evalue = list[10].to_f
20
+ @bitscore = list[11].to_f
21
+ @qlen = list[12].to_f
22
+ @tlen = list[13].to_f
23
+ end
24
+
25
+ def to_s
26
+ s = "#{@query}\t#{@target}\t#{@id}\t#{@alnlen}\t#{@evalue}\t#{@bitscore}\t"
27
+ s << "#{@qstart}..#{@qend}\t#{@tstart}..#{@tend}"
28
+ return s
29
+ end
30
+ end
metadata ADDED
@@ -0,0 +1,168 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: crb-blast
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ platform: ruby
6
+ authors:
7
+ - Chris Boursnell
8
+ - Richard Smith-Unna
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-04-28 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: trollop
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - ">="
19
+ - !ruby/object:Gem::Version
20
+ version: '0'
21
+ type: :runtime
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ version: '0'
28
+ - !ruby/object:Gem::Dependency
29
+ name: rake
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ type: :runtime
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ - !ruby/object:Gem::Dependency
43
+ name: bio
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - "~>"
47
+ - !ruby/object:Gem::Version
48
+ version: '1.4'
49
+ - - ">="
50
+ - !ruby/object:Gem::Version
51
+ version: 1.4.3
52
+ type: :runtime
53
+ prerelease: false
54
+ version_requirements: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - "~>"
57
+ - !ruby/object:Gem::Version
58
+ version: '1.4'
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: 1.4.3
62
+ - !ruby/object:Gem::Dependency
63
+ name: which
64
+ requirement: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '='
67
+ - !ruby/object:Gem::Version
68
+ version: 0.0.2
69
+ type: :runtime
70
+ prerelease: false
71
+ version_requirements: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '='
74
+ - !ruby/object:Gem::Version
75
+ version: 0.0.2
76
+ - !ruby/object:Gem::Dependency
77
+ name: turn
78
+ requirement: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ type: :development
84
+ prerelease: false
85
+ version_requirements: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ - !ruby/object:Gem::Dependency
91
+ name: simplecov
92
+ requirement: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ type: :development
98
+ prerelease: false
99
+ version_requirements: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ - !ruby/object:Gem::Dependency
105
+ name: shoulda-context
106
+ requirement: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ type: :development
112
+ prerelease: false
113
+ version_requirements: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ - !ruby/object:Gem::Dependency
119
+ name: coveralls
120
+ requirement: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: 0.6.7
125
+ type: :development
126
+ prerelease: false
127
+ version_requirements: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: 0.6.7
132
+ description: Increased sensitivity reciprocal best BLAST with automated cutoff learning
133
+ based on species divergence
134
+ email: cmb211@cam.ac.uk
135
+ executables:
136
+ - crb-blast
137
+ extensions: []
138
+ extra_rdoc_files: []
139
+ files:
140
+ - bin/crb-blast
141
+ - lib/crb-blast.rb
142
+ - lib/hit.rb
143
+ homepage: http://rubygems.org/gems/crb-blast
144
+ licenses:
145
+ - MIT
146
+ metadata: {}
147
+ post_install_message:
148
+ rdoc_options: []
149
+ require_paths:
150
+ - lib
151
+ required_ruby_version: !ruby/object:Gem::Requirement
152
+ requirements:
153
+ - - ">="
154
+ - !ruby/object:Gem::Version
155
+ version: '0'
156
+ required_rubygems_version: !ruby/object:Gem::Requirement
157
+ requirements:
158
+ - - ">="
159
+ - !ruby/object:Gem::Version
160
+ version: '0'
161
+ requirements: []
162
+ rubyforge_project:
163
+ rubygems_version: 2.2.2
164
+ signing_key:
165
+ specification_version: 4
166
+ summary: Increased sensitivity reciprocal best BLAST
167
+ test_files: []
168
+ has_rdoc: