crb-blast 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +13 -0
- data/Gemfile +3 -0
- data/README.md +95 -0
- data/Rakefile +8 -0
- data/bin/crb-blast +3 -3
- data/build +1 -0
- data/crb-blast.gemspec +26 -0
- data/deps/deps.yaml +27 -0
- data/lib/crb-blast.rb +3 -447
- data/lib/crb-blast/cmd.rb +19 -0
- data/lib/crb-blast/crb-blast.rb +515 -0
- data/lib/crb-blast/hit.rb +34 -0
- data/test/helper.rb +16 -0
- data/test/query.fasta +22 -0
- data/test/query2.fasta +30 -0
- data/test/target.fasta +62 -0
- data/test/target2.fasta +76 -0
- data/test/test_bin.rb +17 -0
- data/test/test_test.rb +99 -0
- data/test/test_test2.rb +78 -0
- metadata +20 -3
- data/lib/hit.rb +0 -30
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8ae488cec70a923add9cbe9e6f4a4fe0d69471d8
|
4
|
+
data.tar.gz: 7628e84ef78228bcc268d7deb6eb2a667131d405
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f52df39062d0adf04187fca27635a4822fd2cdfb41944518773985bc08504243c02bd63564fa44656cf910909c9d3d1004c56a6f251d9918d07d2ef88f38eb2f
|
7
|
+
data.tar.gz: 1e46630e56d52f9056ab92a27eb33171aee066885b418855f9c9b61afccba31b5f34f2de2b53a0592d564e710cef3cf7f1f46a6eb148e57ffbb76b1025f24110
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,95 @@
|
|
1
|
+
CRB-BLAST
|
2
|
+
=========
|
3
|
+
|
4
|
+
Conditional Reciprocal Best BLAST - high confidence ortholog assignment.
|
5
|
+
|
6
|
+
### What is Conditional Reciprocal Best BLAST?
|
7
|
+
|
8
|
+
CRB-BLAST is a novel method for finding orthologs between one set of sequences and another. This is particularly useful in genome and transcriptome annotation.
|
9
|
+
|
10
|
+
CRB-BLAST initially performs a standard reciprocal best BLAST. It does this by performing BLAST alignments of query->target and target->query. Reciprocal best BLAST hits are those where the best match for any given query sequence in the query->target alignment is also the best hit of the match in the reverse (target->query) alignment.
|
11
|
+
|
12
|
+
Reciprocal best BLAST is a very conservative way to assign orthologs. The main innovation in CRB-BLAST is to learn an appropriate e-value cutoff to apply to each pairwise alignment by taking into account the overall relatedness of the two datasets being compared. This is done by fitting a function to the distribution of alignment e-values over sequence lengths. The function provides the e-value cutoff for a sequence of given length.
|
13
|
+
|
14
|
+
CRB-BLAST greatly improves the accuracy of ortholog assignment for de-novo transcriptome assembly ([Aubry et al. 2014](http://www.plosgenetics.org/article/info%3Adoi%2F10.1371%2Fjournal.pgen.1004365)).
|
15
|
+
|
16
|
+
The CRB-BLAST algorithm was designed by [Steve Kelly](http://www.stevekellylab.com), and this implementation is by Chris Boursnell and Richard Smith-Unna. The original reference implementation from the paper is available for online use at http://www.stevekellylab.com/software/conditional-orthology-assignment.
|
17
|
+
|
18
|
+
### Installation
|
19
|
+
|
20
|
+
To install CRB-BLAST, simply use rubygems:
|
21
|
+
|
22
|
+
`gem install crb-blast`
|
23
|
+
|
24
|
+
### Prerequisites
|
25
|
+
|
26
|
+
- NCBI BLAST+ (preferably the latest version) should be installed and in your PATH.
|
27
|
+
- Ruby v2.0 or later. If you don't have Ruby, we suggest installing it with [RVM](http://rvm.io).
|
28
|
+
|
29
|
+
`\curl -sSL https://get.rvm.io | bash -s stable --ruby`
|
30
|
+
|
31
|
+
|
32
|
+
### Usage
|
33
|
+
|
34
|
+
CRB-BLAST can be run from the command-line as a standalone program, or used as a library in your own code.
|
35
|
+
|
36
|
+
#### Command-line usage
|
37
|
+
|
38
|
+
CRB-BLAST can be run from the command line with:
|
39
|
+
|
40
|
+
```
|
41
|
+
crb-blast
|
42
|
+
```
|
43
|
+
|
44
|
+
The options are
|
45
|
+
|
46
|
+
```
|
47
|
+
--query, -q <s>: query fasta file in nucleotide format
|
48
|
+
--target, -t <s>: target fasta file as nucleotide or protein
|
49
|
+
--evalue, -e <f>: e-value cut off for BLAST. Format 1e-5 (default: 1.0e-05)
|
50
|
+
--threads, -h <i>: number of threads to run BLAST with (default: 1)
|
51
|
+
--output, -o <s>: output file as tsv
|
52
|
+
--split, -s: split the fasta files into chunks and run multiple blast
|
53
|
+
jobs and then combine them.
|
54
|
+
--help, -l: Show this message
|
55
|
+
```
|
56
|
+
|
57
|
+
An example command is:
|
58
|
+
|
59
|
+
```bash
|
60
|
+
crb-blast --query assembly.fa --target reference_proteins.fa --threads 8 --output annotation.tsv
|
61
|
+
```
|
62
|
+
|
63
|
+
#### Library usage
|
64
|
+
|
65
|
+
To include the gem in your code just `require 'crb-blast'`
|
66
|
+
|
67
|
+
A quick example:
|
68
|
+
|
69
|
+
```ruby
|
70
|
+
blaster = CRB_Blast.new('test/query.fasta', 'test/target.fasta')
|
71
|
+
blaster.run(1e-5, 4, true) # to run with an evalue cutoff of 1e-5 and 4 threads
|
72
|
+
```
|
73
|
+
|
74
|
+
A longer example with each step at a time:
|
75
|
+
|
76
|
+
```ruby
|
77
|
+
blaster = CRB_Blast.new('test/query.fasta', 'test/target.fasta')
|
78
|
+
blaster.makedb
|
79
|
+
blaster.run_blast(1e-5, 6, true)
|
80
|
+
blaster.load_outputs
|
81
|
+
blaster.find_reciprocals
|
82
|
+
blaster.find_secondaries
|
83
|
+
```
|
84
|
+
|
85
|
+
### Getting help
|
86
|
+
|
87
|
+
Please use the issue tracker if you find bugs or have trouble running CRB-BLAST.
|
88
|
+
|
89
|
+
Chris Boursnell <cmb211@cam.ac.uk> maintains this software.
|
90
|
+
|
91
|
+
### License
|
92
|
+
|
93
|
+
This is adademic software - please cite us if you use it in your work.
|
94
|
+
|
95
|
+
CRB-BLAST is released under the MIT license.
|
data/Rakefile
ADDED
data/bin/crb-blast
CHANGED
@@ -11,7 +11,7 @@ require 'bindeps'
|
|
11
11
|
opts = Trollop::options do
|
12
12
|
banner <<-EOS
|
13
13
|
|
14
|
-
CRB-Blast v0.3 by Chris Boursnell <cmb211@cam.ac.uk>
|
14
|
+
CRB-Blast v0.3.2 by Chris Boursnell <cmb211@cam.ac.uk>
|
15
15
|
|
16
16
|
Conditional Reciprocal Best BLAST
|
17
17
|
|
@@ -48,7 +48,7 @@ EOS
|
|
48
48
|
|
49
49
|
opt :split,
|
50
50
|
"split the fasta files into chunks and run multiple blast jobs and then"+
|
51
|
-
"combine them."
|
51
|
+
" combine them."
|
52
52
|
end
|
53
53
|
|
54
54
|
Trollop::die :query, "must exist" if !File.exist?(opts[:query])
|
@@ -58,7 +58,7 @@ gem_dir = Gem.loaded_specs['crb-blast'].full_gem_path
|
|
58
58
|
gem_deps = File.join(gem_dir, 'deps', 'deps.yaml')
|
59
59
|
Bindeps.require gem_deps
|
60
60
|
|
61
|
-
blaster = CRB_Blast.new(opts.query, opts.target)
|
61
|
+
blaster = CRB_Blast::CRB_Blast.new(opts.query, opts.target)
|
62
62
|
dbs = blaster.makedb
|
63
63
|
run = blaster.run_blast(opts.evalue, opts.threads, opts.split)
|
64
64
|
load = blaster.load_outputs
|
data/build
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
gem build crb-blast.gemspec
|
data/crb-blast.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
Gem::Specification.new do |gem|
|
2
|
+
gem.name = 'crb-blast'
|
3
|
+
gem.version = '0.4.0'
|
4
|
+
gem.date = '2014-07-23'
|
5
|
+
gem.summary = "Run conditional reciprocal best blast"
|
6
|
+
gem.description = "See summary"
|
7
|
+
gem.authors = ["Chris Boursnell", "Richard Smith-Unna"]
|
8
|
+
gem.email = 'cmb211@cam.ac.uk'
|
9
|
+
gem.files = `git ls-files`.split("\n")
|
10
|
+
gem.executables = ["crb-blast"]
|
11
|
+
gem.require_paths = %w( lib )
|
12
|
+
gem.homepage = 'http://rubygems.org/gems/crb-blast'
|
13
|
+
gem.license = 'MIT'
|
14
|
+
|
15
|
+
gem.add_dependency 'trollop', '~> 2.0'
|
16
|
+
gem.add_dependency 'bio', '~> 1.4', '>= 1.4.3'
|
17
|
+
gem.add_dependency 'which', '0.0.2'
|
18
|
+
gem.add_dependency 'threach', '~> 0.2', '>= 0.2.0'
|
19
|
+
gem.add_dependency 'bindeps', '~> 0.0', '>= 0.0.7'
|
20
|
+
|
21
|
+
gem.add_development_dependency 'rake', '~> 10.3', '>= 10.3.2'
|
22
|
+
gem.add_development_dependency 'turn', '~> 0.9', '>= 0.9.7'
|
23
|
+
gem.add_development_dependency 'simplecov', '~> 0.8', '>= 0.8.2'
|
24
|
+
gem.add_development_dependency 'shoulda-context', '~> 1.2', '>= 1.2.1'
|
25
|
+
gem.add_development_dependency 'coveralls', '~> 0.7'
|
26
|
+
end
|
data/deps/deps.yaml
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
blastplus:
|
2
|
+
binaries:
|
3
|
+
- makeblastdb
|
4
|
+
- blastn
|
5
|
+
- tblastn
|
6
|
+
- blastp
|
7
|
+
- blastx
|
8
|
+
- tblastx
|
9
|
+
- makembindex
|
10
|
+
- psiblast
|
11
|
+
- rpsblast
|
12
|
+
- blastdbcmd
|
13
|
+
- segmasker
|
14
|
+
- dustmasker
|
15
|
+
- blast_formatter
|
16
|
+
- windowmasker
|
17
|
+
- blastdb_aliastool
|
18
|
+
- deltablast
|
19
|
+
- rpstblastn
|
20
|
+
- blastdbcheck
|
21
|
+
version:
|
22
|
+
number: '2.2.29'
|
23
|
+
command: 'blastx -version'
|
24
|
+
url:
|
25
|
+
64bit:
|
26
|
+
macosx: ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.2.29/ncbi-blast-2.2.29+-universal-macosx.tar.gz
|
27
|
+
linux: ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.2.29/ncbi-blast-2.2.29+-x64-linux.tar.gz
|
data/lib/crb-blast.rb
CHANGED
@@ -1,447 +1,3 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
require '
|
4
|
-
require 'which'
|
5
|
-
require 'hit'
|
6
|
-
require 'threach'
|
7
|
-
|
8
|
-
class Bio::FastaFormat
|
9
|
-
def isNucl?
|
10
|
-
Bio::Sequence.guess(self.seq, 0.9, 500) == Bio::Sequence::NA
|
11
|
-
end
|
12
|
-
|
13
|
-
def isProt?
|
14
|
-
Bio::Sequence.guess(self.seq, 0.9, 500) == Bio::Sequence::AA
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
|
-
class CRB_Blast
|
19
|
-
|
20
|
-
include Which
|
21
|
-
|
22
|
-
attr_accessor :query_name, :target_name, :reciprocals
|
23
|
-
attr_accessor :missed
|
24
|
-
attr_accessor :target_is_prot, :query_is_prot
|
25
|
-
attr_accessor :query_results, :target_results, :working_dir
|
26
|
-
|
27
|
-
def initialize query, target, output=nil
|
28
|
-
@query = query
|
29
|
-
@target = target
|
30
|
-
if output.nil?
|
31
|
-
#@working_dir = File.expand_path(File.dirname(query)) # no trailing /
|
32
|
-
@working_dir = "."
|
33
|
-
else
|
34
|
-
@working_dir = File.expand_path(output)
|
35
|
-
mkcmd = "mkdir #{@working_dir}"
|
36
|
-
if !Dir.exist?(@working_dir)
|
37
|
-
puts mkcmd
|
38
|
-
`#{mkcmd}`
|
39
|
-
end
|
40
|
-
end
|
41
|
-
@makedb_path = which('makeblastdb')
|
42
|
-
raise 'makeblastdb was not in the PATH' if @makedb_path.empty?
|
43
|
-
@blastn_path = which('blastn')
|
44
|
-
raise 'blastn was not in the PATH' if @blastn_path.empty?
|
45
|
-
@tblastn_path = which('tblastn')
|
46
|
-
raise 'tblastn was not in the PATH' if @tblastn_path.empty?
|
47
|
-
@blastx_path = which('blastx')
|
48
|
-
raise 'blastx was not in the PATH' if @blastx_path.empty?
|
49
|
-
@blastp_path = which('blastp')
|
50
|
-
raise 'blastp was not in the PATH' if @blastp_path.empty?
|
51
|
-
@makedb_path = @makedb_path.first
|
52
|
-
@blastn_path = @blastn_path.first
|
53
|
-
@tblastn_path = @tblastn_path.first
|
54
|
-
@blastx_path = @blastx_path.first
|
55
|
-
@blastp_path = @blastp_path.first
|
56
|
-
end
|
57
|
-
|
58
|
-
#
|
59
|
-
# makes a blast database from the query and the target
|
60
|
-
#
|
61
|
-
def makedb
|
62
|
-
# only scan the first few hundred entries
|
63
|
-
n = 100
|
64
|
-
# check if the query is a nucl or prot seq
|
65
|
-
query_file = Bio::FastaFormat.open(@query)
|
66
|
-
count_p=0
|
67
|
-
count=0
|
68
|
-
query_file.take(n).each do |entry|
|
69
|
-
count_p += 1 if entry.isProt?
|
70
|
-
count += 1
|
71
|
-
end
|
72
|
-
if count_p > count*0.9
|
73
|
-
@query_is_prot = true
|
74
|
-
else
|
75
|
-
@query_is_prot = false
|
76
|
-
end
|
77
|
-
|
78
|
-
# check if the target is a nucl or prot seq
|
79
|
-
target_file = Bio::FastaFormat.open(@target)
|
80
|
-
count_p=0
|
81
|
-
count=0
|
82
|
-
target_file.take(n).each do |entry|
|
83
|
-
count_p += 1 if entry.isProt?
|
84
|
-
count += 1
|
85
|
-
end
|
86
|
-
if count_p > count*0.9
|
87
|
-
@target_is_prot = true
|
88
|
-
else
|
89
|
-
@target_is_prot = false
|
90
|
-
end
|
91
|
-
# construct the output database names
|
92
|
-
@query_name = File.basename(@query).split('.')[0..-2].join('.')
|
93
|
-
@target_name = File.basename(@target).split('.')[0..-2].join('.')
|
94
|
-
|
95
|
-
# check if the databases already exist in @working_dir
|
96
|
-
make_query_db_cmd = "#{@makedb_path} -in #{@query}"
|
97
|
-
make_query_db_cmd << " -dbtype nucl " if !@query_is_prot
|
98
|
-
make_query_db_cmd << " -dbtype prot " if @query_is_prot
|
99
|
-
make_query_db_cmd << " -title #{query_name} "
|
100
|
-
make_query_db_cmd << " -out #{@working_dir}/#{query_name}"
|
101
|
-
db_query = "#{query_name}.nsq" if !@query_is_prot
|
102
|
-
db_query = "#{query_name}.psq" if @query_is_prot
|
103
|
-
if !File.exists?("#{@working_dir}/#{db_query}")
|
104
|
-
`#{make_query_db_cmd}`
|
105
|
-
end
|
106
|
-
|
107
|
-
make_target_db_cmd = "#{@makedb_path} -in #{@target}"
|
108
|
-
make_target_db_cmd << " -dbtype nucl " if !@target_is_prot
|
109
|
-
make_target_db_cmd << " -dbtype prot " if @target_is_prot
|
110
|
-
make_target_db_cmd << " -title #{target_name} "
|
111
|
-
make_target_db_cmd << " -out #{@working_dir}/#{target_name}"
|
112
|
-
|
113
|
-
db_target = "#{target_name}.nsq" if !@target_is_prot
|
114
|
-
db_target = "#{target_name}.psq" if @target_is_prot
|
115
|
-
if !File.exists?("#{@working_dir}/#{db_target}")
|
116
|
-
`#{make_target_db_cmd}`
|
117
|
-
end
|
118
|
-
@databases = true
|
119
|
-
[@query_name, @target_name]
|
120
|
-
end
|
121
|
-
|
122
|
-
def run_blast(evalue, threads, split)
|
123
|
-
if @databases
|
124
|
-
@output1 = "#{@working_dir}/#{query_name}_into_#{target_name}.1.blast"
|
125
|
-
@output2 = "#{@working_dir}/#{target_name}_into_#{query_name}.2.blast"
|
126
|
-
if @query_is_prot
|
127
|
-
if @target_is_prot
|
128
|
-
bin1 = "#{@blastp_path} "
|
129
|
-
bin2 = "#{@blastp_path} "
|
130
|
-
else
|
131
|
-
bin1 = "#{@tblastn_path} "
|
132
|
-
bin2 = "#{@blastx_path} "
|
133
|
-
end
|
134
|
-
else
|
135
|
-
if @target_is_prot
|
136
|
-
bin1 = "#{@blastx_path} "
|
137
|
-
bin2 = "#{@tblastn_path} "
|
138
|
-
else
|
139
|
-
bin1 = "#{@blastn_path} "
|
140
|
-
bin2 = "#{@blastn_path} "
|
141
|
-
end
|
142
|
-
end
|
143
|
-
if split and threads > 1
|
144
|
-
run_blast_with_splitting evalue, threads, bin1, bin2
|
145
|
-
else
|
146
|
-
run_blast_with_threads evalue, threads, bin1, bin2
|
147
|
-
end
|
148
|
-
return true
|
149
|
-
else
|
150
|
-
return false
|
151
|
-
end
|
152
|
-
end
|
153
|
-
|
154
|
-
def run_blast_with_threads evalue, threads, bin1, bin2
|
155
|
-
# puts "running blast with #{threads} threads"
|
156
|
-
cmd1 = "#{bin1} -query #{@query} -db #{@working_dir}/#{@target_name} "
|
157
|
-
cmd1 << " -out #{@output1} -evalue #{evalue} "
|
158
|
-
cmd1 << " -outfmt \"6 std qlen slen\" "
|
159
|
-
cmd1 << " -max_target_seqs 50 "
|
160
|
-
cmd1 << " -num_threads #{threads}"
|
161
|
-
|
162
|
-
cmd2 = "#{bin2} -query #{@target} -db #{@working_dir}/#{@query_name} "
|
163
|
-
cmd2 << " -out #{@output2} -evalue #{evalue} "
|
164
|
-
cmd2 << " -outfmt \"6 std qlen slen\" "
|
165
|
-
cmd2 << " -max_target_seqs 50 "
|
166
|
-
cmd2 << " -num_threads #{threads}"
|
167
|
-
if !File.exist?("#{@output1}")
|
168
|
-
`#{cmd1}`
|
169
|
-
end
|
170
|
-
|
171
|
-
if !File.exist?("#{@output2}")
|
172
|
-
`#{cmd2}`
|
173
|
-
end
|
174
|
-
end
|
175
|
-
|
176
|
-
def run_blast_with_splitting evalue, threads, bin1, bin2
|
177
|
-
# puts "running blast by splitting input into #{threads} pieces"
|
178
|
-
blasts=[]
|
179
|
-
files = split_input(@query, threads)
|
180
|
-
files.threach(threads) do |thread|
|
181
|
-
cmd1 = "#{bin1} -query #{thread} -db #{@working_dir}/#{@target_name} "
|
182
|
-
cmd1 << " -out #{thread}.blast -evalue #{evalue} "
|
183
|
-
cmd1 << " -outfmt \"6 std qlen slen\" "
|
184
|
-
cmd1 << " -max_target_seqs 50 "
|
185
|
-
cmd1 << " -num_threads 1"
|
186
|
-
if !File.exists?("#{thread}.blast")
|
187
|
-
`#{cmd1}`
|
188
|
-
end
|
189
|
-
blasts << "#{thread}.blast"
|
190
|
-
end
|
191
|
-
cat_cmd = "cat "
|
192
|
-
cat_cmd << blasts.join(" ")
|
193
|
-
cat_cmd << " > #{@output1}"
|
194
|
-
`#{cat_cmd}`
|
195
|
-
files.each do |file|
|
196
|
-
File.delete(file) if File.exist?(file)
|
197
|
-
end
|
198
|
-
blasts.each do |b|
|
199
|
-
File.delete(b) # delete intermediate blast output files
|
200
|
-
end
|
201
|
-
|
202
|
-
blasts=[]
|
203
|
-
files = split_input(@target, threads)
|
204
|
-
files.threach(threads) do |thread|
|
205
|
-
cmd2 = "#{bin2} -query #{thread} -db #{@working_dir}/#{@query_name} "
|
206
|
-
cmd2 << " -out #{thread}.blast -evalue #{evalue} "
|
207
|
-
cmd2 << " -outfmt \"6 std qlen slen\" "
|
208
|
-
cmd2 << " -max_target_seqs 50 "
|
209
|
-
cmd2 << " -num_threads 1"
|
210
|
-
if !File.exists?("#{thread}.blast")
|
211
|
-
`#{cmd2}`
|
212
|
-
end
|
213
|
-
blasts << "#{thread}.blast"
|
214
|
-
end
|
215
|
-
cat_cmd = "cat "
|
216
|
-
cat_cmd << blasts.join(" ")
|
217
|
-
cat_cmd << " > #{@output2}"
|
218
|
-
`#{cat_cmd}`
|
219
|
-
files.each do |file|
|
220
|
-
File.delete(file) if File.exist?(file)
|
221
|
-
end
|
222
|
-
blasts.each do |b|
|
223
|
-
File.delete(b) # delete intermediate blast output files
|
224
|
-
end
|
225
|
-
|
226
|
-
end
|
227
|
-
|
228
|
-
def split_input filename, pieces
|
229
|
-
input = {}
|
230
|
-
name = nil
|
231
|
-
seq=""
|
232
|
-
File.open(filename).each_line do |line|
|
233
|
-
if line =~ /^>(.*)$/
|
234
|
-
if name
|
235
|
-
input[name]=seq
|
236
|
-
seq=""
|
237
|
-
end
|
238
|
-
name = $1
|
239
|
-
else
|
240
|
-
seq << line.chomp
|
241
|
-
end
|
242
|
-
end
|
243
|
-
input[name]=seq
|
244
|
-
# construct list of output file handles
|
245
|
-
outputs=[]
|
246
|
-
output_files=[]
|
247
|
-
pieces.times do |n|
|
248
|
-
outfile = "#{filename}_chunk_#{n}.fasta"
|
249
|
-
outfile = File.expand_path(outfile)
|
250
|
-
outputs[n] = File.open("#{outfile}", "w")
|
251
|
-
output_files[n] = "#{outfile}"
|
252
|
-
end
|
253
|
-
# write sequences
|
254
|
-
count=0
|
255
|
-
input.each_pair do |name, seq|
|
256
|
-
outputs[count].write(">#{name}\n")
|
257
|
-
outputs[count].write("#{seq}\n")
|
258
|
-
count += 1
|
259
|
-
count %= pieces
|
260
|
-
end
|
261
|
-
outputs.each do |out|
|
262
|
-
out.close
|
263
|
-
end
|
264
|
-
output_files
|
265
|
-
end
|
266
|
-
|
267
|
-
def load_outputs
|
268
|
-
if File.exist?("#{@working_dir}/reciprocal_hits.txt")
|
269
|
-
# puts "reciprocal output already exists"
|
270
|
-
else
|
271
|
-
@query_results = Hash.new
|
272
|
-
@target_results = Hash.new
|
273
|
-
q_count=0
|
274
|
-
t_count=0
|
275
|
-
if !File.exists?("#{@output1}")
|
276
|
-
raise RuntimeError.new("can't find #{@output1}")
|
277
|
-
end
|
278
|
-
if !File.exists?("#{@output2}")
|
279
|
-
raise RuntimeError.new("can't find #{@output2}")
|
280
|
-
end
|
281
|
-
if File.exists?("#{@output1}") and File.exists?("#{@output2}")
|
282
|
-
File.open("#{@output1}").each_line do |line|
|
283
|
-
cols = line.chomp.split("\t")
|
284
|
-
hit = Hit.new(cols)
|
285
|
-
@query_results[hit.query] = [] if !@query_results.has_key?(hit.query)
|
286
|
-
@query_results[hit.query] << hit
|
287
|
-
q_count += 1
|
288
|
-
end
|
289
|
-
File.open("#{@output2}").each_line do |line|
|
290
|
-
cols = line.chomp.split("\t")
|
291
|
-
hit = Hit.new(cols)
|
292
|
-
@target_results[hit.query] = [] if !@target_results.has_key?(hit.query)
|
293
|
-
@target_results[hit.query] << hit
|
294
|
-
t_count += 1
|
295
|
-
end
|
296
|
-
else
|
297
|
-
raise "need to run blast first"
|
298
|
-
end
|
299
|
-
end
|
300
|
-
[q_count, t_count]
|
301
|
-
end
|
302
|
-
|
303
|
-
# fills @reciprocals with strict reciprocal hits from the blast results
|
304
|
-
def find_reciprocals
|
305
|
-
if File.exist?("#{@working_dir}/reciprocal_hits.txt")
|
306
|
-
# puts "reciprocal output already exists"
|
307
|
-
else
|
308
|
-
@reciprocals = Hash.new
|
309
|
-
@missed = Hash.new
|
310
|
-
@evalues = []
|
311
|
-
@longest = 0
|
312
|
-
hits = 0
|
313
|
-
@query_results.each_pair do |query_id, list_of_hits|
|
314
|
-
list_of_hits.each_with_index do |target_hit, query_index|
|
315
|
-
if @target_results.has_key?(target_hit.target)
|
316
|
-
list_of_hits_2 = @target_results[target_hit.target]
|
317
|
-
list_of_hits_2.each_with_index do |query_hit2, target_index|
|
318
|
-
if query_index == 0 && target_index == 0 &&
|
319
|
-
query_id == query_hit2.target
|
320
|
-
e = target_hit.evalue.to_f
|
321
|
-
e = 1e-200 if e==0
|
322
|
-
e = -Math.log10(e)
|
323
|
-
if !@reciprocals.key?(query_id)
|
324
|
-
@reciprocals[query_id] = []
|
325
|
-
end
|
326
|
-
@reciprocals[query_id] << target_hit
|
327
|
-
hits += 1
|
328
|
-
@longest = target_hit.alnlen if target_hit.alnlen > @longest
|
329
|
-
@evalues << {:e => e, :length => target_hit.alnlen}
|
330
|
-
elsif query_id == query_hit2.target
|
331
|
-
if !@missed.key?(query_id)
|
332
|
-
@missed[query_id] = []
|
333
|
-
end
|
334
|
-
@missed[query_id] << target_hit
|
335
|
-
end
|
336
|
-
end
|
337
|
-
end
|
338
|
-
end
|
339
|
-
end
|
340
|
-
end
|
341
|
-
return hits
|
342
|
-
end
|
343
|
-
|
344
|
-
def find_secondaries
|
345
|
-
|
346
|
-
if File.exist?("#{@working_dir}/reciprocal_hits.txt")
|
347
|
-
# puts "reciprocal output already exists"
|
348
|
-
else
|
349
|
-
length_hash = Hash.new
|
350
|
-
fitting = Hash.new
|
351
|
-
@evalues.each do |h|
|
352
|
-
length_hash[h[:length]] = [] if !length_hash.key?(h[:length])
|
353
|
-
length_hash[h[:length]] << h
|
354
|
-
end
|
355
|
-
|
356
|
-
(10..@longest).each do |centre|
|
357
|
-
e = 0
|
358
|
-
count = 0
|
359
|
-
s = centre*0.1
|
360
|
-
s = s.to_i
|
361
|
-
s = 5 if s < 5
|
362
|
-
(-s..s).each do |side|
|
363
|
-
if length_hash.has_key?(centre+side)
|
364
|
-
length_hash[centre+side].each do |point|
|
365
|
-
e += point[:e]
|
366
|
-
count += 1
|
367
|
-
end
|
368
|
-
end
|
369
|
-
end
|
370
|
-
if count>0
|
371
|
-
mean = e/count
|
372
|
-
fitting[centre] = mean
|
373
|
-
end
|
374
|
-
end
|
375
|
-
hits = 0
|
376
|
-
@missed.each_pair do |id, list|
|
377
|
-
list.each do |hit|
|
378
|
-
l = hit.alnlen.to_i
|
379
|
-
e = hit.evalue
|
380
|
-
e = 1e-200 if e==0
|
381
|
-
e = -Math.log10(e)
|
382
|
-
if fitting.has_key?(l)
|
383
|
-
if e >= fitting[l]
|
384
|
-
if !@reciprocals.key?(id)
|
385
|
-
@reciprocals[id] = []
|
386
|
-
found=false
|
387
|
-
@reciprocals[id].each do |existing_hit|
|
388
|
-
if existing_hit.query == hit.query &&
|
389
|
-
existing_hit.target == hit.target
|
390
|
-
found=true
|
391
|
-
end
|
392
|
-
end
|
393
|
-
if !found
|
394
|
-
@reciprocals[id] << hit
|
395
|
-
hits += 1
|
396
|
-
end
|
397
|
-
end
|
398
|
-
end
|
399
|
-
end
|
400
|
-
end
|
401
|
-
end
|
402
|
-
end
|
403
|
-
return hits
|
404
|
-
end
|
405
|
-
|
406
|
-
def clear_memory
|
407
|
-
# running lots of jobs at the same time was keeping a lot of stuff in
|
408
|
-
# memory that you might not want so this empties out those big hashes.
|
409
|
-
@query_results = nil
|
410
|
-
@target_results = nil
|
411
|
-
end
|
412
|
-
|
413
|
-
def run evalue, threads, split
|
414
|
-
makedb
|
415
|
-
run_blast evalue, threads, split
|
416
|
-
load_outputs
|
417
|
-
find_reciprocals
|
418
|
-
find_secondaries
|
419
|
-
end
|
420
|
-
|
421
|
-
def size
|
422
|
-
hits=0
|
423
|
-
@reciprocals.each_pair do |key, list|
|
424
|
-
list.each do |hit|
|
425
|
-
hits += 1
|
426
|
-
end
|
427
|
-
end
|
428
|
-
hits
|
429
|
-
end
|
430
|
-
|
431
|
-
def write_output
|
432
|
-
s=""
|
433
|
-
unless @reciprocals.nil?
|
434
|
-
@reciprocals.each_pair do |query_id, hits|
|
435
|
-
hits.each do |hit|
|
436
|
-
s << "#{hit}\n"
|
437
|
-
end
|
438
|
-
end
|
439
|
-
File.open("#{@working_dir}/reciprocal_hits.txt", "w") {|f| f.write s }
|
440
|
-
end
|
441
|
-
end
|
442
|
-
|
443
|
-
def has_reciprocal? contig
|
444
|
-
return true if @reciprocals.has_key?(contig)
|
445
|
-
return false
|
446
|
-
end
|
447
|
-
end
|
1
|
+
require 'crb-blast/cmd'
|
2
|
+
require 'crb-blast/hit'
|
3
|
+
require 'crb-blast/crb-blast'
|