scbi_blast 0.0.30 → 0.0.31
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +6 -0
- data/Manifest.txt +1 -1
- data/README.rdoc +76 -8
- data/lib/scbi_blast/batch_blast.rb +101 -85
- data/lib/scbi_blast/blast_hit.rb +70 -57
- data/lib/scbi_blast/blast_query.rb +44 -17
- data/lib/scbi_blast/blast_simplexml_result.rb +28 -14
- data/lib/scbi_blast/blast_table_result.rb +164 -149
- data/lib/scbi_blast/blast_xml_result.rb +105 -94
- data/lib/scbi_blast/dust_masker.rb +59 -44
- data/lib/scbi_blast.rb +23 -1
- metadata +7 -5
@@ -1,173 +1,186 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# Copyright (c) 2010 Dario Guerrero & Almudena Bocinos
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
# a copy of this software and associated documentation files (the
|
5
|
+
# 'Software'), to deal in the Software without restriction, including
|
6
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
# the following conditions:
|
10
|
+
#
|
11
|
+
# The above copyright notice and this permission notice shall be
|
12
|
+
# included in all copies or substantial portions of the Software.
|
13
|
+
#
|
14
|
+
# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
17
|
+
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
18
|
+
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
19
|
+
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
20
|
+
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
3
21
|
|
4
22
|
|
23
|
+
require "blast_query.rb"
|
24
|
+
require "blast_hit.rb"
|
5
25
|
|
6
|
-
######################################
|
7
|
-
# Author:: Almudena Bocinos Rioboo
|
8
|
-
# Extracts results from blast table's file and uses it to create instances of "BlastQuery" and "BlastHit"
|
9
26
|
|
10
|
-
######################################
|
11
27
|
|
28
|
+
# Extracts results from blast table's file and uses it to create instances of "BlastQuery" and "BlastHit"
|
12
29
|
class BlastTableResult
|
13
|
-
|
30
|
+
|
31
|
+
# Parser initialization
|
14
32
|
def initialize(input)
|
15
|
-
|
33
|
+
|
16
34
|
@querys = []
|
17
|
-
|
18
|
-
|
35
|
+
|
36
|
+
|
19
37
|
if input.is_a?(Array)
|
20
38
|
lines=input
|
21
|
-
|
39
|
+
|
22
40
|
else
|
23
|
-
|
41
|
+
|
24
42
|
fich = File.open(input,'r')
|
25
43
|
lines = fich.readlines
|
26
44
|
fich.close
|
27
|
-
|
45
|
+
|
28
46
|
end
|
29
|
-
|
30
|
-
# puts "lines length #{lines.length}"
|
47
|
+
|
31
48
|
query_name=''
|
32
|
-
|
49
|
+
|
33
50
|
lines.each do |line|
|
34
|
-
|
51
|
+
|
35
52
|
line.chomp! #delete end of line
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
#
|
55
|
-
#
|
56
|
-
#
|
57
|
-
#
|
58
|
-
#
|
59
|
-
#
|
60
|
-
#
|
61
|
-
#
|
62
|
-
#
|
63
|
-
#
|
64
|
-
#
|
65
|
-
#
|
66
|
-
#
|
67
|
-
#
|
68
|
-
#
|
69
|
-
#
|
70
|
-
#
|
71
|
-
#
|
72
|
-
#
|
73
|
-
#
|
74
|
-
#
|
75
|
-
#
|
76
|
-
#
|
77
|
-
#
|
78
|
-
#
|
79
|
-
#
|
80
|
-
#
|
81
|
-
#
|
82
|
-
#
|
83
|
-
#
|
84
|
-
#
|
85
|
-
|
86
|
-
#
|
87
|
-
#
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
query = BlastQuery.new(qseqid)
|
123
|
-
query.add_hit(hit)
|
124
|
-
@querys.push query
|
125
|
-
end
|
126
|
-
|
127
|
-
|
128
|
-
#Description
|
129
|
-
|
130
|
-
# read_blast_tab read tabular BLAST format created with blast_seq and written to file with write_blast - or with blastall and the -m 8 or -m 9 switch.
|
131
|
-
# Each column in the table corresponds to the following keys:
|
132
|
-
#
|
133
|
-
# 1. Q_ID - Query ID.
|
134
|
-
# 2. S_ID - Subject ID.
|
135
|
-
# 3. IDENT - Identity (%).
|
136
|
-
# 4. ALIGN_LEN - Alignment length.
|
137
|
-
# 5. MISMATCHES - Number of mismatches.
|
138
|
-
# 6. GAPS - Number of gaps.
|
139
|
-
# 7. Q_BEG - Query begin.
|
140
|
-
# 8. Q_END - Query end.
|
141
|
-
# 9. S_BEG - Subject begin.
|
142
|
-
# 10. S_END - Subject end.
|
143
|
-
# 11. E_VAL - Expect value.
|
144
|
-
# 12. BIT_SCORE - Bit score.
|
145
|
-
#
|
146
|
-
# Furthermore, two extra keys are added to the record:
|
147
|
-
#
|
148
|
-
# * STRAND - Strand.
|
149
|
-
# * REC_TYPE - Record type.
|
53
|
+
|
54
|
+
if line =~ /^\s*#/
|
55
|
+
if line =~ /^#\sQuery:\s+(.+)$/
|
56
|
+
query_name = $1
|
57
|
+
elsif line =~ /^#\s0\shits\sfound$/
|
58
|
+
@querys.push BlastQuery.new(query_name)
|
59
|
+
end
|
60
|
+
# 0 hits found
|
61
|
+
|
62
|
+
|
63
|
+
else
|
64
|
+
params = line.split(/\t+/)
|
65
|
+
|
66
|
+
# puts "Extracted #{params[0]} #{params[1]} #{params[2]} #{params[3]} #{params[4]} #{params[5]} #{params[6]} #{params[7]} #{params[8]} #{params[9]} #{params[10]} #{params[11]}"
|
67
|
+
# Options 6, 7, and 10 can be additionally configured to produce
|
68
|
+
# a custom format specified by space delimited format specifiers.
|
69
|
+
# The supported format specifiers are:
|
70
|
+
# qseqid means Query Seq-id
|
71
|
+
# qgi means Query GI
|
72
|
+
# qacc means Query accesion
|
73
|
+
# sseqid means Subject Seq-id
|
74
|
+
# sallseqid means All subject Seq-id(s), separated by a ';'
|
75
|
+
# sgi means Subject GI
|
76
|
+
# sallgi means All subject GIs
|
77
|
+
# sacc means Subject accession
|
78
|
+
# sallacc means All subject accessions
|
79
|
+
# qstart means Start of alignment in query
|
80
|
+
# qend means End of alignment in query
|
81
|
+
# sstart means Start of alignment in subject
|
82
|
+
# send means End of alignment in subject
|
83
|
+
# qseq means Aligned part of query sequence
|
84
|
+
# sseq means Aligned part of subject sequence
|
85
|
+
# evalue means Expect value
|
86
|
+
# bitscore means Bit score
|
87
|
+
# score means Raw score
|
88
|
+
# length means Alignment length
|
89
|
+
# pident means Percentage of identical matches
|
90
|
+
# nident means Number of identical matches
|
91
|
+
# mismatch means Number of mismatches
|
92
|
+
# positive means Number of positive-scoring matches
|
93
|
+
# gapopen means Number of gap openings
|
94
|
+
# gaps means Total number of gaps
|
95
|
+
# ppos means Percentage of positive-scoring matches
|
96
|
+
# frames means Query and subject frames separated by a '/'
|
97
|
+
# qframe means Query frame
|
98
|
+
# sframe means Subject frame
|
99
|
+
# When not provided, the default value is:
|
100
|
+
# 'qseqid sseqid pident length mismatch gapopen qstart qend sstart send
|
101
|
+
# evalue bitscore', which is equivalent to the keyword 'std'
|
102
|
+
|
103
|
+
# if the query doesn't exist, then create a new one,
|
104
|
+
# else the hit will be added to the last query
|
105
|
+
|
106
|
+
qseqid,sacc,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,score,qframe,sframe,qseq,sseq = params
|
107
|
+
|
108
|
+
# creates the hit
|
109
|
+
hit = BlastHit.new(qstart,qend,sstart,send)
|
110
|
+
|
111
|
+
hit.align_len=length
|
112
|
+
hit.ident=pident
|
113
|
+
|
114
|
+
hit.gaps=gapopen
|
115
|
+
hit.mismatches=mismatch
|
116
|
+
hit.e_val=evalue
|
117
|
+
hit.bit_score=bitscore
|
118
|
+
|
119
|
+
hit.score = score
|
120
|
+
hit.q_frame = qframe
|
121
|
+
hit.s_frame = sframe
|
122
|
+
|
123
|
+
hit.subject_id = sacc
|
124
|
+
hit.full_subject_length=0
|
125
|
+
hit.definition=sacc
|
126
|
+
hit.acc=sacc
|
127
|
+
hit.q_seq=qseq
|
128
|
+
hit.s_seq=sseq
|
129
|
+
|
130
|
+
query=find_query(@querys,qseqid)
|
131
|
+
|
132
|
+
if (query) #if it is a new query, it is created and added
|
133
|
+
query.add_hit(hit)
|
134
|
+
|
135
|
+
else # else the hit is added in last query added
|
136
|
+
query = BlastQuery.new(qseqid)
|
137
|
+
query.add_hit(hit)
|
138
|
+
@querys.push query
|
150
139
|
end
|
140
|
+
|
141
|
+
|
142
|
+
#Description
|
143
|
+
|
144
|
+
# read_blast_tab read tabular BLAST format created with blast_seq and written to file with write_blast - or with blastall and the -m 8 or -m 9 switch.
|
145
|
+
# Each column in the table corresponds to the following keys:
|
146
|
+
#
|
147
|
+
# 1. Q_ID - Query ID.
|
148
|
+
# 2. S_ID - Subject ID.
|
149
|
+
# 3. IDENT - Identity (%).
|
150
|
+
# 4. ALIGN_LEN - Alignment length.
|
151
|
+
# 5. MISMATCHES - Number of mismatches.
|
152
|
+
# 6. GAPS - Number of gaps.
|
153
|
+
# 7. Q_BEG - Query begin.
|
154
|
+
# 8. Q_END - Query end.
|
155
|
+
# 9. S_BEG - Subject begin.
|
156
|
+
# 10. S_END - Subject end.
|
157
|
+
# 11. E_VAL - Expect value.
|
158
|
+
# 12. BIT_SCORE - Bit score.
|
159
|
+
#
|
160
|
+
# Furthermore, two extra keys are added to the record:
|
161
|
+
#
|
162
|
+
# * STRAND - Strand.
|
163
|
+
# * REC_TYPE - Record type.
|
151
164
|
end
|
152
|
-
|
153
|
-
|
154
|
-
|
165
|
+
end
|
166
|
+
|
167
|
+
#inspect
|
168
|
+
|
155
169
|
end
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
def inspect
|
160
|
-
|
161
|
-
# puts(@querys.each{|q| q.inspect}).join("\n")
|
162
|
-
res = "Blast results:\n"
|
170
|
+
|
171
|
+
|
172
|
+
# inspect results
|
173
|
+
def inspect
|
174
|
+
res = "Blast results:\n"
|
163
175
|
res+= '-'*20
|
164
176
|
res+= "\nQuerys: #{@querys.count}\n"
|
165
177
|
@querys.each{|q| res+=q.inspect+"\n"}
|
166
|
-
return res
|
178
|
+
return res
|
167
179
|
end
|
168
|
-
|
169
|
-
|
170
|
-
|
180
|
+
|
181
|
+
# find query by name
|
182
|
+
def find_query(querys,name_q)
|
183
|
+
# newq = querys.find{|q| ( q.find{|h| (h.subject_id)})}
|
171
184
|
new_q=nil
|
172
185
|
|
173
186
|
if !querys.empty?
|
@@ -176,12 +189,14 @@ class BlastTableResult
|
|
176
189
|
|
177
190
|
return new_q
|
178
191
|
end
|
179
|
-
|
192
|
+
|
193
|
+
# check if there are querys
|
180
194
|
def empty?
|
181
|
-
|
195
|
+
|
182
196
|
return @querys.empty?
|
183
|
-
end
|
184
|
-
|
197
|
+
end
|
198
|
+
|
199
|
+
# get query count
|
185
200
|
def size
|
186
201
|
@querys.size
|
187
202
|
end
|
@@ -1,27 +1,38 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
#
|
6
|
-
#
|
1
|
+
# Copyright (c) 2010 Dario Guerrero & Almudena Bocinos
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
# a copy of this software and associated documentation files (the
|
5
|
+
# 'Software'), to deal in the Software without restriction, including
|
6
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
# the following conditions:
|
7
10
|
#
|
11
|
+
# The above copyright notice and this permission notice shall be
|
12
|
+
# included in all copies or substantial portions of the Software.
|
8
13
|
#
|
9
|
-
#
|
10
|
-
#
|
14
|
+
# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
17
|
+
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
18
|
+
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
19
|
+
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
20
|
+
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
11
21
|
|
12
|
-
######################################
|
13
|
-
# Author:: Almudena Bocinos Rioboo
|
14
|
-
# Extracts results from blast table's file and uses it to create instances of "BlastQuery" and "BlastHit"
|
15
22
|
|
16
|
-
|
23
|
+
require "blast_query.rb"
|
24
|
+
require "blast_hit.rb"
|
25
|
+
|
26
|
+
require 'nokogiri'
|
17
27
|
|
28
|
+
# Another XML parser using nokogiri library
|
18
29
|
class BlastXmlResult
|
19
|
-
|
30
|
+
|
20
31
|
def initialize(input)
|
21
|
-
|
32
|
+
|
22
33
|
@querys = []
|
23
34
|
lines=[]
|
24
|
-
|
35
|
+
|
25
36
|
if input.is_a?(Array)
|
26
37
|
lines=input
|
27
38
|
else
|
@@ -30,106 +41,106 @@ class BlastXmlResult
|
|
30
41
|
lines = fich.readlines
|
31
42
|
fich.close
|
32
43
|
end
|
33
|
-
|
44
|
+
|
34
45
|
end
|
35
|
-
|
36
|
-
# puts "lines length #{lines.length}"
|
46
|
+
|
47
|
+
# puts "lines length #{lines.length}"
|
37
48
|
if !lines.empty?
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
49
|
+
data = Nokogiri::XML(lines.join)
|
50
|
+
data.root.xpath('//Iteration').each do |iteration|
|
51
|
+
|
52
|
+
# puts JSON::pretty_generate(iteration)
|
42
53
|
query_id = iteration.xpath('Iteration_query-ID').text
|
43
54
|
|
44
55
|
full_query_length = iteration.xpath('Iteration_query-len').text
|
45
56
|
query_def = iteration.xpath('Iteration_query-def').text
|
46
|
-
|
57
|
+
|
47
58
|
if query_def =~ /^([^\s]+)/
|
48
|
-
|
59
|
+
query_def=$1
|
49
60
|
end
|
50
|
-
|
51
|
-
|
61
|
+
|
62
|
+
#@query_def = iteration['Iteration_query-def'][0]
|
52
63
|
|
53
64
|
query = BlastQuery.new(query_id)
|
54
65
|
query.query_def = query_def
|
55
66
|
query.full_query_length = full_query_length
|
56
67
|
@querys.push query
|
57
|
-
|
68
|
+
|
58
69
|
|
59
70
|
hits = iteration.xpath('Iteration_hits/Hit')
|
60
71
|
if !hits.nil?
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
end
|
72
|
+
hits.each do |h|
|
73
|
+
#puts JSON::pretty_generate(h)
|
74
|
+
|
75
|
+
|
76
|
+
|
77
|
+
subject_id=h.xpath('Hit_id').text
|
78
|
+
acc =h.xpath('Hit_accession').text
|
79
|
+
full_subject_length = h.xpath('Hit_len').text.to_i
|
80
|
+
hit_def=h.xpath('Hit_def').text
|
81
|
+
if hit_def=='No definition line'
|
82
|
+
hit_def =subject_id
|
83
|
+
end
|
84
|
+
|
85
|
+
hsps = h.xpath('Hit_hsps/Hsp')
|
86
|
+
|
87
|
+
hsps.each do |hsp|
|
88
|
+
|
89
|
+
q_beg=hsp.xpath('Hsp_query-from').text.to_i
|
90
|
+
q_end=hsp.xpath('Hsp_query-to').text.to_i
|
91
|
+
s_beg=hsp.xpath('Hsp_hit-from').text.to_i
|
92
|
+
s_end=hsp.xpath('Hsp_hit-to').text.to_i
|
93
|
+
|
94
|
+
# creates the hit
|
95
|
+
hit = BlastHit.new(q_beg,q_end,s_beg,s_end)
|
96
|
+
|
97
|
+
hit.align_len=hsp.xpath('Hsp_align-len').text.to_i
|
98
|
+
hit.ident=(hsp.xpath('Hsp_identity').text.to_f/hit.align_len)*100
|
99
|
+
hit.gaps=hsp.xpath('Hsp_gaps').text.to_i
|
100
|
+
hit.mismatches=hsp.xpath('Hsp_midline').text.count(' ').to_i - hit.gaps
|
101
|
+
hit.e_val=hsp.xpath('Hsp_evalue').text.to_f
|
102
|
+
hit.e_val = (hit.e_val*1000).round/1000.0
|
103
|
+
hit.bit_score=hsp.xpath('Hsp_bit-score').text.to_f
|
104
|
+
hit.bit_score = (hit.bit_score*100).round/100.0
|
105
|
+
|
106
|
+
hit.score = hsp.xpath('Hsp_score').text.to_f
|
107
|
+
hit.q_frame = hsp.xpath('Hsp_query-frame').text.to_i
|
108
|
+
hit.s_frame =hsp.xpath('Hsp_hit-frame').text.to_i
|
109
|
+
|
110
|
+
hit.q_seq = hsp.xpath('Hsp_qseq').text
|
111
|
+
hit.s_seq = hsp.xpath('Hsp_hseq').text
|
112
|
+
|
113
|
+
|
114
|
+
hit.subject_id= subject_id
|
115
|
+
hit.full_subject_length=full_subject_length
|
116
|
+
# hit.full_query_length = full_query_length
|
117
|
+
hit.definition=hit_def
|
118
|
+
hit.acc=acc
|
119
|
+
|
120
|
+
query.add_hit(hit)
|
121
|
+
|
122
|
+
end
|
123
|
+
end
|
114
124
|
end
|
115
125
|
end
|
116
|
-
|
117
|
-
|
126
|
+
end
|
127
|
+
#inspect
|
128
|
+
|
118
129
|
end
|
119
|
-
|
120
|
-
|
121
|
-
|
130
|
+
|
131
|
+
|
132
|
+
|
122
133
|
def inspect
|
123
|
-
|
134
|
+
|
124
135
|
res = "Blast results:\n"
|
125
136
|
res+= '-'*20
|
126
137
|
res+= "\nQuerys: #{@querys.count}\n"
|
127
138
|
@querys.each{|q| res+=q.inspect+"\n"}
|
128
139
|
return res
|
129
140
|
end
|
130
|
-
|
131
|
-
def find_query(querys,name_q)
|
132
|
-
|
141
|
+
|
142
|
+
def find_query(querys,name_q)
|
143
|
+
# newq = querys.find{|q| ( q.find{|h| (h.subject_id)})}
|
133
144
|
new_q=nil
|
134
145
|
|
135
146
|
if !querys.empty?
|
@@ -138,15 +149,15 @@ class BlastXmlResult
|
|
138
149
|
|
139
150
|
return new_q
|
140
151
|
end
|
141
|
-
|
152
|
+
|
142
153
|
def empty?
|
143
|
-
|
154
|
+
|
144
155
|
return @querys.empty?
|
145
|
-
end
|
146
|
-
|
156
|
+
end
|
157
|
+
|
147
158
|
def size
|
148
159
|
@querys.size
|
149
160
|
end
|
150
|
-
|
161
|
+
|
151
162
|
attr_accessor :querys
|
152
163
|
end
|