scbi_blast 0.0.30 → 0.0.31

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,173 +1,186 @@
1
- require "blast_query.rb"
2
- require "blast_hit.rb"
1
+ # Copyright (c) 2010 Dario Guerrero & Almudena Bocinos
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining
4
+ # a copy of this software and associated documentation files (the
5
+ # 'Software'), to deal in the Software without restriction, including
6
+ # without limitation the rights to use, copy, modify, merge, publish,
7
+ # distribute, sublicense, and/or sell copies of the Software, and to
8
+ # permit persons to whom the Software is furnished to do so, subject to
9
+ # the following conditions:
10
+ #
11
+ # The above copyright notice and this permission notice shall be
12
+ # included in all copies or substantial portions of the Software.
13
+ #
14
+ # THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
15
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17
+ # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18
+ # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19
+ # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20
+ # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
3
21
 
4
22
 
23
+ require "blast_query.rb"
24
+ require "blast_hit.rb"
5
25
 
6
- ######################################
7
- # Author:: Almudena Bocinos Rioboo
8
- # Extracts results from blast table's file and uses it to create instances of "BlastQuery" and "BlastHit"
9
26
 
10
- ######################################
11
27
 
28
+ # Extracts results from blast table's file and uses it to create instances of "BlastQuery" and "BlastHit"
12
29
  class BlastTableResult
13
-
30
+
31
+ # Parser initialization
14
32
  def initialize(input)
15
-
33
+
16
34
  @querys = []
17
-
18
-
35
+
36
+
19
37
  if input.is_a?(Array)
20
38
  lines=input
21
-
39
+
22
40
  else
23
-
41
+
24
42
  fich = File.open(input,'r')
25
43
  lines = fich.readlines
26
44
  fich.close
27
-
45
+
28
46
  end
29
-
30
- # puts "lines length #{lines.length}"
47
+
31
48
  query_name=''
32
-
49
+
33
50
  lines.each do |line|
34
-
51
+
35
52
  line.chomp! #delete end of line
36
-
37
- if line =~ /^\s*#/
38
- if line =~ /^#\sQuery:\s+(.+)$/
39
- query_name = $1
40
- elsif line =~ /^#\s0\shits\sfound$/
41
- @querys.push BlastQuery.new(query_name)
42
- end
43
- # 0 hits found
44
-
45
-
46
- else
47
- params = line.split(/\t+/)
48
-
49
- # creates the hit
50
- #hit = BlastHit.new( params[1], params[2], params[3], params[4], params[5], params[6], params[7], params[8], params[9], params[10], params[11])
51
-
52
- # puts "Extracted #{params[0]} #{params[1]} #{params[2]} #{params[3]} #{params[4]} #{params[5]} #{params[6]} #{params[7]} #{params[8]} #{params[9]} #{params[10]} #{params[11]}"
53
- # Options 6, 7, and 10 can be additionally configured to produce
54
- # a custom format specified by space delimited format specifiers.
55
- # The supported format specifiers are:
56
- # qseqid means Query Seq-id
57
- # qgi means Query GI
58
- # qacc means Query accesion
59
- # sseqid means Subject Seq-id
60
- # sallseqid means All subject Seq-id(s), separated by a ';'
61
- # sgi means Subject GI
62
- # sallgi means All subject GIs
63
- # sacc means Subject accession
64
- # sallacc means All subject accessions
65
- # qstart means Start of alignment in query
66
- # qend means End of alignment in query
67
- # sstart means Start of alignment in subject
68
- # send means End of alignment in subject
69
- # qseq means Aligned part of query sequence
70
- # sseq means Aligned part of subject sequence
71
- # evalue means Expect value
72
- # bitscore means Bit score
73
- # score means Raw score
74
- # length means Alignment length
75
- # pident means Percentage of identical matches
76
- # nident means Number of identical matches
77
- # mismatch means Number of mismatches
78
- # positive means Number of positive-scoring matches
79
- # gapopen means Number of gap openings
80
- # gaps means Total number of gaps
81
- # ppos means Percentage of positive-scoring matches
82
- # frames means Query and subject frames separated by a '/'
83
- # qframe means Query frame
84
- # sframe means Subject frame
85
- # When not provided, the default value is:
86
- # 'qseqid sseqid pident length mismatch gapopen qstart qend sstart send
87
- # evalue bitscore', which is equivalent to the keyword 'std'
88
-
89
- # if the query doesn't exist, then create a new one,
90
- # else the hit will be added to the last query
91
-
92
- qseqid,sacc,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,score,qframe,sframe,qseq,sseq = params
93
-
94
- # creates the hit
95
- hit = BlastHit.new(qstart,qend,sstart,send)
96
-
97
- hit.align_len=length
98
- hit.ident=pident
99
-
100
- hit.gaps=gapopen
101
- hit.mismatches=mismatch
102
- hit.e_val=evalue
103
- hit.bit_score=bitscore
104
-
105
- hit.score = score
106
- hit.q_frame = qframe
107
- hit.s_frame = sframe
108
-
109
- hit.subject_id = sacc
110
- hit.full_subject_length=0
111
- hit.definition=sacc
112
- hit.acc=sacc
113
- hit.q_seq=qseq
114
- hit.s_seq=sseq
115
-
116
- query=find_query(@querys,qseqid)
117
-
118
- if (query) #if it is a new query, it is created and added
119
- query.add_hit(hit)
120
-
121
- else # else the hit is added in last query added
122
- query = BlastQuery.new(qseqid)
123
- query.add_hit(hit)
124
- @querys.push query
125
- end
126
-
127
-
128
- #Description
129
-
130
- # read_blast_tab read tabular BLAST format created with blast_seq and written to file with write_blast - or with blastall and the -m 8 or -m 9 switch.
131
- # Each column in the table corresponds to the following keys:
132
- #
133
- # 1. Q_ID - Query ID.
134
- # 2. S_ID - Subject ID.
135
- # 3. IDENT - Identity (%).
136
- # 4. ALIGN_LEN - Alignment length.
137
- # 5. MISMATCHES - Number of mismatches.
138
- # 6. GAPS - Number of gaps.
139
- # 7. Q_BEG - Query begin.
140
- # 8. Q_END - Query end.
141
- # 9. S_BEG - Subject begin.
142
- # 10. S_END - Subject end.
143
- # 11. E_VAL - Expect value.
144
- # 12. BIT_SCORE - Bit score.
145
- #
146
- # Furthermore, two extra keys are added to the record:
147
- #
148
- # * STRAND - Strand.
149
- # * REC_TYPE - Record type.
53
+
54
+ if line =~ /^\s*#/
55
+ if line =~ /^#\sQuery:\s+(.+)$/
56
+ query_name = $1
57
+ elsif line =~ /^#\s0\shits\sfound$/
58
+ @querys.push BlastQuery.new(query_name)
59
+ end
60
+ # 0 hits found
61
+
62
+
63
+ else
64
+ params = line.split(/\t+/)
65
+
66
+ # puts "Extracted #{params[0]} #{params[1]} #{params[2]} #{params[3]} #{params[4]} #{params[5]} #{params[6]} #{params[7]} #{params[8]} #{params[9]} #{params[10]} #{params[11]}"
67
+ # Options 6, 7, and 10 can be additionally configured to produce
68
+ # a custom format specified by space delimited format specifiers.
69
+ # The supported format specifiers are:
70
+ # qseqid means Query Seq-id
71
+ # qgi means Query GI
72
+ # qacc means Query accesion
73
+ # sseqid means Subject Seq-id
74
+ # sallseqid means All subject Seq-id(s), separated by a ';'
75
+ # sgi means Subject GI
76
+ # sallgi means All subject GIs
77
+ # sacc means Subject accession
78
+ # sallacc means All subject accessions
79
+ # qstart means Start of alignment in query
80
+ # qend means End of alignment in query
81
+ # sstart means Start of alignment in subject
82
+ # send means End of alignment in subject
83
+ # qseq means Aligned part of query sequence
84
+ # sseq means Aligned part of subject sequence
85
+ # evalue means Expect value
86
+ # bitscore means Bit score
87
+ # score means Raw score
88
+ # length means Alignment length
89
+ # pident means Percentage of identical matches
90
+ # nident means Number of identical matches
91
+ # mismatch means Number of mismatches
92
+ # positive means Number of positive-scoring matches
93
+ # gapopen means Number of gap openings
94
+ # gaps means Total number of gaps
95
+ # ppos means Percentage of positive-scoring matches
96
+ # frames means Query and subject frames separated by a '/'
97
+ # qframe means Query frame
98
+ # sframe means Subject frame
99
+ # When not provided, the default value is:
100
+ # 'qseqid sseqid pident length mismatch gapopen qstart qend sstart send
101
+ # evalue bitscore', which is equivalent to the keyword 'std'
102
+
103
+ # if the query doesn't exist, then create a new one,
104
+ # else the hit will be added to the last query
105
+
106
+ qseqid,sacc,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,score,qframe,sframe,qseq,sseq = params
107
+
108
+ # creates the hit
109
+ hit = BlastHit.new(qstart,qend,sstart,send)
110
+
111
+ hit.align_len=length
112
+ hit.ident=pident
113
+
114
+ hit.gaps=gapopen
115
+ hit.mismatches=mismatch
116
+ hit.e_val=evalue
117
+ hit.bit_score=bitscore
118
+
119
+ hit.score = score
120
+ hit.q_frame = qframe
121
+ hit.s_frame = sframe
122
+
123
+ hit.subject_id = sacc
124
+ hit.full_subject_length=0
125
+ hit.definition=sacc
126
+ hit.acc=sacc
127
+ hit.q_seq=qseq
128
+ hit.s_seq=sseq
129
+
130
+ query=find_query(@querys,qseqid)
131
+
132
+ if (query) #if it is a new query, it is created and added
133
+ query.add_hit(hit)
134
+
135
+ else # else the hit is added in last query added
136
+ query = BlastQuery.new(qseqid)
137
+ query.add_hit(hit)
138
+ @querys.push query
150
139
  end
140
+
141
+
142
+ #Description
143
+
144
+ # read_blast_tab read tabular BLAST format created with blast_seq and written to file with write_blast - or with blastall and the -m 8 or -m 9 switch.
145
+ # Each column in the table corresponds to the following keys:
146
+ #
147
+ # 1. Q_ID - Query ID.
148
+ # 2. S_ID - Subject ID.
149
+ # 3. IDENT - Identity (%).
150
+ # 4. ALIGN_LEN - Alignment length.
151
+ # 5. MISMATCHES - Number of mismatches.
152
+ # 6. GAPS - Number of gaps.
153
+ # 7. Q_BEG - Query begin.
154
+ # 8. Q_END - Query end.
155
+ # 9. S_BEG - Subject begin.
156
+ # 10. S_END - Subject end.
157
+ # 11. E_VAL - Expect value.
158
+ # 12. BIT_SCORE - Bit score.
159
+ #
160
+ # Furthermore, two extra keys are added to the record:
161
+ #
162
+ # * STRAND - Strand.
163
+ # * REC_TYPE - Record type.
151
164
  end
152
-
153
- #inspect
154
-
165
+ end
166
+
167
+ #inspect
168
+
155
169
  end
156
-
157
-
158
-
159
- def inspect
160
- # puts "Table Results:"
161
- # puts(@querys.each{|q| q.inspect}).join("\n")
162
- res = "Blast results:\n"
170
+
171
+
172
+ # inspect results
173
+ def inspect
174
+ res = "Blast results:\n"
163
175
  res+= '-'*20
164
176
  res+= "\nQuerys: #{@querys.count}\n"
165
177
  @querys.each{|q| res+=q.inspect+"\n"}
166
- return res
178
+ return res
167
179
  end
168
-
169
- def find_query(querys,name_q)
170
- # newq = querys.find{|q| ( q.find{|h| (h.subject_id)})}
180
+
181
+ # find query by name
182
+ def find_query(querys,name_q)
183
+ # newq = querys.find{|q| ( q.find{|h| (h.subject_id)})}
171
184
  new_q=nil
172
185
 
173
186
  if !querys.empty?
@@ -176,12 +189,14 @@ class BlastTableResult
176
189
 
177
190
  return new_q
178
191
  end
179
-
192
+
193
+ # check if there are querys
180
194
  def empty?
181
-
195
+
182
196
  return @querys.empty?
183
- end
184
-
197
+ end
198
+
199
+ # get query count
185
200
  def size
186
201
  @querys.size
187
202
  end
@@ -1,27 +1,38 @@
1
- require "blast_query.rb"
2
- require "blast_hit.rb"
3
-
4
- require 'nokogiri'
5
- #xml=File.open('orf.1.xml').read
6
- #data = XmlSimple.xml_in(xml)
1
+ # Copyright (c) 2010 Dario Guerrero & Almudena Bocinos
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining
4
+ # a copy of this software and associated documentation files (the
5
+ # 'Software'), to deal in the Software without restriction, including
6
+ # without limitation the rights to use, copy, modify, merge, publish,
7
+ # distribute, sublicense, and/or sell copies of the Software, and to
8
+ # permit persons to whom the Software is furnished to do so, subject to
9
+ # the following conditions:
7
10
  #
11
+ # The above copyright notice and this permission notice shall be
12
+ # included in all copies or substantial portions of the Software.
8
13
  #
9
- #data['best_orf'][0]['start'][0]
10
- #data['best_orf'][0]['content']
14
+ # THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
15
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17
+ # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18
+ # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19
+ # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20
+ # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
11
21
 
12
- ######################################
13
- # Author:: Almudena Bocinos Rioboo
14
- # Extracts results from blast table's file and uses it to create instances of "BlastQuery" and "BlastHit"
15
22
 
16
- ######################################
23
+ require "blast_query.rb"
24
+ require "blast_hit.rb"
25
+
26
+ require 'nokogiri'
17
27
 
28
+ # Another XML parser using nokogiri library
18
29
  class BlastXmlResult
19
-
30
+
20
31
  def initialize(input)
21
-
32
+
22
33
  @querys = []
23
34
  lines=[]
24
-
35
+
25
36
  if input.is_a?(Array)
26
37
  lines=input
27
38
  else
@@ -30,106 +41,106 @@ class BlastXmlResult
30
41
  lines = fich.readlines
31
42
  fich.close
32
43
  end
33
-
44
+
34
45
  end
35
-
36
- # puts "lines length #{lines.length}"
46
+
47
+ # puts "lines length #{lines.length}"
37
48
  if !lines.empty?
38
- data = Nokogiri::XML(lines.join)
39
- data.root.xpath('//Iteration').each do |iteration|
40
-
41
- # puts JSON::pretty_generate(iteration)
49
+ data = Nokogiri::XML(lines.join)
50
+ data.root.xpath('//Iteration').each do |iteration|
51
+
52
+ # puts JSON::pretty_generate(iteration)
42
53
  query_id = iteration.xpath('Iteration_query-ID').text
43
54
 
44
55
  full_query_length = iteration.xpath('Iteration_query-len').text
45
56
  query_def = iteration.xpath('Iteration_query-def').text
46
-
57
+
47
58
  if query_def =~ /^([^\s]+)/
48
- query_def=$1
59
+ query_def=$1
49
60
  end
50
-
51
- #@query_def = iteration['Iteration_query-def'][0]
61
+
62
+ #@query_def = iteration['Iteration_query-def'][0]
52
63
 
53
64
  query = BlastQuery.new(query_id)
54
65
  query.query_def = query_def
55
66
  query.full_query_length = full_query_length
56
67
  @querys.push query
57
-
68
+
58
69
 
59
70
  hits = iteration.xpath('Iteration_hits/Hit')
60
71
  if !hits.nil?
61
- hits.each do |h|
62
- #puts JSON::pretty_generate(h)
63
-
64
-
65
-
66
- subject_id=h.xpath('Hit_id').text
67
- acc =h.xpath('Hit_accession').text
68
- full_subject_length = h.xpath('Hit_len').text.to_i
69
- hit_def=h.xpath('Hit_def').text
70
- if hit_def=='No definition line'
71
- hit_def =subject_id
72
- end
73
-
74
- hsps = h.xpath('Hit_hsps/Hsp')
75
-
76
- hsps.each do |hsp|
77
-
78
- q_beg=hsp.xpath('Hsp_query-from').text.to_i
79
- q_end=hsp.xpath('Hsp_query-to').text.to_i
80
- s_beg=hsp.xpath('Hsp_hit-from').text.to_i
81
- s_end=hsp.xpath('Hsp_hit-to').text.to_i
82
-
83
- # creates the hit
84
- hit = BlastHit.new(q_beg,q_end,s_beg,s_end)
85
-
86
- hit.align_len=hsp.xpath('Hsp_align-len').text.to_i
87
- hit.ident=(hsp.xpath('Hsp_identity').text.to_f/hit.align_len)*100
88
- hit.gaps=hsp.xpath('Hsp_gaps').text.to_i
89
- hit.mismatches=hsp.xpath('Hsp_midline').text.count(' ').to_i - hit.gaps
90
- hit.e_val=hsp.xpath('Hsp_evalue').text.to_f
91
- hit.e_val = (hit.e_val*1000).round/1000.0
92
- hit.bit_score=hsp.xpath('Hsp_bit-score').text.to_f
93
- hit.bit_score = (hit.bit_score*100).round/100.0
94
-
95
- hit.score = hsp.xpath('Hsp_score').text.to_f
96
- hit.q_frame = hsp.xpath('Hsp_query-frame').text.to_i
97
- hit.s_frame =hsp.xpath('Hsp_hit-frame').text.to_i
98
-
99
- hit.q_seq = hsp.xpath('Hsp_qseq').text
100
- hit.s_seq = hsp.xpath('Hsp_hseq').text
101
-
102
-
103
- hit.subject_id= subject_id
104
- hit.full_subject_length=full_subject_length
105
- # hit.full_query_length = full_query_length
106
- hit.definition=hit_def
107
- hit.acc=acc
108
-
109
- query.add_hit(hit)
110
-
111
- end
112
- end
113
- end
72
+ hits.each do |h|
73
+ #puts JSON::pretty_generate(h)
74
+
75
+
76
+
77
+ subject_id=h.xpath('Hit_id').text
78
+ acc =h.xpath('Hit_accession').text
79
+ full_subject_length = h.xpath('Hit_len').text.to_i
80
+ hit_def=h.xpath('Hit_def').text
81
+ if hit_def=='No definition line'
82
+ hit_def =subject_id
83
+ end
84
+
85
+ hsps = h.xpath('Hit_hsps/Hsp')
86
+
87
+ hsps.each do |hsp|
88
+
89
+ q_beg=hsp.xpath('Hsp_query-from').text.to_i
90
+ q_end=hsp.xpath('Hsp_query-to').text.to_i
91
+ s_beg=hsp.xpath('Hsp_hit-from').text.to_i
92
+ s_end=hsp.xpath('Hsp_hit-to').text.to_i
93
+
94
+ # creates the hit
95
+ hit = BlastHit.new(q_beg,q_end,s_beg,s_end)
96
+
97
+ hit.align_len=hsp.xpath('Hsp_align-len').text.to_i
98
+ hit.ident=(hsp.xpath('Hsp_identity').text.to_f/hit.align_len)*100
99
+ hit.gaps=hsp.xpath('Hsp_gaps').text.to_i
100
+ hit.mismatches=hsp.xpath('Hsp_midline').text.count(' ').to_i - hit.gaps
101
+ hit.e_val=hsp.xpath('Hsp_evalue').text.to_f
102
+ hit.e_val = (hit.e_val*1000).round/1000.0
103
+ hit.bit_score=hsp.xpath('Hsp_bit-score').text.to_f
104
+ hit.bit_score = (hit.bit_score*100).round/100.0
105
+
106
+ hit.score = hsp.xpath('Hsp_score').text.to_f
107
+ hit.q_frame = hsp.xpath('Hsp_query-frame').text.to_i
108
+ hit.s_frame =hsp.xpath('Hsp_hit-frame').text.to_i
109
+
110
+ hit.q_seq = hsp.xpath('Hsp_qseq').text
111
+ hit.s_seq = hsp.xpath('Hsp_hseq').text
112
+
113
+
114
+ hit.subject_id= subject_id
115
+ hit.full_subject_length=full_subject_length
116
+ # hit.full_query_length = full_query_length
117
+ hit.definition=hit_def
118
+ hit.acc=acc
119
+
120
+ query.add_hit(hit)
121
+
122
+ end
123
+ end
114
124
  end
115
125
  end
116
- #inspect
117
-
126
+ end
127
+ #inspect
128
+
118
129
  end
119
-
120
-
121
-
130
+
131
+
132
+
122
133
  def inspect
123
-
134
+
124
135
  res = "Blast results:\n"
125
136
  res+= '-'*20
126
137
  res+= "\nQuerys: #{@querys.count}\n"
127
138
  @querys.each{|q| res+=q.inspect+"\n"}
128
139
  return res
129
140
  end
130
-
131
- def find_query(querys,name_q)
132
- # newq = querys.find{|q| ( q.find{|h| (h.subject_id)})}
141
+
142
+ def find_query(querys,name_q)
143
+ # newq = querys.find{|q| ( q.find{|h| (h.subject_id)})}
133
144
  new_q=nil
134
145
 
135
146
  if !querys.empty?
@@ -138,15 +149,15 @@ class BlastXmlResult
138
149
 
139
150
  return new_q
140
151
  end
141
-
152
+
142
153
  def empty?
143
-
154
+
144
155
  return @querys.empty?
145
- end
146
-
156
+ end
157
+
147
158
  def size
148
159
  @querys.size
149
160
  end
150
-
161
+
151
162
  attr_accessor :querys
152
163
  end