scbi_blast 0.0.30 → 0.0.31

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,173 +1,186 @@
1
- require "blast_query.rb"
2
- require "blast_hit.rb"
1
+ # Copyright (c) 2010 Dario Guerrero & Almudena Bocinos
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining
4
+ # a copy of this software and associated documentation files (the
5
+ # 'Software'), to deal in the Software without restriction, including
6
+ # without limitation the rights to use, copy, modify, merge, publish,
7
+ # distribute, sublicense, and/or sell copies of the Software, and to
8
+ # permit persons to whom the Software is furnished to do so, subject to
9
+ # the following conditions:
10
+ #
11
+ # The above copyright notice and this permission notice shall be
12
+ # included in all copies or substantial portions of the Software.
13
+ #
14
+ # THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
15
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17
+ # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18
+ # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19
+ # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20
+ # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
3
21
 
4
22
 
23
+ require "blast_query.rb"
24
+ require "blast_hit.rb"
5
25
 
6
- ######################################
7
- # Author:: Almudena Bocinos Rioboo
8
- # Extracts results from blast table's file and uses it to create instances of "BlastQuery" and "BlastHit"
9
26
 
10
- ######################################
11
27
 
28
+ # Extracts results from blast table's file and uses it to create instances of "BlastQuery" and "BlastHit"
12
29
  class BlastTableResult
13
-
30
+
31
+ # Parser initialization
14
32
  def initialize(input)
15
-
33
+
16
34
  @querys = []
17
-
18
-
35
+
36
+
19
37
  if input.is_a?(Array)
20
38
  lines=input
21
-
39
+
22
40
  else
23
-
41
+
24
42
  fich = File.open(input,'r')
25
43
  lines = fich.readlines
26
44
  fich.close
27
-
45
+
28
46
  end
29
-
30
- # puts "lines length #{lines.length}"
47
+
31
48
  query_name=''
32
-
49
+
33
50
  lines.each do |line|
34
-
51
+
35
52
  line.chomp! #delete end of line
36
-
37
- if line =~ /^\s*#/
38
- if line =~ /^#\sQuery:\s+(.+)$/
39
- query_name = $1
40
- elsif line =~ /^#\s0\shits\sfound$/
41
- @querys.push BlastQuery.new(query_name)
42
- end
43
- # 0 hits found
44
-
45
-
46
- else
47
- params = line.split(/\t+/)
48
-
49
- # creates the hit
50
- #hit = BlastHit.new( params[1], params[2], params[3], params[4], params[5], params[6], params[7], params[8], params[9], params[10], params[11])
51
-
52
- # puts "Extracted #{params[0]} #{params[1]} #{params[2]} #{params[3]} #{params[4]} #{params[5]} #{params[6]} #{params[7]} #{params[8]} #{params[9]} #{params[10]} #{params[11]}"
53
- # Options 6, 7, and 10 can be additionally configured to produce
54
- # a custom format specified by space delimited format specifiers.
55
- # The supported format specifiers are:
56
- # qseqid means Query Seq-id
57
- # qgi means Query GI
58
- # qacc means Query accesion
59
- # sseqid means Subject Seq-id
60
- # sallseqid means All subject Seq-id(s), separated by a ';'
61
- # sgi means Subject GI
62
- # sallgi means All subject GIs
63
- # sacc means Subject accession
64
- # sallacc means All subject accessions
65
- # qstart means Start of alignment in query
66
- # qend means End of alignment in query
67
- # sstart means Start of alignment in subject
68
- # send means End of alignment in subject
69
- # qseq means Aligned part of query sequence
70
- # sseq means Aligned part of subject sequence
71
- # evalue means Expect value
72
- # bitscore means Bit score
73
- # score means Raw score
74
- # length means Alignment length
75
- # pident means Percentage of identical matches
76
- # nident means Number of identical matches
77
- # mismatch means Number of mismatches
78
- # positive means Number of positive-scoring matches
79
- # gapopen means Number of gap openings
80
- # gaps means Total number of gaps
81
- # ppos means Percentage of positive-scoring matches
82
- # frames means Query and subject frames separated by a '/'
83
- # qframe means Query frame
84
- # sframe means Subject frame
85
- # When not provided, the default value is:
86
- # 'qseqid sseqid pident length mismatch gapopen qstart qend sstart send
87
- # evalue bitscore', which is equivalent to the keyword 'std'
88
-
89
- # if the query doesn't exist, then create a new one,
90
- # else the hit will be added to the last query
91
-
92
- qseqid,sacc,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,score,qframe,sframe,qseq,sseq = params
93
-
94
- # creates the hit
95
- hit = BlastHit.new(qstart,qend,sstart,send)
96
-
97
- hit.align_len=length
98
- hit.ident=pident
99
-
100
- hit.gaps=gapopen
101
- hit.mismatches=mismatch
102
- hit.e_val=evalue
103
- hit.bit_score=bitscore
104
-
105
- hit.score = score
106
- hit.q_frame = qframe
107
- hit.s_frame = sframe
108
-
109
- hit.subject_id = sacc
110
- hit.full_subject_length=0
111
- hit.definition=sacc
112
- hit.acc=sacc
113
- hit.q_seq=qseq
114
- hit.s_seq=sseq
115
-
116
- query=find_query(@querys,qseqid)
117
-
118
- if (query) #if it is a new query, it is created and added
119
- query.add_hit(hit)
120
-
121
- else # else the hit is added in last query added
122
- query = BlastQuery.new(qseqid)
123
- query.add_hit(hit)
124
- @querys.push query
125
- end
126
-
127
-
128
- #Description
129
-
130
- # read_blast_tab read tabular BLAST format created with blast_seq and written to file with write_blast - or with blastall and the -m 8 or -m 9 switch.
131
- # Each column in the table corresponds to the following keys:
132
- #
133
- # 1. Q_ID - Query ID.
134
- # 2. S_ID - Subject ID.
135
- # 3. IDENT - Identity (%).
136
- # 4. ALIGN_LEN - Alignment length.
137
- # 5. MISMATCHES - Number of mismatches.
138
- # 6. GAPS - Number of gaps.
139
- # 7. Q_BEG - Query begin.
140
- # 8. Q_END - Query end.
141
- # 9. S_BEG - Subject begin.
142
- # 10. S_END - Subject end.
143
- # 11. E_VAL - Expect value.
144
- # 12. BIT_SCORE - Bit score.
145
- #
146
- # Furthermore, two extra keys are added to the record:
147
- #
148
- # * STRAND - Strand.
149
- # * REC_TYPE - Record type.
53
+
54
+ if line =~ /^\s*#/
55
+ if line =~ /^#\sQuery:\s+(.+)$/
56
+ query_name = $1
57
+ elsif line =~ /^#\s0\shits\sfound$/
58
+ @querys.push BlastQuery.new(query_name)
59
+ end
60
+ # 0 hits found
61
+
62
+
63
+ else
64
+ params = line.split(/\t+/)
65
+
66
+ # puts "Extracted #{params[0]} #{params[1]} #{params[2]} #{params[3]} #{params[4]} #{params[5]} #{params[6]} #{params[7]} #{params[8]} #{params[9]} #{params[10]} #{params[11]}"
67
+ # Options 6, 7, and 10 can be additionally configured to produce
68
+ # a custom format specified by space delimited format specifiers.
69
+ # The supported format specifiers are:
70
+ # qseqid means Query Seq-id
71
+ # qgi means Query GI
72
+ # qacc means Query accesion
73
+ # sseqid means Subject Seq-id
74
+ # sallseqid means All subject Seq-id(s), separated by a ';'
75
+ # sgi means Subject GI
76
+ # sallgi means All subject GIs
77
+ # sacc means Subject accession
78
+ # sallacc means All subject accessions
79
+ # qstart means Start of alignment in query
80
+ # qend means End of alignment in query
81
+ # sstart means Start of alignment in subject
82
+ # send means End of alignment in subject
83
+ # qseq means Aligned part of query sequence
84
+ # sseq means Aligned part of subject sequence
85
+ # evalue means Expect value
86
+ # bitscore means Bit score
87
+ # score means Raw score
88
+ # length means Alignment length
89
+ # pident means Percentage of identical matches
90
+ # nident means Number of identical matches
91
+ # mismatch means Number of mismatches
92
+ # positive means Number of positive-scoring matches
93
+ # gapopen means Number of gap openings
94
+ # gaps means Total number of gaps
95
+ # ppos means Percentage of positive-scoring matches
96
+ # frames means Query and subject frames separated by a '/'
97
+ # qframe means Query frame
98
+ # sframe means Subject frame
99
+ # When not provided, the default value is:
100
+ # 'qseqid sseqid pident length mismatch gapopen qstart qend sstart send
101
+ # evalue bitscore', which is equivalent to the keyword 'std'
102
+
103
+ # if the query doesn't exist, then create a new one,
104
+ # else the hit will be added to the last query
105
+
106
+ qseqid,sacc,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,score,qframe,sframe,qseq,sseq = params
107
+
108
+ # creates the hit
109
+ hit = BlastHit.new(qstart,qend,sstart,send)
110
+
111
+ hit.align_len=length
112
+ hit.ident=pident
113
+
114
+ hit.gaps=gapopen
115
+ hit.mismatches=mismatch
116
+ hit.e_val=evalue
117
+ hit.bit_score=bitscore
118
+
119
+ hit.score = score
120
+ hit.q_frame = qframe
121
+ hit.s_frame = sframe
122
+
123
+ hit.subject_id = sacc
124
+ hit.full_subject_length=0
125
+ hit.definition=sacc
126
+ hit.acc=sacc
127
+ hit.q_seq=qseq
128
+ hit.s_seq=sseq
129
+
130
+ query=find_query(@querys,qseqid)
131
+
132
+ if (query) #if it is a new query, it is created and added
133
+ query.add_hit(hit)
134
+
135
+ else # else the hit is added in last query added
136
+ query = BlastQuery.new(qseqid)
137
+ query.add_hit(hit)
138
+ @querys.push query
150
139
  end
140
+
141
+
142
+ #Description
143
+
144
+ # read_blast_tab read tabular BLAST format created with blast_seq and written to file with write_blast - or with blastall and the -m 8 or -m 9 switch.
145
+ # Each column in the table corresponds to the following keys:
146
+ #
147
+ # 1. Q_ID - Query ID.
148
+ # 2. S_ID - Subject ID.
149
+ # 3. IDENT - Identity (%).
150
+ # 4. ALIGN_LEN - Alignment length.
151
+ # 5. MISMATCHES - Number of mismatches.
152
+ # 6. GAPS - Number of gaps.
153
+ # 7. Q_BEG - Query begin.
154
+ # 8. Q_END - Query end.
155
+ # 9. S_BEG - Subject begin.
156
+ # 10. S_END - Subject end.
157
+ # 11. E_VAL - Expect value.
158
+ # 12. BIT_SCORE - Bit score.
159
+ #
160
+ # Furthermore, two extra keys are added to the record:
161
+ #
162
+ # * STRAND - Strand.
163
+ # * REC_TYPE - Record type.
151
164
  end
152
-
153
- #inspect
154
-
165
+ end
166
+
167
+ #inspect
168
+
155
169
  end
156
-
157
-
158
-
159
- def inspect
160
- # puts "Table Results:"
161
- # puts(@querys.each{|q| q.inspect}).join("\n")
162
- res = "Blast results:\n"
170
+
171
+
172
+ # inspect results
173
+ def inspect
174
+ res = "Blast results:\n"
163
175
  res+= '-'*20
164
176
  res+= "\nQuerys: #{@querys.count}\n"
165
177
  @querys.each{|q| res+=q.inspect+"\n"}
166
- return res
178
+ return res
167
179
  end
168
-
169
- def find_query(querys,name_q)
170
- # newq = querys.find{|q| ( q.find{|h| (h.subject_id)})}
180
+
181
+ # find query by name
182
+ def find_query(querys,name_q)
183
+ # newq = querys.find{|q| ( q.find{|h| (h.subject_id)})}
171
184
  new_q=nil
172
185
 
173
186
  if !querys.empty?
@@ -176,12 +189,14 @@ class BlastTableResult
176
189
 
177
190
  return new_q
178
191
  end
179
-
192
+
193
+ # check if there are querys
180
194
  def empty?
181
-
195
+
182
196
  return @querys.empty?
183
- end
184
-
197
+ end
198
+
199
+ # get query count
185
200
  def size
186
201
  @querys.size
187
202
  end
@@ -1,27 +1,38 @@
1
- require "blast_query.rb"
2
- require "blast_hit.rb"
3
-
4
- require 'nokogiri'
5
- #xml=File.open('orf.1.xml').read
6
- #data = XmlSimple.xml_in(xml)
1
+ # Copyright (c) 2010 Dario Guerrero & Almudena Bocinos
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining
4
+ # a copy of this software and associated documentation files (the
5
+ # 'Software'), to deal in the Software without restriction, including
6
+ # without limitation the rights to use, copy, modify, merge, publish,
7
+ # distribute, sublicense, and/or sell copies of the Software, and to
8
+ # permit persons to whom the Software is furnished to do so, subject to
9
+ # the following conditions:
7
10
  #
11
+ # The above copyright notice and this permission notice shall be
12
+ # included in all copies or substantial portions of the Software.
8
13
  #
9
- #data['best_orf'][0]['start'][0]
10
- #data['best_orf'][0]['content']
14
+ # THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
15
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17
+ # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18
+ # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19
+ # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20
+ # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
11
21
 
12
- ######################################
13
- # Author:: Almudena Bocinos Rioboo
14
- # Extracts results from blast table's file and uses it to create instances of "BlastQuery" and "BlastHit"
15
22
 
16
- ######################################
23
+ require "blast_query.rb"
24
+ require "blast_hit.rb"
25
+
26
+ require 'nokogiri'
17
27
 
28
+ # Another XML parser using nokogiri library
18
29
  class BlastXmlResult
19
-
30
+
20
31
  def initialize(input)
21
-
32
+
22
33
  @querys = []
23
34
  lines=[]
24
-
35
+
25
36
  if input.is_a?(Array)
26
37
  lines=input
27
38
  else
@@ -30,106 +41,106 @@ class BlastXmlResult
30
41
  lines = fich.readlines
31
42
  fich.close
32
43
  end
33
-
44
+
34
45
  end
35
-
36
- # puts "lines length #{lines.length}"
46
+
47
+ # puts "lines length #{lines.length}"
37
48
  if !lines.empty?
38
- data = Nokogiri::XML(lines.join)
39
- data.root.xpath('//Iteration').each do |iteration|
40
-
41
- # puts JSON::pretty_generate(iteration)
49
+ data = Nokogiri::XML(lines.join)
50
+ data.root.xpath('//Iteration').each do |iteration|
51
+
52
+ # puts JSON::pretty_generate(iteration)
42
53
  query_id = iteration.xpath('Iteration_query-ID').text
43
54
 
44
55
  full_query_length = iteration.xpath('Iteration_query-len').text
45
56
  query_def = iteration.xpath('Iteration_query-def').text
46
-
57
+
47
58
  if query_def =~ /^([^\s]+)/
48
- query_def=$1
59
+ query_def=$1
49
60
  end
50
-
51
- #@query_def = iteration['Iteration_query-def'][0]
61
+
62
+ #@query_def = iteration['Iteration_query-def'][0]
52
63
 
53
64
  query = BlastQuery.new(query_id)
54
65
  query.query_def = query_def
55
66
  query.full_query_length = full_query_length
56
67
  @querys.push query
57
-
68
+
58
69
 
59
70
  hits = iteration.xpath('Iteration_hits/Hit')
60
71
  if !hits.nil?
61
- hits.each do |h|
62
- #puts JSON::pretty_generate(h)
63
-
64
-
65
-
66
- subject_id=h.xpath('Hit_id').text
67
- acc =h.xpath('Hit_accession').text
68
- full_subject_length = h.xpath('Hit_len').text.to_i
69
- hit_def=h.xpath('Hit_def').text
70
- if hit_def=='No definition line'
71
- hit_def =subject_id
72
- end
73
-
74
- hsps = h.xpath('Hit_hsps/Hsp')
75
-
76
- hsps.each do |hsp|
77
-
78
- q_beg=hsp.xpath('Hsp_query-from').text.to_i
79
- q_end=hsp.xpath('Hsp_query-to').text.to_i
80
- s_beg=hsp.xpath('Hsp_hit-from').text.to_i
81
- s_end=hsp.xpath('Hsp_hit-to').text.to_i
82
-
83
- # creates the hit
84
- hit = BlastHit.new(q_beg,q_end,s_beg,s_end)
85
-
86
- hit.align_len=hsp.xpath('Hsp_align-len').text.to_i
87
- hit.ident=(hsp.xpath('Hsp_identity').text.to_f/hit.align_len)*100
88
- hit.gaps=hsp.xpath('Hsp_gaps').text.to_i
89
- hit.mismatches=hsp.xpath('Hsp_midline').text.count(' ').to_i - hit.gaps
90
- hit.e_val=hsp.xpath('Hsp_evalue').text.to_f
91
- hit.e_val = (hit.e_val*1000).round/1000.0
92
- hit.bit_score=hsp.xpath('Hsp_bit-score').text.to_f
93
- hit.bit_score = (hit.bit_score*100).round/100.0
94
-
95
- hit.score = hsp.xpath('Hsp_score').text.to_f
96
- hit.q_frame = hsp.xpath('Hsp_query-frame').text.to_i
97
- hit.s_frame =hsp.xpath('Hsp_hit-frame').text.to_i
98
-
99
- hit.q_seq = hsp.xpath('Hsp_qseq').text
100
- hit.s_seq = hsp.xpath('Hsp_hseq').text
101
-
102
-
103
- hit.subject_id= subject_id
104
- hit.full_subject_length=full_subject_length
105
- # hit.full_query_length = full_query_length
106
- hit.definition=hit_def
107
- hit.acc=acc
108
-
109
- query.add_hit(hit)
110
-
111
- end
112
- end
113
- end
72
+ hits.each do |h|
73
+ #puts JSON::pretty_generate(h)
74
+
75
+
76
+
77
+ subject_id=h.xpath('Hit_id').text
78
+ acc =h.xpath('Hit_accession').text
79
+ full_subject_length = h.xpath('Hit_len').text.to_i
80
+ hit_def=h.xpath('Hit_def').text
81
+ if hit_def=='No definition line'
82
+ hit_def =subject_id
83
+ end
84
+
85
+ hsps = h.xpath('Hit_hsps/Hsp')
86
+
87
+ hsps.each do |hsp|
88
+
89
+ q_beg=hsp.xpath('Hsp_query-from').text.to_i
90
+ q_end=hsp.xpath('Hsp_query-to').text.to_i
91
+ s_beg=hsp.xpath('Hsp_hit-from').text.to_i
92
+ s_end=hsp.xpath('Hsp_hit-to').text.to_i
93
+
94
+ # creates the hit
95
+ hit = BlastHit.new(q_beg,q_end,s_beg,s_end)
96
+
97
+ hit.align_len=hsp.xpath('Hsp_align-len').text.to_i
98
+ hit.ident=(hsp.xpath('Hsp_identity').text.to_f/hit.align_len)*100
99
+ hit.gaps=hsp.xpath('Hsp_gaps').text.to_i
100
+ hit.mismatches=hsp.xpath('Hsp_midline').text.count(' ').to_i - hit.gaps
101
+ hit.e_val=hsp.xpath('Hsp_evalue').text.to_f
102
+ hit.e_val = (hit.e_val*1000).round/1000.0
103
+ hit.bit_score=hsp.xpath('Hsp_bit-score').text.to_f
104
+ hit.bit_score = (hit.bit_score*100).round/100.0
105
+
106
+ hit.score = hsp.xpath('Hsp_score').text.to_f
107
+ hit.q_frame = hsp.xpath('Hsp_query-frame').text.to_i
108
+ hit.s_frame =hsp.xpath('Hsp_hit-frame').text.to_i
109
+
110
+ hit.q_seq = hsp.xpath('Hsp_qseq').text
111
+ hit.s_seq = hsp.xpath('Hsp_hseq').text
112
+
113
+
114
+ hit.subject_id= subject_id
115
+ hit.full_subject_length=full_subject_length
116
+ # hit.full_query_length = full_query_length
117
+ hit.definition=hit_def
118
+ hit.acc=acc
119
+
120
+ query.add_hit(hit)
121
+
122
+ end
123
+ end
114
124
  end
115
125
  end
116
- #inspect
117
-
126
+ end
127
+ #inspect
128
+
118
129
  end
119
-
120
-
121
-
130
+
131
+
132
+
122
133
  def inspect
123
-
134
+
124
135
  res = "Blast results:\n"
125
136
  res+= '-'*20
126
137
  res+= "\nQuerys: #{@querys.count}\n"
127
138
  @querys.each{|q| res+=q.inspect+"\n"}
128
139
  return res
129
140
  end
130
-
131
- def find_query(querys,name_q)
132
- # newq = querys.find{|q| ( q.find{|h| (h.subject_id)})}
141
+
142
+ def find_query(querys,name_q)
143
+ # newq = querys.find{|q| ( q.find{|h| (h.subject_id)})}
133
144
  new_q=nil
134
145
 
135
146
  if !querys.empty?
@@ -138,15 +149,15 @@ class BlastXmlResult
138
149
 
139
150
  return new_q
140
151
  end
141
-
152
+
142
153
  def empty?
143
-
154
+
144
155
  return @querys.empty?
145
- end
146
-
156
+ end
157
+
147
158
  def size
148
159
  @querys.size
149
160
  end
150
-
161
+
151
162
  attr_accessor :querys
152
163
  end