scbi_blast 0.0.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/Manifest.txt +21 -0
- data/PostInstall.txt +7 -0
- data/README.rdoc +48 -0
- data/Rakefile +26 -0
- data/lib/scbi_blast.rb +15 -0
- data/lib/scbi_blast/batch_blast.rb +115 -0
- data/lib/scbi_blast/blast_hit.rb +115 -0
- data/lib/scbi_blast/blast_query.rb +37 -0
- data/lib/scbi_blast/blast_simplexml_result.rb +158 -0
- data/lib/scbi_blast/blast_table_result.rb +190 -0
- data/lib/scbi_blast/blast_xml_result.rb +152 -0
- data/lib/scbi_blast/dust_masker.rb +103 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/test/blast.txt +252 -0
- data/test/blast.xml +173 -0
- data/test/empty_blast.xml +42 -0
- data/test/test_helper.rb +4 -0
- data/test/test_scbi_blast.rb +178 -0
- metadata +89 -0
@@ -0,0 +1,190 @@
|
|
1
|
+
require "blast_query.rb"
|
2
|
+
require "blast_hit.rb"
|
3
|
+
|
4
|
+
|
5
|
+
|
6
|
+
######################################
|
7
|
+
# Author:: Almudena Bocinos Rioboo
|
8
|
+
# Extracts results from blast table's file and uses it to create instances of "BlastQuery" and "BlastHit"
|
9
|
+
|
10
|
+
######################################
|
11
|
+
|
12
|
+
class BlastTableResult
|
13
|
+
|
14
|
+
def initialize(input)
|
15
|
+
|
16
|
+
@querys = []
|
17
|
+
|
18
|
+
|
19
|
+
if input.is_a?(Array)
|
20
|
+
lines=input
|
21
|
+
|
22
|
+
else
|
23
|
+
|
24
|
+
fich = File.open(input,'r')
|
25
|
+
lines = fich.readlines
|
26
|
+
fich.close
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
# puts "lines length #{lines.length}"
|
31
|
+
query_name=''
|
32
|
+
|
33
|
+
lines.each do |line|
|
34
|
+
|
35
|
+
line.chomp! #delete end of line
|
36
|
+
|
37
|
+
if line =~ /^\s*#/
|
38
|
+
if line =~ /^#\sQuery:\s+(.+)$/
|
39
|
+
query_name = $1
|
40
|
+
elsif line =~ /^#\s0\shits\sfound$/
|
41
|
+
@querys.push BlastQuery.new(query_name)
|
42
|
+
end
|
43
|
+
# 0 hits found
|
44
|
+
|
45
|
+
|
46
|
+
else
|
47
|
+
params = line.split(/\t+/)
|
48
|
+
|
49
|
+
# creates the hit
|
50
|
+
#hit = BlastHit.new( params[1], params[2], params[3], params[4], params[5], params[6], params[7], params[8], params[9], params[10], params[11])
|
51
|
+
|
52
|
+
# puts "Extracted #{params[0]} #{params[1]} #{params[2]} #{params[3]} #{params[4]} #{params[5]} #{params[6]} #{params[7]} #{params[8]} #{params[9]} #{params[10]} #{params[11]}"
|
53
|
+
# Options 6, 7, and 10 can be additionally configured to produce
|
54
|
+
# a custom format specified by space delimited format specifiers.
|
55
|
+
# The supported format specifiers are:
|
56
|
+
# qseqid means Query Seq-id
|
57
|
+
# qgi means Query GI
|
58
|
+
# qacc means Query accesion
|
59
|
+
# sseqid means Subject Seq-id
|
60
|
+
# sallseqid means All subject Seq-id(s), separated by a ';'
|
61
|
+
# sgi means Subject GI
|
62
|
+
# sallgi means All subject GIs
|
63
|
+
# sacc means Subject accession
|
64
|
+
# sallacc means All subject accessions
|
65
|
+
# qstart means Start of alignment in query
|
66
|
+
# qend means End of alignment in query
|
67
|
+
# sstart means Start of alignment in subject
|
68
|
+
# send means End of alignment in subject
|
69
|
+
# qseq means Aligned part of query sequence
|
70
|
+
# sseq means Aligned part of subject sequence
|
71
|
+
# evalue means Expect value
|
72
|
+
# bitscore means Bit score
|
73
|
+
# score means Raw score
|
74
|
+
# length means Alignment length
|
75
|
+
# pident means Percentage of identical matches
|
76
|
+
# nident means Number of identical matches
|
77
|
+
# mismatch means Number of mismatches
|
78
|
+
# positive means Number of positive-scoring matches
|
79
|
+
# gapopen means Number of gap openings
|
80
|
+
# gaps means Total number of gaps
|
81
|
+
# ppos means Percentage of positive-scoring matches
|
82
|
+
# frames means Query and subject frames separated by a '/'
|
83
|
+
# qframe means Query frame
|
84
|
+
# sframe means Subject frame
|
85
|
+
# When not provided, the default value is:
|
86
|
+
# 'qseqid sseqid pident length mismatch gapopen qstart qend sstart send
|
87
|
+
# evalue bitscore', which is equivalent to the keyword 'std'
|
88
|
+
|
89
|
+
# if the query doesn't exist, then create a new one,
|
90
|
+
# else the hit will be added to the last query
|
91
|
+
|
92
|
+
qseqid,sacc,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,score,qframe,sframe,qseq,sseq = params
|
93
|
+
|
94
|
+
# creates the hit
|
95
|
+
hit = BlastHit.new(qstart,qend,sstart,send)
|
96
|
+
|
97
|
+
hit.align_len=length
|
98
|
+
hit.ident=pident
|
99
|
+
|
100
|
+
hit.gaps=gapopen
|
101
|
+
hit.mismatches=mismatch
|
102
|
+
hit.e_val=evalue
|
103
|
+
hit.bit_score=bitscore
|
104
|
+
|
105
|
+
hit.score = score
|
106
|
+
hit.q_frame = qframe
|
107
|
+
hit.s_frame = sframe
|
108
|
+
|
109
|
+
hit.subject_id = sacc
|
110
|
+
hit.full_subject_length=0
|
111
|
+
hit.definition=sacc
|
112
|
+
hit.acc=sacc
|
113
|
+
hit.q_seq=qseq
|
114
|
+
hit.s_seq=sseq
|
115
|
+
|
116
|
+
query=find_query(@querys,qseqid)
|
117
|
+
|
118
|
+
if (query) #if it is a new query, it is created and added
|
119
|
+
query.add_hit(hit)
|
120
|
+
|
121
|
+
else # else the hit is added in last query added
|
122
|
+
query = BlastQuery.new(qseqid)
|
123
|
+
query.add_hit(hit)
|
124
|
+
@querys.push query
|
125
|
+
end
|
126
|
+
|
127
|
+
|
128
|
+
#Description
|
129
|
+
|
130
|
+
# read_blast_tab read tabular BLAST format created with blast_seq and written to file with write_blast - or with blastall and the -m 8 or -m 9 switch.
|
131
|
+
# Each column in the table corresponds to the following keys:
|
132
|
+
#
|
133
|
+
# 1. Q_ID - Query ID.
|
134
|
+
# 2. S_ID - Subject ID.
|
135
|
+
# 3. IDENT - Identity (%).
|
136
|
+
# 4. ALIGN_LEN - Alignment length.
|
137
|
+
# 5. MISMATCHES - Number of mismatches.
|
138
|
+
# 6. GAPS - Number of gaps.
|
139
|
+
# 7. Q_BEG - Query begin.
|
140
|
+
# 8. Q_END - Query end.
|
141
|
+
# 9. S_BEG - Subject begin.
|
142
|
+
# 10. S_END - Subject end.
|
143
|
+
# 11. E_VAL - Expect value.
|
144
|
+
# 12. BIT_SCORE - Bit score.
|
145
|
+
#
|
146
|
+
# Furthermore, two extra keys are added to the record:
|
147
|
+
#
|
148
|
+
# * STRAND - Strand.
|
149
|
+
# * REC_TYPE - Record type.
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
#inspect
|
154
|
+
|
155
|
+
end
|
156
|
+
|
157
|
+
|
158
|
+
|
159
|
+
def inspect
|
160
|
+
# puts "Table Results:"
|
161
|
+
# puts(@querys.each{|q| q.inspect}).join("\n")
|
162
|
+
res = "Blast results:\n"
|
163
|
+
res+= '-'*20
|
164
|
+
res+= "\nQuerys: #{@querys.count}\n"
|
165
|
+
@querys.each{|q| res+=q.inspect+"\n"}
|
166
|
+
return res
|
167
|
+
end
|
168
|
+
|
169
|
+
def find_query(querys,name_q)
|
170
|
+
# newq = querys.find{|q| ( q.find{|h| (h.subject_id)})}
|
171
|
+
new_q=nil
|
172
|
+
|
173
|
+
if !querys.empty?
|
174
|
+
new_q=querys.find{|q| (q.query_id==name_q)}
|
175
|
+
end
|
176
|
+
|
177
|
+
return new_q
|
178
|
+
end
|
179
|
+
|
180
|
+
def empty?
|
181
|
+
|
182
|
+
return @querys.empty?
|
183
|
+
end
|
184
|
+
|
185
|
+
def size
|
186
|
+
@querys.size
|
187
|
+
end
|
188
|
+
|
189
|
+
attr_accessor :querys
|
190
|
+
end
|
@@ -0,0 +1,152 @@
|
|
1
|
+
require "blast_query.rb"
|
2
|
+
require "blast_hit.rb"
|
3
|
+
|
4
|
+
require 'nokogiri'
|
5
|
+
#xml=File.open('orf.1.xml').read
|
6
|
+
#data = XmlSimple.xml_in(xml)
|
7
|
+
#
|
8
|
+
#
|
9
|
+
#data['best_orf'][0]['start'][0]
|
10
|
+
#data['best_orf'][0]['content']
|
11
|
+
|
12
|
+
######################################
|
13
|
+
# Author:: Almudena Bocinos Rioboo
|
14
|
+
# Extracts results from blast table's file and uses it to create instances of "BlastQuery" and "BlastHit"
|
15
|
+
|
16
|
+
######################################
|
17
|
+
|
18
|
+
class BlastXmlResult
|
19
|
+
|
20
|
+
def initialize(input)
|
21
|
+
|
22
|
+
@querys = []
|
23
|
+
lines=[]
|
24
|
+
|
25
|
+
if input.is_a?(Array)
|
26
|
+
lines=input
|
27
|
+
else
|
28
|
+
if File.exists?(input)
|
29
|
+
fich = File.open(input,'r')
|
30
|
+
lines = fich.readlines
|
31
|
+
fich.close
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
|
36
|
+
# puts "lines length #{lines.length}"
|
37
|
+
if !lines.empty?
|
38
|
+
data = Nokogiri::XML(lines.join)
|
39
|
+
data.root.xpath('//Iteration').each do |iteration|
|
40
|
+
|
41
|
+
# puts JSON::pretty_generate(iteration)
|
42
|
+
query_id = iteration.xpath('Iteration_query-ID').text
|
43
|
+
|
44
|
+
full_query_length = iteration.xpath('Iteration_query-len').text
|
45
|
+
query_def = iteration.xpath('Iteration_query-def').text
|
46
|
+
|
47
|
+
if query_def =~ /^([^\s]+)/
|
48
|
+
query_def=$1
|
49
|
+
end
|
50
|
+
|
51
|
+
#@query_def = iteration['Iteration_query-def'][0]
|
52
|
+
|
53
|
+
query = BlastQuery.new(query_id)
|
54
|
+
query.query_def = query_def
|
55
|
+
query.full_query_length = full_query_length
|
56
|
+
@querys.push query
|
57
|
+
|
58
|
+
|
59
|
+
hits = iteration.xpath('Iteration_hits/Hit')
|
60
|
+
if !hits.nil?
|
61
|
+
hits.each do |h|
|
62
|
+
#puts JSON::pretty_generate(h)
|
63
|
+
|
64
|
+
|
65
|
+
|
66
|
+
subject_id=h.xpath('Hit_id').text
|
67
|
+
acc =h.xpath('Hit_accession').text
|
68
|
+
full_subject_length = h.xpath('Hit_len').text.to_i
|
69
|
+
hit_def=h.xpath('Hit_def').text
|
70
|
+
if hit_def=='No definition line'
|
71
|
+
hit_def =subject_id
|
72
|
+
end
|
73
|
+
|
74
|
+
hsps = h.xpath('Hit_hsps/Hsp')
|
75
|
+
|
76
|
+
hsps.each do |hsp|
|
77
|
+
|
78
|
+
q_beg=hsp.xpath('Hsp_query-from').text.to_i
|
79
|
+
q_end=hsp.xpath('Hsp_query-to').text.to_i
|
80
|
+
s_beg=hsp.xpath('Hsp_hit-from').text.to_i
|
81
|
+
s_end=hsp.xpath('Hsp_hit-to').text.to_i
|
82
|
+
|
83
|
+
# creates the hit
|
84
|
+
hit = BlastHit.new(q_beg,q_end,s_beg,s_end)
|
85
|
+
|
86
|
+
hit.align_len=hsp.xpath('Hsp_align-len').text.to_i
|
87
|
+
hit.ident=(hsp.xpath('Hsp_identity').text.to_f/hit.align_len)*100
|
88
|
+
hit.gaps=hsp.xpath('Hsp_gaps').text.to_i
|
89
|
+
hit.mismatches=hsp.xpath('Hsp_midline').text.count(' ').to_i - hit.gaps
|
90
|
+
hit.e_val=hsp.xpath('Hsp_evalue').text.to_f
|
91
|
+
hit.e_val = (hit.e_val*1000).round/1000.0
|
92
|
+
hit.bit_score=hsp.xpath('Hsp_bit-score').text.to_f
|
93
|
+
hit.bit_score = (hit.bit_score*100).round/100.0
|
94
|
+
|
95
|
+
hit.score = hsp.xpath('Hsp_score').text.to_f
|
96
|
+
hit.q_frame = hsp.xpath('Hsp_query-frame').text.to_i
|
97
|
+
hit.s_frame =hsp.xpath('Hsp_hit-frame').text.to_i
|
98
|
+
|
99
|
+
hit.q_seq = hsp.xpath('Hsp_qseq').text
|
100
|
+
hit.s_seq = hsp.xpath('Hsp_hseq').text
|
101
|
+
|
102
|
+
|
103
|
+
hit.subject_id= subject_id
|
104
|
+
hit.full_subject_length=full_subject_length
|
105
|
+
# hit.full_query_length = full_query_length
|
106
|
+
hit.definition=hit_def
|
107
|
+
hit.acc=acc
|
108
|
+
|
109
|
+
query.add_hit(hit)
|
110
|
+
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
#inspect
|
117
|
+
|
118
|
+
end
|
119
|
+
|
120
|
+
|
121
|
+
|
122
|
+
def inspect
|
123
|
+
|
124
|
+
res = "Blast results:\n"
|
125
|
+
res+= '-'*20
|
126
|
+
res+= "\nQuerys: #{@querys.count}\n"
|
127
|
+
@querys.each{|q| res+=q.inspect+"\n"}
|
128
|
+
return res
|
129
|
+
end
|
130
|
+
|
131
|
+
def find_query(querys,name_q)
|
132
|
+
# newq = querys.find{|q| ( q.find{|h| (h.subject_id)})}
|
133
|
+
new_q=nil
|
134
|
+
|
135
|
+
if !querys.empty?
|
136
|
+
new_q=querys.find{|q| (q.query_id==name_q)}
|
137
|
+
end
|
138
|
+
|
139
|
+
return new_q
|
140
|
+
end
|
141
|
+
|
142
|
+
def empty?
|
143
|
+
|
144
|
+
return @querys.empty?
|
145
|
+
end
|
146
|
+
|
147
|
+
def size
|
148
|
+
@querys.size
|
149
|
+
end
|
150
|
+
|
151
|
+
attr_accessor :querys
|
152
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
|
2
|
+
class DustQuery
|
3
|
+
|
4
|
+
attr_accessor :query_id,:dust
|
5
|
+
|
6
|
+
def initialize(query_id)
|
7
|
+
@dust=[]
|
8
|
+
@query_id = query_id
|
9
|
+
end
|
10
|
+
|
11
|
+
def push(interval)
|
12
|
+
@dust.push interval
|
13
|
+
end
|
14
|
+
|
15
|
+
def inspect
|
16
|
+
res= "Query #{query_id}:"
|
17
|
+
@dust.each do |d|
|
18
|
+
res += " from #{d[0]} to #{d[1]}"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
class DustMasker
|
25
|
+
|
26
|
+
def initialize(extra_params = '')
|
27
|
+
|
28
|
+
@format = 'interval'
|
29
|
+
@extra_params=extra_params
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
def get_cmd(extra_params = '')
|
34
|
+
|
35
|
+
cmd = 'dustmasker '+@extra_params + '-outfmt '+ @format + ' 2>/dev/null'
|
36
|
+
return cmd
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
def do_dust(seq_fasta)
|
41
|
+
intervals=[]
|
42
|
+
|
43
|
+
if !seq_fasta.nil? && !seq_fasta.empty?
|
44
|
+
|
45
|
+
if seq_fasta.is_a?(Array)
|
46
|
+
seq_fasta=seq_fasta.join("\n")
|
47
|
+
end
|
48
|
+
|
49
|
+
cmd = get_cmd(@extra_params)
|
50
|
+
if !seq_fasta.index('>')
|
51
|
+
raise "Data passed to dust must be in fasta format"
|
52
|
+
end
|
53
|
+
|
54
|
+
# puts seq_fasta
|
55
|
+
res=''
|
56
|
+
|
57
|
+
# Ojo, que una vez nos ibamos a volver locos buscando porque esto no devolvia todos los hits que se encontraban al ejecutar el blast a mano, y era porque en el blast a mano le estabamos pasando la secuencia completa mientras que en el MID le estabamos pasando sólo los 20 primeros nt.
|
58
|
+
IO.popen(cmd,'w+') {|blast|
|
59
|
+
blast.sync = true
|
60
|
+
# blast.write(">seq\n")
|
61
|
+
blast.write(seq_fasta)
|
62
|
+
blast.close_write
|
63
|
+
res = blast.readlines
|
64
|
+
blast.close_read
|
65
|
+
}
|
66
|
+
|
67
|
+
if !$?.exitstatus.nil? && $?.exitstatus>0
|
68
|
+
raise "Error while doing #{cmd} to seq: #{seq_fasta}"
|
69
|
+
end
|
70
|
+
# puts cmd
|
71
|
+
# puts $?.class
|
72
|
+
# puts res
|
73
|
+
#parse results
|
74
|
+
|
75
|
+
# >seq
|
76
|
+
# 3 - 346
|
77
|
+
# 354 - 683
|
78
|
+
# .
|
79
|
+
|
80
|
+
|
81
|
+
res.each do |line|
|
82
|
+
# puts "LINEA:" + line
|
83
|
+
if line =~ /^>(.*)$/
|
84
|
+
intervals.push DustQuery.new($1)
|
85
|
+
elsif line =~ /^(\d+)\s\-\s(\d+)/
|
86
|
+
# puts "Algo #{$1}, #{$2}"
|
87
|
+
intervals.last.push [$1.to_i,$2.to_i]
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
return intervals
|
94
|
+
|
95
|
+
end
|
96
|
+
|
97
|
+
def close
|
98
|
+
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
|
103
|
+
|
data/script/console
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# File: script/console
|
3
|
+
irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
|
4
|
+
|
5
|
+
libs = " -r irb/completion"
|
6
|
+
# Perhaps use a console_lib to store any extra methods I may want available in the cosole
|
7
|
+
# libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
|
8
|
+
libs << " -r #{File.expand_path(File.join(File.dirname(__FILE__),'..','lib','scbi_blast.rb'))}"
|
9
|
+
puts "Loading scbi_blast gem"
|
10
|
+
exec "#{irb} #{libs} --simple-prompt"
|