bio-blastxmlparser 0.6.0 → 0.6.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +69 -28
- data/VERSION +1 -1
- data/bin/blastxmlparser +44 -10
- data/bio-blastxmlparser.gemspec +2 -2
- data/lib/bio-blastxmlparser.rb +2 -0
- data/lib/bio/db/blast/xmlsplitter.rb +4 -0
- metadata +4 -4
data/README.rdoc
CHANGED
@@ -2,50 +2,57 @@
|
|
2
2
|
|
3
3
|
blastxmlparser is a fast big-data BLAST XML file parser. Rather than
|
4
4
|
loading everything in memory, XML is parsed by BLAST query
|
5
|
-
(Iteration). Not only has this the advantage of low memory use, it
|
6
|
-
also be faster when IO continues in
|
5
|
+
(Iteration). Not only has this the advantage of low memory use, it
|
6
|
+
also shows results early, and it may be faster when IO continues in
|
7
|
+
parallel (disk read-ahead).
|
7
8
|
|
8
9
|
Next to the API, blastxmlparser comes as a command line utility, which
|
9
10
|
can be used to filter results and requires no understanding of Ruby.
|
10
11
|
|
11
12
|
== Performance
|
12
13
|
|
13
|
-
XML parsing is expensive. blastxmlparser uses the Nokogiri C, or Java, XML
|
14
|
-
|
15
|
-
document
|
14
|
+
XML parsing is expensive. blastxmlparser uses the fast Nokogiri C, or Java, XML
|
15
|
+
parsers, based on libxml2. Basically, a DOM parser is used for subsections of a
|
16
|
+
document. Tests show this is faster than a SAX parser with Ruby callbacks. To
|
16
17
|
see why libxml2 based Nokogiri is fast, see
|
17
18
|
http://www.rubyinside.com/ruby-xml-performance-benchmarks-1641.html and
|
18
19
|
http://www.xml.com/lpt/a/1703.
|
19
20
|
|
20
21
|
The parser is also designed with other optimizations, such as lazy evaluation,
|
21
|
-
only creating objects when required, and (future) parallelization. When parsing
|
22
|
+
only creating objects when required, and (in a future version) parallelization. When parsing
|
22
23
|
a full BLAST result usually only a few fields are used. By using XPath queries
|
23
24
|
only the relevant fields are queried.
|
24
25
|
|
25
26
|
Timings for parsing test/data/nt_example_blastn.m7 (file size 3.4Mb)
|
26
27
|
|
27
|
-
Nokogiri DOM (default)
|
28
|
+
bio-blastxmlparser + Nokogiri DOM (default)
|
28
29
|
|
29
|
-
real 0m1.259s
|
30
|
-
user 0m1.052s
|
31
|
-
sys 0m0.144s
|
30
|
+
real 0m1.259s
|
31
|
+
user 0m1.052s
|
32
|
+
sys 0m0.144s
|
32
33
|
|
33
|
-
Nokogiri split DOM
|
34
|
+
bio-blastxmlparser + Nokogiri split DOM
|
34
35
|
|
35
|
-
real 0m1.713s
|
36
|
-
user 0m1.444s
|
37
|
-
sys 0m0.160s
|
36
|
+
real 0m1.713s
|
37
|
+
user 0m1.444s
|
38
|
+
sys 0m0.160s
|
38
39
|
|
39
|
-
BioRuby ReXML DOM parser
|
40
|
+
BioRuby ReXML DOM parser
|
40
41
|
|
41
|
-
real 1m14.548s
|
42
|
-
user 1m13.065s
|
43
|
-
sys 0m0.472s
|
42
|
+
real 1m14.548s
|
43
|
+
user 1m13.065s
|
44
|
+
sys 0m0.472s
|
44
45
|
|
45
46
|
== Install
|
46
47
|
|
48
|
+
Quick install:
|
49
|
+
|
47
50
|
gem install bio-blastxmlparser
|
48
51
|
|
52
|
+
Important: the parser is written for Ruby >= 1.9. You can check with
|
53
|
+
|
54
|
+
gem env
|
55
|
+
|
49
56
|
Nokogiri XML parser is required. To install it,
|
50
57
|
the libxml2 libraries and headers need to be installed first, for
|
51
58
|
example on Debian:
|
@@ -56,7 +63,7 @@ example on Debian:
|
|
56
63
|
for more installation on other platforms see
|
57
64
|
http://nokogiri.org/tutorials/installing_nokogiri.html.
|
58
65
|
|
59
|
-
== API
|
66
|
+
== API (Ruby library)
|
60
67
|
|
61
68
|
To loop through a BLAST result:
|
62
69
|
|
@@ -72,12 +79,13 @@ To loop through a BLAST result:
|
|
72
79
|
>> end
|
73
80
|
>> end
|
74
81
|
|
75
|
-
The next example parses XML using less memory
|
82
|
+
The next example parses XML using less memory by using a Ruby
|
83
|
+
Iterator
|
76
84
|
|
77
|
-
>> blast = XmlSplitterIterator.new(fn).to_enum
|
85
|
+
>> blast = Bio::Blast::XmlSplitterIterator.new(fn).to_enum
|
78
86
|
>> iter = blast.next
|
79
87
|
>> iter.iter_num
|
80
|
-
|
88
|
+
=> 1
|
81
89
|
>> iter.query_id
|
82
90
|
=> "lcl|1_0"
|
83
91
|
|
@@ -132,14 +140,19 @@ Get the first Hsp
|
|
132
140
|
>> hsp.midline
|
133
141
|
=> "|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||"
|
134
142
|
|
135
|
-
|
143
|
+
Unlike BioRuby, this module uses the actual element names in the XML
|
144
|
+
definition, to avoid confusion (if anyone wants a translation,
|
145
|
+
feel free to contribute an adaptor).
|
146
|
+
|
147
|
+
It is also possible to use the XML element names as Strings, rather
|
148
|
+
than methods. E.g.
|
136
149
|
|
137
150
|
>> hsp.field("Hsp_bit-score")
|
138
151
|
=> "145.205"
|
139
152
|
>> hsp["Hsp_bit-score"]
|
140
153
|
=> "145.205"
|
141
154
|
|
142
|
-
Note that
|
155
|
+
Note that, when using the element names, the results are always String values.
|
143
156
|
|
144
157
|
Fetch the next result (Iteration)
|
145
158
|
|
@@ -153,11 +166,14 @@ etc. etc.
|
|
153
166
|
|
154
167
|
For more examples see the files in ./spec
|
155
168
|
|
156
|
-
==
|
169
|
+
== Command line usage
|
157
170
|
|
171
|
+
|
172
|
+
== Usage
|
158
173
|
blastxmlparser [options] file(s)
|
159
174
|
|
160
175
|
-p, --parser name Use full|split parser (default full)
|
176
|
+
--output-fasta Output FASTA
|
161
177
|
-n, --named fields Set named fields
|
162
178
|
-e, --exec filter Execute filter
|
163
179
|
|
@@ -182,11 +198,23 @@ Print fields where bit_score > 145
|
|
182
198
|
|
183
199
|
blastxmlparser -e 'hsp.bit_score>145' test/data/nt_example_blastn.m7
|
184
200
|
|
185
|
-
|
201
|
+
prints a tab delimited
|
202
|
+
|
203
|
+
1 1 lcl|1_0 lcl|I_74685 1 5.82208e-34
|
204
|
+
2 1 lcl|1_0 lcl|I_1 1 5.82208e-34
|
205
|
+
3 2 lcl|2_0 lcl|I_2 1 6.05436e-59
|
206
|
+
4 3 lcl|3_0 lcl|I_3 1 2.03876e-56
|
207
|
+
|
208
|
+
The second and third column show the BLAST iteration, and the others
|
209
|
+
relate to the hits.
|
210
|
+
|
211
|
+
As this is evaluated Ruby, it is also possible to use the XML element
|
212
|
+
names directly
|
186
213
|
|
187
214
|
blastxmlparser -e 'hsp["Hsp_bit-score"].to_i>145' test/data/nt_example_blastn.m7
|
188
215
|
|
189
|
-
|
216
|
+
And it is possible to print (non default) named fields where E-value < 0.001
|
217
|
+
and hit length > 100. E.g.
|
190
218
|
|
191
219
|
blastxmlparser -n 'hsp.evalue,hsp.qseq' -e 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7
|
192
220
|
|
@@ -197,7 +225,20 @@ Print named fields where E-value < 0.001 and hit length > 100
|
|
197
225
|
5 2.76378e-11 GAAGAGTGTACTACCGTTTCTGTAGCTACCATATT
|
198
226
|
etc. etc.
|
199
227
|
|
200
|
-
|
228
|
+
prints the evalue and qseq columns. To output FASTA use --output-fasta
|
229
|
+
|
230
|
+
blastxmlparser --output-fasta -e 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7
|
231
|
+
|
232
|
+
which prints matching sequences, where the first field is the accession, followed
|
233
|
+
by query iteration id, and hit_id. E.g.
|
234
|
+
|
235
|
+
>I_74685 1|lcl|1_0 lcl|I_74685 [57809 - 57666] (REVERSE SENSE)
|
236
|
+
AGTGAAGCTTCTAGATATTTGGCGGGTACCTCTAATTTTGCCTGCCTGCCAACCTATATGCTCCTGTGTTTAG
|
237
|
+
>I_1 1|lcl|1_0 lcl|I_1 [477 - 884]
|
238
|
+
AGTGAAGCTTCTAGATATTTGGCGGGTACCTCTAATTTTGCCTGCCTGCCAACCTATATGCTCCTGTGTTTAG
|
239
|
+
etc. etc.
|
240
|
+
|
241
|
+
To use the low-mem (iterated slower) version of the parser use
|
201
242
|
|
202
243
|
blastxmlparser --parser split -n 'hsp.evalue,hsp.qseq' -e 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7
|
203
244
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.6.
|
1
|
+
0.6.1
|
data/bin/blastxmlparser
CHANGED
@@ -30,11 +30,23 @@ Print fields where bit_score > 145
|
|
30
30
|
|
31
31
|
blastxmlparser -e 'hsp.bit_score>145' test/data/nt_example_blastn.m7
|
32
32
|
|
33
|
-
|
33
|
+
prints a tab delimited
|
34
|
+
|
35
|
+
1 1 lcl|1_0 lcl|I_74685 1 5.82208e-34
|
36
|
+
2 1 lcl|1_0 lcl|I_1 1 5.82208e-34
|
37
|
+
3 2 lcl|2_0 lcl|I_2 1 6.05436e-59
|
38
|
+
4 3 lcl|3_0 lcl|I_3 1 2.03876e-56
|
39
|
+
|
40
|
+
The second and third column show the BLAST iteration, and the others
|
41
|
+
relate to the hits.
|
42
|
+
|
43
|
+
As this is evaluated Ruby, it is also possible to use the XML element
|
44
|
+
names directly
|
34
45
|
|
35
46
|
blastxmlparser -e 'hsp["Hsp_bit-score"].to_i>145' test/data/nt_example_blastn.m7
|
36
47
|
|
37
|
-
|
48
|
+
And it is possible to print (non default) named fields where E-value < 0.001
|
49
|
+
and hit length > 100. E.g.
|
38
50
|
|
39
51
|
blastxmlparser -n 'hsp.evalue,hsp.qseq' -e 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7
|
40
52
|
|
@@ -45,7 +57,20 @@ Print named fields where E-value < 0.001 and hit length > 100
|
|
45
57
|
5 2.76378e-11 GAAGAGTGTACTACCGTTTCTGTAGCTACCATATT
|
46
58
|
etc. etc.
|
47
59
|
|
48
|
-
|
60
|
+
prints the evalue and qseq columns. To output FASTA use --output-fasta
|
61
|
+
|
62
|
+
blastxmlparser --output-fasta -e 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7
|
63
|
+
|
64
|
+
which prints matching sequences, where the first field is the accession, followed
|
65
|
+
by query iteration id, and hit_id. E.g.
|
66
|
+
|
67
|
+
>I_74685 1|lcl|1_0 lcl|I_74685 [57809 - 57666] (REVERSE SENSE)
|
68
|
+
AGTGAAGCTTCTAGATATTTGGCGGGTACCTCTAATTTTGCCTGCCTGCCAACCTATATGCTCCTGTGTTTAG
|
69
|
+
>I_1 1|lcl|1_0 lcl|I_1 [477 - 884]
|
70
|
+
AGTGAAGCTTCTAGATATTTGGCGGGTACCTCTAATTTTGCCTGCCTGCCAACCTATATGCTCCTGTGTTTAG
|
71
|
+
etc. etc.
|
72
|
+
|
73
|
+
To use the low-mem (iterated slower) version of the parser use
|
49
74
|
|
50
75
|
blastxmlparser --parser split -n 'hsp.evalue,hsp.qseq' -e 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7
|
51
76
|
|
@@ -90,6 +115,10 @@ opts = OptionParser.new do |o|
|
|
90
115
|
options.parser = p.to_sym
|
91
116
|
end
|
92
117
|
|
118
|
+
o.on("--output-fasta","Output FASTA") do |b|
|
119
|
+
options.output_fasta = true
|
120
|
+
end
|
121
|
+
|
93
122
|
o.on("-n fields","--named fields",String, "Set named fields") do |s|
|
94
123
|
options.fields = s.split(/,/)
|
95
124
|
end
|
@@ -145,14 +174,19 @@ begin
|
|
145
174
|
true
|
146
175
|
end
|
147
176
|
if do_print
|
148
|
-
if options.
|
149
|
-
print
|
150
|
-
|
151
|
-
print eval(f),"\t"
|
152
|
-
end
|
153
|
-
print "\n"
|
177
|
+
if options.output_fasta
|
178
|
+
print ">"+hit.accession+' '+iter.iter_num.to_s+'|'+iter.query_id+' '+hit.hit_id+' '+hit.hit_def+"\n"
|
179
|
+
print hsp.qseq+"\n"
|
154
180
|
else
|
155
|
-
|
181
|
+
if options.fields
|
182
|
+
print i,"\t"
|
183
|
+
options.fields.each do | f |
|
184
|
+
print eval(f),"\t"
|
185
|
+
end
|
186
|
+
print "\n"
|
187
|
+
else
|
188
|
+
print [i,iter.iter_num,iter.query_id,hit.hit_id,hsp.hsp_num,hsp.evalue].join("\t"),"\n"
|
189
|
+
end
|
156
190
|
end
|
157
191
|
i += 1
|
158
192
|
end
|
data/bio-blastxmlparser.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{bio-blastxmlparser}
|
8
|
-
s.version = "0.6.
|
8
|
+
s.version = "0.6.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Pjotr Prins"]
|
12
|
-
s.date = %q{2011-
|
12
|
+
s.date = %q{2011-04-26}
|
13
13
|
s.default_executable = %q{blastxmlparser}
|
14
14
|
s.description = %q{Fast big data XML parser and library, written in Ruby}
|
15
15
|
s.email = %q{pjotr.public01@thebird.nl}
|
data/lib/bio-blastxmlparser.rb
CHANGED
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 6
|
8
|
-
-
|
9
|
-
version: 0.6.
|
8
|
+
- 1
|
9
|
+
version: 0.6.1
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Pjotr Prins
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-
|
17
|
+
date: 2011-04-26 00:00:00 +02:00
|
18
18
|
default_executable: blastxmlparser
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -156,7 +156,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
156
156
|
requirements:
|
157
157
|
- - ">="
|
158
158
|
- !ruby/object:Gem::Version
|
159
|
-
hash:
|
159
|
+
hash: 169663261
|
160
160
|
segments:
|
161
161
|
- 0
|
162
162
|
version: "0"
|