bio-blastxmlparser 0.6.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +69 -28
- data/VERSION +1 -1
- data/bin/blastxmlparser +44 -10
- data/bio-blastxmlparser.gemspec +2 -2
- data/lib/bio-blastxmlparser.rb +2 -0
- data/lib/bio/db/blast/xmlsplitter.rb +4 -0
- metadata +4 -4
data/README.rdoc
CHANGED
@@ -2,50 +2,57 @@
|
|
2
2
|
|
3
3
|
blastxmlparser is a fast big-data BLAST XML file parser. Rather than
|
4
4
|
loading everything in memory, XML is parsed by BLAST query
|
5
|
-
(Iteration). Not only has this the advantage of low memory use, it
|
6
|
-
also be faster when IO continues in
|
5
|
+
(Iteration). Not only has this the advantage of low memory use, it
|
6
|
+
also shows results early, and it may be faster when IO continues in
|
7
|
+
parallel (disk read-ahead).
|
7
8
|
|
8
9
|
Next to the API, blastxmlparser comes as a command line utility, which
|
9
10
|
can be used to filter results and requires no understanding of Ruby.
|
10
11
|
|
11
12
|
== Performance
|
12
13
|
|
13
|
-
XML parsing is expensive. blastxmlparser uses the Nokogiri C, or Java, XML
|
14
|
-
|
15
|
-
document
|
14
|
+
XML parsing is expensive. blastxmlparser uses the fast Nokogiri C, or Java, XML
|
15
|
+
parsers, based on libxml2. Basically, a DOM parser is used for subsections of a
|
16
|
+
document. Tests show this is faster than a SAX parser with Ruby callbacks. To
|
16
17
|
see why libxml2 based Nokogiri is fast, see
|
17
18
|
http://www.rubyinside.com/ruby-xml-performance-benchmarks-1641.html and
|
18
19
|
http://www.xml.com/lpt/a/1703.
|
19
20
|
|
20
21
|
The parser is also designed with other optimizations, such as lazy evaluation,
|
21
|
-
only creating objects when required, and (future) parallelization. When parsing
|
22
|
+
only creating objects when required, and (in a future version) parallelization. When parsing
|
22
23
|
a full BLAST result usually only a few fields are used. By using XPath queries
|
23
24
|
only the relevant fields are queried.
|
24
25
|
|
25
26
|
Timings for parsing test/data/nt_example_blastn.m7 (file size 3.4Mb)
|
26
27
|
|
27
|
-
Nokogiri DOM (default)
|
28
|
+
bio-blastxmlparser + Nokogiri DOM (default)
|
28
29
|
|
29
|
-
real 0m1.259s
|
30
|
-
user 0m1.052s
|
31
|
-
sys 0m0.144s
|
30
|
+
real 0m1.259s
|
31
|
+
user 0m1.052s
|
32
|
+
sys 0m0.144s
|
32
33
|
|
33
|
-
Nokogiri split DOM
|
34
|
+
bio-blastxmlparser + Nokogiri split DOM
|
34
35
|
|
35
|
-
real 0m1.713s
|
36
|
-
user 0m1.444s
|
37
|
-
sys 0m0.160s
|
36
|
+
real 0m1.713s
|
37
|
+
user 0m1.444s
|
38
|
+
sys 0m0.160s
|
38
39
|
|
39
|
-
BioRuby ReXML DOM parser
|
40
|
+
BioRuby ReXML DOM parser
|
40
41
|
|
41
|
-
real 1m14.548s
|
42
|
-
user 1m13.065s
|
43
|
-
sys 0m0.472s
|
42
|
+
real 1m14.548s
|
43
|
+
user 1m13.065s
|
44
|
+
sys 0m0.472s
|
44
45
|
|
45
46
|
== Install
|
46
47
|
|
48
|
+
Quick install:
|
49
|
+
|
47
50
|
gem install bio-blastxmlparser
|
48
51
|
|
52
|
+
Important: the parser is written for Ruby >= 1.9. You can check with
|
53
|
+
|
54
|
+
gem env
|
55
|
+
|
49
56
|
Nokogiri XML parser is required. To install it,
|
50
57
|
the libxml2 libraries and headers need to be installed first, for
|
51
58
|
example on Debian:
|
@@ -56,7 +63,7 @@ example on Debian:
|
|
56
63
|
for more installation on other platforms see
|
57
64
|
http://nokogiri.org/tutorials/installing_nokogiri.html.
|
58
65
|
|
59
|
-
== API
|
66
|
+
== API (Ruby library)
|
60
67
|
|
61
68
|
To loop through a BLAST result:
|
62
69
|
|
@@ -72,12 +79,13 @@ To loop through a BLAST result:
|
|
72
79
|
>> end
|
73
80
|
>> end
|
74
81
|
|
75
|
-
The next example parses XML using less memory
|
82
|
+
The next example parses XML using less memory by using a Ruby
|
83
|
+
Iterator
|
76
84
|
|
77
|
-
>> blast = XmlSplitterIterator.new(fn).to_enum
|
85
|
+
>> blast = Bio::Blast::XmlSplitterIterator.new(fn).to_enum
|
78
86
|
>> iter = blast.next
|
79
87
|
>> iter.iter_num
|
80
|
-
|
88
|
+
=> 1
|
81
89
|
>> iter.query_id
|
82
90
|
=> "lcl|1_0"
|
83
91
|
|
@@ -132,14 +140,19 @@ Get the first Hsp
|
|
132
140
|
>> hsp.midline
|
133
141
|
=> "|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||"
|
134
142
|
|
135
|
-
|
143
|
+
Unlike BioRuby, this module uses the actual element names in the XML
|
144
|
+
definition, to avoid confusion (if anyone wants a translation,
|
145
|
+
feel free to contribute an adaptor).
|
146
|
+
|
147
|
+
It is also possible to use the XML element names as Strings, rather
|
148
|
+
than methods. E.g.
|
136
149
|
|
137
150
|
>> hsp.field("Hsp_bit-score")
|
138
151
|
=> "145.205"
|
139
152
|
>> hsp["Hsp_bit-score"]
|
140
153
|
=> "145.205"
|
141
154
|
|
142
|
-
Note that
|
155
|
+
Note that, when using the element names, the results are always String values.
|
143
156
|
|
144
157
|
Fetch the next result (Iteration)
|
145
158
|
|
@@ -153,11 +166,14 @@ etc. etc.
|
|
153
166
|
|
154
167
|
For more examples see the files in ./spec
|
155
168
|
|
156
|
-
==
|
169
|
+
== Command line usage
|
157
170
|
|
171
|
+
|
172
|
+
== Usage
|
158
173
|
blastxmlparser [options] file(s)
|
159
174
|
|
160
175
|
-p, --parser name Use full|split parser (default full)
|
176
|
+
--output-fasta Output FASTA
|
161
177
|
-n, --named fields Set named fields
|
162
178
|
-e, --exec filter Execute filter
|
163
179
|
|
@@ -182,11 +198,23 @@ Print fields where bit_score > 145
|
|
182
198
|
|
183
199
|
blastxmlparser -e 'hsp.bit_score>145' test/data/nt_example_blastn.m7
|
184
200
|
|
185
|
-
|
201
|
+
prints a tab delimited
|
202
|
+
|
203
|
+
1 1 lcl|1_0 lcl|I_74685 1 5.82208e-34
|
204
|
+
2 1 lcl|1_0 lcl|I_1 1 5.82208e-34
|
205
|
+
3 2 lcl|2_0 lcl|I_2 1 6.05436e-59
|
206
|
+
4 3 lcl|3_0 lcl|I_3 1 2.03876e-56
|
207
|
+
|
208
|
+
The second and third column show the BLAST iteration, and the others
|
209
|
+
relate to the hits.
|
210
|
+
|
211
|
+
As this is evaluated Ruby, it is also possible to use the XML element
|
212
|
+
names directly
|
186
213
|
|
187
214
|
blastxmlparser -e 'hsp["Hsp_bit-score"].to_i>145' test/data/nt_example_blastn.m7
|
188
215
|
|
189
|
-
|
216
|
+
And it is possible to print (non default) named fields where E-value < 0.001
|
217
|
+
and hit length > 100. E.g.
|
190
218
|
|
191
219
|
blastxmlparser -n 'hsp.evalue,hsp.qseq' -e 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7
|
192
220
|
|
@@ -197,7 +225,20 @@ Print named fields where E-value < 0.001 and hit length > 100
|
|
197
225
|
5 2.76378e-11 GAAGAGTGTACTACCGTTTCTGTAGCTACCATATT
|
198
226
|
etc. etc.
|
199
227
|
|
200
|
-
|
228
|
+
prints the evalue and qseq columns. To output FASTA use --output-fasta
|
229
|
+
|
230
|
+
blastxmlparser --output-fasta -e 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7
|
231
|
+
|
232
|
+
which prints matching sequences, where the first field is the accession, followed
|
233
|
+
by query iteration id, and hit_id. E.g.
|
234
|
+
|
235
|
+
>I_74685 1|lcl|1_0 lcl|I_74685 [57809 - 57666] (REVERSE SENSE)
|
236
|
+
AGTGAAGCTTCTAGATATTTGGCGGGTACCTCTAATTTTGCCTGCCTGCCAACCTATATGCTCCTGTGTTTAG
|
237
|
+
>I_1 1|lcl|1_0 lcl|I_1 [477 - 884]
|
238
|
+
AGTGAAGCTTCTAGATATTTGGCGGGTACCTCTAATTTTGCCTGCCTGCCAACCTATATGCTCCTGTGTTTAG
|
239
|
+
etc. etc.
|
240
|
+
|
241
|
+
To use the low-mem (iterated slower) version of the parser use
|
201
242
|
|
202
243
|
blastxmlparser --parser split -n 'hsp.evalue,hsp.qseq' -e 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7
|
203
244
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.6.
|
1
|
+
0.6.1
|
data/bin/blastxmlparser
CHANGED
@@ -30,11 +30,23 @@ Print fields where bit_score > 145
|
|
30
30
|
|
31
31
|
blastxmlparser -e 'hsp.bit_score>145' test/data/nt_example_blastn.m7
|
32
32
|
|
33
|
-
|
33
|
+
prints a tab delimited
|
34
|
+
|
35
|
+
1 1 lcl|1_0 lcl|I_74685 1 5.82208e-34
|
36
|
+
2 1 lcl|1_0 lcl|I_1 1 5.82208e-34
|
37
|
+
3 2 lcl|2_0 lcl|I_2 1 6.05436e-59
|
38
|
+
4 3 lcl|3_0 lcl|I_3 1 2.03876e-56
|
39
|
+
|
40
|
+
The second and third column show the BLAST iteration, and the others
|
41
|
+
relate to the hits.
|
42
|
+
|
43
|
+
As this is evaluated Ruby, it is also possible to use the XML element
|
44
|
+
names directly
|
34
45
|
|
35
46
|
blastxmlparser -e 'hsp["Hsp_bit-score"].to_i>145' test/data/nt_example_blastn.m7
|
36
47
|
|
37
|
-
|
48
|
+
And it is possible to print (non default) named fields where E-value < 0.001
|
49
|
+
and hit length > 100. E.g.
|
38
50
|
|
39
51
|
blastxmlparser -n 'hsp.evalue,hsp.qseq' -e 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7
|
40
52
|
|
@@ -45,7 +57,20 @@ Print named fields where E-value < 0.001 and hit length > 100
|
|
45
57
|
5 2.76378e-11 GAAGAGTGTACTACCGTTTCTGTAGCTACCATATT
|
46
58
|
etc. etc.
|
47
59
|
|
48
|
-
|
60
|
+
prints the evalue and qseq columns. To output FASTA use --output-fasta
|
61
|
+
|
62
|
+
blastxmlparser --output-fasta -e 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7
|
63
|
+
|
64
|
+
which prints matching sequences, where the first field is the accession, followed
|
65
|
+
by query iteration id, and hit_id. E.g.
|
66
|
+
|
67
|
+
>I_74685 1|lcl|1_0 lcl|I_74685 [57809 - 57666] (REVERSE SENSE)
|
68
|
+
AGTGAAGCTTCTAGATATTTGGCGGGTACCTCTAATTTTGCCTGCCTGCCAACCTATATGCTCCTGTGTTTAG
|
69
|
+
>I_1 1|lcl|1_0 lcl|I_1 [477 - 884]
|
70
|
+
AGTGAAGCTTCTAGATATTTGGCGGGTACCTCTAATTTTGCCTGCCTGCCAACCTATATGCTCCTGTGTTTAG
|
71
|
+
etc. etc.
|
72
|
+
|
73
|
+
To use the low-mem (iterated slower) version of the parser use
|
49
74
|
|
50
75
|
blastxmlparser --parser split -n 'hsp.evalue,hsp.qseq' -e 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7
|
51
76
|
|
@@ -90,6 +115,10 @@ opts = OptionParser.new do |o|
|
|
90
115
|
options.parser = p.to_sym
|
91
116
|
end
|
92
117
|
|
118
|
+
o.on("--output-fasta","Output FASTA") do |b|
|
119
|
+
options.output_fasta = true
|
120
|
+
end
|
121
|
+
|
93
122
|
o.on("-n fields","--named fields",String, "Set named fields") do |s|
|
94
123
|
options.fields = s.split(/,/)
|
95
124
|
end
|
@@ -145,14 +174,19 @@ begin
|
|
145
174
|
true
|
146
175
|
end
|
147
176
|
if do_print
|
148
|
-
if options.
|
149
|
-
print
|
150
|
-
|
151
|
-
print eval(f),"\t"
|
152
|
-
end
|
153
|
-
print "\n"
|
177
|
+
if options.output_fasta
|
178
|
+
print ">"+hit.accession+' '+iter.iter_num.to_s+'|'+iter.query_id+' '+hit.hit_id+' '+hit.hit_def+"\n"
|
179
|
+
print hsp.qseq+"\n"
|
154
180
|
else
|
155
|
-
|
181
|
+
if options.fields
|
182
|
+
print i,"\t"
|
183
|
+
options.fields.each do | f |
|
184
|
+
print eval(f),"\t"
|
185
|
+
end
|
186
|
+
print "\n"
|
187
|
+
else
|
188
|
+
print [i,iter.iter_num,iter.query_id,hit.hit_id,hsp.hsp_num,hsp.evalue].join("\t"),"\n"
|
189
|
+
end
|
156
190
|
end
|
157
191
|
i += 1
|
158
192
|
end
|
data/bio-blastxmlparser.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{bio-blastxmlparser}
|
8
|
-
s.version = "0.6.
|
8
|
+
s.version = "0.6.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Pjotr Prins"]
|
12
|
-
s.date = %q{2011-
|
12
|
+
s.date = %q{2011-04-26}
|
13
13
|
s.default_executable = %q{blastxmlparser}
|
14
14
|
s.description = %q{Fast big data XML parser and library, written in Ruby}
|
15
15
|
s.email = %q{pjotr.public01@thebird.nl}
|
data/lib/bio-blastxmlparser.rb
CHANGED
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 6
|
8
|
-
-
|
9
|
-
version: 0.6.
|
8
|
+
- 1
|
9
|
+
version: 0.6.1
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Pjotr Prins
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-
|
17
|
+
date: 2011-04-26 00:00:00 +02:00
|
18
18
|
default_executable: blastxmlparser
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -156,7 +156,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
156
156
|
requirements:
|
157
157
|
- - ">="
|
158
158
|
- !ruby/object:Gem::Version
|
159
|
-
hash:
|
159
|
+
hash: 169663261
|
160
160
|
segments:
|
161
161
|
- 0
|
162
162
|
version: "0"
|