bio-blastxmlparser 1.1.2 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +1 -1
- data/Gemfile.lock +1 -1
- data/README.md +21 -37
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/bin/blastxmlparser +70 -18
- data/bio-blastxmlparser.gemspec +9 -8
- data/lib/bio/writers/rdf.rb +1 -1
- data/template/{json.erb → blast2json.erb} +0 -0
- data/template/blast2rdf-minimal.erb +14 -0
- data/template/{rdf.erb → blast2rdf.erb} +1 -2
- metadata +8 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b96fa7141abe77c13e1e34eb1d920a624c22f83d
|
4
|
+
data.tar.gz: 1b038243195b478d18b2a2c72b3fb0b4538a6701
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 87ea99f1ac87b528e8a08c5490cb500038fdd06ea18def8cda1276cf5e7ba7976ca4f0a49934023be16ea122ae1d91e514ad5a5fbdf282245a105ce22131dc6f
|
7
|
+
data.tar.gz: 2237ce97c067f42123c9aeba9ae8cb77cf97d11ef0ad7211ceb8a7e6110f6cc6c71d2a7a2e1e1dcf31b5f6c67d55af8a992b82eb43e377caf858fff6cac3e4d9
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -2,8 +2,9 @@
|
|
2
2
|
|
3
3
|
# bio-blastxmlparser
|
4
4
|
|
5
|
-
blastxmlparser is a very fast big-data BLAST XML file
|
6
|
-
as command line utility. Use blastxmlparser
|
5
|
+
blastxmlparser is a very fast parallelised big-data BLAST XML file
|
6
|
+
parser, which can be used as command line utility. Use blastxmlparser
|
7
|
+
to:
|
7
8
|
|
8
9
|
* Parse BLAST XML
|
9
10
|
* Filter output
|
@@ -24,12 +25,10 @@ can be used to filter results and requires no understanding of Ruby.
|
|
24
25
|
blastxmlparser --help
|
25
26
|
```
|
26
27
|
|
27
|
-
(see Installation, below, if it does not work)
|
28
|
-
|
29
28
|
## Performance
|
30
29
|
|
31
30
|
XML parsing is expensive. blastxmlparser can use the fast Nokogiri C, or
|
32
|
-
Java XML parsers, based on libxml2.
|
31
|
+
Java XML parsers, based on libxml2 in parallel. A DOM parser is used
|
33
32
|
after splitting the BLAST XML document into subsections.
|
34
33
|
Tests show this is faster than a SAX
|
35
34
|
parser with Ruby callbacks. To see why libxml2 based Nokogiri is
|
@@ -38,33 +37,21 @@ fast, see this
|
|
38
37
|
and [xml.com](http://www.xml.com/lpt/a/1703).
|
39
38
|
|
40
39
|
Blastxmlparser is designed with other optimizations, such as lazy
|
41
|
-
evaluation, i.e., only creating objects when required, and
|
42
|
-
|
43
|
-
|
44
|
-
|
40
|
+
evaluation, i.e., only creating objects when required, and
|
41
|
+
parallelism. When parsing a full BLAST result usually only a few
|
42
|
+
fields are used. By using XPath queries the parser makes sure only the
|
43
|
+
relevant fields are queried.
|
45
44
|
|
46
|
-
Timings for parsing
|
45
|
+
Timings for parsing a 128 Mb BLAST XML file on 4x1.2GHz laptop
|
47
46
|
|
48
47
|
```
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
user 0m1.052s
|
53
|
-
sys 0m0.144s
|
54
|
-
|
55
|
-
bio-blastxmlparser + Nokogiri split DOM
|
56
|
-
|
57
|
-
real 0m1.713s
|
58
|
-
user 0m1.444s
|
59
|
-
sys 0m0.160s
|
60
|
-
|
61
|
-
BioRuby ReXML DOM parser (old style)
|
62
|
-
|
63
|
-
real 1m14.548s
|
64
|
-
user 1m13.065s
|
65
|
-
sys 0m0.472s
|
48
|
+
real 0m13.985s
|
49
|
+
user 0m44.951s
|
50
|
+
sys 0m3.676s
|
66
51
|
```
|
67
52
|
|
53
|
+
which makes for pretty good core utilisation.
|
54
|
+
|
68
55
|
## Install
|
69
56
|
|
70
57
|
```sh
|
@@ -99,9 +86,11 @@ provide build paths, as described [here](http://nokogiri.org/tutorials/installin
|
|
99
86
|
blastxmlparser [options] file(s)
|
100
87
|
|
101
88
|
-p, --parser name Use full|split parser (default full)
|
89
|
+
-e, --exec filter Evaluate filter
|
90
|
+
|
91
|
+
-n, --named fields Print named fields
|
102
92
|
--output-fasta Output FASTA
|
103
|
-
-
|
104
|
-
-e, --exec filter Execute filter
|
93
|
+
-t, --template erb Use ERB template for output
|
105
94
|
|
106
95
|
--logger filename Log to file (default stderr)
|
107
96
|
--trace options Set log level (default INFO, see bio-logger)
|
@@ -109,10 +98,6 @@ provide build paths, as described [here](http://nokogiri.org/tutorials/installin
|
|
109
98
|
-v, --verbose Run verbosely
|
110
99
|
--debug Show debug messages
|
111
100
|
-h, --help Show help and examples
|
112
|
-
|
113
|
-
bioblastxmlparser filename(s)
|
114
|
-
|
115
|
-
Use --help switch for more information
|
116
101
|
```
|
117
102
|
|
118
103
|
### Examples
|
@@ -204,7 +189,7 @@ template could be
|
|
204
189
|
To get JSON, run it with
|
205
190
|
|
206
191
|
```sh
|
207
|
-
blastxmlparser --template template/
|
192
|
+
blastxmlparser --template template/blast2json.erb -e 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7
|
208
193
|
```
|
209
194
|
|
210
195
|
```Javascript
|
@@ -223,7 +208,7 @@ To get JSON, run it with
|
|
223
208
|
Likewise, using the RDF template
|
224
209
|
|
225
210
|
```sh
|
226
|
-
blastxmlparser --template template/
|
211
|
+
blastxmlparser --template template/blast2rdf.erb -e 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7
|
227
212
|
```
|
228
213
|
|
229
214
|
```ruby
|
@@ -235,14 +220,13 @@ Likewise, using the RDF template
|
|
235
220
|
:accession "Minc02032",
|
236
221
|
:id "lcl|Minc02032",
|
237
222
|
:len 147,
|
238
|
-
:E-value 8.1089e-12,
|
239
223
|
:identity 60,
|
240
224
|
:align_len 69,
|
241
225
|
:bitscore 69.8753,
|
242
226
|
:qseq "ATGGGAGATGGAATTGAACCGTCATGGAAAGGGCCCAAACCGAAGCACAACCGACTGTGCCACCATCCA",
|
243
227
|
:midline "|||||||||||||||||||| |||||||| | |||||||||||||||||||||||||||||||",
|
244
228
|
:hseq "ATGGGAGATGGAATTGAACCATCATGGAATG-------ACCGAAGCACAACCGACTGTGCCACCATCCA",
|
245
|
-
:evalue
|
229
|
+
:evalue 8.1089e-12 .
|
246
230
|
```
|
247
231
|
|
248
232
|
## Additional options
|
data/Rakefile
CHANGED
@@ -15,7 +15,7 @@ Jeweler::Tasks.new do |gem|
|
|
15
15
|
gem.name = "bio-blastxmlparser"
|
16
16
|
gem.homepage = "http://github.com/pjotrp/blastxmlparser"
|
17
17
|
gem.license = "MIT"
|
18
|
-
gem.summary = %Q{Very fast BLAST XML to RDF/HTML/JSON/YAML/csv transformer}
|
18
|
+
gem.summary = %Q{Very fast parallel BLAST XML to RDF/HTML/JSON/YAML/csv transformer}
|
19
19
|
gem.description = %Q{Fast big data BLAST XML parser and library; this libxml2 based version is 50x faster than BioRuby and comes with a nice CLI}
|
20
20
|
gem.email = "pjotr.public01@thebird.nl"
|
21
21
|
gem.authors = ["Pjotr Prins"]
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
2.0.0
|
data/bin/blastxmlparser
CHANGED
@@ -52,8 +52,17 @@ opts = OptionParser.new do |o|
|
|
52
52
|
options.parser = p.to_sym
|
53
53
|
end
|
54
54
|
|
55
|
-
o.on("
|
56
|
-
options.
|
55
|
+
o.on("--filter filter",String, "Filtering expression") do |s|
|
56
|
+
options.filter = s
|
57
|
+
end
|
58
|
+
|
59
|
+
o.on("-t num", "--threads num",String, "Use parallel threads") do |num|
|
60
|
+
options.threads = num.to_i
|
61
|
+
end
|
62
|
+
|
63
|
+
o.on("-e filter","--exec filter",String, "Evaluate filter (deprecated)") do |s|
|
64
|
+
$stderr.print "WARNING: -e,--exec switch is deprecated, use --filter instead!\n"
|
65
|
+
options.filter = s
|
57
66
|
end
|
58
67
|
|
59
68
|
o.separator ""
|
@@ -61,7 +70,7 @@ opts = OptionParser.new do |o|
|
|
61
70
|
o.on("-n fields","--named fields",String, "Print named fields") do |s|
|
62
71
|
options.fields = s.split(/,/)
|
63
72
|
end
|
64
|
-
|
73
|
+
o.on("--output-fasta","Output FASTA") do |b|
|
65
74
|
options.output_fasta = true
|
66
75
|
end
|
67
76
|
|
@@ -100,7 +109,16 @@ begin
|
|
100
109
|
Bio::Log::CLI.configure('bio-blastxmlparser')
|
101
110
|
logger = Bio::Log::LoggerPlus['bio-blastxmlparser']
|
102
111
|
|
103
|
-
if options
|
112
|
+
if options.threads != 1
|
113
|
+
begin
|
114
|
+
require 'parallel'
|
115
|
+
rescue LoadError
|
116
|
+
$stderr.print "Error: Missing 'parallel' module. Install with command 'gem install parallel' if you want multiple threads\n"
|
117
|
+
options.threads = 1
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
if options.template
|
104
122
|
include BioRdf
|
105
123
|
fn = options.template
|
106
124
|
raise "No template #{fn}!" if not File.exist?(fn)
|
@@ -114,39 +132,73 @@ begin
|
|
114
132
|
else
|
115
133
|
Bio::BlastXMLParser::XmlIterator.new(fn).to_enum
|
116
134
|
end
|
117
|
-
|
118
|
-
|
135
|
+
chunks = []
|
136
|
+
chunks_count = 0
|
137
|
+
NUM_CHUNKS=10_000
|
138
|
+
|
139
|
+
process = lambda { |iter,i| # Process one BLAST iter block
|
140
|
+
res = []
|
141
|
+
line_count = 0
|
142
|
+
hit_count = 0
|
119
143
|
iter.each do | hit |
|
144
|
+
hit_count += 1
|
120
145
|
hit.each do | hsp |
|
121
|
-
do_print = if options.
|
122
|
-
eval(options.
|
146
|
+
do_print = if options.filter
|
147
|
+
eval(options.filter)
|
123
148
|
else
|
124
149
|
true
|
125
150
|
end
|
126
151
|
if do_print
|
152
|
+
line_count += 1
|
127
153
|
if template
|
128
|
-
|
154
|
+
res << template.result(binding)
|
129
155
|
elsif options.output_fasta
|
130
|
-
|
131
|
-
|
156
|
+
res << ">"+hit.accession+' '+iter.iter_num.to_s+'|'+iter.query_id+' '+hit.hit_id+' '+hit.hit_def+"\n"
|
157
|
+
res << hsp.qseq+"\n"
|
132
158
|
else
|
133
159
|
# Default output
|
134
160
|
if options.fields
|
135
|
-
|
161
|
+
out = [iter.iter_num,hit_count,hsp.hsp_num]
|
136
162
|
options.fields.each do | f |
|
137
|
-
|
163
|
+
out << eval(f)
|
138
164
|
end
|
139
|
-
|
140
|
-
|
141
|
-
|
165
|
+
res << out.join("\t")+"\n"
|
166
|
+
else
|
167
|
+
res << [hit_count,iter.iter_num,iter.query_id,hit.hit_id,hsp.hsp_num,hsp.evalue].join("\t")+"\n"
|
142
168
|
end
|
143
169
|
end
|
144
|
-
i += 1
|
145
170
|
end
|
146
171
|
end
|
147
172
|
end
|
173
|
+
res
|
174
|
+
} # end process
|
175
|
+
|
176
|
+
output = lambda { |collection|
|
177
|
+
collection.each do | result |
|
178
|
+
result.each { |line| print line }
|
179
|
+
end
|
180
|
+
} # end output
|
181
|
+
|
182
|
+
if options.threads == 1
|
183
|
+
n.each do | iter |
|
184
|
+
process.call(iter,0).each { | line | print line }
|
185
|
+
end
|
186
|
+
else
|
187
|
+
n.each do | iter |
|
188
|
+
chunks << iter
|
189
|
+
chunks_count += 1
|
190
|
+
if chunks.size > NUM_CHUNKS
|
191
|
+
output.call Parallel.map_with_index(chunks, :in_processes => options.threads) { | iter,i |
|
192
|
+
process.call(iter,i)
|
193
|
+
}
|
194
|
+
chunks = []
|
195
|
+
end
|
196
|
+
end
|
197
|
+
output.call Parallel.map_with_index(chunks, :in_processes => options.threads) { | iter,i |
|
198
|
+
process.call(iter,i)
|
199
|
+
}
|
148
200
|
end
|
149
201
|
end
|
150
202
|
rescue OptionParser::InvalidOption => e
|
151
|
-
|
203
|
+
$stderr.print e.message
|
152
204
|
end
|
data/bio-blastxmlparser.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "bio-blastxmlparser"
|
8
|
-
s.version = "
|
8
|
+
s.version = "2.0.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Pjotr Prins"]
|
12
|
-
s.date = "2014-09-
|
12
|
+
s.date = "2014-09-06"
|
13
13
|
s.description = "Fast big data BLAST XML parser and library; this libxml2 based version is 50x faster than BioRuby and comes with a nice CLI"
|
14
14
|
s.email = "pjotr.public01@thebird.nl"
|
15
15
|
s.executables = ["blastxmlparser"]
|
@@ -42,8 +42,9 @@ Gem::Specification.new do |s|
|
|
42
42
|
"sample/nokogiri_split_dom.rb",
|
43
43
|
"spec/bio-blastxmlparser_spec.rb",
|
44
44
|
"spec/spec_helper.rb",
|
45
|
-
"template/
|
46
|
-
"template/
|
45
|
+
"template/blast2json.erb",
|
46
|
+
"template/blast2rdf-minimal.erb",
|
47
|
+
"template/blast2rdf.erb",
|
47
48
|
"test/data/aa_example.fasta",
|
48
49
|
"test/data/aa_example_blastp.m7",
|
49
50
|
"test/data/nt_example.fasta",
|
@@ -54,14 +55,14 @@ Gem::Specification.new do |s|
|
|
54
55
|
s.licenses = ["MIT"]
|
55
56
|
s.require_paths = ["lib"]
|
56
57
|
s.rubygems_version = "2.0.3"
|
57
|
-
s.summary = "Very fast BLAST XML to RDF/HTML/JSON/YAML/csv transformer"
|
58
|
+
s.summary = "Very fast parallel BLAST XML to RDF/HTML/JSON/YAML/csv transformer"
|
58
59
|
|
59
60
|
if s.respond_to? :specification_version then
|
60
61
|
s.specification_version = 4
|
61
62
|
|
62
63
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
63
64
|
s.add_runtime_dependency(%q<bio-logger>, [">= 0"])
|
64
|
-
s.add_runtime_dependency(%q<nokogiri>, ["~> 1.6.
|
65
|
+
s.add_runtime_dependency(%q<nokogiri>, ["~> 1.6.3"])
|
65
66
|
s.add_development_dependency(%q<rake>, [">= 0"])
|
66
67
|
s.add_development_dependency(%q<bundler>, [">= 0"])
|
67
68
|
s.add_development_dependency(%q<jeweler>, ["~> 2.0.1"])
|
@@ -69,7 +70,7 @@ Gem::Specification.new do |s|
|
|
69
70
|
s.add_development_dependency(%q<rdoc>, [">= 0"])
|
70
71
|
else
|
71
72
|
s.add_dependency(%q<bio-logger>, [">= 0"])
|
72
|
-
s.add_dependency(%q<nokogiri>, ["~> 1.6.
|
73
|
+
s.add_dependency(%q<nokogiri>, ["~> 1.6.3"])
|
73
74
|
s.add_dependency(%q<rake>, [">= 0"])
|
74
75
|
s.add_dependency(%q<bundler>, [">= 0"])
|
75
76
|
s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
|
@@ -78,7 +79,7 @@ Gem::Specification.new do |s|
|
|
78
79
|
end
|
79
80
|
else
|
80
81
|
s.add_dependency(%q<bio-logger>, [">= 0"])
|
81
|
-
s.add_dependency(%q<nokogiri>, ["~> 1.6.
|
82
|
+
s.add_dependency(%q<nokogiri>, ["~> 1.6.3"])
|
82
83
|
s.add_dependency(%q<rake>, [">= 0"])
|
83
84
|
s.add_dependency(%q<bundler>, [">= 0"])
|
84
85
|
s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
|
data/lib/bio/writers/rdf.rb
CHANGED
@@ -63,7 +63,7 @@ module BioRdf
|
|
63
63
|
# Don't want Bio depency in templates!
|
64
64
|
# logger = Bio::Log::LoggerPlus.new 'bio-rdf'
|
65
65
|
# logger.warn "\nWARNING: Changed identifier <#{s}> to <#{id}>"
|
66
|
-
$stderr.print "\nWARNING: Changed identifier <#{s}> to <#{id}>"
|
66
|
+
# $stderr.print "\nWARNING: Changed identifier <#{s}> to <#{id}>"
|
67
67
|
end
|
68
68
|
if not RDF::valid_uri?(id)
|
69
69
|
raise "Invalid URI after mangling <#{s}> to <#{id}>!"
|
File without changes
|
@@ -0,0 +1,14 @@
|
|
1
|
+
<%
|
2
|
+
blastid = Turtle::mangle_identifier(hit.parent.query_def)
|
3
|
+
id = blastid+'_'+hit.hit_num.to_s
|
4
|
+
%>
|
5
|
+
:<%= blastid %> :query :<%= id %>
|
6
|
+
:<%= id %>
|
7
|
+
:query_def "<%= hit.parent.query_def %>",
|
8
|
+
:num <%= hit.hit_num %>,
|
9
|
+
:accession "<%= hit.accession %>",
|
10
|
+
:len <%= hit.len %>,
|
11
|
+
:identity <%= hsp.identity %>,
|
12
|
+
:align_len <%= hsp.align_len %>,
|
13
|
+
:bitscore <%= hsp.bit_score %>,
|
14
|
+
:evalue <%= hsp.evalue %> .
|
@@ -10,12 +10,11 @@
|
|
10
10
|
:accession "<%= hit.accession %>",
|
11
11
|
:id "<%= hit.hit_id %>",
|
12
12
|
:len <%= hit.len %>,
|
13
|
-
:E-value <%= hsp.evalue %>,
|
14
13
|
:identity <%= hsp.identity %>,
|
15
14
|
:align_len <%= hsp.align_len %>,
|
16
15
|
:bitscore <%= hsp.bit_score %>,
|
17
16
|
:qseq "<%= hsp.qseq %>",
|
18
17
|
:midline "<%= hsp.midline %>",
|
19
18
|
:hseq "<%= hsp.hseq %>",
|
20
|
-
:evalue
|
19
|
+
:evalue <%= hsp.evalue %> .
|
21
20
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-blastxmlparser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Pjotr Prins
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-09-
|
11
|
+
date: 2014-09-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio-logger
|
@@ -30,14 +30,14 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 1.6.
|
33
|
+
version: 1.6.3
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: 1.6.
|
40
|
+
version: 1.6.3
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: rake
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -142,8 +142,9 @@ files:
|
|
142
142
|
- sample/nokogiri_split_dom.rb
|
143
143
|
- spec/bio-blastxmlparser_spec.rb
|
144
144
|
- spec/spec_helper.rb
|
145
|
-
- template/
|
146
|
-
- template/
|
145
|
+
- template/blast2json.erb
|
146
|
+
- template/blast2rdf-minimal.erb
|
147
|
+
- template/blast2rdf.erb
|
147
148
|
- test/data/aa_example.fasta
|
148
149
|
- test/data/aa_example_blastp.m7
|
149
150
|
- test/data/nt_example.fasta
|
@@ -172,5 +173,5 @@ rubyforge_project:
|
|
172
173
|
rubygems_version: 2.0.3
|
173
174
|
signing_key:
|
174
175
|
specification_version: 4
|
175
|
-
summary: Very fast BLAST XML to RDF/HTML/JSON/YAML/csv transformer
|
176
|
+
summary: Very fast parallel BLAST XML to RDF/HTML/JSON/YAML/csv transformer
|
176
177
|
test_files: []
|