bio-blastxmlparser 1.1.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +1 -1
- data/Gemfile.lock +1 -1
- data/README.md +21 -37
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/bin/blastxmlparser +70 -18
- data/bio-blastxmlparser.gemspec +9 -8
- data/lib/bio/writers/rdf.rb +1 -1
- data/template/{json.erb → blast2json.erb} +0 -0
- data/template/blast2rdf-minimal.erb +14 -0
- data/template/{rdf.erb → blast2rdf.erb} +1 -2
- metadata +8 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b96fa7141abe77c13e1e34eb1d920a624c22f83d
|
4
|
+
data.tar.gz: 1b038243195b478d18b2a2c72b3fb0b4538a6701
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 87ea99f1ac87b528e8a08c5490cb500038fdd06ea18def8cda1276cf5e7ba7976ca4f0a49934023be16ea122ae1d91e514ad5a5fbdf282245a105ce22131dc6f
|
7
|
+
data.tar.gz: 2237ce97c067f42123c9aeba9ae8cb77cf97d11ef0ad7211ceb8a7e6110f6cc6c71d2a7a2e1e1dcf31b5f6c67d55af8a992b82eb43e377caf858fff6cac3e4d9
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -2,8 +2,9 @@
|
|
2
2
|
|
3
3
|
# bio-blastxmlparser
|
4
4
|
|
5
|
-
blastxmlparser is a very fast big-data BLAST XML file
|
6
|
-
as command line utility. Use blastxmlparser
|
5
|
+
blastxmlparser is a very fast parallelised big-data BLAST XML file
|
6
|
+
parser, which can be used as command line utility. Use blastxmlparser
|
7
|
+
to:
|
7
8
|
|
8
9
|
* Parse BLAST XML
|
9
10
|
* Filter output
|
@@ -24,12 +25,10 @@ can be used to filter results and requires no understanding of Ruby.
|
|
24
25
|
blastxmlparser --help
|
25
26
|
```
|
26
27
|
|
27
|
-
(see Installation, below, if it does not work)
|
28
|
-
|
29
28
|
## Performance
|
30
29
|
|
31
30
|
XML parsing is expensive. blastxmlparser can use the fast Nokogiri C, or
|
32
|
-
Java XML parsers, based on libxml2.
|
31
|
+
Java XML parsers, based on libxml2 in parallel. A DOM parser is used
|
33
32
|
after splitting the BLAST XML document into subsections.
|
34
33
|
Tests show this is faster than a SAX
|
35
34
|
parser with Ruby callbacks. To see why libxml2 based Nokogiri is
|
@@ -38,33 +37,21 @@ fast, see this
|
|
38
37
|
and [xml.com](http://www.xml.com/lpt/a/1703).
|
39
38
|
|
40
39
|
Blastxmlparser is designed with other optimizations, such as lazy
|
41
|
-
evaluation, i.e., only creating objects when required, and
|
42
|
-
|
43
|
-
|
44
|
-
|
40
|
+
evaluation, i.e., only creating objects when required, and
|
41
|
+
parallelism. When parsing a full BLAST result usually only a few
|
42
|
+
fields are used. By using XPath queries the parser makes sure only the
|
43
|
+
relevant fields are queried.
|
45
44
|
|
46
|
-
Timings for parsing
|
45
|
+
Timings for parsing a 128 Mb BLAST XML file on 4x1.2GHz laptop
|
47
46
|
|
48
47
|
```
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
user 0m1.052s
|
53
|
-
sys 0m0.144s
|
54
|
-
|
55
|
-
bio-blastxmlparser + Nokogiri split DOM
|
56
|
-
|
57
|
-
real 0m1.713s
|
58
|
-
user 0m1.444s
|
59
|
-
sys 0m0.160s
|
60
|
-
|
61
|
-
BioRuby ReXML DOM parser (old style)
|
62
|
-
|
63
|
-
real 1m14.548s
|
64
|
-
user 1m13.065s
|
65
|
-
sys 0m0.472s
|
48
|
+
real 0m13.985s
|
49
|
+
user 0m44.951s
|
50
|
+
sys 0m3.676s
|
66
51
|
```
|
67
52
|
|
53
|
+
which makes for pretty good core utilisation.
|
54
|
+
|
68
55
|
## Install
|
69
56
|
|
70
57
|
```sh
|
@@ -99,9 +86,11 @@ provide build paths, as described [here](http://nokogiri.org/tutorials/installin
|
|
99
86
|
blastxmlparser [options] file(s)
|
100
87
|
|
101
88
|
-p, --parser name Use full|split parser (default full)
|
89
|
+
-e, --exec filter Evaluate filter
|
90
|
+
|
91
|
+
-n, --named fields Print named fields
|
102
92
|
--output-fasta Output FASTA
|
103
|
-
-
|
104
|
-
-e, --exec filter Execute filter
|
93
|
+
-t, --template erb Use ERB template for output
|
105
94
|
|
106
95
|
--logger filename Log to file (default stderr)
|
107
96
|
--trace options Set log level (default INFO, see bio-logger)
|
@@ -109,10 +98,6 @@ provide build paths, as described [here](http://nokogiri.org/tutorials/installin
|
|
109
98
|
-v, --verbose Run verbosely
|
110
99
|
--debug Show debug messages
|
111
100
|
-h, --help Show help and examples
|
112
|
-
|
113
|
-
bioblastxmlparser filename(s)
|
114
|
-
|
115
|
-
Use --help switch for more information
|
116
101
|
```
|
117
102
|
|
118
103
|
### Examples
|
@@ -204,7 +189,7 @@ template could be
|
|
204
189
|
To get JSON, run it with
|
205
190
|
|
206
191
|
```sh
|
207
|
-
blastxmlparser --template template/
|
192
|
+
blastxmlparser --template template/blast2json.erb -e 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7
|
208
193
|
```
|
209
194
|
|
210
195
|
```Javascript
|
@@ -223,7 +208,7 @@ To get JSON, run it with
|
|
223
208
|
Likewise, using the RDF template
|
224
209
|
|
225
210
|
```sh
|
226
|
-
blastxmlparser --template template/
|
211
|
+
blastxmlparser --template template/blast2rdf.erb -e 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7
|
227
212
|
```
|
228
213
|
|
229
214
|
```ruby
|
@@ -235,14 +220,13 @@ Likewise, using the RDF template
|
|
235
220
|
:accession "Minc02032",
|
236
221
|
:id "lcl|Minc02032",
|
237
222
|
:len 147,
|
238
|
-
:E-value 8.1089e-12,
|
239
223
|
:identity 60,
|
240
224
|
:align_len 69,
|
241
225
|
:bitscore 69.8753,
|
242
226
|
:qseq "ATGGGAGATGGAATTGAACCGTCATGGAAAGGGCCCAAACCGAAGCACAACCGACTGTGCCACCATCCA",
|
243
227
|
:midline "|||||||||||||||||||| |||||||| | |||||||||||||||||||||||||||||||",
|
244
228
|
:hseq "ATGGGAGATGGAATTGAACCATCATGGAATG-------ACCGAAGCACAACCGACTGTGCCACCATCCA",
|
245
|
-
:evalue
|
229
|
+
:evalue 8.1089e-12 .
|
246
230
|
```
|
247
231
|
|
248
232
|
## Additional options
|
data/Rakefile
CHANGED
@@ -15,7 +15,7 @@ Jeweler::Tasks.new do |gem|
|
|
15
15
|
gem.name = "bio-blastxmlparser"
|
16
16
|
gem.homepage = "http://github.com/pjotrp/blastxmlparser"
|
17
17
|
gem.license = "MIT"
|
18
|
-
gem.summary = %Q{Very fast BLAST XML to RDF/HTML/JSON/YAML/csv transformer}
|
18
|
+
gem.summary = %Q{Very fast parallel BLAST XML to RDF/HTML/JSON/YAML/csv transformer}
|
19
19
|
gem.description = %Q{Fast big data BLAST XML parser and library; this libxml2 based version is 50x faster than BioRuby and comes with a nice CLI}
|
20
20
|
gem.email = "pjotr.public01@thebird.nl"
|
21
21
|
gem.authors = ["Pjotr Prins"]
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
2.0.0
|
data/bin/blastxmlparser
CHANGED
@@ -52,8 +52,17 @@ opts = OptionParser.new do |o|
|
|
52
52
|
options.parser = p.to_sym
|
53
53
|
end
|
54
54
|
|
55
|
-
o.on("
|
56
|
-
options.
|
55
|
+
o.on("--filter filter",String, "Filtering expression") do |s|
|
56
|
+
options.filter = s
|
57
|
+
end
|
58
|
+
|
59
|
+
o.on("-t num", "--threads num",String, "Use parallel threads") do |num|
|
60
|
+
options.threads = num.to_i
|
61
|
+
end
|
62
|
+
|
63
|
+
o.on("-e filter","--exec filter",String, "Evaluate filter (deprecated)") do |s|
|
64
|
+
$stderr.print "WARNING: -e,--exec switch is deprecated, use --filter instead!\n"
|
65
|
+
options.filter = s
|
57
66
|
end
|
58
67
|
|
59
68
|
o.separator ""
|
@@ -61,7 +70,7 @@ opts = OptionParser.new do |o|
|
|
61
70
|
o.on("-n fields","--named fields",String, "Print named fields") do |s|
|
62
71
|
options.fields = s.split(/,/)
|
63
72
|
end
|
64
|
-
|
73
|
+
o.on("--output-fasta","Output FASTA") do |b|
|
65
74
|
options.output_fasta = true
|
66
75
|
end
|
67
76
|
|
@@ -100,7 +109,16 @@ begin
|
|
100
109
|
Bio::Log::CLI.configure('bio-blastxmlparser')
|
101
110
|
logger = Bio::Log::LoggerPlus['bio-blastxmlparser']
|
102
111
|
|
103
|
-
if options
|
112
|
+
if options.threads != 1
|
113
|
+
begin
|
114
|
+
require 'parallel'
|
115
|
+
rescue LoadError
|
116
|
+
$stderr.print "Error: Missing 'parallel' module. Install with command 'gem install parallel' if you want multiple threads\n"
|
117
|
+
options.threads = 1
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
if options.template
|
104
122
|
include BioRdf
|
105
123
|
fn = options.template
|
106
124
|
raise "No template #{fn}!" if not File.exist?(fn)
|
@@ -114,39 +132,73 @@ begin
|
|
114
132
|
else
|
115
133
|
Bio::BlastXMLParser::XmlIterator.new(fn).to_enum
|
116
134
|
end
|
117
|
-
|
118
|
-
|
135
|
+
chunks = []
|
136
|
+
chunks_count = 0
|
137
|
+
NUM_CHUNKS=10_000
|
138
|
+
|
139
|
+
process = lambda { |iter,i| # Process one BLAST iter block
|
140
|
+
res = []
|
141
|
+
line_count = 0
|
142
|
+
hit_count = 0
|
119
143
|
iter.each do | hit |
|
144
|
+
hit_count += 1
|
120
145
|
hit.each do | hsp |
|
121
|
-
do_print = if options.
|
122
|
-
eval(options.
|
146
|
+
do_print = if options.filter
|
147
|
+
eval(options.filter)
|
123
148
|
else
|
124
149
|
true
|
125
150
|
end
|
126
151
|
if do_print
|
152
|
+
line_count += 1
|
127
153
|
if template
|
128
|
-
|
154
|
+
res << template.result(binding)
|
129
155
|
elsif options.output_fasta
|
130
|
-
|
131
|
-
|
156
|
+
res << ">"+hit.accession+' '+iter.iter_num.to_s+'|'+iter.query_id+' '+hit.hit_id+' '+hit.hit_def+"\n"
|
157
|
+
res << hsp.qseq+"\n"
|
132
158
|
else
|
133
159
|
# Default output
|
134
160
|
if options.fields
|
135
|
-
|
161
|
+
out = [iter.iter_num,hit_count,hsp.hsp_num]
|
136
162
|
options.fields.each do | f |
|
137
|
-
|
163
|
+
out << eval(f)
|
138
164
|
end
|
139
|
-
|
140
|
-
|
141
|
-
|
165
|
+
res << out.join("\t")+"\n"
|
166
|
+
else
|
167
|
+
res << [hit_count,iter.iter_num,iter.query_id,hit.hit_id,hsp.hsp_num,hsp.evalue].join("\t")+"\n"
|
142
168
|
end
|
143
169
|
end
|
144
|
-
i += 1
|
145
170
|
end
|
146
171
|
end
|
147
172
|
end
|
173
|
+
res
|
174
|
+
} # end process
|
175
|
+
|
176
|
+
output = lambda { |collection|
|
177
|
+
collection.each do | result |
|
178
|
+
result.each { |line| print line }
|
179
|
+
end
|
180
|
+
} # end output
|
181
|
+
|
182
|
+
if options.threads == 1
|
183
|
+
n.each do | iter |
|
184
|
+
process.call(iter,0).each { | line | print line }
|
185
|
+
end
|
186
|
+
else
|
187
|
+
n.each do | iter |
|
188
|
+
chunks << iter
|
189
|
+
chunks_count += 1
|
190
|
+
if chunks.size > NUM_CHUNKS
|
191
|
+
output.call Parallel.map_with_index(chunks, :in_processes => options.threads) { | iter,i |
|
192
|
+
process.call(iter,i)
|
193
|
+
}
|
194
|
+
chunks = []
|
195
|
+
end
|
196
|
+
end
|
197
|
+
output.call Parallel.map_with_index(chunks, :in_processes => options.threads) { | iter,i |
|
198
|
+
process.call(iter,i)
|
199
|
+
}
|
148
200
|
end
|
149
201
|
end
|
150
202
|
rescue OptionParser::InvalidOption => e
|
151
|
-
|
203
|
+
$stderr.print e.message
|
152
204
|
end
|
data/bio-blastxmlparser.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "bio-blastxmlparser"
|
8
|
-
s.version = "
|
8
|
+
s.version = "2.0.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Pjotr Prins"]
|
12
|
-
s.date = "2014-09-
|
12
|
+
s.date = "2014-09-06"
|
13
13
|
s.description = "Fast big data BLAST XML parser and library; this libxml2 based version is 50x faster than BioRuby and comes with a nice CLI"
|
14
14
|
s.email = "pjotr.public01@thebird.nl"
|
15
15
|
s.executables = ["blastxmlparser"]
|
@@ -42,8 +42,9 @@ Gem::Specification.new do |s|
|
|
42
42
|
"sample/nokogiri_split_dom.rb",
|
43
43
|
"spec/bio-blastxmlparser_spec.rb",
|
44
44
|
"spec/spec_helper.rb",
|
45
|
-
"template/
|
46
|
-
"template/
|
45
|
+
"template/blast2json.erb",
|
46
|
+
"template/blast2rdf-minimal.erb",
|
47
|
+
"template/blast2rdf.erb",
|
47
48
|
"test/data/aa_example.fasta",
|
48
49
|
"test/data/aa_example_blastp.m7",
|
49
50
|
"test/data/nt_example.fasta",
|
@@ -54,14 +55,14 @@ Gem::Specification.new do |s|
|
|
54
55
|
s.licenses = ["MIT"]
|
55
56
|
s.require_paths = ["lib"]
|
56
57
|
s.rubygems_version = "2.0.3"
|
57
|
-
s.summary = "Very fast BLAST XML to RDF/HTML/JSON/YAML/csv transformer"
|
58
|
+
s.summary = "Very fast parallel BLAST XML to RDF/HTML/JSON/YAML/csv transformer"
|
58
59
|
|
59
60
|
if s.respond_to? :specification_version then
|
60
61
|
s.specification_version = 4
|
61
62
|
|
62
63
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
63
64
|
s.add_runtime_dependency(%q<bio-logger>, [">= 0"])
|
64
|
-
s.add_runtime_dependency(%q<nokogiri>, ["~> 1.6.
|
65
|
+
s.add_runtime_dependency(%q<nokogiri>, ["~> 1.6.3"])
|
65
66
|
s.add_development_dependency(%q<rake>, [">= 0"])
|
66
67
|
s.add_development_dependency(%q<bundler>, [">= 0"])
|
67
68
|
s.add_development_dependency(%q<jeweler>, ["~> 2.0.1"])
|
@@ -69,7 +70,7 @@ Gem::Specification.new do |s|
|
|
69
70
|
s.add_development_dependency(%q<rdoc>, [">= 0"])
|
70
71
|
else
|
71
72
|
s.add_dependency(%q<bio-logger>, [">= 0"])
|
72
|
-
s.add_dependency(%q<nokogiri>, ["~> 1.6.
|
73
|
+
s.add_dependency(%q<nokogiri>, ["~> 1.6.3"])
|
73
74
|
s.add_dependency(%q<rake>, [">= 0"])
|
74
75
|
s.add_dependency(%q<bundler>, [">= 0"])
|
75
76
|
s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
|
@@ -78,7 +79,7 @@ Gem::Specification.new do |s|
|
|
78
79
|
end
|
79
80
|
else
|
80
81
|
s.add_dependency(%q<bio-logger>, [">= 0"])
|
81
|
-
s.add_dependency(%q<nokogiri>, ["~> 1.6.
|
82
|
+
s.add_dependency(%q<nokogiri>, ["~> 1.6.3"])
|
82
83
|
s.add_dependency(%q<rake>, [">= 0"])
|
83
84
|
s.add_dependency(%q<bundler>, [">= 0"])
|
84
85
|
s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
|
data/lib/bio/writers/rdf.rb
CHANGED
@@ -63,7 +63,7 @@ module BioRdf
|
|
63
63
|
# Don't want Bio depency in templates!
|
64
64
|
# logger = Bio::Log::LoggerPlus.new 'bio-rdf'
|
65
65
|
# logger.warn "\nWARNING: Changed identifier <#{s}> to <#{id}>"
|
66
|
-
$stderr.print "\nWARNING: Changed identifier <#{s}> to <#{id}>"
|
66
|
+
# $stderr.print "\nWARNING: Changed identifier <#{s}> to <#{id}>"
|
67
67
|
end
|
68
68
|
if not RDF::valid_uri?(id)
|
69
69
|
raise "Invalid URI after mangling <#{s}> to <#{id}>!"
|
File without changes
|
@@ -0,0 +1,14 @@
|
|
1
|
+
<%
|
2
|
+
blastid = Turtle::mangle_identifier(hit.parent.query_def)
|
3
|
+
id = blastid+'_'+hit.hit_num.to_s
|
4
|
+
%>
|
5
|
+
:<%= blastid %> :query :<%= id %>
|
6
|
+
:<%= id %>
|
7
|
+
:query_def "<%= hit.parent.query_def %>",
|
8
|
+
:num <%= hit.hit_num %>,
|
9
|
+
:accession "<%= hit.accession %>",
|
10
|
+
:len <%= hit.len %>,
|
11
|
+
:identity <%= hsp.identity %>,
|
12
|
+
:align_len <%= hsp.align_len %>,
|
13
|
+
:bitscore <%= hsp.bit_score %>,
|
14
|
+
:evalue <%= hsp.evalue %> .
|
@@ -10,12 +10,11 @@
|
|
10
10
|
:accession "<%= hit.accession %>",
|
11
11
|
:id "<%= hit.hit_id %>",
|
12
12
|
:len <%= hit.len %>,
|
13
|
-
:E-value <%= hsp.evalue %>,
|
14
13
|
:identity <%= hsp.identity %>,
|
15
14
|
:align_len <%= hsp.align_len %>,
|
16
15
|
:bitscore <%= hsp.bit_score %>,
|
17
16
|
:qseq "<%= hsp.qseq %>",
|
18
17
|
:midline "<%= hsp.midline %>",
|
19
18
|
:hseq "<%= hsp.hseq %>",
|
20
|
-
:evalue
|
19
|
+
:evalue <%= hsp.evalue %> .
|
21
20
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-blastxmlparser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Pjotr Prins
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-09-
|
11
|
+
date: 2014-09-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio-logger
|
@@ -30,14 +30,14 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 1.6.
|
33
|
+
version: 1.6.3
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: 1.6.
|
40
|
+
version: 1.6.3
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: rake
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -142,8 +142,9 @@ files:
|
|
142
142
|
- sample/nokogiri_split_dom.rb
|
143
143
|
- spec/bio-blastxmlparser_spec.rb
|
144
144
|
- spec/spec_helper.rb
|
145
|
-
- template/
|
146
|
-
- template/
|
145
|
+
- template/blast2json.erb
|
146
|
+
- template/blast2rdf-minimal.erb
|
147
|
+
- template/blast2rdf.erb
|
147
148
|
- test/data/aa_example.fasta
|
148
149
|
- test/data/aa_example_blastp.m7
|
149
150
|
- test/data/nt_example.fasta
|
@@ -172,5 +173,5 @@ rubyforge_project:
|
|
172
173
|
rubygems_version: 2.0.3
|
173
174
|
signing_key:
|
174
175
|
specification_version: 4
|
175
|
-
summary: Very fast BLAST XML to RDF/HTML/JSON/YAML/csv transformer
|
176
|
+
summary: Very fast parallel BLAST XML to RDF/HTML/JSON/YAML/csv transformer
|
176
177
|
test_files: []
|