bio-blastxmlparser 2.0.1 → 2.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +54 -1
- data/VERSION +1 -1
- data/bin/blastxmlparser +13 -6
- data/bio-blastxmlparser.gemspec +4 -2
- data/lib/bio/writers/template.rb +75 -0
- data/template/blast2json2.erb +28 -0
- data/template/blast2rdf-minimal.erb +8 -8
- data/template/blast2rdf.erb +11 -11
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 76df7cd1f6e1bc4f2b04fdf3f0fc830110f2e376
|
4
|
+
data.tar.gz: 68f44a797aa5357690e6c4a10a1fe241a7b8fe37
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f424b8cfedf921840dbf2fee412c191f1c9951d289b80db047744a56883b171b50d4b1161e86415f0e873be4bd43935031f966d32d394efea9208e65b2529903
|
7
|
+
data.tar.gz: 6214606ac08afa7306503c78e969046ab2a54d9c45e9dc42c597d8c033db2095fc5a25f15f56090a0337bd1049081699a81ceefa90d2acaf17cb782340074fc6
|
data/README.md
CHANGED
@@ -107,7 +107,7 @@ Print result fields of iterations containing 'lcl', using a regex
|
|
107
107
|
blastxmlparser --filter 'iter.query_id=~/lcl/' test/data/nt_example_blastn.m7
|
108
108
|
```
|
109
109
|
|
110
|
-
prints a tab delimited
|
110
|
+
prints a (default) tab delimited to stdout
|
111
111
|
|
112
112
|
```sh
|
113
113
|
1 1 lcl|1_0 lcl|I_74685 1 5.82208e-34
|
@@ -228,6 +228,59 @@ Likewise, using the RDF template
|
|
228
228
|
:evalue 8.1089e-12 .
|
229
229
|
```
|
230
230
|
|
231
|
+
### Metadata
|
232
|
+
|
233
|
+
Templates can also print data as a header of the JSON/YAML/RDF output. For this
|
234
|
+
use the '=' prefix with HEADER, BODY, FOOTER keywords in the template. A small example
|
235
|
+
can be
|
236
|
+
|
237
|
+
```Javascript
|
238
|
+
=HEADER
|
239
|
+
<% require 'json' %>
|
240
|
+
[
|
241
|
+
{ "HEADER": {
|
242
|
+
"options": <%= options.to_h.to_json %>,
|
243
|
+
"files": <%= ARGV %>,
|
244
|
+
"version": "<%= BLASTXML_VERSION %>"
|
245
|
+
},
|
246
|
+
=BODY
|
247
|
+
{ "<%= hit.parent.query_def %>": {
|
248
|
+
"num": <%= hit.hit_num %>,
|
249
|
+
"id": "<%= hit.hit_id %>",
|
250
|
+
"len": <%= hit.len %>,
|
251
|
+
"E-value": <%= hsp.evalue %>,
|
252
|
+
},
|
253
|
+
=FOOTER
|
254
|
+
]
|
255
|
+
```
|
256
|
+
|
257
|
+
may generate something like
|
258
|
+
|
259
|
+
```Javascript
|
260
|
+
[
|
261
|
+
{ "HEADER": {
|
262
|
+
"options": {"template":"template/blast2json2.erb","filter":"hsp.evalue>0.01"},
|
263
|
+
"files": ["test/data/nt_example_blastn.m7"],
|
264
|
+
"version": "2.0.2-pre1"
|
265
|
+
},
|
266
|
+
{ "I_1 [477 - 884] ": {
|
267
|
+
"num": 41,
|
268
|
+
"id": "lcl|X_42251",
|
269
|
+
"len": 153,
|
270
|
+
"E-value": 0.0247015,
|
271
|
+
},
|
272
|
+
{ "I_1 [477 - 884] ": {
|
273
|
+
"num": 43,
|
274
|
+
"id": "lcl|V_105720",
|
275
|
+
"len": 180,
|
276
|
+
"E-value": 0.0247015,
|
277
|
+
},
|
278
|
+
]
|
279
|
+
```
|
280
|
+
|
281
|
+
Note that the template is not smart enough to remove the final comma
|
282
|
+
from the last BODY element. To make it valid JSON that needs to be removed.
|
283
|
+
|
231
284
|
## Additional options
|
232
285
|
|
233
286
|
To use the high-mem version of the parser (slightly faster on single core) use
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.0.
|
1
|
+
2.0.2
|
data/bin/blastxmlparser
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
#
|
3
|
-
#
|
3
|
+
# blastxmlparser
|
4
4
|
# Author:: Pjotr Prins
|
5
5
|
# License:: MIT License
|
6
6
|
#
|
@@ -75,7 +75,6 @@ opts = OptionParser.new do |o|
|
|
75
75
|
end
|
76
76
|
|
77
77
|
o.on("-t erb","--template erb",String, "Use ERB template for output") do |s|
|
78
|
-
require 'erb'
|
79
78
|
options.template = s
|
80
79
|
end
|
81
80
|
|
@@ -91,6 +90,7 @@ opts = OptionParser.new do |o|
|
|
91
90
|
|
92
91
|
o.on("-q", "--quiet", "Run quietly") do |q|
|
93
92
|
Bio::Log::CLI.trace('error')
|
93
|
+
options.quiet = true
|
94
94
|
end
|
95
95
|
|
96
96
|
o.on("-v", "--verbose", "Run verbosely") do |v|
|
@@ -118,13 +118,18 @@ begin
|
|
118
118
|
end
|
119
119
|
end
|
120
120
|
|
121
|
+
# Prepare the ERB template
|
121
122
|
if options.template
|
122
123
|
include BioRdf
|
123
|
-
|
124
|
-
|
125
|
-
template = ERB.new(File.read(fn))
|
124
|
+
require 'bio/writers/template'
|
125
|
+
template = Bio::Template.new(options.template)
|
126
126
|
end
|
127
127
|
|
128
|
+
$stderr.print options,"\n" if not options.quiet
|
129
|
+
|
130
|
+
raise "No input file(s) defined" if ARGV.size == 0
|
131
|
+
|
132
|
+
|
128
133
|
ARGV.each do | fn |
|
129
134
|
logger.info("XML parsing #{fn}")
|
130
135
|
parser_type = options.parser
|
@@ -145,6 +150,7 @@ begin
|
|
145
150
|
chunks = []
|
146
151
|
chunks_count = 0
|
147
152
|
NUM_CHUNKS=10_000
|
153
|
+
print template.header(binding) if template
|
148
154
|
|
149
155
|
process = lambda { |iter2,i| # Process one BLAST iter block
|
150
156
|
if parser_type == :nosplit
|
@@ -167,7 +173,7 @@ begin
|
|
167
173
|
if do_print
|
168
174
|
line_count += 1
|
169
175
|
if template
|
170
|
-
res << template.
|
176
|
+
res << template.body(binding)
|
171
177
|
elsif options.output_fasta
|
172
178
|
res << ">"+hit.accession+' '+iter.iter_num.to_s+'|'+iter.query_id+' '+hit.hit_id+' '+hit.hit_def+"\n"
|
173
179
|
res << hsp.qseq+"\n"
|
@@ -221,6 +227,7 @@ begin
|
|
221
227
|
process.call(iter,i)
|
222
228
|
}
|
223
229
|
end
|
230
|
+
print template.footer(binding) if template
|
224
231
|
end
|
225
232
|
rescue OptionParser::InvalidOption => e
|
226
233
|
$stderr.print e.message
|
data/bio-blastxmlparser.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "bio-blastxmlparser"
|
8
|
-
s.version = "2.0.
|
8
|
+
s.version = "2.0.2"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Pjotr Prins"]
|
12
|
-
s.date = "2014-
|
12
|
+
s.date = "2014-11-07"
|
13
13
|
s.description = "Fast big data BLAST XML parser and library; this libxml2 based version is 50x faster than BioRuby and comes with a nice CLI"
|
14
14
|
s.email = "pjotr.public01@thebird.nl"
|
15
15
|
s.executables = ["blastxmlparser"]
|
@@ -34,6 +34,7 @@ Gem::Specification.new do |s|
|
|
34
34
|
"lib/bio/db/blast/xmliterator.rb",
|
35
35
|
"lib/bio/db/blast/xmlsplitter.rb",
|
36
36
|
"lib/bio/writers/rdf.rb",
|
37
|
+
"lib/bio/writers/template.rb",
|
37
38
|
"sample/bioruby.rb",
|
38
39
|
"sample/blastxmlparserdemo.rb",
|
39
40
|
"sample/libxml_sax.rb",
|
@@ -43,6 +44,7 @@ Gem::Specification.new do |s|
|
|
43
44
|
"spec/bio-blastxmlparser_spec.rb",
|
44
45
|
"spec/spec_helper.rb",
|
45
46
|
"template/blast2json.erb",
|
47
|
+
"template/blast2json2.erb",
|
46
48
|
"template/blast2rdf-minimal.erb",
|
47
49
|
"template/blast2rdf.erb",
|
48
50
|
"test/data/aa_example.fasta",
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'erb'
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
|
5
|
+
class Template
|
6
|
+
|
7
|
+
def initialize fn
|
8
|
+
raise "Can not find template #{fn}!" if not File.exist?(fn)
|
9
|
+
parse(File.read(fn))
|
10
|
+
end
|
11
|
+
|
12
|
+
def parse buf
|
13
|
+
header = []
|
14
|
+
body = []
|
15
|
+
footer = []
|
16
|
+
where = :header
|
17
|
+
buf.split("\n").each do | line |
|
18
|
+
case where
|
19
|
+
when :header
|
20
|
+
next if line =~ /=HEADER/
|
21
|
+
if line =~ /=BODY/
|
22
|
+
body = []
|
23
|
+
where = :body
|
24
|
+
next
|
25
|
+
end
|
26
|
+
header << line
|
27
|
+
when :body
|
28
|
+
if line =~ /=FOOTER/
|
29
|
+
footer = []
|
30
|
+
where = :footer
|
31
|
+
next
|
32
|
+
end
|
33
|
+
body << line
|
34
|
+
else
|
35
|
+
footer << line
|
36
|
+
end
|
37
|
+
end
|
38
|
+
if body == []
|
39
|
+
body = header
|
40
|
+
header = []
|
41
|
+
end
|
42
|
+
@erb_header = ERB.new(header.join("\n")) if header.size
|
43
|
+
@erb_body = ERB.new(body.join("\n")) if body.size
|
44
|
+
@erb_footer = ERB.new(footer.join("\n")) if footer.size
|
45
|
+
end
|
46
|
+
|
47
|
+
def result env
|
48
|
+
@erb.result(env)
|
49
|
+
end
|
50
|
+
|
51
|
+
def header env
|
52
|
+
if @erb_header
|
53
|
+
@erb_header.result(env)
|
54
|
+
else
|
55
|
+
""
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def body env
|
60
|
+
if @erb_body
|
61
|
+
@erb_body.result(env)
|
62
|
+
else
|
63
|
+
""
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def footer env
|
68
|
+
if @erb_footer
|
69
|
+
@erb_footer.result(env)
|
70
|
+
else
|
71
|
+
""
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
=HEADER
|
2
|
+
<% require 'json' %>
|
3
|
+
[
|
4
|
+
{ "HEADER": {
|
5
|
+
"options": <%= options.to_h.to_json %>,
|
6
|
+
"files": <%= ARGV %>,
|
7
|
+
"version": "<%= BLASTXML_VERSION %>"
|
8
|
+
},
|
9
|
+
|
10
|
+
=BODY
|
11
|
+
{ "<%= hit.parent.query_def %>": {
|
12
|
+
"query_id": "<%= hit.parent.query_id %>",
|
13
|
+
"num": <%= hit.hit_num %>,
|
14
|
+
"accession": "<%= hit.accession %>",
|
15
|
+
"id": "<%= hit.hit_id %>",
|
16
|
+
"len": <%= hit.len %>,
|
17
|
+
"E-value": <%= hsp.evalue %>,
|
18
|
+
"identity": <%= hsp.identity %>,
|
19
|
+
"align_len": <%= hsp.align_len %>,
|
20
|
+
"bitscore": <%= hsp.bit_score %>,
|
21
|
+
"qseq": "<%= hsp.qseq %>",
|
22
|
+
"midline": "<%= hsp.midline %>",
|
23
|
+
"hseq": "<%= hsp.hseq %>",
|
24
|
+
},
|
25
|
+
|
26
|
+
=FOOTER
|
27
|
+
]
|
28
|
+
|
@@ -4,11 +4,11 @@
|
|
4
4
|
%>
|
5
5
|
:<%= blastid %> :query :<%= id %>
|
6
6
|
:<%= id %>
|
7
|
-
:query_def "<%= hit.parent.query_def %>"
|
8
|
-
:num <%= hit.hit_num
|
9
|
-
:accession "<%= hit.accession %>"
|
10
|
-
:len <%= hit.len
|
11
|
-
:identity <%= hsp.identity
|
12
|
-
:align_len <%= hsp.align_len
|
13
|
-
:bitscore <%= hsp.bit_score
|
14
|
-
:evalue
|
7
|
+
:query_def "<%= hit.parent.query_def %>";
|
8
|
+
:num <%= hit.hit_num %>;
|
9
|
+
:accession "<%= hit.accession %>";
|
10
|
+
:len <%= hit.len %>;
|
11
|
+
:identity <%= hsp.identity %>;
|
12
|
+
:align_len <%= hsp.align_len %>;
|
13
|
+
:bitscore <%= hsp.bit_score %>;
|
14
|
+
:evalue <%= hsp.evalue %> .
|
data/template/blast2rdf.erb
CHANGED
@@ -4,17 +4,17 @@
|
|
4
4
|
%>
|
5
5
|
:<%= blastid %> :query :<%= id %>
|
6
6
|
:<%= id %>
|
7
|
-
:query_id "<%= hit.parent.query_id %>"
|
8
|
-
:query_def "<%= hit.parent.query_def %>"
|
9
|
-
:num <%= hit.hit_num
|
10
|
-
:accession "<%= hit.accession %>"
|
11
|
-
:id "<%= hit.hit_id %>"
|
12
|
-
:len <%= hit.len
|
13
|
-
:identity <%= hsp.identity
|
14
|
-
:align_len <%= hsp.align_len
|
15
|
-
:bitscore <%= hsp.bit_score
|
16
|
-
:qseq "<%= hsp.qseq %>"
|
7
|
+
:query_id "<%= hit.parent.query_id %>";
|
8
|
+
:query_def "<%= hit.parent.query_def %>";
|
9
|
+
:num <%= hit.hit_num %>;
|
10
|
+
:accession "<%= hit.accession %>";
|
11
|
+
:id "<%= hit.hit_id %>";
|
12
|
+
:len <%= hit.len %>;
|
13
|
+
:identity <%= hsp.identity %>;
|
14
|
+
:align_len <%= hsp.align_len %>;
|
15
|
+
:bitscore <%= hsp.bit_score %>;
|
16
|
+
:qseq "<%= hsp.qseq %>";
|
17
17
|
:midline "<%= hsp.midline %>",
|
18
|
-
:hseq "<%= hsp.hseq %>"
|
18
|
+
:hseq "<%= hsp.hseq %>";
|
19
19
|
:evalue <%= hsp.evalue %> .
|
20
20
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-blastxmlparser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Pjotr Prins
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-11-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio-logger
|
@@ -134,6 +134,7 @@ files:
|
|
134
134
|
- lib/bio/db/blast/xmliterator.rb
|
135
135
|
- lib/bio/db/blast/xmlsplitter.rb
|
136
136
|
- lib/bio/writers/rdf.rb
|
137
|
+
- lib/bio/writers/template.rb
|
137
138
|
- sample/bioruby.rb
|
138
139
|
- sample/blastxmlparserdemo.rb
|
139
140
|
- sample/libxml_sax.rb
|
@@ -143,6 +144,7 @@ files:
|
|
143
144
|
- spec/bio-blastxmlparser_spec.rb
|
144
145
|
- spec/spec_helper.rb
|
145
146
|
- template/blast2json.erb
|
147
|
+
- template/blast2json2.erb
|
146
148
|
- template/blast2rdf-minimal.erb
|
147
149
|
- template/blast2rdf.erb
|
148
150
|
- test/data/aa_example.fasta
|