bio-blastxmlparser 2.0.1 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +54 -1
- data/VERSION +1 -1
- data/bin/blastxmlparser +13 -6
- data/bio-blastxmlparser.gemspec +4 -2
- data/lib/bio/writers/template.rb +75 -0
- data/template/blast2json2.erb +28 -0
- data/template/blast2rdf-minimal.erb +8 -8
- data/template/blast2rdf.erb +11 -11
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 76df7cd1f6e1bc4f2b04fdf3f0fc830110f2e376
|
4
|
+
data.tar.gz: 68f44a797aa5357690e6c4a10a1fe241a7b8fe37
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f424b8cfedf921840dbf2fee412c191f1c9951d289b80db047744a56883b171b50d4b1161e86415f0e873be4bd43935031f966d32d394efea9208e65b2529903
|
7
|
+
data.tar.gz: 6214606ac08afa7306503c78e969046ab2a54d9c45e9dc42c597d8c033db2095fc5a25f15f56090a0337bd1049081699a81ceefa90d2acaf17cb782340074fc6
|
data/README.md
CHANGED
@@ -107,7 +107,7 @@ Print result fields of iterations containing 'lcl', using a regex
|
|
107
107
|
blastxmlparser --filter 'iter.query_id=~/lcl/' test/data/nt_example_blastn.m7
|
108
108
|
```
|
109
109
|
|
110
|
-
prints a tab delimited
|
110
|
+
prints a (default) tab delimited to stdout
|
111
111
|
|
112
112
|
```sh
|
113
113
|
1 1 lcl|1_0 lcl|I_74685 1 5.82208e-34
|
@@ -228,6 +228,59 @@ Likewise, using the RDF template
|
|
228
228
|
:evalue 8.1089e-12 .
|
229
229
|
```
|
230
230
|
|
231
|
+
### Metadata
|
232
|
+
|
233
|
+
Templates can also print data as a header of the JSON/YAML/RDF output. For this
|
234
|
+
use the '=' prefix with HEADER, BODY, FOOTER keywords in the template. A small example
|
235
|
+
can be
|
236
|
+
|
237
|
+
```Javascript
|
238
|
+
=HEADER
|
239
|
+
<% require 'json' %>
|
240
|
+
[
|
241
|
+
{ "HEADER": {
|
242
|
+
"options": <%= options.to_h.to_json %>,
|
243
|
+
"files": <%= ARGV %>,
|
244
|
+
"version": "<%= BLASTXML_VERSION %>"
|
245
|
+
},
|
246
|
+
=BODY
|
247
|
+
{ "<%= hit.parent.query_def %>": {
|
248
|
+
"num": <%= hit.hit_num %>,
|
249
|
+
"id": "<%= hit.hit_id %>",
|
250
|
+
"len": <%= hit.len %>,
|
251
|
+
"E-value": <%= hsp.evalue %>,
|
252
|
+
},
|
253
|
+
=FOOTER
|
254
|
+
]
|
255
|
+
```
|
256
|
+
|
257
|
+
may generate something like
|
258
|
+
|
259
|
+
```Javascript
|
260
|
+
[
|
261
|
+
{ "HEADER": {
|
262
|
+
"options": {"template":"template/blast2json2.erb","filter":"hsp.evalue>0.01"},
|
263
|
+
"files": ["test/data/nt_example_blastn.m7"],
|
264
|
+
"version": "2.0.2-pre1"
|
265
|
+
},
|
266
|
+
{ "I_1 [477 - 884] ": {
|
267
|
+
"num": 41,
|
268
|
+
"id": "lcl|X_42251",
|
269
|
+
"len": 153,
|
270
|
+
"E-value": 0.0247015,
|
271
|
+
},
|
272
|
+
{ "I_1 [477 - 884] ": {
|
273
|
+
"num": 43,
|
274
|
+
"id": "lcl|V_105720",
|
275
|
+
"len": 180,
|
276
|
+
"E-value": 0.0247015,
|
277
|
+
},
|
278
|
+
]
|
279
|
+
```
|
280
|
+
|
281
|
+
Note that the template is not smart enough to remove the final comma
|
282
|
+
from the last BODY element. To make it valid JSON that needs to be removed.
|
283
|
+
|
231
284
|
## Additional options
|
232
285
|
|
233
286
|
To use the high-mem version of the parser (slightly faster on single core) use
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.0.
|
1
|
+
2.0.2
|
data/bin/blastxmlparser
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
#
|
3
|
-
#
|
3
|
+
# blastxmlparser
|
4
4
|
# Author:: Pjotr Prins
|
5
5
|
# License:: MIT License
|
6
6
|
#
|
@@ -75,7 +75,6 @@ opts = OptionParser.new do |o|
|
|
75
75
|
end
|
76
76
|
|
77
77
|
o.on("-t erb","--template erb",String, "Use ERB template for output") do |s|
|
78
|
-
require 'erb'
|
79
78
|
options.template = s
|
80
79
|
end
|
81
80
|
|
@@ -91,6 +90,7 @@ opts = OptionParser.new do |o|
|
|
91
90
|
|
92
91
|
o.on("-q", "--quiet", "Run quietly") do |q|
|
93
92
|
Bio::Log::CLI.trace('error')
|
93
|
+
options.quiet = true
|
94
94
|
end
|
95
95
|
|
96
96
|
o.on("-v", "--verbose", "Run verbosely") do |v|
|
@@ -118,13 +118,18 @@ begin
|
|
118
118
|
end
|
119
119
|
end
|
120
120
|
|
121
|
+
# Prepare the ERB template
|
121
122
|
if options.template
|
122
123
|
include BioRdf
|
123
|
-
|
124
|
-
|
125
|
-
template = ERB.new(File.read(fn))
|
124
|
+
require 'bio/writers/template'
|
125
|
+
template = Bio::Template.new(options.template)
|
126
126
|
end
|
127
127
|
|
128
|
+
$stderr.print options,"\n" if not options.quiet
|
129
|
+
|
130
|
+
raise "No input file(s) defined" if ARGV.size == 0
|
131
|
+
|
132
|
+
|
128
133
|
ARGV.each do | fn |
|
129
134
|
logger.info("XML parsing #{fn}")
|
130
135
|
parser_type = options.parser
|
@@ -145,6 +150,7 @@ begin
|
|
145
150
|
chunks = []
|
146
151
|
chunks_count = 0
|
147
152
|
NUM_CHUNKS=10_000
|
153
|
+
print template.header(binding) if template
|
148
154
|
|
149
155
|
process = lambda { |iter2,i| # Process one BLAST iter block
|
150
156
|
if parser_type == :nosplit
|
@@ -167,7 +173,7 @@ begin
|
|
167
173
|
if do_print
|
168
174
|
line_count += 1
|
169
175
|
if template
|
170
|
-
res << template.
|
176
|
+
res << template.body(binding)
|
171
177
|
elsif options.output_fasta
|
172
178
|
res << ">"+hit.accession+' '+iter.iter_num.to_s+'|'+iter.query_id+' '+hit.hit_id+' '+hit.hit_def+"\n"
|
173
179
|
res << hsp.qseq+"\n"
|
@@ -221,6 +227,7 @@ begin
|
|
221
227
|
process.call(iter,i)
|
222
228
|
}
|
223
229
|
end
|
230
|
+
print template.footer(binding) if template
|
224
231
|
end
|
225
232
|
rescue OptionParser::InvalidOption => e
|
226
233
|
$stderr.print e.message
|
data/bio-blastxmlparser.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "bio-blastxmlparser"
|
8
|
-
s.version = "2.0.
|
8
|
+
s.version = "2.0.2"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Pjotr Prins"]
|
12
|
-
s.date = "2014-
|
12
|
+
s.date = "2014-11-07"
|
13
13
|
s.description = "Fast big data BLAST XML parser and library; this libxml2 based version is 50x faster than BioRuby and comes with a nice CLI"
|
14
14
|
s.email = "pjotr.public01@thebird.nl"
|
15
15
|
s.executables = ["blastxmlparser"]
|
@@ -34,6 +34,7 @@ Gem::Specification.new do |s|
|
|
34
34
|
"lib/bio/db/blast/xmliterator.rb",
|
35
35
|
"lib/bio/db/blast/xmlsplitter.rb",
|
36
36
|
"lib/bio/writers/rdf.rb",
|
37
|
+
"lib/bio/writers/template.rb",
|
37
38
|
"sample/bioruby.rb",
|
38
39
|
"sample/blastxmlparserdemo.rb",
|
39
40
|
"sample/libxml_sax.rb",
|
@@ -43,6 +44,7 @@ Gem::Specification.new do |s|
|
|
43
44
|
"spec/bio-blastxmlparser_spec.rb",
|
44
45
|
"spec/spec_helper.rb",
|
45
46
|
"template/blast2json.erb",
|
47
|
+
"template/blast2json2.erb",
|
46
48
|
"template/blast2rdf-minimal.erb",
|
47
49
|
"template/blast2rdf.erb",
|
48
50
|
"test/data/aa_example.fasta",
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'erb'
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
|
5
|
+
class Template
|
6
|
+
|
7
|
+
def initialize fn
|
8
|
+
raise "Can not find template #{fn}!" if not File.exist?(fn)
|
9
|
+
parse(File.read(fn))
|
10
|
+
end
|
11
|
+
|
12
|
+
def parse buf
|
13
|
+
header = []
|
14
|
+
body = []
|
15
|
+
footer = []
|
16
|
+
where = :header
|
17
|
+
buf.split("\n").each do | line |
|
18
|
+
case where
|
19
|
+
when :header
|
20
|
+
next if line =~ /=HEADER/
|
21
|
+
if line =~ /=BODY/
|
22
|
+
body = []
|
23
|
+
where = :body
|
24
|
+
next
|
25
|
+
end
|
26
|
+
header << line
|
27
|
+
when :body
|
28
|
+
if line =~ /=FOOTER/
|
29
|
+
footer = []
|
30
|
+
where = :footer
|
31
|
+
next
|
32
|
+
end
|
33
|
+
body << line
|
34
|
+
else
|
35
|
+
footer << line
|
36
|
+
end
|
37
|
+
end
|
38
|
+
if body == []
|
39
|
+
body = header
|
40
|
+
header = []
|
41
|
+
end
|
42
|
+
@erb_header = ERB.new(header.join("\n")) if header.size
|
43
|
+
@erb_body = ERB.new(body.join("\n")) if body.size
|
44
|
+
@erb_footer = ERB.new(footer.join("\n")) if footer.size
|
45
|
+
end
|
46
|
+
|
47
|
+
def result env
|
48
|
+
@erb.result(env)
|
49
|
+
end
|
50
|
+
|
51
|
+
def header env
|
52
|
+
if @erb_header
|
53
|
+
@erb_header.result(env)
|
54
|
+
else
|
55
|
+
""
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def body env
|
60
|
+
if @erb_body
|
61
|
+
@erb_body.result(env)
|
62
|
+
else
|
63
|
+
""
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def footer env
|
68
|
+
if @erb_footer
|
69
|
+
@erb_footer.result(env)
|
70
|
+
else
|
71
|
+
""
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
=HEADER
|
2
|
+
<% require 'json' %>
|
3
|
+
[
|
4
|
+
{ "HEADER": {
|
5
|
+
"options": <%= options.to_h.to_json %>,
|
6
|
+
"files": <%= ARGV %>,
|
7
|
+
"version": "<%= BLASTXML_VERSION %>"
|
8
|
+
},
|
9
|
+
|
10
|
+
=BODY
|
11
|
+
{ "<%= hit.parent.query_def %>": {
|
12
|
+
"query_id": "<%= hit.parent.query_id %>",
|
13
|
+
"num": <%= hit.hit_num %>,
|
14
|
+
"accession": "<%= hit.accession %>",
|
15
|
+
"id": "<%= hit.hit_id %>",
|
16
|
+
"len": <%= hit.len %>,
|
17
|
+
"E-value": <%= hsp.evalue %>,
|
18
|
+
"identity": <%= hsp.identity %>,
|
19
|
+
"align_len": <%= hsp.align_len %>,
|
20
|
+
"bitscore": <%= hsp.bit_score %>,
|
21
|
+
"qseq": "<%= hsp.qseq %>",
|
22
|
+
"midline": "<%= hsp.midline %>",
|
23
|
+
"hseq": "<%= hsp.hseq %>",
|
24
|
+
},
|
25
|
+
|
26
|
+
=FOOTER
|
27
|
+
]
|
28
|
+
|
@@ -4,11 +4,11 @@
|
|
4
4
|
%>
|
5
5
|
:<%= blastid %> :query :<%= id %>
|
6
6
|
:<%= id %>
|
7
|
-
:query_def "<%= hit.parent.query_def %>"
|
8
|
-
:num <%= hit.hit_num
|
9
|
-
:accession "<%= hit.accession %>"
|
10
|
-
:len <%= hit.len
|
11
|
-
:identity <%= hsp.identity
|
12
|
-
:align_len <%= hsp.align_len
|
13
|
-
:bitscore <%= hsp.bit_score
|
14
|
-
:evalue
|
7
|
+
:query_def "<%= hit.parent.query_def %>";
|
8
|
+
:num <%= hit.hit_num %>;
|
9
|
+
:accession "<%= hit.accession %>";
|
10
|
+
:len <%= hit.len %>;
|
11
|
+
:identity <%= hsp.identity %>;
|
12
|
+
:align_len <%= hsp.align_len %>;
|
13
|
+
:bitscore <%= hsp.bit_score %>;
|
14
|
+
:evalue <%= hsp.evalue %> .
|
data/template/blast2rdf.erb
CHANGED
@@ -4,17 +4,17 @@
|
|
4
4
|
%>
|
5
5
|
:<%= blastid %> :query :<%= id %>
|
6
6
|
:<%= id %>
|
7
|
-
:query_id "<%= hit.parent.query_id %>"
|
8
|
-
:query_def "<%= hit.parent.query_def %>"
|
9
|
-
:num <%= hit.hit_num
|
10
|
-
:accession "<%= hit.accession %>"
|
11
|
-
:id "<%= hit.hit_id %>"
|
12
|
-
:len <%= hit.len
|
13
|
-
:identity <%= hsp.identity
|
14
|
-
:align_len <%= hsp.align_len
|
15
|
-
:bitscore <%= hsp.bit_score
|
16
|
-
:qseq "<%= hsp.qseq %>"
|
7
|
+
:query_id "<%= hit.parent.query_id %>";
|
8
|
+
:query_def "<%= hit.parent.query_def %>";
|
9
|
+
:num <%= hit.hit_num %>;
|
10
|
+
:accession "<%= hit.accession %>";
|
11
|
+
:id "<%= hit.hit_id %>";
|
12
|
+
:len <%= hit.len %>;
|
13
|
+
:identity <%= hsp.identity %>;
|
14
|
+
:align_len <%= hsp.align_len %>;
|
15
|
+
:bitscore <%= hsp.bit_score %>;
|
16
|
+
:qseq "<%= hsp.qseq %>";
|
17
17
|
:midline "<%= hsp.midline %>",
|
18
|
-
:hseq "<%= hsp.hseq %>"
|
18
|
+
:hseq "<%= hsp.hseq %>";
|
19
19
|
:evalue <%= hsp.evalue %> .
|
20
20
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-blastxmlparser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Pjotr Prins
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-11-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio-logger
|
@@ -134,6 +134,7 @@ files:
|
|
134
134
|
- lib/bio/db/blast/xmliterator.rb
|
135
135
|
- lib/bio/db/blast/xmlsplitter.rb
|
136
136
|
- lib/bio/writers/rdf.rb
|
137
|
+
- lib/bio/writers/template.rb
|
137
138
|
- sample/bioruby.rb
|
138
139
|
- sample/blastxmlparserdemo.rb
|
139
140
|
- sample/libxml_sax.rb
|
@@ -143,6 +144,7 @@ files:
|
|
143
144
|
- spec/bio-blastxmlparser_spec.rb
|
144
145
|
- spec/spec_helper.rb
|
145
146
|
- template/blast2json.erb
|
147
|
+
- template/blast2json2.erb
|
146
148
|
- template/blast2rdf-minimal.erb
|
147
149
|
- template/blast2rdf.erb
|
148
150
|
- test/data/aa_example.fasta
|