bio-blastxmlparser 2.0.1 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7222df89b2f60ef4b027ea7ca766a30c04de567b
4
- data.tar.gz: b8d7c84c85dd58e7794a62b83a73b17a04b60ce1
3
+ metadata.gz: 76df7cd1f6e1bc4f2b04fdf3f0fc830110f2e376
4
+ data.tar.gz: 68f44a797aa5357690e6c4a10a1fe241a7b8fe37
5
5
  SHA512:
6
- metadata.gz: e9feee95e3063b0c6c9e9ac28c0f7389e4036130e51c107314216fe8e30f98342d2fbc5f1af0ef16f9c5a11be95aa97d86d16f5d9e2169eda2a54d2594c0dc84
7
- data.tar.gz: 63971bd220b178e7ff0dbd7c50a4df6277b9dc8035f610f173603e2e304655610e65894fb3bde7e67c17963d13557b22c3c1d4a3e6dbadbf65c7f170ddbd12f5
6
+ metadata.gz: f424b8cfedf921840dbf2fee412c191f1c9951d289b80db047744a56883b171b50d4b1161e86415f0e873be4bd43935031f966d32d394efea9208e65b2529903
7
+ data.tar.gz: 6214606ac08afa7306503c78e969046ab2a54d9c45e9dc42c597d8c033db2095fc5a25f15f56090a0337bd1049081699a81ceefa90d2acaf17cb782340074fc6
data/README.md CHANGED
@@ -107,7 +107,7 @@ Print result fields of iterations containing 'lcl', using a regex
107
107
  blastxmlparser --filter 'iter.query_id=~/lcl/' test/data/nt_example_blastn.m7
108
108
  ```
109
109
 
110
- prints a tab delimited
110
+ prints a (default) tab delimited to stdout
111
111
 
112
112
  ```sh
113
113
  1 1 lcl|1_0 lcl|I_74685 1 5.82208e-34
@@ -228,6 +228,59 @@ Likewise, using the RDF template
228
228
  :evalue 8.1089e-12 .
229
229
  ```
230
230
 
231
+ ### Metadata
232
+
233
+ Templates can also print data as a header of the JSON/YAML/RDF output. For this
234
+ use the '=' prefix with HEADER, BODY, FOOTER keywords in the template. A small example
235
+ can be
236
+
237
+ ```Javascript
238
+ =HEADER
239
+ <% require 'json' %>
240
+ [
241
+ { "HEADER": {
242
+ "options": <%= options.to_h.to_json %>,
243
+ "files": <%= ARGV %>,
244
+ "version": "<%= BLASTXML_VERSION %>"
245
+ },
246
+ =BODY
247
+ { "<%= hit.parent.query_def %>": {
248
+ "num": <%= hit.hit_num %>,
249
+ "id": "<%= hit.hit_id %>",
250
+ "len": <%= hit.len %>,
251
+ "E-value": <%= hsp.evalue %>,
252
+ },
253
+ =FOOTER
254
+ ]
255
+ ```
256
+
257
+ may generate something like
258
+
259
+ ```Javascript
260
+ [
261
+ { "HEADER": {
262
+ "options": {"template":"template/blast2json2.erb","filter":"hsp.evalue>0.01"},
263
+ "files": ["test/data/nt_example_blastn.m7"],
264
+ "version": "2.0.2-pre1"
265
+ },
266
+ { "I_1 [477 - 884] ": {
267
+ "num": 41,
268
+ "id": "lcl|X_42251",
269
+ "len": 153,
270
+ "E-value": 0.0247015,
271
+ },
272
+ { "I_1 [477 - 884] ": {
273
+ "num": 43,
274
+ "id": "lcl|V_105720",
275
+ "len": 180,
276
+ "E-value": 0.0247015,
277
+ },
278
+ ]
279
+ ```
280
+
281
+ Note that the template is not smart enough to remove the final comma
282
+ from the last BODY element. To make it valid JSON that needs to be removed.
283
+
231
284
  ## Additional options
232
285
 
233
286
  To use the high-mem version of the parser (slightly faster on single core) use
data/VERSION CHANGED
@@ -1 +1 @@
1
- 2.0.1
1
+ 2.0.2
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  #
3
- # BioRuby bio-blastxmlparser Plugin
3
+ # blastxmlparser
4
4
  # Author:: Pjotr Prins
5
5
  # License:: MIT License
6
6
  #
@@ -75,7 +75,6 @@ opts = OptionParser.new do |o|
75
75
  end
76
76
 
77
77
  o.on("-t erb","--template erb",String, "Use ERB template for output") do |s|
78
- require 'erb'
79
78
  options.template = s
80
79
  end
81
80
 
@@ -91,6 +90,7 @@ opts = OptionParser.new do |o|
91
90
 
92
91
  o.on("-q", "--quiet", "Run quietly") do |q|
93
92
  Bio::Log::CLI.trace('error')
93
+ options.quiet = true
94
94
  end
95
95
 
96
96
  o.on("-v", "--verbose", "Run verbosely") do |v|
@@ -118,13 +118,18 @@ begin
118
118
  end
119
119
  end
120
120
 
121
+ # Prepare the ERB template
121
122
  if options.template
122
123
  include BioRdf
123
- fn = options.template
124
- raise "No template #{fn}!" if not File.exist?(fn)
125
- template = ERB.new(File.read(fn))
124
+ require 'bio/writers/template'
125
+ template = Bio::Template.new(options.template)
126
126
  end
127
127
 
128
+ $stderr.print options,"\n" if not options.quiet
129
+
130
+ raise "No input file(s) defined" if ARGV.size == 0
131
+
132
+
128
133
  ARGV.each do | fn |
129
134
  logger.info("XML parsing #{fn}")
130
135
  parser_type = options.parser
@@ -145,6 +150,7 @@ begin
145
150
  chunks = []
146
151
  chunks_count = 0
147
152
  NUM_CHUNKS=10_000
153
+ print template.header(binding) if template
148
154
 
149
155
  process = lambda { |iter2,i| # Process one BLAST iter block
150
156
  if parser_type == :nosplit
@@ -167,7 +173,7 @@ begin
167
173
  if do_print
168
174
  line_count += 1
169
175
  if template
170
- res << template.result(binding)
176
+ res << template.body(binding)
171
177
  elsif options.output_fasta
172
178
  res << ">"+hit.accession+' '+iter.iter_num.to_s+'|'+iter.query_id+' '+hit.hit_id+' '+hit.hit_def+"\n"
173
179
  res << hsp.qseq+"\n"
@@ -221,6 +227,7 @@ begin
221
227
  process.call(iter,i)
222
228
  }
223
229
  end
230
+ print template.footer(binding) if template
224
231
  end
225
232
  rescue OptionParser::InvalidOption => e
226
233
  $stderr.print e.message
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "bio-blastxmlparser"
8
- s.version = "2.0.1"
8
+ s.version = "2.0.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Pjotr Prins"]
12
- s.date = "2014-09-07"
12
+ s.date = "2014-11-07"
13
13
  s.description = "Fast big data BLAST XML parser and library; this libxml2 based version is 50x faster than BioRuby and comes with a nice CLI"
14
14
  s.email = "pjotr.public01@thebird.nl"
15
15
  s.executables = ["blastxmlparser"]
@@ -34,6 +34,7 @@ Gem::Specification.new do |s|
34
34
  "lib/bio/db/blast/xmliterator.rb",
35
35
  "lib/bio/db/blast/xmlsplitter.rb",
36
36
  "lib/bio/writers/rdf.rb",
37
+ "lib/bio/writers/template.rb",
37
38
  "sample/bioruby.rb",
38
39
  "sample/blastxmlparserdemo.rb",
39
40
  "sample/libxml_sax.rb",
@@ -43,6 +44,7 @@ Gem::Specification.new do |s|
43
44
  "spec/bio-blastxmlparser_spec.rb",
44
45
  "spec/spec_helper.rb",
45
46
  "template/blast2json.erb",
47
+ "template/blast2json2.erb",
46
48
  "template/blast2rdf-minimal.erb",
47
49
  "template/blast2rdf.erb",
48
50
  "test/data/aa_example.fasta",
@@ -0,0 +1,75 @@
1
+ require 'erb'
2
+
3
+ module Bio
4
+
5
+ class Template
6
+
7
+ def initialize fn
8
+ raise "Can not find template #{fn}!" if not File.exist?(fn)
9
+ parse(File.read(fn))
10
+ end
11
+
12
+ def parse buf
13
+ header = []
14
+ body = []
15
+ footer = []
16
+ where = :header
17
+ buf.split("\n").each do | line |
18
+ case where
19
+ when :header
20
+ next if line =~ /=HEADER/
21
+ if line =~ /=BODY/
22
+ body = []
23
+ where = :body
24
+ next
25
+ end
26
+ header << line
27
+ when :body
28
+ if line =~ /=FOOTER/
29
+ footer = []
30
+ where = :footer
31
+ next
32
+ end
33
+ body << line
34
+ else
35
+ footer << line
36
+ end
37
+ end
38
+ if body == []
39
+ body = header
40
+ header = []
41
+ end
42
+ @erb_header = ERB.new(header.join("\n")) if header.size
43
+ @erb_body = ERB.new(body.join("\n")) if body.size
44
+ @erb_footer = ERB.new(footer.join("\n")) if footer.size
45
+ end
46
+
47
+ def result env
48
+ @erb.result(env)
49
+ end
50
+
51
+ def header env
52
+ if @erb_header
53
+ @erb_header.result(env)
54
+ else
55
+ ""
56
+ end
57
+ end
58
+
59
+ def body env
60
+ if @erb_body
61
+ @erb_body.result(env)
62
+ else
63
+ ""
64
+ end
65
+ end
66
+
67
+ def footer env
68
+ if @erb_footer
69
+ @erb_footer.result(env)
70
+ else
71
+ ""
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,28 @@
1
+ =HEADER
2
+ <% require 'json' %>
3
+ [
4
+ { "HEADER": {
5
+ "options": <%= options.to_h.to_json %>,
6
+ "files": <%= ARGV %>,
7
+ "version": "<%= BLASTXML_VERSION %>"
8
+ },
9
+
10
+ =BODY
11
+ { "<%= hit.parent.query_def %>": {
12
+ "query_id": "<%= hit.parent.query_id %>",
13
+ "num": <%= hit.hit_num %>,
14
+ "accession": "<%= hit.accession %>",
15
+ "id": "<%= hit.hit_id %>",
16
+ "len": <%= hit.len %>,
17
+ "E-value": <%= hsp.evalue %>,
18
+ "identity": <%= hsp.identity %>,
19
+ "align_len": <%= hsp.align_len %>,
20
+ "bitscore": <%= hsp.bit_score %>,
21
+ "qseq": "<%= hsp.qseq %>",
22
+ "midline": "<%= hsp.midline %>",
23
+ "hseq": "<%= hsp.hseq %>",
24
+ },
25
+
26
+ =FOOTER
27
+ ]
28
+
@@ -4,11 +4,11 @@
4
4
  %>
5
5
  :<%= blastid %> :query :<%= id %>
6
6
  :<%= id %>
7
- :query_def "<%= hit.parent.query_def %>",
8
- :num <%= hit.hit_num %>,
9
- :accession "<%= hit.accession %>",
10
- :len <%= hit.len %>,
11
- :identity <%= hsp.identity %>,
12
- :align_len <%= hsp.align_len %>,
13
- :bitscore <%= hsp.bit_score %>,
14
- :evalue <%= hsp.evalue %> .
7
+ :query_def "<%= hit.parent.query_def %>";
8
+ :num <%= hit.hit_num %>;
9
+ :accession "<%= hit.accession %>";
10
+ :len <%= hit.len %>;
11
+ :identity <%= hsp.identity %>;
12
+ :align_len <%= hsp.align_len %>;
13
+ :bitscore <%= hsp.bit_score %>;
14
+ :evalue <%= hsp.evalue %> .
@@ -4,17 +4,17 @@
4
4
  %>
5
5
  :<%= blastid %> :query :<%= id %>
6
6
  :<%= id %>
7
- :query_id "<%= hit.parent.query_id %>",
8
- :query_def "<%= hit.parent.query_def %>",
9
- :num <%= hit.hit_num %>,
10
- :accession "<%= hit.accession %>",
11
- :id "<%= hit.hit_id %>",
12
- :len <%= hit.len %>,
13
- :identity <%= hsp.identity %>,
14
- :align_len <%= hsp.align_len %>,
15
- :bitscore <%= hsp.bit_score %>,
16
- :qseq "<%= hsp.qseq %>",
7
+ :query_id "<%= hit.parent.query_id %>";
8
+ :query_def "<%= hit.parent.query_def %>";
9
+ :num <%= hit.hit_num %>;
10
+ :accession "<%= hit.accession %>";
11
+ :id "<%= hit.hit_id %>";
12
+ :len <%= hit.len %>;
13
+ :identity <%= hsp.identity %>;
14
+ :align_len <%= hsp.align_len %>;
15
+ :bitscore <%= hsp.bit_score %>;
16
+ :qseq "<%= hsp.qseq %>";
17
17
  :midline "<%= hsp.midline %>",
18
- :hseq "<%= hsp.hseq %>",
18
+ :hseq "<%= hsp.hseq %>";
19
19
  :evalue <%= hsp.evalue %> .
20
20
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-blastxmlparser
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.1
4
+ version: 2.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Pjotr Prins
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-09-07 00:00:00.000000000 Z
11
+ date: 2014-11-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio-logger
@@ -134,6 +134,7 @@ files:
134
134
  - lib/bio/db/blast/xmliterator.rb
135
135
  - lib/bio/db/blast/xmlsplitter.rb
136
136
  - lib/bio/writers/rdf.rb
137
+ - lib/bio/writers/template.rb
137
138
  - sample/bioruby.rb
138
139
  - sample/blastxmlparserdemo.rb
139
140
  - sample/libxml_sax.rb
@@ -143,6 +144,7 @@ files:
143
144
  - spec/bio-blastxmlparser_spec.rb
144
145
  - spec/spec_helper.rb
145
146
  - template/blast2json.erb
147
+ - template/blast2json2.erb
146
148
  - template/blast2rdf-minimal.erb
147
149
  - template/blast2rdf.erb
148
150
  - test/data/aa_example.fasta