bio-blastxmlparser 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
data/Gemfile ADDED
@@ -0,0 +1,18 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+
6
+ # Runtime dependencies
7
+ # gem "bio", ">= 1.3.1"
8
+ gem "bio-logger", "> 0.8.0"
9
+ gem "nokogiri", ">= 1.4.4"
10
+
11
+ # Add dependencies to develop your gem here.
12
+ # Include everything needed to run rake, tests, features, etc.
13
+ group :development do
14
+ gem "rspec", "~> 2.3.0"
15
+ gem "bundler", "~> 1.0.0"
16
+ gem "jeweler", "~> 1.5.2"
17
+ gem "rcov", ">= 0"
18
+ end
@@ -0,0 +1,34 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ bio-logger (0.9.0)
5
+ log4r (>= 1.1.9)
6
+ diff-lcs (1.1.2)
7
+ git (1.2.5)
8
+ jeweler (1.5.2)
9
+ bundler (~> 1.0.0)
10
+ git (>= 1.2.5)
11
+ rake
12
+ log4r (1.1.9)
13
+ nokogiri (1.4.4)
14
+ rake (0.8.7)
15
+ rcov (0.9.9)
16
+ rspec (2.3.0)
17
+ rspec-core (~> 2.3.0)
18
+ rspec-expectations (~> 2.3.0)
19
+ rspec-mocks (~> 2.3.0)
20
+ rspec-core (2.3.1)
21
+ rspec-expectations (2.3.0)
22
+ diff-lcs (~> 1.1.2)
23
+ rspec-mocks (2.3.0)
24
+
25
+ PLATFORMS
26
+ ruby
27
+
28
+ DEPENDENCIES
29
+ bio-logger (> 0.8.0)
30
+ bundler (~> 1.0.0)
31
+ jeweler (~> 1.5.2)
32
+ nokogiri (>= 1.4.4)
33
+ rcov
34
+ rspec (~> 2.3.0)
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2011 Pjotr Prins
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,211 @@
1
+ = bio-blastxmlparser
2
+
3
+ blastxmlparser is a fast big-data BLAST XML file parser. Rather than
4
+ loading everything in memory, XML is parsed by BLAST query
5
+ (Iteration). Not only has this the advantage of low memory use, it may
6
+ also be faster when IO continues in parallel (disks read ahead).
7
+
8
+ Next to the API, blastxmlparser comes as a command line utility, which
9
+ can be used to filter results and requires no understanding of Ruby.
10
+
11
+ == Performance
12
+
13
+ XML parsing is expensive. blastxmlparser uses the Nokogiri C, or Java, XML
14
+ parser, based on libxml2. Basically a DOM parser is used for subsections of a
15
+ document, tests show this is faster than a SAX parser with Ruby callbacks. To
16
+ see why libxml2 based Nokogiri is fast, see
17
+ http://www.rubyinside.com/ruby-xml-performance-benchmarks-1641.html and
18
+ http://www.xml.com/lpt/a/1703.
19
+
20
+ The parser is also designed with other optimizations, such as lazy evaluation,
21
+ only creating objects when required, and (future) parallelization. When parsing
22
+ a full BLAST result usually only a few fields are used. By using XPath queries
23
+ only the relevant fields are queried.
24
+
25
+ Timings for parsing test/data/nt_example_blastn.m7 (file size 3.4Mb)
26
+
27
+ Nokogiri DOM (default)
28
+
29
+ real 0m1.259s
30
+ user 0m1.052s
31
+ sys 0m0.144s
32
+
33
+ Nokogiri split DOM
34
+
35
+ real 0m1.713s
36
+ user 0m1.444s
37
+ sys 0m0.160s
38
+
39
+ BioRuby ReXML DOM parser
40
+
41
+ real 1m14.548s
42
+ user 1m13.065s
43
+ sys 0m0.472s
44
+
45
+ == Install
46
+
47
+ gem install bio-blastxmlparser
48
+
49
+ Nokogiri XML parser is required. To install it,
50
+ the libxml2 libraries and headers need to be installed first, for
51
+ example on Debian:
52
+
53
+ apt-get install libxslt-dev libxml2-dev
54
+ gem install bio-blastxmlparser
55
+
56
+ for more installation on other platforms see
57
+ http://nokogiri.org/tutorials/installing_nokogiri.html.
58
+
59
+ == API
60
+
61
+ To loop through a BLAST result:
62
+
63
+ >> require 'bio-blastxmlparser'
64
+ >> fn = 'test/data/nt_example_blastn.m7'
65
+ >> n = Bio::Blast::XmlIterator.new(fn).to_enum
66
+ >> n.each do | iter |
67
+ >> puts "Hits for " + iter.query_id
68
+ >> iter.each do | hit |
69
+ >> hit.each do | hsp |
70
+ >> print hit.hit_id, "\t", hsp.evalue, "\n" if hsp.evalue < 0.001
71
+ >> end
72
+ >> end
73
+ >> end
74
+
75
+ The next example parses XML using less memory
76
+
77
+ >> blast = XmlSplitterIterator.new(fn).to_enum
78
+ >> iter = blast.next
79
+ >> iter.iter_num
80
+ >> 1
81
+ >> iter.query_id
82
+ => "lcl|1_0"
83
+
84
+ Get the first hit
85
+
86
+ >> hit = iter.hits.first
87
+ >> hit.hit_num
88
+ => 1
89
+ >> hit.hit_id
90
+ => "lcl|I_74685"
91
+ >> hit.hit_def
92
+ => "[57809 - 57666] (REVERSE SENSE) "
93
+ >> hit.accession
94
+ => "I_74685"
95
+ >> hit.len
96
+ => 144
97
+
98
+ Get the parent info
99
+
100
+ >> hit.parent.query_id
101
+ => "lcl|1_0"
102
+
103
+ Get the first Hsp
104
+
105
+ >> hsp = hit.hsps.first
106
+ >> hsp.hsp_num
107
+ => 1
108
+ >> hsp.bit_score
109
+ => 145.205
110
+ >> hsp.score
111
+ => 73
112
+ >> hsp.evalue
113
+ => 5.82208e-34
114
+ >> hsp.query_from
115
+ => 28
116
+ >> hsp.query_to
117
+ => 100
118
+ >> hsp.query_frame
119
+ => 1
120
+ >> hsp.hit_frame
121
+ => 1
122
+ >> hsp.identity
123
+ => 73
124
+ >> hsp.positive
125
+ => 73
126
+ >> hsp.align_len
127
+ => 73
128
+ >> hsp.qseq
129
+ => "AGTGAAGCTTCTAGATATTTGGCGGGTACCTCTAATTTTGCCTGCCTGCCAACCTATATGCTCCTGTGTTTAG"
130
+ >> hsp.hseq
131
+ => "AGTGAAGCTTCTAGATATTTGGCGGGTACCTCTAATTTTGCCTGCCTGCCAACCTATATGCTCCTGTGTTTAG"
132
+ >> hsp.midline
133
+ => "|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||"
134
+
135
+ It is possible to use the XML element names, over methods. E.g.
136
+
137
+ >> hsp.field("Hsp_bit-score")
138
+ => "145.205"
139
+ >> hsp["Hsp_bit-score"]
140
+ => "145.205"
141
+
142
+ Note that these are always String values.
143
+
144
+ Fetch the next result (Iteration)
145
+
146
+ >> iter2 = blast.next
147
+ >> iter2.iter_num
148
+ >> 2
149
+ >> iter2.query_id
150
+ => "lcl|2_0"
151
+
152
+ etc. etc.
153
+
154
+ For more examples see the files in ./spec
155
+
156
+ == Usage
157
+
158
+ blastxmlparser [options] file(s)
159
+
160
+ -p, --parser name Use full|split parser (default full)
161
+ -n, --named fields Set named fields
162
+ -e, --exec filter Execute filter
163
+
164
+ --logger filename Log to file (default stderr)
165
+ --trace options Set log level (default INFO, see bio-logger)
166
+ -q, --quiet Run quietly
167
+ -v, --verbose Run verbosely
168
+ --debug Show debug messages
169
+ -h, --help Show help and examples
170
+
171
+ bioblastxmlparser filename(s)
172
+
173
+ Use --help switch for more information
174
+
175
+ == Examples
176
+
177
+ Print result fields of iterations containing 'lcl', using a regex
178
+
179
+ blastxmlparser -e 'iter.query_id=~/lcl/' test/data/nt_example_blastn.m7
180
+
181
+ Print fields where bit_score > 145
182
+
183
+ blastxmlparser -e 'hsp.bit_score>145' test/data/nt_example_blastn.m7
184
+
185
+ It is also possible to use the XML element names directly
186
+
187
+ blastxmlparser -e 'hsp["Hsp_bit-score"].to_i>145' test/data/nt_example_blastn.m7
188
+
189
+ Print named fields where E-value < 0.001 and hit length > 100
190
+
191
+ blastxmlparser -n 'hsp.evalue,hsp.qseq' -e 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7
192
+
193
+ 1 5.82208e-34 AGTGAAGCTTCTAGATATTTGGCGGGTACCTCTAATTTTGCCT...
194
+ 2 5.82208e-34 AGTGAAGCTTCTAGATATTTGGCGGGTACCTCTAATTTTGCCT...
195
+ 3 2.76378e-11 AATATGGTAGCTACAGAAACGGTAGTACACTCTTC
196
+ 4 1.13373e-13 CTAAACACAGGAGCATATAGGTTGGCAGGCAGGCAAAAT
197
+ 5 2.76378e-11 GAAGAGTGTACTACCGTTTCTGTAGCTACCATATT
198
+ etc. etc.
199
+
200
+ To use the low-mem version use
201
+
202
+ blastxmlparser --parser split -n 'hsp.evalue,hsp.qseq' -e 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7
203
+
204
+ == URL
205
+
206
+ The project lives at http://github.com/pjotrp/blastxmlparser. If you use this software, please cite http://dx.doi.org/10.1093/bioinformatics/btq475
207
+
208
+ == Copyright
209
+
210
+ Copyright (c) 2011 Pjotr Prins under the MIT licence. See LICENSE.txt and http://www.opensource.org/licenses/mit-license.html for further details.
211
+
@@ -0,0 +1,50 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'rake'
11
+
12
+ require 'jeweler'
13
+ Jeweler::Tasks.new do |gem|
14
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
15
+ gem.name = "bio-blastxmlparser"
16
+ gem.homepage = "http://github.com/pjotrp/bioruby-blastxmlparser"
17
+ gem.license = "MIT"
18
+ gem.summary = %Q{BLAST XML parser}
19
+ gem.description = %Q{Fast big data XML parser and library, written in Ruby}
20
+ gem.email = "pjotr.public01@thebird.nl"
21
+ gem.authors = ["Pjotr Prins"]
22
+ # Include your dependencies below. Runtime dependencies are required when using your gem,
23
+ # and development dependencies are only needed for development (ie running rake tasks, tests, etc)
24
+ # gem.add_runtime_dependency 'jabber4r', '> 0.1'
25
+ # gem.add_development_dependency 'rspec', '> 1.2.3'
26
+ end
27
+ Jeweler::RubygemsDotOrgTasks.new
28
+
29
+ require 'rspec/core'
30
+ require 'rspec/core/rake_task'
31
+ RSpec::Core::RakeTask.new(:spec) do |spec|
32
+ spec.pattern = FileList['spec/**/*_spec.rb']
33
+ end
34
+
35
+ RSpec::Core::RakeTask.new(:rcov) do |spec|
36
+ spec.pattern = 'spec/**/*_spec.rb'
37
+ spec.rcov = true
38
+ end
39
+
40
+ task :default => :spec
41
+
42
+ require 'rake/rdoctask'
43
+ Rake::RDocTask.new do |rdoc|
44
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
45
+
46
+ rdoc.rdoc_dir = 'rdoc'
47
+ rdoc.title = "bio-blastxmlparser #{version}"
48
+ rdoc.rdoc_files.include('README*')
49
+ rdoc.rdoc_files.include('lib/**/*.rb')
50
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.6.0
@@ -0,0 +1,165 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # BioRuby bio-blastxmlparser Plugin
4
+ # Author:: Pjotr Prins
5
+ # Copyright:: 2011
6
+ # License:: MIT License
7
+ #
8
+ # Copyright (C) 2010,2011 Pjotr Prins <pjotr.prins@thebird.nl>
9
+
10
+ rootpath = File.dirname(File.dirname(__FILE__))
11
+ $: << File.join(rootpath,'lib')
12
+
13
+ BLASTXML_VERSION = File.new(File.join(rootpath,'VERSION')).read.chomp
14
+
15
+ $stderr.print "BioRuby BLAST XML Parser "+BLASTXML_VERSION+" Copyright (C) 2011 Pjotr Prins <pjotr.prins@thebird.nl>\n\n"
16
+
17
+ USAGE = <<EOM
18
+
19
+ bioblastxmlparser filename(s)
20
+
21
+ Use --help switch for more information
22
+
23
+ == Examples
24
+
25
+ Print result fields of iterations containing 'lcl', using a regex
26
+
27
+ blastxmlparser -e 'iter.query_id=~/lcl/' test/data/nt_example_blastn.m7
28
+
29
+ Print fields where bit_score > 145
30
+
31
+ blastxmlparser -e 'hsp.bit_score>145' test/data/nt_example_blastn.m7
32
+
33
+ It is also possible to use the XML element names directly
34
+
35
+ blastxmlparser -e 'hsp["Hsp_bit-score"].to_i>145' test/data/nt_example_blastn.m7
36
+
37
+ Print named fields where E-value < 0.001 and hit length > 100
38
+
39
+ blastxmlparser -n 'hsp.evalue,hsp.qseq' -e 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7
40
+
41
+ 1 5.82208e-34 AGTGAAGCTTCTAGATATTTGGCGGGTACCTCTAATTTTGCCT...
42
+ 2 5.82208e-34 AGTGAAGCTTCTAGATATTTGGCGGGTACCTCTAATTTTGCCT...
43
+ 3 2.76378e-11 AATATGGTAGCTACAGAAACGGTAGTACACTCTTC
44
+ 4 1.13373e-13 CTAAACACAGGAGCATATAGGTTGGCAGGCAGGCAAAAT
45
+ 5 2.76378e-11 GAAGAGTGTACTACCGTTTCTGTAGCTACCATATT
46
+ etc. etc.
47
+
48
+ To use the low-mem version use
49
+
50
+ blastxmlparser --parser split -n 'hsp.evalue,hsp.qseq' -e 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7
51
+
52
+ == URL
53
+
54
+ The project lives at http://github.com/pjotrp/blastxmlparser. If you use this software, please cite http://dx.doi.org/10.1093/bioinformatics/btq475
55
+
56
+ == Copyright
57
+
58
+ Copyright (c) 2011 Pjotr Prins under the MIT licence. See LICENSE.txt and http://www.opensource.org/licenses/mit-license.html for further details.
59
+
60
+ EOM
61
+
62
+ if ARGV.size == 0
63
+ print USAGE
64
+ exit 1
65
+ end
66
+
67
+ require 'bio-blastxmlparser'
68
+ require 'optparse'
69
+ require 'ostruct'
70
+
71
+ require 'bio-logger'
72
+ Bio::Log::CLI.logger('stderr')
73
+ Bio::Log::CLI.trace('info')
74
+
75
+ options = OpenStruct.new()
76
+
77
+ opts = OptionParser.new do |o|
78
+
79
+ o.on_tail("-h", "--help", "Show help and examples") {
80
+ print(opts)
81
+ print USAGE
82
+ exit()
83
+ }
84
+
85
+ o.banner = "== Usage\n #{File.basename($0)} [options] file(s)"
86
+
87
+ o.separator ""
88
+
89
+ o.on("-p name", "--parser name", "Use full|split parser (default full)") do |p|
90
+ options.parser = p.to_sym
91
+ end
92
+
93
+ o.on("-n fields","--named fields",String, "Set named fields") do |s|
94
+ options.fields = s.split(/,/)
95
+ end
96
+
97
+ o.on("-e filter","--exec filter",String, "Execute filter") do |s|
98
+ options.exec = s
99
+ end
100
+
101
+ o.separator ""
102
+
103
+ o.on("--logger filename",String,"Log to file (default stderr)") do | name |
104
+ Bio::Log::CLI.logger(name)
105
+ end
106
+
107
+ o.on("--trace options",String,"Set log level (default INFO, see bio-logger)") do | s |
108
+ Bio::Log::CLI.trace(s)
109
+ end
110
+
111
+ o.on("-q", "--quiet", "Run quietly") do |q|
112
+ Bio::Log::CLI.trace('error')
113
+ end
114
+
115
+ o.on("-v", "--verbose", "Run verbosely") do |v|
116
+ Bio::Log::CLI.trace('info')
117
+ end
118
+
119
+ o.on("--debug", "Show debug messages") do |v|
120
+ Bio::Log::CLI.trace('debug')
121
+ end
122
+
123
+ end
124
+
125
+ begin
126
+ opts.parse!(ARGV)
127
+
128
+ Bio::Log::CLI.configure('bio-blastxmlparser')
129
+ logger = Bio::Log::LoggerPlus['bio-blastxmlparser']
130
+
131
+ ARGV.each do | fn |
132
+ logger.info("XML parsing #{fn}")
133
+ n = if options.parser == :split
134
+ Bio::Blast::XmlSplitterIterator.new(fn).to_enum
135
+ else
136
+ Bio::Blast::XmlIterator.new(fn).to_enum
137
+ end
138
+ i = 1
139
+ n.each do | iter |
140
+ iter.each do | hit |
141
+ hit.each do | hsp |
142
+ do_print = if options.exec
143
+ eval(options.exec)
144
+ else
145
+ true
146
+ end
147
+ if do_print
148
+ if options.fields
149
+ print i,"\t"
150
+ options.fields.each do | f |
151
+ print eval(f),"\t"
152
+ end
153
+ print "\n"
154
+ else
155
+ print [i,iter.iter_num,iter.query_id,hit.hit_id,hsp.hsp_num,hsp.evalue].join("\t"),"\n"
156
+ end
157
+ i += 1
158
+ end
159
+ end
160
+ end
161
+ end
162
+ end
163
+ rescue OptionParser::InvalidOption => e
164
+ opts[:invalid_argument] = e.message
165
+ end