bio-blastxmlparser 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,88 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{bio-blastxmlparser}
8
+ s.version = "0.6.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Pjotr Prins"]
12
+ s.date = %q{2011-02-14}
13
+ s.default_executable = %q{blastxmlparser}
14
+ s.description = %q{Fast big data XML parser and library, written in Ruby}
15
+ s.email = %q{pjotr.public01@thebird.nl}
16
+ s.executables = ["blastxmlparser"]
17
+ s.extra_rdoc_files = [
18
+ "LICENSE.txt",
19
+ "README.rdoc"
20
+ ]
21
+ s.files = [
22
+ ".document",
23
+ ".rspec",
24
+ "Gemfile",
25
+ "Gemfile.lock",
26
+ "LICENSE.txt",
27
+ "README.rdoc",
28
+ "Rakefile",
29
+ "VERSION",
30
+ "bin/blastxmlparser",
31
+ "bio-blastxmlparser.gemspec",
32
+ "lib/bio-blastxmlparser.rb",
33
+ "lib/bio/db/blast/parser/nokogiri.rb",
34
+ "lib/bio/db/blast/xmliterator.rb",
35
+ "lib/bio/db/blast/xmlsplitter.rb",
36
+ "sample/bioruby.rb",
37
+ "sample/blastxmlparserdemo.rb",
38
+ "sample/libxml_sax.rb",
39
+ "sample/nokogiri_dom.rb",
40
+ "sample/nokogiri_sax.rb",
41
+ "sample/nokogiri_split_dom.rb",
42
+ "spec/bio-blastxmlparser_spec.rb",
43
+ "spec/spec_helper.rb",
44
+ "test/data/aa_example.fasta",
45
+ "test/data/aa_example_blastp.m7",
46
+ "test/data/nt_example.fasta",
47
+ "test/data/nt_example_blastn.m7",
48
+ "timings.sh"
49
+ ]
50
+ s.homepage = %q{http://github.com/pjotrp/bioruby-blastxmlparser}
51
+ s.licenses = ["MIT"]
52
+ s.require_paths = ["lib"]
53
+ s.rubygems_version = %q{1.3.7}
54
+ s.summary = %q{BLAST XML parser}
55
+ s.test_files = [
56
+ "spec/bio-blastxmlparser_spec.rb",
57
+ "spec/spec_helper.rb"
58
+ ]
59
+
60
+ if s.respond_to? :specification_version then
61
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
62
+ s.specification_version = 3
63
+
64
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
65
+ s.add_runtime_dependency(%q<bio-logger>, ["> 0.8.0"])
66
+ s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.4"])
67
+ s.add_development_dependency(%q<rspec>, ["~> 2.3.0"])
68
+ s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
69
+ s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
70
+ s.add_development_dependency(%q<rcov>, [">= 0"])
71
+ else
72
+ s.add_dependency(%q<bio-logger>, ["> 0.8.0"])
73
+ s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
74
+ s.add_dependency(%q<rspec>, ["~> 2.3.0"])
75
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
76
+ s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
77
+ s.add_dependency(%q<rcov>, [">= 0"])
78
+ end
79
+ else
80
+ s.add_dependency(%q<bio-logger>, ["> 0.8.0"])
81
+ s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
82
+ s.add_dependency(%q<rspec>, ["~> 2.3.0"])
83
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
84
+ s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
85
+ s.add_dependency(%q<rcov>, [">= 0"])
86
+ end
87
+ end
88
+
@@ -0,0 +1,17 @@
1
+ # find local plugin installation, and use it when there
2
+ rootpath = File.dirname(File.dirname(__FILE__))
3
+ bio_logger_path = File.join(rootpath,'..','bioruby-logger','lib')
4
+ if File.directory? bio_logger_path
5
+ $: << bio_logger_path
6
+ $stderr.print "bio-logger loaded directly\n"
7
+ else
8
+ require "rubygems"
9
+ gem "bio-logger"
10
+ end
11
+ require 'bio-logger'
12
+
13
+ Bio::Log::LoggerPlus.new('bio-blastxmlparser')
14
+
15
+ require 'bio/db/blast/parser/nokogiri'
16
+ require 'bio/db/blast/xmlsplitter'
17
+ require 'bio/db/blast/xmliterator'
@@ -0,0 +1,203 @@
1
+
2
+ require 'nokogiri'
3
+ require 'enumerator'
4
+
5
+ module Bio
6
+ module Blast
7
+
8
+ module XPath
9
+ def field name
10
+ res = if @prefix
11
+ @xml.xpath(@prefix+name+'/text()')
12
+ else
13
+ @xml.xpath(name+'/text()')
14
+ end
15
+ if res == nil
16
+ logger = Bio::Log::LoggerPlus['bio-blastxmlparser']
17
+ logger.warn("XML elemement <#{name}> has no content")
18
+ end
19
+ res.to_s
20
+ end
21
+
22
+ end
23
+
24
+ # Some magic to create XML -> method mappers, on the fly
25
+ module MapXPath
26
+ include XPath
27
+ def MapXPath.define_s map
28
+ map.each { |k,v|
29
+ define_method(v) {
30
+ field(k)
31
+ }
32
+ }
33
+ end
34
+ def MapXPath.define_i map
35
+ map.each { |k,v|
36
+ define_method(v) {
37
+ field(k).to_i
38
+ }
39
+ }
40
+ end
41
+ def MapXPath.define_f map
42
+ map.each { |k,v|
43
+ define_method(v) {
44
+ field(k).to_f
45
+ }
46
+ }
47
+ end
48
+ def [] name
49
+ field(name)
50
+ end
51
+
52
+ end
53
+
54
+ class NokogiriBlastHsp
55
+ include MapXPath
56
+ attr_reader :parent
57
+ MapXPath.define_s 'Hsp_id' => :hsp_id,
58
+ 'Hsp_qseq' => :qseq,
59
+ 'Hsp_hseq' => :hseq,
60
+ 'Hsp_midline' => :midline
61
+ MapXPath.define_i 'Hsp_num' => :hsp_num,
62
+ 'Hsp_score' => :score,
63
+ 'Hsp_query-from' => :query_from,
64
+ 'Hsp_query-to' => :query_to,
65
+ 'Hsp_hit-from' => :hit_from,
66
+ 'Hsp_hit-to' => :hit_to,
67
+ 'Hsp_query-frame' => :query_frame,
68
+ 'Hsp_hit-frame' => :hit_frame,
69
+ 'Hsp_identity' => :identity,
70
+ 'Hsp_positive' => :positive,
71
+ 'Hsp_align-len' => :align_len
72
+ MapXPath.define_f 'Hsp_bit-score' => :bit_score,
73
+ 'Hsp_evalue' => :evalue
74
+
75
+ def initialize xml, parent
76
+ @xml = xml
77
+ @parent = parent
78
+ end
79
+
80
+ def to_s
81
+ s = <<EOM
82
+ hit_num=#{parent.hit_num}, hsp_num=#{hsp_num}, score=#{score}, bit_score=#{bit_score}
83
+ EOM
84
+ end
85
+
86
+ end
87
+
88
+ class NokogiriBlastHit
89
+ include MapXPath
90
+ attr_reader :parent
91
+ MapXPath.define_s 'Hit_id' => :hit_id,
92
+ 'Hit_def' => :hit_def,
93
+ 'Hit_accession' => :accession
94
+ MapXPath.define_i 'Hit_num' => :hit_num,
95
+ 'Hit_len' => :len
96
+
97
+ def initialize hit, parent
98
+ @xml = hit
99
+ @parent = parent
100
+ end
101
+
102
+ def hsps
103
+ Enumerator.new { |yielder|
104
+ @xml.children.each do | hit_field |
105
+ if hit_field.name == 'Hit_hsps'
106
+ hit_field.children.each do | hsp |
107
+ if hsp.name == 'Hsp'
108
+ yielder.yield NokogiriBlastHsp.new(hsp,self)
109
+ end
110
+ end
111
+ end
112
+ end
113
+ }
114
+ end
115
+
116
+ def each
117
+ hsps.each { | h | yield h }
118
+ end
119
+
120
+
121
+ def to_s
122
+ s = <<EOM
123
+ iter_num=#{parent.iter_num}, hit_id=#{hit_id}, hit_def=#{hit_def}, hit_num=#{hit_num}
124
+ EOM
125
+ end
126
+
127
+ end
128
+
129
+ class NokogiriBlastIterator
130
+ include MapXPath
131
+ attr_reader :parent
132
+ MapXPath.define_s 'Iteration_query-ID' => :query_id,
133
+ 'Iteration_query-def' => :query_def
134
+
135
+ MapXPath.define_i 'Iteration_iter-num' => :iter_num,
136
+ 'Iteration_query-len' => :query_len
137
+
138
+
139
+ def initialize iterator, parent, opts = { :prefix => nil }
140
+ @parent = parent
141
+ @prefix = opts[:prefix]
142
+ @xml = if iterator.name == 'document'
143
+ iterator.children.first
144
+ else
145
+ iterator
146
+ end
147
+ name2 = @xml.name
148
+ raise "Error in BLAST XML, expected Iteration node, but got #{name2}" if name2 != 'Iteration'
149
+ # p [:iter,@prefix,'@@Iteratition_iter-num',field('Iteration_iter-num')]
150
+ # print @xml.to_s
151
+ end
152
+
153
+ def hits
154
+ Enumerator.new { |yielder|
155
+ @xml.children.each do | iter_field |
156
+ if iter_field.name == 'Iteration_hits'
157
+ iter_field.children.each do | hit |
158
+ if hit.name == 'Hit'
159
+ yielder.yield NokogiriBlastHit.new(hit,self)
160
+ end
161
+ end
162
+ end
163
+ end
164
+ }
165
+ end
166
+
167
+ def each
168
+ hits.each { | h | yield h }
169
+ end
170
+
171
+ def to_s
172
+ s = <<EOM
173
+ iter_num=#{iter_num}, query_id=#{query_id}
174
+ EOM
175
+ end
176
+ end
177
+
178
+ class NokogiriBlastXml
179
+ def initialize document
180
+ @xml = document
181
+ end
182
+
183
+ def to_enum
184
+ Enumerator.new { |yielder|
185
+ each { | iterator | yielder.yield(iterator) }
186
+ }
187
+ end
188
+
189
+ def each &block
190
+ doc = Nokogiri::XML(@xml) { | cfg | cfg.noblanks }
191
+ doc.root.children.each do |blastnode|
192
+ if blastnode.name == 'BlastOutput_iterations'
193
+ blastnode.children.each do | iteration |
194
+ if iteration.name == 'Iteration'
195
+ block.call(NokogiriBlastIterator.new(iteration,self))
196
+ end
197
+ end
198
+ end
199
+ end
200
+ end
201
+ end
202
+ end
203
+ end
@@ -0,0 +1,19 @@
1
+
2
+
3
+ module Bio
4
+ module Blast
5
+
6
+ # Iterate a BLAST file yielding (lazy) results
7
+ class XmlIterator
8
+ def initialize blastfilename
9
+ @fn = blastfilename
10
+ end
11
+
12
+ def to_enum
13
+ logger = Bio::Log::LoggerPlus['bio-blastxmlparser']
14
+ logger.info("parsing (full) #{@fn}")
15
+ NokogiriBlastXml.new(File.new(@fn)).to_enum
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,43 @@
1
+ module Bio
2
+ module Blast
3
+ # Reads a full XML result and splits it out into a buffer for each
4
+ # Iteration (query result).
5
+ class XmlSplitterIterator
6
+ def initialize fn
7
+ @fn = fn
8
+ end
9
+
10
+ def to_enum
11
+ Enumerator.new do | yielder |
12
+ logger = Bio::Log::LoggerPlus['bio-blastxmlparser']
13
+ logger.info("split file parsing #{@fn}")
14
+ f = File.open(@fn)
15
+ # Skip BLAST header
16
+ f.each_line do | line |
17
+ break if line.strip == "<Iteration>"
18
+ end
19
+ # Return each Iteration as an XML DOM
20
+ each_iteration(f) do | buf |
21
+ iteration = Nokogiri::XML.parse(buf.join) { | cfg | cfg.noblanks }
22
+ yielder.yield NokogiriBlastIterator.new(iteration,self,:prefix=>nil)
23
+ end
24
+ end
25
+ end
26
+
27
+ private
28
+
29
+ def each_iteration f
30
+ # b = ["<?xml version=\"1.0\"?>\n","<Iteration>\n"]
31
+ # b = []
32
+ b = ["<Iteration>\n"]
33
+ f.each_line do | line |
34
+ b << line
35
+ if line.strip == "</Iteration>"
36
+ yield b
37
+ b = []
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bio'
3
+
4
+ fn = 'test/data/nt_example_blastn.m7'
5
+
6
+ # Iterates over each XML result.
7
+ # The variable "report" is a Bio::Blast::Report object.
8
+ # Bio::Blast.reports(ARGF) do |report|
9
+ Bio::Blast.reports(File.new(fn)) do |report|
10
+ puts "Hits for " + report.query_def + " against " + report.db
11
+ report.each do |hit|
12
+ print hit.target_id, "\t", hit.evalue, "\n" if hit.evalue < 0.001
13
+ end
14
+ end
@@ -0,0 +1,17 @@
1
+ #! /usr/bin/ruby
2
+
3
+ rootpath = File.dirname(File.dirname(__FILE__))
4
+ $: << File.join(rootpath,'lib')
5
+
6
+ require 'bio-blastxmlparser'
7
+ fn = 'test/data/nt_example_blastn.m7'
8
+ n = Bio::Blast::XmlIterator.new(fn).to_enum
9
+ n.each do | iter |
10
+ puts "Hits for " + iter.query_id
11
+ iter.each do | hit |
12
+ hit.each do | hsp |
13
+ print hit.hit_id, "\t", hsp.evalue, "\n" if hsp.evalue < 0.001
14
+ end
15
+ end
16
+ end
17
+
@@ -0,0 +1,25 @@
1
+ #! /usr/bin/ruby
2
+
3
+ require 'rubygems'
4
+ require 'libxml'
5
+
6
+ include LibXML
7
+
8
+ class PostCallbacks
9
+ include XML::SaxParser::Callbacks
10
+
11
+ def on_start_element(element, attributes)
12
+ if element == 'Iteration_iter-num'
13
+ # Process row of data here
14
+ print "---- ",element
15
+ end
16
+ # if element == 'Hsp_score'
17
+ # print "---- ",element
18
+ # end
19
+ end
20
+ end
21
+
22
+ parser = XML::SaxParser.file("test/data/nt_example_blastn.m7")
23
+ parser.callbacks = PostCallbacks.new
24
+ parser.parse
25
+
@@ -0,0 +1,17 @@
1
+ #! /usr/bin/ruby
2
+
3
+ require 'rubygems'
4
+ require 'nokogiri'
5
+
6
+ include Nokogiri
7
+
8
+ input = Nokogiri::XML(File.new("test/data/nt_example_blastn.m7"))
9
+
10
+ input.root.xpath("//Iteration").each do | e |
11
+ print "---- "
12
+ print e.xpath("Iteration_iter-num/text()"),"\n"
13
+ print e.xpath("Iteration_hits/Hit/Hit_hsps/Hsp/Hsp_score/text()").map {|n| n.to_s}, "\n"
14
+
15
+ end
16
+
17
+