bio-blastxmlparser 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,88 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{bio-blastxmlparser}
8
+ s.version = "0.6.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Pjotr Prins"]
12
+ s.date = %q{2011-02-14}
13
+ s.default_executable = %q{blastxmlparser}
14
+ s.description = %q{Fast big data XML parser and library, written in Ruby}
15
+ s.email = %q{pjotr.public01@thebird.nl}
16
+ s.executables = ["blastxmlparser"]
17
+ s.extra_rdoc_files = [
18
+ "LICENSE.txt",
19
+ "README.rdoc"
20
+ ]
21
+ s.files = [
22
+ ".document",
23
+ ".rspec",
24
+ "Gemfile",
25
+ "Gemfile.lock",
26
+ "LICENSE.txt",
27
+ "README.rdoc",
28
+ "Rakefile",
29
+ "VERSION",
30
+ "bin/blastxmlparser",
31
+ "bio-blastxmlparser.gemspec",
32
+ "lib/bio-blastxmlparser.rb",
33
+ "lib/bio/db/blast/parser/nokogiri.rb",
34
+ "lib/bio/db/blast/xmliterator.rb",
35
+ "lib/bio/db/blast/xmlsplitter.rb",
36
+ "sample/bioruby.rb",
37
+ "sample/blastxmlparserdemo.rb",
38
+ "sample/libxml_sax.rb",
39
+ "sample/nokogiri_dom.rb",
40
+ "sample/nokogiri_sax.rb",
41
+ "sample/nokogiri_split_dom.rb",
42
+ "spec/bio-blastxmlparser_spec.rb",
43
+ "spec/spec_helper.rb",
44
+ "test/data/aa_example.fasta",
45
+ "test/data/aa_example_blastp.m7",
46
+ "test/data/nt_example.fasta",
47
+ "test/data/nt_example_blastn.m7",
48
+ "timings.sh"
49
+ ]
50
+ s.homepage = %q{http://github.com/pjotrp/bioruby-blastxmlparser}
51
+ s.licenses = ["MIT"]
52
+ s.require_paths = ["lib"]
53
+ s.rubygems_version = %q{1.3.7}
54
+ s.summary = %q{BLAST XML parser}
55
+ s.test_files = [
56
+ "spec/bio-blastxmlparser_spec.rb",
57
+ "spec/spec_helper.rb"
58
+ ]
59
+
60
+ if s.respond_to? :specification_version then
61
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
62
+ s.specification_version = 3
63
+
64
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
65
+ s.add_runtime_dependency(%q<bio-logger>, ["> 0.8.0"])
66
+ s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.4"])
67
+ s.add_development_dependency(%q<rspec>, ["~> 2.3.0"])
68
+ s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
69
+ s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
70
+ s.add_development_dependency(%q<rcov>, [">= 0"])
71
+ else
72
+ s.add_dependency(%q<bio-logger>, ["> 0.8.0"])
73
+ s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
74
+ s.add_dependency(%q<rspec>, ["~> 2.3.0"])
75
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
76
+ s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
77
+ s.add_dependency(%q<rcov>, [">= 0"])
78
+ end
79
+ else
80
+ s.add_dependency(%q<bio-logger>, ["> 0.8.0"])
81
+ s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
82
+ s.add_dependency(%q<rspec>, ["~> 2.3.0"])
83
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
84
+ s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
85
+ s.add_dependency(%q<rcov>, [">= 0"])
86
+ end
87
+ end
88
+
@@ -0,0 +1,17 @@
1
+ # find local plugin installation, and use it when there
2
+ rootpath = File.dirname(File.dirname(__FILE__))
3
+ bio_logger_path = File.join(rootpath,'..','bioruby-logger','lib')
4
+ if File.directory? bio_logger_path
5
+ $: << bio_logger_path
6
+ $stderr.print "bio-logger loaded directly\n"
7
+ else
8
+ require "rubygems"
9
+ gem "bio-logger"
10
+ end
11
+ require 'bio-logger'
12
+
13
+ Bio::Log::LoggerPlus.new('bio-blastxmlparser')
14
+
15
+ require 'bio/db/blast/parser/nokogiri'
16
+ require 'bio/db/blast/xmlsplitter'
17
+ require 'bio/db/blast/xmliterator'
@@ -0,0 +1,203 @@
1
+
2
+ require 'nokogiri'
3
+ require 'enumerator'
4
+
5
+ module Bio
6
+ module Blast
7
+
8
+ module XPath
9
+ def field name
10
+ res = if @prefix
11
+ @xml.xpath(@prefix+name+'/text()')
12
+ else
13
+ @xml.xpath(name+'/text()')
14
+ end
15
+ if res == nil
16
+ logger = Bio::Log::LoggerPlus['bio-blastxmlparser']
17
+ logger.warn("XML elemement <#{name}> has no content")
18
+ end
19
+ res.to_s
20
+ end
21
+
22
+ end
23
+
24
+ # Some magic to create XML -> method mappers, on the fly
25
+ module MapXPath
26
+ include XPath
27
+ def MapXPath.define_s map
28
+ map.each { |k,v|
29
+ define_method(v) {
30
+ field(k)
31
+ }
32
+ }
33
+ end
34
+ def MapXPath.define_i map
35
+ map.each { |k,v|
36
+ define_method(v) {
37
+ field(k).to_i
38
+ }
39
+ }
40
+ end
41
+ def MapXPath.define_f map
42
+ map.each { |k,v|
43
+ define_method(v) {
44
+ field(k).to_f
45
+ }
46
+ }
47
+ end
48
+ def [] name
49
+ field(name)
50
+ end
51
+
52
+ end
53
+
54
+ class NokogiriBlastHsp
55
+ include MapXPath
56
+ attr_reader :parent
57
+ MapXPath.define_s 'Hsp_id' => :hsp_id,
58
+ 'Hsp_qseq' => :qseq,
59
+ 'Hsp_hseq' => :hseq,
60
+ 'Hsp_midline' => :midline
61
+ MapXPath.define_i 'Hsp_num' => :hsp_num,
62
+ 'Hsp_score' => :score,
63
+ 'Hsp_query-from' => :query_from,
64
+ 'Hsp_query-to' => :query_to,
65
+ 'Hsp_hit-from' => :hit_from,
66
+ 'Hsp_hit-to' => :hit_to,
67
+ 'Hsp_query-frame' => :query_frame,
68
+ 'Hsp_hit-frame' => :hit_frame,
69
+ 'Hsp_identity' => :identity,
70
+ 'Hsp_positive' => :positive,
71
+ 'Hsp_align-len' => :align_len
72
+ MapXPath.define_f 'Hsp_bit-score' => :bit_score,
73
+ 'Hsp_evalue' => :evalue
74
+
75
+ def initialize xml, parent
76
+ @xml = xml
77
+ @parent = parent
78
+ end
79
+
80
+ def to_s
81
+ s = <<EOM
82
+ hit_num=#{parent.hit_num}, hsp_num=#{hsp_num}, score=#{score}, bit_score=#{bit_score}
83
+ EOM
84
+ end
85
+
86
+ end
87
+
88
+ class NokogiriBlastHit
89
+ include MapXPath
90
+ attr_reader :parent
91
+ MapXPath.define_s 'Hit_id' => :hit_id,
92
+ 'Hit_def' => :hit_def,
93
+ 'Hit_accession' => :accession
94
+ MapXPath.define_i 'Hit_num' => :hit_num,
95
+ 'Hit_len' => :len
96
+
97
+ def initialize hit, parent
98
+ @xml = hit
99
+ @parent = parent
100
+ end
101
+
102
+ def hsps
103
+ Enumerator.new { |yielder|
104
+ @xml.children.each do | hit_field |
105
+ if hit_field.name == 'Hit_hsps'
106
+ hit_field.children.each do | hsp |
107
+ if hsp.name == 'Hsp'
108
+ yielder.yield NokogiriBlastHsp.new(hsp,self)
109
+ end
110
+ end
111
+ end
112
+ end
113
+ }
114
+ end
115
+
116
+ def each
117
+ hsps.each { | h | yield h }
118
+ end
119
+
120
+
121
+ def to_s
122
+ s = <<EOM
123
+ iter_num=#{parent.iter_num}, hit_id=#{hit_id}, hit_def=#{hit_def}, hit_num=#{hit_num}
124
+ EOM
125
+ end
126
+
127
+ end
128
+
129
+ class NokogiriBlastIterator
130
+ include MapXPath
131
+ attr_reader :parent
132
+ MapXPath.define_s 'Iteration_query-ID' => :query_id,
133
+ 'Iteration_query-def' => :query_def
134
+
135
+ MapXPath.define_i 'Iteration_iter-num' => :iter_num,
136
+ 'Iteration_query-len' => :query_len
137
+
138
+
139
+ def initialize iterator, parent, opts = { :prefix => nil }
140
+ @parent = parent
141
+ @prefix = opts[:prefix]
142
+ @xml = if iterator.name == 'document'
143
+ iterator.children.first
144
+ else
145
+ iterator
146
+ end
147
+ name2 = @xml.name
148
+ raise "Error in BLAST XML, expected Iteration node, but got #{name2}" if name2 != 'Iteration'
149
+ # p [:iter,@prefix,'@@Iteratition_iter-num',field('Iteration_iter-num')]
150
+ # print @xml.to_s
151
+ end
152
+
153
+ def hits
154
+ Enumerator.new { |yielder|
155
+ @xml.children.each do | iter_field |
156
+ if iter_field.name == 'Iteration_hits'
157
+ iter_field.children.each do | hit |
158
+ if hit.name == 'Hit'
159
+ yielder.yield NokogiriBlastHit.new(hit,self)
160
+ end
161
+ end
162
+ end
163
+ end
164
+ }
165
+ end
166
+
167
+ def each
168
+ hits.each { | h | yield h }
169
+ end
170
+
171
+ def to_s
172
+ s = <<EOM
173
+ iter_num=#{iter_num}, query_id=#{query_id}
174
+ EOM
175
+ end
176
+ end
177
+
178
+ class NokogiriBlastXml
179
+ def initialize document
180
+ @xml = document
181
+ end
182
+
183
+ def to_enum
184
+ Enumerator.new { |yielder|
185
+ each { | iterator | yielder.yield(iterator) }
186
+ }
187
+ end
188
+
189
+ def each &block
190
+ doc = Nokogiri::XML(@xml) { | cfg | cfg.noblanks }
191
+ doc.root.children.each do |blastnode|
192
+ if blastnode.name == 'BlastOutput_iterations'
193
+ blastnode.children.each do | iteration |
194
+ if iteration.name == 'Iteration'
195
+ block.call(NokogiriBlastIterator.new(iteration,self))
196
+ end
197
+ end
198
+ end
199
+ end
200
+ end
201
+ end
202
+ end
203
+ end
@@ -0,0 +1,19 @@
1
+
2
+
3
+ module Bio
4
+ module Blast
5
+
6
+ # Iterate a BLAST file yielding (lazy) results
7
+ class XmlIterator
8
+ def initialize blastfilename
9
+ @fn = blastfilename
10
+ end
11
+
12
+ def to_enum
13
+ logger = Bio::Log::LoggerPlus['bio-blastxmlparser']
14
+ logger.info("parsing (full) #{@fn}")
15
+ NokogiriBlastXml.new(File.new(@fn)).to_enum
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,43 @@
1
+ module Bio
2
+ module Blast
3
+ # Reads a full XML result and splits it out into a buffer for each
4
+ # Iteration (query result).
5
+ class XmlSplitterIterator
6
+ def initialize fn
7
+ @fn = fn
8
+ end
9
+
10
+ def to_enum
11
+ Enumerator.new do | yielder |
12
+ logger = Bio::Log::LoggerPlus['bio-blastxmlparser']
13
+ logger.info("split file parsing #{@fn}")
14
+ f = File.open(@fn)
15
+ # Skip BLAST header
16
+ f.each_line do | line |
17
+ break if line.strip == "<Iteration>"
18
+ end
19
+ # Return each Iteration as an XML DOM
20
+ each_iteration(f) do | buf |
21
+ iteration = Nokogiri::XML.parse(buf.join) { | cfg | cfg.noblanks }
22
+ yielder.yield NokogiriBlastIterator.new(iteration,self,:prefix=>nil)
23
+ end
24
+ end
25
+ end
26
+
27
+ private
28
+
29
+ def each_iteration f
30
+ # b = ["<?xml version=\"1.0\"?>\n","<Iteration>\n"]
31
+ # b = []
32
+ b = ["<Iteration>\n"]
33
+ f.each_line do | line |
34
+ b << line
35
+ if line.strip == "</Iteration>"
36
+ yield b
37
+ b = []
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bio'
3
+
4
+ fn = 'test/data/nt_example_blastn.m7'
5
+
6
+ # Iterates over each XML result.
7
+ # The variable "report" is a Bio::Blast::Report object.
8
+ # Bio::Blast.reports(ARGF) do |report|
9
+ Bio::Blast.reports(File.new(fn)) do |report|
10
+ puts "Hits for " + report.query_def + " against " + report.db
11
+ report.each do |hit|
12
+ print hit.target_id, "\t", hit.evalue, "\n" if hit.evalue < 0.001
13
+ end
14
+ end
@@ -0,0 +1,17 @@
1
+ #! /usr/bin/ruby
2
+
3
+ rootpath = File.dirname(File.dirname(__FILE__))
4
+ $: << File.join(rootpath,'lib')
5
+
6
+ require 'bio-blastxmlparser'
7
+ fn = 'test/data/nt_example_blastn.m7'
8
+ n = Bio::Blast::XmlIterator.new(fn).to_enum
9
+ n.each do | iter |
10
+ puts "Hits for " + iter.query_id
11
+ iter.each do | hit |
12
+ hit.each do | hsp |
13
+ print hit.hit_id, "\t", hsp.evalue, "\n" if hsp.evalue < 0.001
14
+ end
15
+ end
16
+ end
17
+
@@ -0,0 +1,25 @@
1
+ #! /usr/bin/ruby
2
+
3
+ require 'rubygems'
4
+ require 'libxml'
5
+
6
+ include LibXML
7
+
8
+ class PostCallbacks
9
+ include XML::SaxParser::Callbacks
10
+
11
+ def on_start_element(element, attributes)
12
+ if element == 'Iteration_iter-num'
13
+ # Process row of data here
14
+ print "---- ",element
15
+ end
16
+ # if element == 'Hsp_score'
17
+ # print "---- ",element
18
+ # end
19
+ end
20
+ end
21
+
22
+ parser = XML::SaxParser.file("test/data/nt_example_blastn.m7")
23
+ parser.callbacks = PostCallbacks.new
24
+ parser.parse
25
+
@@ -0,0 +1,17 @@
1
+ #! /usr/bin/ruby
2
+
3
+ require 'rubygems'
4
+ require 'nokogiri'
5
+
6
+ include Nokogiri
7
+
8
+ input = Nokogiri::XML(File.new("test/data/nt_example_blastn.m7"))
9
+
10
+ input.root.xpath("//Iteration").each do | e |
11
+ print "---- "
12
+ print e.xpath("Iteration_iter-num/text()"),"\n"
13
+ print e.xpath("Iteration_hits/Hit/Hit_hsps/Hsp/Hsp_score/text()").map {|n| n.to_s}, "\n"
14
+
15
+ end
16
+
17
+