bio-blastxmlparser 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.rspec +1 -0
- data/Gemfile +18 -0
- data/Gemfile.lock +34 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +211 -0
- data/Rakefile +50 -0
- data/VERSION +1 -0
- data/bin/blastxmlparser +165 -0
- data/bio-blastxmlparser.gemspec +88 -0
- data/lib/bio-blastxmlparser.rb +17 -0
- data/lib/bio/db/blast/parser/nokogiri.rb +203 -0
- data/lib/bio/db/blast/xmliterator.rb +19 -0
- data/lib/bio/db/blast/xmlsplitter.rb +43 -0
- data/sample/bioruby.rb +14 -0
- data/sample/blastxmlparserdemo.rb +17 -0
- data/sample/libxml_sax.rb +25 -0
- data/sample/nokogiri_dom.rb +17 -0
- data/sample/nokogiri_sax.rb +26 -0
- data/sample/nokogiri_split_dom.rb +34 -0
- data/spec/bio-blastxmlparser_spec.rb +104 -0
- data/spec/spec_helper.rb +12 -0
- data/test/data/aa_example.fasta +42 -0
- data/test/data/aa_example_blastp.m7 +5021 -0
- data/test/data/nt_example.fasta +88 -0
- data/test/data/nt_example_blastn.m7 +85538 -0
- data/timings.sh +28 -0
- metadata +180 -0
@@ -0,0 +1,88 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{bio-blastxmlparser}
|
8
|
+
s.version = "0.6.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Pjotr Prins"]
|
12
|
+
s.date = %q{2011-02-14}
|
13
|
+
s.default_executable = %q{blastxmlparser}
|
14
|
+
s.description = %q{Fast big data XML parser and library, written in Ruby}
|
15
|
+
s.email = %q{pjotr.public01@thebird.nl}
|
16
|
+
s.executables = ["blastxmlparser"]
|
17
|
+
s.extra_rdoc_files = [
|
18
|
+
"LICENSE.txt",
|
19
|
+
"README.rdoc"
|
20
|
+
]
|
21
|
+
s.files = [
|
22
|
+
".document",
|
23
|
+
".rspec",
|
24
|
+
"Gemfile",
|
25
|
+
"Gemfile.lock",
|
26
|
+
"LICENSE.txt",
|
27
|
+
"README.rdoc",
|
28
|
+
"Rakefile",
|
29
|
+
"VERSION",
|
30
|
+
"bin/blastxmlparser",
|
31
|
+
"bio-blastxmlparser.gemspec",
|
32
|
+
"lib/bio-blastxmlparser.rb",
|
33
|
+
"lib/bio/db/blast/parser/nokogiri.rb",
|
34
|
+
"lib/bio/db/blast/xmliterator.rb",
|
35
|
+
"lib/bio/db/blast/xmlsplitter.rb",
|
36
|
+
"sample/bioruby.rb",
|
37
|
+
"sample/blastxmlparserdemo.rb",
|
38
|
+
"sample/libxml_sax.rb",
|
39
|
+
"sample/nokogiri_dom.rb",
|
40
|
+
"sample/nokogiri_sax.rb",
|
41
|
+
"sample/nokogiri_split_dom.rb",
|
42
|
+
"spec/bio-blastxmlparser_spec.rb",
|
43
|
+
"spec/spec_helper.rb",
|
44
|
+
"test/data/aa_example.fasta",
|
45
|
+
"test/data/aa_example_blastp.m7",
|
46
|
+
"test/data/nt_example.fasta",
|
47
|
+
"test/data/nt_example_blastn.m7",
|
48
|
+
"timings.sh"
|
49
|
+
]
|
50
|
+
s.homepage = %q{http://github.com/pjotrp/bioruby-blastxmlparser}
|
51
|
+
s.licenses = ["MIT"]
|
52
|
+
s.require_paths = ["lib"]
|
53
|
+
s.rubygems_version = %q{1.3.7}
|
54
|
+
s.summary = %q{BLAST XML parser}
|
55
|
+
s.test_files = [
|
56
|
+
"spec/bio-blastxmlparser_spec.rb",
|
57
|
+
"spec/spec_helper.rb"
|
58
|
+
]
|
59
|
+
|
60
|
+
if s.respond_to? :specification_version then
|
61
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
62
|
+
s.specification_version = 3
|
63
|
+
|
64
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
65
|
+
s.add_runtime_dependency(%q<bio-logger>, ["> 0.8.0"])
|
66
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.4"])
|
67
|
+
s.add_development_dependency(%q<rspec>, ["~> 2.3.0"])
|
68
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
69
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
|
70
|
+
s.add_development_dependency(%q<rcov>, [">= 0"])
|
71
|
+
else
|
72
|
+
s.add_dependency(%q<bio-logger>, ["> 0.8.0"])
|
73
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
|
74
|
+
s.add_dependency(%q<rspec>, ["~> 2.3.0"])
|
75
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
76
|
+
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
77
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
78
|
+
end
|
79
|
+
else
|
80
|
+
s.add_dependency(%q<bio-logger>, ["> 0.8.0"])
|
81
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
|
82
|
+
s.add_dependency(%q<rspec>, ["~> 2.3.0"])
|
83
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
84
|
+
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
85
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# find local plugin installation, and use it when there
|
2
|
+
rootpath = File.dirname(File.dirname(__FILE__))
|
3
|
+
bio_logger_path = File.join(rootpath,'..','bioruby-logger','lib')
|
4
|
+
if File.directory? bio_logger_path
|
5
|
+
$: << bio_logger_path
|
6
|
+
$stderr.print "bio-logger loaded directly\n"
|
7
|
+
else
|
8
|
+
require "rubygems"
|
9
|
+
gem "bio-logger"
|
10
|
+
end
|
11
|
+
require 'bio-logger'
|
12
|
+
|
13
|
+
Bio::Log::LoggerPlus.new('bio-blastxmlparser')
|
14
|
+
|
15
|
+
require 'bio/db/blast/parser/nokogiri'
|
16
|
+
require 'bio/db/blast/xmlsplitter'
|
17
|
+
require 'bio/db/blast/xmliterator'
|
@@ -0,0 +1,203 @@
|
|
1
|
+
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'enumerator'
|
4
|
+
|
5
|
+
module Bio
|
6
|
+
module Blast
|
7
|
+
|
8
|
+
module XPath
|
9
|
+
def field name
|
10
|
+
res = if @prefix
|
11
|
+
@xml.xpath(@prefix+name+'/text()')
|
12
|
+
else
|
13
|
+
@xml.xpath(name+'/text()')
|
14
|
+
end
|
15
|
+
if res == nil
|
16
|
+
logger = Bio::Log::LoggerPlus['bio-blastxmlparser']
|
17
|
+
logger.warn("XML elemement <#{name}> has no content")
|
18
|
+
end
|
19
|
+
res.to_s
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
# Some magic to create XML -> method mappers, on the fly
|
25
|
+
module MapXPath
|
26
|
+
include XPath
|
27
|
+
def MapXPath.define_s map
|
28
|
+
map.each { |k,v|
|
29
|
+
define_method(v) {
|
30
|
+
field(k)
|
31
|
+
}
|
32
|
+
}
|
33
|
+
end
|
34
|
+
def MapXPath.define_i map
|
35
|
+
map.each { |k,v|
|
36
|
+
define_method(v) {
|
37
|
+
field(k).to_i
|
38
|
+
}
|
39
|
+
}
|
40
|
+
end
|
41
|
+
def MapXPath.define_f map
|
42
|
+
map.each { |k,v|
|
43
|
+
define_method(v) {
|
44
|
+
field(k).to_f
|
45
|
+
}
|
46
|
+
}
|
47
|
+
end
|
48
|
+
def [] name
|
49
|
+
field(name)
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
class NokogiriBlastHsp
|
55
|
+
include MapXPath
|
56
|
+
attr_reader :parent
|
57
|
+
MapXPath.define_s 'Hsp_id' => :hsp_id,
|
58
|
+
'Hsp_qseq' => :qseq,
|
59
|
+
'Hsp_hseq' => :hseq,
|
60
|
+
'Hsp_midline' => :midline
|
61
|
+
MapXPath.define_i 'Hsp_num' => :hsp_num,
|
62
|
+
'Hsp_score' => :score,
|
63
|
+
'Hsp_query-from' => :query_from,
|
64
|
+
'Hsp_query-to' => :query_to,
|
65
|
+
'Hsp_hit-from' => :hit_from,
|
66
|
+
'Hsp_hit-to' => :hit_to,
|
67
|
+
'Hsp_query-frame' => :query_frame,
|
68
|
+
'Hsp_hit-frame' => :hit_frame,
|
69
|
+
'Hsp_identity' => :identity,
|
70
|
+
'Hsp_positive' => :positive,
|
71
|
+
'Hsp_align-len' => :align_len
|
72
|
+
MapXPath.define_f 'Hsp_bit-score' => :bit_score,
|
73
|
+
'Hsp_evalue' => :evalue
|
74
|
+
|
75
|
+
def initialize xml, parent
|
76
|
+
@xml = xml
|
77
|
+
@parent = parent
|
78
|
+
end
|
79
|
+
|
80
|
+
def to_s
|
81
|
+
s = <<EOM
|
82
|
+
hit_num=#{parent.hit_num}, hsp_num=#{hsp_num}, score=#{score}, bit_score=#{bit_score}
|
83
|
+
EOM
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
87
|
+
|
88
|
+
class NokogiriBlastHit
|
89
|
+
include MapXPath
|
90
|
+
attr_reader :parent
|
91
|
+
MapXPath.define_s 'Hit_id' => :hit_id,
|
92
|
+
'Hit_def' => :hit_def,
|
93
|
+
'Hit_accession' => :accession
|
94
|
+
MapXPath.define_i 'Hit_num' => :hit_num,
|
95
|
+
'Hit_len' => :len
|
96
|
+
|
97
|
+
def initialize hit, parent
|
98
|
+
@xml = hit
|
99
|
+
@parent = parent
|
100
|
+
end
|
101
|
+
|
102
|
+
def hsps
|
103
|
+
Enumerator.new { |yielder|
|
104
|
+
@xml.children.each do | hit_field |
|
105
|
+
if hit_field.name == 'Hit_hsps'
|
106
|
+
hit_field.children.each do | hsp |
|
107
|
+
if hsp.name == 'Hsp'
|
108
|
+
yielder.yield NokogiriBlastHsp.new(hsp,self)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
}
|
114
|
+
end
|
115
|
+
|
116
|
+
def each
|
117
|
+
hsps.each { | h | yield h }
|
118
|
+
end
|
119
|
+
|
120
|
+
|
121
|
+
def to_s
|
122
|
+
s = <<EOM
|
123
|
+
iter_num=#{parent.iter_num}, hit_id=#{hit_id}, hit_def=#{hit_def}, hit_num=#{hit_num}
|
124
|
+
EOM
|
125
|
+
end
|
126
|
+
|
127
|
+
end
|
128
|
+
|
129
|
+
class NokogiriBlastIterator
|
130
|
+
include MapXPath
|
131
|
+
attr_reader :parent
|
132
|
+
MapXPath.define_s 'Iteration_query-ID' => :query_id,
|
133
|
+
'Iteration_query-def' => :query_def
|
134
|
+
|
135
|
+
MapXPath.define_i 'Iteration_iter-num' => :iter_num,
|
136
|
+
'Iteration_query-len' => :query_len
|
137
|
+
|
138
|
+
|
139
|
+
def initialize iterator, parent, opts = { :prefix => nil }
|
140
|
+
@parent = parent
|
141
|
+
@prefix = opts[:prefix]
|
142
|
+
@xml = if iterator.name == 'document'
|
143
|
+
iterator.children.first
|
144
|
+
else
|
145
|
+
iterator
|
146
|
+
end
|
147
|
+
name2 = @xml.name
|
148
|
+
raise "Error in BLAST XML, expected Iteration node, but got #{name2}" if name2 != 'Iteration'
|
149
|
+
# p [:iter,@prefix,'@@Iteratition_iter-num',field('Iteration_iter-num')]
|
150
|
+
# print @xml.to_s
|
151
|
+
end
|
152
|
+
|
153
|
+
def hits
|
154
|
+
Enumerator.new { |yielder|
|
155
|
+
@xml.children.each do | iter_field |
|
156
|
+
if iter_field.name == 'Iteration_hits'
|
157
|
+
iter_field.children.each do | hit |
|
158
|
+
if hit.name == 'Hit'
|
159
|
+
yielder.yield NokogiriBlastHit.new(hit,self)
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
}
|
165
|
+
end
|
166
|
+
|
167
|
+
def each
|
168
|
+
hits.each { | h | yield h }
|
169
|
+
end
|
170
|
+
|
171
|
+
def to_s
|
172
|
+
s = <<EOM
|
173
|
+
iter_num=#{iter_num}, query_id=#{query_id}
|
174
|
+
EOM
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
class NokogiriBlastXml
|
179
|
+
def initialize document
|
180
|
+
@xml = document
|
181
|
+
end
|
182
|
+
|
183
|
+
def to_enum
|
184
|
+
Enumerator.new { |yielder|
|
185
|
+
each { | iterator | yielder.yield(iterator) }
|
186
|
+
}
|
187
|
+
end
|
188
|
+
|
189
|
+
def each &block
|
190
|
+
doc = Nokogiri::XML(@xml) { | cfg | cfg.noblanks }
|
191
|
+
doc.root.children.each do |blastnode|
|
192
|
+
if blastnode.name == 'BlastOutput_iterations'
|
193
|
+
blastnode.children.each do | iteration |
|
194
|
+
if iteration.name == 'Iteration'
|
195
|
+
block.call(NokogiriBlastIterator.new(iteration,self))
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
module Blast
|
5
|
+
|
6
|
+
# Iterate a BLAST file yielding (lazy) results
|
7
|
+
class XmlIterator
|
8
|
+
def initialize blastfilename
|
9
|
+
@fn = blastfilename
|
10
|
+
end
|
11
|
+
|
12
|
+
def to_enum
|
13
|
+
logger = Bio::Log::LoggerPlus['bio-blastxmlparser']
|
14
|
+
logger.info("parsing (full) #{@fn}")
|
15
|
+
NokogiriBlastXml.new(File.new(@fn)).to_enum
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Bio
|
2
|
+
module Blast
|
3
|
+
# Reads a full XML result and splits it out into a buffer for each
|
4
|
+
# Iteration (query result).
|
5
|
+
class XmlSplitterIterator
|
6
|
+
def initialize fn
|
7
|
+
@fn = fn
|
8
|
+
end
|
9
|
+
|
10
|
+
def to_enum
|
11
|
+
Enumerator.new do | yielder |
|
12
|
+
logger = Bio::Log::LoggerPlus['bio-blastxmlparser']
|
13
|
+
logger.info("split file parsing #{@fn}")
|
14
|
+
f = File.open(@fn)
|
15
|
+
# Skip BLAST header
|
16
|
+
f.each_line do | line |
|
17
|
+
break if line.strip == "<Iteration>"
|
18
|
+
end
|
19
|
+
# Return each Iteration as an XML DOM
|
20
|
+
each_iteration(f) do | buf |
|
21
|
+
iteration = Nokogiri::XML.parse(buf.join) { | cfg | cfg.noblanks }
|
22
|
+
yielder.yield NokogiriBlastIterator.new(iteration,self,:prefix=>nil)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def each_iteration f
|
30
|
+
# b = ["<?xml version=\"1.0\"?>\n","<Iteration>\n"]
|
31
|
+
# b = []
|
32
|
+
b = ["<Iteration>\n"]
|
33
|
+
f.each_line do | line |
|
34
|
+
b << line
|
35
|
+
if line.strip == "</Iteration>"
|
36
|
+
yield b
|
37
|
+
b = []
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
data/sample/bioruby.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'bio'
|
3
|
+
|
4
|
+
fn = 'test/data/nt_example_blastn.m7'
|
5
|
+
|
6
|
+
# Iterates over each XML result.
|
7
|
+
# The variable "report" is a Bio::Blast::Report object.
|
8
|
+
# Bio::Blast.reports(ARGF) do |report|
|
9
|
+
Bio::Blast.reports(File.new(fn)) do |report|
|
10
|
+
puts "Hits for " + report.query_def + " against " + report.db
|
11
|
+
report.each do |hit|
|
12
|
+
print hit.target_id, "\t", hit.evalue, "\n" if hit.evalue < 0.001
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
|
3
|
+
rootpath = File.dirname(File.dirname(__FILE__))
|
4
|
+
$: << File.join(rootpath,'lib')
|
5
|
+
|
6
|
+
require 'bio-blastxmlparser'
|
7
|
+
fn = 'test/data/nt_example_blastn.m7'
|
8
|
+
n = Bio::Blast::XmlIterator.new(fn).to_enum
|
9
|
+
n.each do | iter |
|
10
|
+
puts "Hits for " + iter.query_id
|
11
|
+
iter.each do | hit |
|
12
|
+
hit.each do | hsp |
|
13
|
+
print hit.hit_id, "\t", hsp.evalue, "\n" if hsp.evalue < 0.001
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
@@ -0,0 +1,25 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'libxml'
|
5
|
+
|
6
|
+
include LibXML
|
7
|
+
|
8
|
+
class PostCallbacks
|
9
|
+
include XML::SaxParser::Callbacks
|
10
|
+
|
11
|
+
def on_start_element(element, attributes)
|
12
|
+
if element == 'Iteration_iter-num'
|
13
|
+
# Process row of data here
|
14
|
+
print "---- ",element
|
15
|
+
end
|
16
|
+
# if element == 'Hsp_score'
|
17
|
+
# print "---- ",element
|
18
|
+
# end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
parser = XML::SaxParser.file("test/data/nt_example_blastn.m7")
|
23
|
+
parser.callbacks = PostCallbacks.new
|
24
|
+
parser.parse
|
25
|
+
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'nokogiri'
|
5
|
+
|
6
|
+
include Nokogiri
|
7
|
+
|
8
|
+
input = Nokogiri::XML(File.new("test/data/nt_example_blastn.m7"))
|
9
|
+
|
10
|
+
input.root.xpath("//Iteration").each do | e |
|
11
|
+
print "---- "
|
12
|
+
print e.xpath("Iteration_iter-num/text()"),"\n"
|
13
|
+
print e.xpath("Iteration_hits/Hit/Hit_hsps/Hsp/Hsp_score/text()").map {|n| n.to_s}, "\n"
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
|