bio-blastxmlparser 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.rspec +1 -0
- data/Gemfile +18 -0
- data/Gemfile.lock +34 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +211 -0
- data/Rakefile +50 -0
- data/VERSION +1 -0
- data/bin/blastxmlparser +165 -0
- data/bio-blastxmlparser.gemspec +88 -0
- data/lib/bio-blastxmlparser.rb +17 -0
- data/lib/bio/db/blast/parser/nokogiri.rb +203 -0
- data/lib/bio/db/blast/xmliterator.rb +19 -0
- data/lib/bio/db/blast/xmlsplitter.rb +43 -0
- data/sample/bioruby.rb +14 -0
- data/sample/blastxmlparserdemo.rb +17 -0
- data/sample/libxml_sax.rb +25 -0
- data/sample/nokogiri_dom.rb +17 -0
- data/sample/nokogiri_sax.rb +26 -0
- data/sample/nokogiri_split_dom.rb +34 -0
- data/spec/bio-blastxmlparser_spec.rb +104 -0
- data/spec/spec_helper.rb +12 -0
- data/test/data/aa_example.fasta +42 -0
- data/test/data/aa_example_blastp.m7 +5021 -0
- data/test/data/nt_example.fasta +88 -0
- data/test/data/nt_example_blastn.m7 +85538 -0
- data/timings.sh +28 -0
- metadata +180 -0
@@ -0,0 +1,88 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{bio-blastxmlparser}
|
8
|
+
s.version = "0.6.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Pjotr Prins"]
|
12
|
+
s.date = %q{2011-02-14}
|
13
|
+
s.default_executable = %q{blastxmlparser}
|
14
|
+
s.description = %q{Fast big data XML parser and library, written in Ruby}
|
15
|
+
s.email = %q{pjotr.public01@thebird.nl}
|
16
|
+
s.executables = ["blastxmlparser"]
|
17
|
+
s.extra_rdoc_files = [
|
18
|
+
"LICENSE.txt",
|
19
|
+
"README.rdoc"
|
20
|
+
]
|
21
|
+
s.files = [
|
22
|
+
".document",
|
23
|
+
".rspec",
|
24
|
+
"Gemfile",
|
25
|
+
"Gemfile.lock",
|
26
|
+
"LICENSE.txt",
|
27
|
+
"README.rdoc",
|
28
|
+
"Rakefile",
|
29
|
+
"VERSION",
|
30
|
+
"bin/blastxmlparser",
|
31
|
+
"bio-blastxmlparser.gemspec",
|
32
|
+
"lib/bio-blastxmlparser.rb",
|
33
|
+
"lib/bio/db/blast/parser/nokogiri.rb",
|
34
|
+
"lib/bio/db/blast/xmliterator.rb",
|
35
|
+
"lib/bio/db/blast/xmlsplitter.rb",
|
36
|
+
"sample/bioruby.rb",
|
37
|
+
"sample/blastxmlparserdemo.rb",
|
38
|
+
"sample/libxml_sax.rb",
|
39
|
+
"sample/nokogiri_dom.rb",
|
40
|
+
"sample/nokogiri_sax.rb",
|
41
|
+
"sample/nokogiri_split_dom.rb",
|
42
|
+
"spec/bio-blastxmlparser_spec.rb",
|
43
|
+
"spec/spec_helper.rb",
|
44
|
+
"test/data/aa_example.fasta",
|
45
|
+
"test/data/aa_example_blastp.m7",
|
46
|
+
"test/data/nt_example.fasta",
|
47
|
+
"test/data/nt_example_blastn.m7",
|
48
|
+
"timings.sh"
|
49
|
+
]
|
50
|
+
s.homepage = %q{http://github.com/pjotrp/bioruby-blastxmlparser}
|
51
|
+
s.licenses = ["MIT"]
|
52
|
+
s.require_paths = ["lib"]
|
53
|
+
s.rubygems_version = %q{1.3.7}
|
54
|
+
s.summary = %q{BLAST XML parser}
|
55
|
+
s.test_files = [
|
56
|
+
"spec/bio-blastxmlparser_spec.rb",
|
57
|
+
"spec/spec_helper.rb"
|
58
|
+
]
|
59
|
+
|
60
|
+
if s.respond_to? :specification_version then
|
61
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
62
|
+
s.specification_version = 3
|
63
|
+
|
64
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
65
|
+
s.add_runtime_dependency(%q<bio-logger>, ["> 0.8.0"])
|
66
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.4"])
|
67
|
+
s.add_development_dependency(%q<rspec>, ["~> 2.3.0"])
|
68
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
69
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
|
70
|
+
s.add_development_dependency(%q<rcov>, [">= 0"])
|
71
|
+
else
|
72
|
+
s.add_dependency(%q<bio-logger>, ["> 0.8.0"])
|
73
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
|
74
|
+
s.add_dependency(%q<rspec>, ["~> 2.3.0"])
|
75
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
76
|
+
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
77
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
78
|
+
end
|
79
|
+
else
|
80
|
+
s.add_dependency(%q<bio-logger>, ["> 0.8.0"])
|
81
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
|
82
|
+
s.add_dependency(%q<rspec>, ["~> 2.3.0"])
|
83
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
84
|
+
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
85
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# find local plugin installation, and use it when there
|
2
|
+
rootpath = File.dirname(File.dirname(__FILE__))
|
3
|
+
bio_logger_path = File.join(rootpath,'..','bioruby-logger','lib')
|
4
|
+
if File.directory? bio_logger_path
|
5
|
+
$: << bio_logger_path
|
6
|
+
$stderr.print "bio-logger loaded directly\n"
|
7
|
+
else
|
8
|
+
require "rubygems"
|
9
|
+
gem "bio-logger"
|
10
|
+
end
|
11
|
+
require 'bio-logger'
|
12
|
+
|
13
|
+
Bio::Log::LoggerPlus.new('bio-blastxmlparser')
|
14
|
+
|
15
|
+
require 'bio/db/blast/parser/nokogiri'
|
16
|
+
require 'bio/db/blast/xmlsplitter'
|
17
|
+
require 'bio/db/blast/xmliterator'
|
@@ -0,0 +1,203 @@
|
|
1
|
+
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'enumerator'
|
4
|
+
|
5
|
+
module Bio
|
6
|
+
module Blast
|
7
|
+
|
8
|
+
module XPath
|
9
|
+
def field name
|
10
|
+
res = if @prefix
|
11
|
+
@xml.xpath(@prefix+name+'/text()')
|
12
|
+
else
|
13
|
+
@xml.xpath(name+'/text()')
|
14
|
+
end
|
15
|
+
if res == nil
|
16
|
+
logger = Bio::Log::LoggerPlus['bio-blastxmlparser']
|
17
|
+
logger.warn("XML elemement <#{name}> has no content")
|
18
|
+
end
|
19
|
+
res.to_s
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
# Some magic to create XML -> method mappers, on the fly
|
25
|
+
module MapXPath
|
26
|
+
include XPath
|
27
|
+
def MapXPath.define_s map
|
28
|
+
map.each { |k,v|
|
29
|
+
define_method(v) {
|
30
|
+
field(k)
|
31
|
+
}
|
32
|
+
}
|
33
|
+
end
|
34
|
+
def MapXPath.define_i map
|
35
|
+
map.each { |k,v|
|
36
|
+
define_method(v) {
|
37
|
+
field(k).to_i
|
38
|
+
}
|
39
|
+
}
|
40
|
+
end
|
41
|
+
def MapXPath.define_f map
|
42
|
+
map.each { |k,v|
|
43
|
+
define_method(v) {
|
44
|
+
field(k).to_f
|
45
|
+
}
|
46
|
+
}
|
47
|
+
end
|
48
|
+
def [] name
|
49
|
+
field(name)
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
class NokogiriBlastHsp
|
55
|
+
include MapXPath
|
56
|
+
attr_reader :parent
|
57
|
+
MapXPath.define_s 'Hsp_id' => :hsp_id,
|
58
|
+
'Hsp_qseq' => :qseq,
|
59
|
+
'Hsp_hseq' => :hseq,
|
60
|
+
'Hsp_midline' => :midline
|
61
|
+
MapXPath.define_i 'Hsp_num' => :hsp_num,
|
62
|
+
'Hsp_score' => :score,
|
63
|
+
'Hsp_query-from' => :query_from,
|
64
|
+
'Hsp_query-to' => :query_to,
|
65
|
+
'Hsp_hit-from' => :hit_from,
|
66
|
+
'Hsp_hit-to' => :hit_to,
|
67
|
+
'Hsp_query-frame' => :query_frame,
|
68
|
+
'Hsp_hit-frame' => :hit_frame,
|
69
|
+
'Hsp_identity' => :identity,
|
70
|
+
'Hsp_positive' => :positive,
|
71
|
+
'Hsp_align-len' => :align_len
|
72
|
+
MapXPath.define_f 'Hsp_bit-score' => :bit_score,
|
73
|
+
'Hsp_evalue' => :evalue
|
74
|
+
|
75
|
+
def initialize xml, parent
|
76
|
+
@xml = xml
|
77
|
+
@parent = parent
|
78
|
+
end
|
79
|
+
|
80
|
+
def to_s
|
81
|
+
s = <<EOM
|
82
|
+
hit_num=#{parent.hit_num}, hsp_num=#{hsp_num}, score=#{score}, bit_score=#{bit_score}
|
83
|
+
EOM
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
87
|
+
|
88
|
+
class NokogiriBlastHit
|
89
|
+
include MapXPath
|
90
|
+
attr_reader :parent
|
91
|
+
MapXPath.define_s 'Hit_id' => :hit_id,
|
92
|
+
'Hit_def' => :hit_def,
|
93
|
+
'Hit_accession' => :accession
|
94
|
+
MapXPath.define_i 'Hit_num' => :hit_num,
|
95
|
+
'Hit_len' => :len
|
96
|
+
|
97
|
+
def initialize hit, parent
|
98
|
+
@xml = hit
|
99
|
+
@parent = parent
|
100
|
+
end
|
101
|
+
|
102
|
+
def hsps
|
103
|
+
Enumerator.new { |yielder|
|
104
|
+
@xml.children.each do | hit_field |
|
105
|
+
if hit_field.name == 'Hit_hsps'
|
106
|
+
hit_field.children.each do | hsp |
|
107
|
+
if hsp.name == 'Hsp'
|
108
|
+
yielder.yield NokogiriBlastHsp.new(hsp,self)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
}
|
114
|
+
end
|
115
|
+
|
116
|
+
def each
|
117
|
+
hsps.each { | h | yield h }
|
118
|
+
end
|
119
|
+
|
120
|
+
|
121
|
+
def to_s
|
122
|
+
s = <<EOM
|
123
|
+
iter_num=#{parent.iter_num}, hit_id=#{hit_id}, hit_def=#{hit_def}, hit_num=#{hit_num}
|
124
|
+
EOM
|
125
|
+
end
|
126
|
+
|
127
|
+
end
|
128
|
+
|
129
|
+
class NokogiriBlastIterator
|
130
|
+
include MapXPath
|
131
|
+
attr_reader :parent
|
132
|
+
MapXPath.define_s 'Iteration_query-ID' => :query_id,
|
133
|
+
'Iteration_query-def' => :query_def
|
134
|
+
|
135
|
+
MapXPath.define_i 'Iteration_iter-num' => :iter_num,
|
136
|
+
'Iteration_query-len' => :query_len
|
137
|
+
|
138
|
+
|
139
|
+
def initialize iterator, parent, opts = { :prefix => nil }
|
140
|
+
@parent = parent
|
141
|
+
@prefix = opts[:prefix]
|
142
|
+
@xml = if iterator.name == 'document'
|
143
|
+
iterator.children.first
|
144
|
+
else
|
145
|
+
iterator
|
146
|
+
end
|
147
|
+
name2 = @xml.name
|
148
|
+
raise "Error in BLAST XML, expected Iteration node, but got #{name2}" if name2 != 'Iteration'
|
149
|
+
# p [:iter,@prefix,'@@Iteratition_iter-num',field('Iteration_iter-num')]
|
150
|
+
# print @xml.to_s
|
151
|
+
end
|
152
|
+
|
153
|
+
def hits
|
154
|
+
Enumerator.new { |yielder|
|
155
|
+
@xml.children.each do | iter_field |
|
156
|
+
if iter_field.name == 'Iteration_hits'
|
157
|
+
iter_field.children.each do | hit |
|
158
|
+
if hit.name == 'Hit'
|
159
|
+
yielder.yield NokogiriBlastHit.new(hit,self)
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
}
|
165
|
+
end
|
166
|
+
|
167
|
+
def each
|
168
|
+
hits.each { | h | yield h }
|
169
|
+
end
|
170
|
+
|
171
|
+
def to_s
|
172
|
+
s = <<EOM
|
173
|
+
iter_num=#{iter_num}, query_id=#{query_id}
|
174
|
+
EOM
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
class NokogiriBlastXml
|
179
|
+
def initialize document
|
180
|
+
@xml = document
|
181
|
+
end
|
182
|
+
|
183
|
+
def to_enum
|
184
|
+
Enumerator.new { |yielder|
|
185
|
+
each { | iterator | yielder.yield(iterator) }
|
186
|
+
}
|
187
|
+
end
|
188
|
+
|
189
|
+
def each &block
|
190
|
+
doc = Nokogiri::XML(@xml) { | cfg | cfg.noblanks }
|
191
|
+
doc.root.children.each do |blastnode|
|
192
|
+
if blastnode.name == 'BlastOutput_iterations'
|
193
|
+
blastnode.children.each do | iteration |
|
194
|
+
if iteration.name == 'Iteration'
|
195
|
+
block.call(NokogiriBlastIterator.new(iteration,self))
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
module Blast
|
5
|
+
|
6
|
+
# Iterate a BLAST file yielding (lazy) results
|
7
|
+
class XmlIterator
|
8
|
+
def initialize blastfilename
|
9
|
+
@fn = blastfilename
|
10
|
+
end
|
11
|
+
|
12
|
+
def to_enum
|
13
|
+
logger = Bio::Log::LoggerPlus['bio-blastxmlparser']
|
14
|
+
logger.info("parsing (full) #{@fn}")
|
15
|
+
NokogiriBlastXml.new(File.new(@fn)).to_enum
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Bio
|
2
|
+
module Blast
|
3
|
+
# Reads a full XML result and splits it out into a buffer for each
|
4
|
+
# Iteration (query result).
|
5
|
+
class XmlSplitterIterator
|
6
|
+
def initialize fn
|
7
|
+
@fn = fn
|
8
|
+
end
|
9
|
+
|
10
|
+
def to_enum
|
11
|
+
Enumerator.new do | yielder |
|
12
|
+
logger = Bio::Log::LoggerPlus['bio-blastxmlparser']
|
13
|
+
logger.info("split file parsing #{@fn}")
|
14
|
+
f = File.open(@fn)
|
15
|
+
# Skip BLAST header
|
16
|
+
f.each_line do | line |
|
17
|
+
break if line.strip == "<Iteration>"
|
18
|
+
end
|
19
|
+
# Return each Iteration as an XML DOM
|
20
|
+
each_iteration(f) do | buf |
|
21
|
+
iteration = Nokogiri::XML.parse(buf.join) { | cfg | cfg.noblanks }
|
22
|
+
yielder.yield NokogiriBlastIterator.new(iteration,self,:prefix=>nil)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def each_iteration f
|
30
|
+
# b = ["<?xml version=\"1.0\"?>\n","<Iteration>\n"]
|
31
|
+
# b = []
|
32
|
+
b = ["<Iteration>\n"]
|
33
|
+
f.each_line do | line |
|
34
|
+
b << line
|
35
|
+
if line.strip == "</Iteration>"
|
36
|
+
yield b
|
37
|
+
b = []
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
data/sample/bioruby.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'bio'
|
3
|
+
|
4
|
+
fn = 'test/data/nt_example_blastn.m7'
|
5
|
+
|
6
|
+
# Iterates over each XML result.
|
7
|
+
# The variable "report" is a Bio::Blast::Report object.
|
8
|
+
# Bio::Blast.reports(ARGF) do |report|
|
9
|
+
Bio::Blast.reports(File.new(fn)) do |report|
|
10
|
+
puts "Hits for " + report.query_def + " against " + report.db
|
11
|
+
report.each do |hit|
|
12
|
+
print hit.target_id, "\t", hit.evalue, "\n" if hit.evalue < 0.001
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
|
3
|
+
rootpath = File.dirname(File.dirname(__FILE__))
|
4
|
+
$: << File.join(rootpath,'lib')
|
5
|
+
|
6
|
+
require 'bio-blastxmlparser'
|
7
|
+
fn = 'test/data/nt_example_blastn.m7'
|
8
|
+
n = Bio::Blast::XmlIterator.new(fn).to_enum
|
9
|
+
n.each do | iter |
|
10
|
+
puts "Hits for " + iter.query_id
|
11
|
+
iter.each do | hit |
|
12
|
+
hit.each do | hsp |
|
13
|
+
print hit.hit_id, "\t", hsp.evalue, "\n" if hsp.evalue < 0.001
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
@@ -0,0 +1,25 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'libxml'
|
5
|
+
|
6
|
+
include LibXML
|
7
|
+
|
8
|
+
class PostCallbacks
|
9
|
+
include XML::SaxParser::Callbacks
|
10
|
+
|
11
|
+
def on_start_element(element, attributes)
|
12
|
+
if element == 'Iteration_iter-num'
|
13
|
+
# Process row of data here
|
14
|
+
print "---- ",element
|
15
|
+
end
|
16
|
+
# if element == 'Hsp_score'
|
17
|
+
# print "---- ",element
|
18
|
+
# end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
parser = XML::SaxParser.file("test/data/nt_example_blastn.m7")
|
23
|
+
parser.callbacks = PostCallbacks.new
|
24
|
+
parser.parse
|
25
|
+
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'nokogiri'
|
5
|
+
|
6
|
+
include Nokogiri
|
7
|
+
|
8
|
+
input = Nokogiri::XML(File.new("test/data/nt_example_blastn.m7"))
|
9
|
+
|
10
|
+
input.root.xpath("//Iteration").each do | e |
|
11
|
+
print "---- "
|
12
|
+
print e.xpath("Iteration_iter-num/text()"),"\n"
|
13
|
+
print e.xpath("Iteration_hits/Hit/Hit_hsps/Hsp/Hsp_score/text()").map {|n| n.to_s}, "\n"
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
|