bio-lazyblastxml 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +52 -1
- data/VERSION +1 -1
- data/bio-lazyblastxml.gemspec +2 -2
- data/lib/bio/appl/blast/lazyblastxml.rb +22 -11
- data/test/test_bio-lazyblastxml.rb +12 -5
- metadata +3 -3
data/README.rdoc
CHANGED
@@ -1,6 +1,57 @@
|
|
1
1
|
= bio-lazyblastxml
|
2
2
|
|
3
|
-
|
3
|
+
Provides a libxml-based lazy parser for reading through large blast xml files with a small memory footprint.
|
4
|
+
|
5
|
+
== Requirements
|
6
|
+
|
7
|
+
* Ruby 1.9.x
|
8
|
+
* libxml
|
9
|
+
|
10
|
+
If you're on ubuntu, libxml is most easily installed with:
|
11
|
+
|
12
|
+
sudo apt-get install libxml-ruby
|
13
|
+
|
14
|
+
And then install the gem with
|
15
|
+
|
16
|
+
gem install bio-lazyblastxml
|
17
|
+
# If you need to be root to install gems, try:
|
18
|
+
sudo gem install bio-lazyblastxml
|
19
|
+
|
20
|
+
== Overview
|
21
|
+
|
22
|
+
The parsers uses a LibXML::Reader instance to read the XML file one line at a time, keeping very little in memory. You can think of the parser as having a very short memory, only able to recall the object that it happens to be looking at right now. The parser only runs through the document once unless Bio::LazyBlast#rewind is called.
|
23
|
+
|
24
|
+
== Example Usage
|
25
|
+
|
26
|
+
Each report is an enumerable that yields iterations.
|
27
|
+
Each iteration is an enumerable that yields hits (if there are any hits).
|
28
|
+
|
29
|
+
require 'bio-lazyblastxml'
|
30
|
+
|
31
|
+
# Generate your new report object
|
32
|
+
report = Bio::LazyBlast::Report.new('my_huge_blastfile.xml')
|
33
|
+
|
34
|
+
# How many hits does each query have?
|
35
|
+
report.each_iteration do |iteration|
|
36
|
+
puts [iteration.query_def, iteration.count].join("\t")
|
37
|
+
end
|
38
|
+
|
39
|
+
Each hit is an enumerable that yields hsps:
|
40
|
+
|
41
|
+
require 'bio-lazyblastxml'
|
42
|
+
|
43
|
+
# Generate your new report object
|
44
|
+
report = Bio::LazyBlast::Report.new('my_huge_blastfile.xml')
|
45
|
+
|
46
|
+
report.each_iteration do |iteration|
|
47
|
+
iteration.each_hit do |hit|
|
48
|
+
# Sum up the lengths of all the hsps
|
49
|
+
hsp_length_sum = hit.inject(0){|count,hsp| count += hsp.align_len}
|
50
|
+
puts [iteration.query_def, hit.definition, hsp_length_sum].join("\t")
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
|
4
55
|
|
5
56
|
== Contributing to bio-lazyblastxml
|
6
57
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.4.0
|
data/bio-lazyblastxml.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{bio-lazyblastxml}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.4.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["robsyme"]
|
12
|
-
s.date = %q{2011-
|
12
|
+
s.date = %q{2011-06-01}
|
13
13
|
s.description = %q{This is very scrappy at the moment, and will need to be seriously cleaned up. It does what I need it to do for now. I'll fix it up in the coming weeks. Promise :)}
|
14
14
|
s.email = %q{rob.syme@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -4,10 +4,11 @@ module Bio
|
|
4
4
|
class LazyBlast
|
5
5
|
class Report
|
6
6
|
include Enumerable
|
7
|
-
attr_reader :
|
7
|
+
attr_reader :program, :version, :db, :query_id, :query_def, :query_len, :statistics
|
8
8
|
|
9
9
|
def initialize(filename)
|
10
|
-
@
|
10
|
+
@filename = filename
|
11
|
+
@reader = LibXML::XML::Reader.file(@filename)
|
11
12
|
@nodes = Enumerator.new do |yielder|
|
12
13
|
while @reader.read
|
13
14
|
yielder << @reader if @reader.node_type == LibXML::XML::Reader::TYPE_ELEMENT
|
@@ -17,9 +18,9 @@ module Bio
|
|
17
18
|
end
|
18
19
|
|
19
20
|
def setup_report_values
|
20
|
-
@
|
21
|
+
@statistics = Hash.new
|
21
22
|
@nodes.each do |node|
|
22
|
-
return
|
23
|
+
return if node.name == "BlastOutput_iterations"
|
23
24
|
case node.name
|
24
25
|
when 'BlastOutput_program'
|
25
26
|
@program = node.read_inner_xml
|
@@ -34,15 +35,15 @@ module Bio
|
|
34
35
|
when 'BlastOutput_query-len'
|
35
36
|
@query_len = node.read_inner_xml.to_i
|
36
37
|
when 'Parameters_matrix'
|
37
|
-
@
|
38
|
+
@statistics['matrix'] = node.read_inner_xml
|
38
39
|
when 'Parameters_expect'
|
39
|
-
@
|
40
|
+
@statistics['expect'] = node.read_inner_xml.to_i
|
40
41
|
when 'Parameters_gap-open'
|
41
|
-
@
|
42
|
+
@statistics['gap-open'] = node.read_inner_xml.to_i
|
42
43
|
when 'Parameters_gap-extend'
|
43
|
-
@
|
44
|
+
@statistics['gap-extend'] = node.read_inner_xml.to_i
|
44
45
|
when 'Parameters_filter'
|
45
|
-
@
|
46
|
+
@statistics['filter'] = node.read_inner_xml
|
46
47
|
end
|
47
48
|
end
|
48
49
|
end
|
@@ -52,6 +53,16 @@ module Bio
|
|
52
53
|
end
|
53
54
|
alias :each_iteration :each
|
54
55
|
|
56
|
+
def rewind
|
57
|
+
@reader.close
|
58
|
+
@reader = LibXML::XML::Reader.file(@filename)
|
59
|
+
@nodes = Enumerator.new do |yielder|
|
60
|
+
while @reader.read
|
61
|
+
yielder << @reader if @reader.node_type == LibXML::XML::Reader::TYPE_ELEMENT
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
55
66
|
class Iteration
|
56
67
|
include Enumerable
|
57
68
|
attr_reader :num, :query_id, :query_def, :query_len, :message, :parameters
|
@@ -67,7 +78,7 @@ module Bio
|
|
67
78
|
|
68
79
|
def setup_iteration_values
|
69
80
|
@nodes.each do |node|
|
70
|
-
return
|
81
|
+
return if node.name == 'Iteration_hits'
|
71
82
|
case node.name
|
72
83
|
when 'Iteration_iter-num'
|
73
84
|
@num = node.read_inner_xml.to_i
|
@@ -103,7 +114,7 @@ module Bio
|
|
103
114
|
|
104
115
|
def setup_hit_values
|
105
116
|
@nodes.each do |node|
|
106
|
-
return
|
117
|
+
return if node.name == 'Hit_hsps'
|
107
118
|
case node.name
|
108
119
|
when 'Hit_num'
|
109
120
|
@num = node.read_inner_xml.to_i
|
@@ -35,17 +35,24 @@ class TestIteration < MiniTest::Unit::TestCase
|
|
35
35
|
@report = Bio::LazyBlast::Report.new(@blast_filename)
|
36
36
|
end
|
37
37
|
|
38
|
+
def test_rewind_ability
|
39
|
+
assert_equal 2, @report.count, "Test report should contain 2 iterations"
|
40
|
+
assert_equal 0, @report.count, "Report should contain no more iterations once the file has been read through"
|
41
|
+
@report.rewind
|
42
|
+
assert_equal 2, @report.count, "Test report should contain 2 iterations once the file has been rewound."
|
43
|
+
end
|
44
|
+
|
38
45
|
def test_iteration_creation
|
39
46
|
@iteration = @report.first
|
40
47
|
assert_kind_of Bio::LazyBlast::Report::Iteration, @iteration
|
41
48
|
assert_equal 'blastp', @report.program
|
42
49
|
assert_equal 'blastp 2.2.21 [Jun-14-2009]', @report.version
|
43
50
|
assert_equal 'db.fasta', @report.db
|
44
|
-
assert_equal 'BLOSUM62', @report.
|
45
|
-
assert_equal 10, @report.
|
46
|
-
assert_equal 11, @report.
|
47
|
-
assert_equal 1, @report.
|
48
|
-
assert_equal 'F', @report.
|
51
|
+
assert_equal 'BLOSUM62', @report.statistics['matrix']
|
52
|
+
assert_equal 10, @report.statistics['expect']
|
53
|
+
assert_equal 11, @report.statistics['gap-open']
|
54
|
+
assert_equal 1, @report.statistics['gap-extend']
|
55
|
+
assert_equal 'F', @report.statistics['filter']
|
49
56
|
end
|
50
57
|
|
51
58
|
def test_example_usage
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: bio-lazyblastxml
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.
|
5
|
+
version: 0.4.0
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- robsyme
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2011-
|
13
|
+
date: 2011-06-01 00:00:00 +08:00
|
14
14
|
default_executable:
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
@@ -116,7 +116,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
116
116
|
requirements:
|
117
117
|
- - ">="
|
118
118
|
- !ruby/object:Gem::Version
|
119
|
-
hash: -
|
119
|
+
hash: -1790620181502382914
|
120
120
|
segments:
|
121
121
|
- 0
|
122
122
|
version: "0"
|