bio-lazyblastxml 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +52 -1
- data/VERSION +1 -1
- data/bio-lazyblastxml.gemspec +2 -2
- data/lib/bio/appl/blast/lazyblastxml.rb +22 -11
- data/test/test_bio-lazyblastxml.rb +12 -5
- metadata +3 -3
data/README.rdoc
CHANGED
@@ -1,6 +1,57 @@
|
|
1
1
|
= bio-lazyblastxml
|
2
2
|
|
3
|
-
|
3
|
+
Provides a libxml-based lazy parser for reading through large blast xml files with a small memory footprint.
|
4
|
+
|
5
|
+
== Requirements
|
6
|
+
|
7
|
+
* Ruby 1.9.x
|
8
|
+
* libxml
|
9
|
+
|
10
|
+
If you're on ubuntu, libxml is most easily installed with:
|
11
|
+
|
12
|
+
sudo apt-get install libxml-ruby
|
13
|
+
|
14
|
+
And then install the gem with
|
15
|
+
|
16
|
+
gem install bio-lazyblastxml
|
17
|
+
# If you need to be root to install gems, try:
|
18
|
+
sudo gem install bio-lazyblastxml
|
19
|
+
|
20
|
+
== Overview
|
21
|
+
|
22
|
+
The parsers uses a LibXML::Reader instance to read the XML file one line at a time, keeping very little in memory. You can think of the parser as having a very short memory, only able to recall the object that it happens to be looking at right now. The parser only runs through the document once unless Bio::LazyBlast#rewind is called.
|
23
|
+
|
24
|
+
== Example Usage
|
25
|
+
|
26
|
+
Each report is an enumerable that yields iterations.
|
27
|
+
Each iteration is an enumerable that yields hits (if there are any hits).
|
28
|
+
|
29
|
+
require 'bio-lazyblastxml'
|
30
|
+
|
31
|
+
# Generate your new report object
|
32
|
+
report = Bio::LazyBlast::Report.new('my_huge_blastfile.xml')
|
33
|
+
|
34
|
+
# How many hits does each query have?
|
35
|
+
report.each_iteration do |iteration|
|
36
|
+
puts [iteration.query_def, iteration.count].join("\t")
|
37
|
+
end
|
38
|
+
|
39
|
+
Each hit is an enumerable that yields hsps:
|
40
|
+
|
41
|
+
require 'bio-lazyblastxml'
|
42
|
+
|
43
|
+
# Generate your new report object
|
44
|
+
report = Bio::LazyBlast::Report.new('my_huge_blastfile.xml')
|
45
|
+
|
46
|
+
report.each_iteration do |iteration|
|
47
|
+
iteration.each_hit do |hit|
|
48
|
+
# Sum up the lengths of all the hsps
|
49
|
+
hsp_length_sum = hit.inject(0){|count,hsp| count += hsp.align_len}
|
50
|
+
puts [iteration.query_def, hit.definition, hsp_length_sum].join("\t")
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
|
4
55
|
|
5
56
|
== Contributing to bio-lazyblastxml
|
6
57
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.4.0
|
data/bio-lazyblastxml.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{bio-lazyblastxml}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.4.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["robsyme"]
|
12
|
-
s.date = %q{2011-
|
12
|
+
s.date = %q{2011-06-01}
|
13
13
|
s.description = %q{This is very scrappy at the moment, and will need to be seriously cleaned up. It does what I need it to do for now. I'll fix it up in the coming weeks. Promise :)}
|
14
14
|
s.email = %q{rob.syme@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -4,10 +4,11 @@ module Bio
|
|
4
4
|
class LazyBlast
|
5
5
|
class Report
|
6
6
|
include Enumerable
|
7
|
-
attr_reader :
|
7
|
+
attr_reader :program, :version, :db, :query_id, :query_def, :query_len, :statistics
|
8
8
|
|
9
9
|
def initialize(filename)
|
10
|
-
@
|
10
|
+
@filename = filename
|
11
|
+
@reader = LibXML::XML::Reader.file(@filename)
|
11
12
|
@nodes = Enumerator.new do |yielder|
|
12
13
|
while @reader.read
|
13
14
|
yielder << @reader if @reader.node_type == LibXML::XML::Reader::TYPE_ELEMENT
|
@@ -17,9 +18,9 @@ module Bio
|
|
17
18
|
end
|
18
19
|
|
19
20
|
def setup_report_values
|
20
|
-
@
|
21
|
+
@statistics = Hash.new
|
21
22
|
@nodes.each do |node|
|
22
|
-
return
|
23
|
+
return if node.name == "BlastOutput_iterations"
|
23
24
|
case node.name
|
24
25
|
when 'BlastOutput_program'
|
25
26
|
@program = node.read_inner_xml
|
@@ -34,15 +35,15 @@ module Bio
|
|
34
35
|
when 'BlastOutput_query-len'
|
35
36
|
@query_len = node.read_inner_xml.to_i
|
36
37
|
when 'Parameters_matrix'
|
37
|
-
@
|
38
|
+
@statistics['matrix'] = node.read_inner_xml
|
38
39
|
when 'Parameters_expect'
|
39
|
-
@
|
40
|
+
@statistics['expect'] = node.read_inner_xml.to_i
|
40
41
|
when 'Parameters_gap-open'
|
41
|
-
@
|
42
|
+
@statistics['gap-open'] = node.read_inner_xml.to_i
|
42
43
|
when 'Parameters_gap-extend'
|
43
|
-
@
|
44
|
+
@statistics['gap-extend'] = node.read_inner_xml.to_i
|
44
45
|
when 'Parameters_filter'
|
45
|
-
@
|
46
|
+
@statistics['filter'] = node.read_inner_xml
|
46
47
|
end
|
47
48
|
end
|
48
49
|
end
|
@@ -52,6 +53,16 @@ module Bio
|
|
52
53
|
end
|
53
54
|
alias :each_iteration :each
|
54
55
|
|
56
|
+
def rewind
|
57
|
+
@reader.close
|
58
|
+
@reader = LibXML::XML::Reader.file(@filename)
|
59
|
+
@nodes = Enumerator.new do |yielder|
|
60
|
+
while @reader.read
|
61
|
+
yielder << @reader if @reader.node_type == LibXML::XML::Reader::TYPE_ELEMENT
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
55
66
|
class Iteration
|
56
67
|
include Enumerable
|
57
68
|
attr_reader :num, :query_id, :query_def, :query_len, :message, :parameters
|
@@ -67,7 +78,7 @@ module Bio
|
|
67
78
|
|
68
79
|
def setup_iteration_values
|
69
80
|
@nodes.each do |node|
|
70
|
-
return
|
81
|
+
return if node.name == 'Iteration_hits'
|
71
82
|
case node.name
|
72
83
|
when 'Iteration_iter-num'
|
73
84
|
@num = node.read_inner_xml.to_i
|
@@ -103,7 +114,7 @@ module Bio
|
|
103
114
|
|
104
115
|
def setup_hit_values
|
105
116
|
@nodes.each do |node|
|
106
|
-
return
|
117
|
+
return if node.name == 'Hit_hsps'
|
107
118
|
case node.name
|
108
119
|
when 'Hit_num'
|
109
120
|
@num = node.read_inner_xml.to_i
|
@@ -35,17 +35,24 @@ class TestIteration < MiniTest::Unit::TestCase
|
|
35
35
|
@report = Bio::LazyBlast::Report.new(@blast_filename)
|
36
36
|
end
|
37
37
|
|
38
|
+
def test_rewind_ability
|
39
|
+
assert_equal 2, @report.count, "Test report should contain 2 iterations"
|
40
|
+
assert_equal 0, @report.count, "Report should contain no more iterations once the file has been read through"
|
41
|
+
@report.rewind
|
42
|
+
assert_equal 2, @report.count, "Test report should contain 2 iterations once the file has been rewound."
|
43
|
+
end
|
44
|
+
|
38
45
|
def test_iteration_creation
|
39
46
|
@iteration = @report.first
|
40
47
|
assert_kind_of Bio::LazyBlast::Report::Iteration, @iteration
|
41
48
|
assert_equal 'blastp', @report.program
|
42
49
|
assert_equal 'blastp 2.2.21 [Jun-14-2009]', @report.version
|
43
50
|
assert_equal 'db.fasta', @report.db
|
44
|
-
assert_equal 'BLOSUM62', @report.
|
45
|
-
assert_equal 10, @report.
|
46
|
-
assert_equal 11, @report.
|
47
|
-
assert_equal 1, @report.
|
48
|
-
assert_equal 'F', @report.
|
51
|
+
assert_equal 'BLOSUM62', @report.statistics['matrix']
|
52
|
+
assert_equal 10, @report.statistics['expect']
|
53
|
+
assert_equal 11, @report.statistics['gap-open']
|
54
|
+
assert_equal 1, @report.statistics['gap-extend']
|
55
|
+
assert_equal 'F', @report.statistics['filter']
|
49
56
|
end
|
50
57
|
|
51
58
|
def test_example_usage
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: bio-lazyblastxml
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.
|
5
|
+
version: 0.4.0
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- robsyme
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2011-
|
13
|
+
date: 2011-06-01 00:00:00 +08:00
|
14
14
|
default_executable:
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
@@ -116,7 +116,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
116
116
|
requirements:
|
117
117
|
- - ">="
|
118
118
|
- !ruby/object:Gem::Version
|
119
|
-
hash: -
|
119
|
+
hash: -1790620181502382914
|
120
120
|
segments:
|
121
121
|
- 0
|
122
122
|
version: "0"
|