bio-lazyblastxml 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,57 @@
1
1
  = bio-lazyblastxml
2
2
 
3
- Description goes here.
3
+ Provides a libxml-based lazy parser for reading through large blast xml files with a small memory footprint.
4
+
5
+ == Requirements
6
+
7
+ * Ruby 1.9.x
8
+ * libxml
9
+
10
+ If you're on ubuntu, libxml is most easily installed with:
11
+
12
+ sudo apt-get install libxml-ruby
13
+
14
+ And then install the gem with
15
+
16
+ gem install bio-lazyblastxml
17
+ # If you need to be root to install gems, try:
18
+ sudo gem install bio-lazyblastxml
19
+
20
+ == Overview
21
+
22
+ The parsers uses a LibXML::Reader instance to read the XML file one line at a time, keeping very little in memory. You can think of the parser as having a very short memory, only able to recall the object that it happens to be looking at right now. The parser only runs through the document once unless Bio::LazyBlast#rewind is called.
23
+
24
+ == Example Usage
25
+
26
+ Each report is an enumerable that yields iterations.
27
+ Each iteration is an enumerable that yields hits (if there are any hits).
28
+
29
+ require 'bio-lazyblastxml'
30
+
31
+ # Generate your new report object
32
+ report = Bio::LazyBlast::Report.new('my_huge_blastfile.xml')
33
+
34
+ # How many hits does each query have?
35
+ report.each_iteration do |iteration|
36
+ puts [iteration.query_def, iteration.count].join("\t")
37
+ end
38
+
39
+ Each hit is an enumerable that yields hsps:
40
+
41
+ require 'bio-lazyblastxml'
42
+
43
+ # Generate your new report object
44
+ report = Bio::LazyBlast::Report.new('my_huge_blastfile.xml')
45
+
46
+ report.each_iteration do |iteration|
47
+ iteration.each_hit do |hit|
48
+ # Sum up the lengths of all the hsps
49
+ hsp_length_sum = hit.inject(0){|count,hsp| count += hsp.align_len}
50
+ puts [iteration.query_def, hit.definition, hsp_length_sum].join("\t")
51
+ end
52
+ end
53
+
54
+
4
55
 
5
56
  == Contributing to bio-lazyblastxml
6
57
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.0
1
+ 0.4.0
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{bio-lazyblastxml}
8
- s.version = "0.3.0"
8
+ s.version = "0.4.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["robsyme"]
12
- s.date = %q{2011-05-31}
12
+ s.date = %q{2011-06-01}
13
13
  s.description = %q{This is very scrappy at the moment, and will need to be seriously cleaned up. It does what I need it to do for now. I'll fix it up in the coming weeks. Promise :)}
14
14
  s.email = %q{rob.syme@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -4,10 +4,11 @@ module Bio
4
4
  class LazyBlast
5
5
  class Report
6
6
  include Enumerable
7
- attr_reader :reader, :program, :version, :db, :query_id, :query_def, :query_len, :parameters
7
+ attr_reader :program, :version, :db, :query_id, :query_def, :query_len, :statistics
8
8
 
9
9
  def initialize(filename)
10
- @reader = LibXML::XML::Reader.file(filename)
10
+ @filename = filename
11
+ @reader = LibXML::XML::Reader.file(@filename)
11
12
  @nodes = Enumerator.new do |yielder|
12
13
  while @reader.read
13
14
  yielder << @reader if @reader.node_type == LibXML::XML::Reader::TYPE_ELEMENT
@@ -17,9 +18,9 @@ module Bio
17
18
  end
18
19
 
19
20
  def setup_report_values
20
- @parameters = Hash.new
21
+ @statistics = Hash.new
21
22
  @nodes.each do |node|
22
- return node if node.name == "BlastOutput_iterations"
23
+ return if node.name == "BlastOutput_iterations"
23
24
  case node.name
24
25
  when 'BlastOutput_program'
25
26
  @program = node.read_inner_xml
@@ -34,15 +35,15 @@ module Bio
34
35
  when 'BlastOutput_query-len'
35
36
  @query_len = node.read_inner_xml.to_i
36
37
  when 'Parameters_matrix'
37
- @parameters['matrix'] = node.read_inner_xml
38
+ @statistics['matrix'] = node.read_inner_xml
38
39
  when 'Parameters_expect'
39
- @parameters['expect'] = node.read_inner_xml.to_i
40
+ @statistics['expect'] = node.read_inner_xml.to_i
40
41
  when 'Parameters_gap-open'
41
- @parameters['gap-open'] = node.read_inner_xml.to_i
42
+ @statistics['gap-open'] = node.read_inner_xml.to_i
42
43
  when 'Parameters_gap-extend'
43
- @parameters['gap-extend'] = node.read_inner_xml.to_i
44
+ @statistics['gap-extend'] = node.read_inner_xml.to_i
44
45
  when 'Parameters_filter'
45
- @parameters['filter'] = node.read_inner_xml
46
+ @statistics['filter'] = node.read_inner_xml
46
47
  end
47
48
  end
48
49
  end
@@ -52,6 +53,16 @@ module Bio
52
53
  end
53
54
  alias :each_iteration :each
54
55
 
56
+ def rewind
57
+ @reader.close
58
+ @reader = LibXML::XML::Reader.file(@filename)
59
+ @nodes = Enumerator.new do |yielder|
60
+ while @reader.read
61
+ yielder << @reader if @reader.node_type == LibXML::XML::Reader::TYPE_ELEMENT
62
+ end
63
+ end
64
+ end
65
+
55
66
  class Iteration
56
67
  include Enumerable
57
68
  attr_reader :num, :query_id, :query_def, :query_len, :message, :parameters
@@ -67,7 +78,7 @@ module Bio
67
78
 
68
79
  def setup_iteration_values
69
80
  @nodes.each do |node|
70
- return node if node.name == 'Iteration_hits'
81
+ return if node.name == 'Iteration_hits'
71
82
  case node.name
72
83
  when 'Iteration_iter-num'
73
84
  @num = node.read_inner_xml.to_i
@@ -103,7 +114,7 @@ module Bio
103
114
 
104
115
  def setup_hit_values
105
116
  @nodes.each do |node|
106
- return node if node.name == 'Hit_hsps'
117
+ return if node.name == 'Hit_hsps'
107
118
  case node.name
108
119
  when 'Hit_num'
109
120
  @num = node.read_inner_xml.to_i
@@ -35,17 +35,24 @@ class TestIteration < MiniTest::Unit::TestCase
35
35
  @report = Bio::LazyBlast::Report.new(@blast_filename)
36
36
  end
37
37
 
38
+ def test_rewind_ability
39
+ assert_equal 2, @report.count, "Test report should contain 2 iterations"
40
+ assert_equal 0, @report.count, "Report should contain no more iterations once the file has been read through"
41
+ @report.rewind
42
+ assert_equal 2, @report.count, "Test report should contain 2 iterations once the file has been rewound."
43
+ end
44
+
38
45
  def test_iteration_creation
39
46
  @iteration = @report.first
40
47
  assert_kind_of Bio::LazyBlast::Report::Iteration, @iteration
41
48
  assert_equal 'blastp', @report.program
42
49
  assert_equal 'blastp 2.2.21 [Jun-14-2009]', @report.version
43
50
  assert_equal 'db.fasta', @report.db
44
- assert_equal 'BLOSUM62', @report.parameters['matrix']
45
- assert_equal 10, @report.parameters['expect']
46
- assert_equal 11, @report.parameters['gap-open']
47
- assert_equal 1, @report.parameters['gap-extend']
48
- assert_equal 'F', @report.parameters['filter']
51
+ assert_equal 'BLOSUM62', @report.statistics['matrix']
52
+ assert_equal 10, @report.statistics['expect']
53
+ assert_equal 11, @report.statistics['gap-open']
54
+ assert_equal 1, @report.statistics['gap-extend']
55
+ assert_equal 'F', @report.statistics['filter']
49
56
  end
50
57
 
51
58
  def test_example_usage
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: bio-lazyblastxml
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.3.0
5
+ version: 0.4.0
6
6
  platform: ruby
7
7
  authors:
8
8
  - robsyme
@@ -10,7 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2011-05-31 00:00:00 +08:00
13
+ date: 2011-06-01 00:00:00 +08:00
14
14
  default_executable:
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
@@ -116,7 +116,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
116
116
  requirements:
117
117
  - - ">="
118
118
  - !ruby/object:Gem::Version
119
- hash: -3799468829096378627
119
+ hash: -1790620181502382914
120
120
  segments:
121
121
  - 0
122
122
  version: "0"