bio-lazyblastxml 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,57 @@
1
1
  = bio-lazyblastxml
2
2
 
3
- Description goes here.
3
+ Provides a libxml-based lazy parser for reading through large blast xml files with a small memory footprint.
4
+
5
+ == Requirements
6
+
7
+ * Ruby 1.9.x
8
+ * libxml
9
+
10
+ If you're on ubuntu, libxml is most easily installed with:
11
+
12
+ sudo apt-get install libxml-ruby
13
+
14
+ And then install the gem with
15
+
16
+ gem install bio-lazyblastxml
17
+ # If you need to be root to install gems, try:
18
+ sudo gem install bio-lazyblastxml
19
+
20
+ == Overview
21
+
22
+ The parsers uses a LibXML::Reader instance to read the XML file one line at a time, keeping very little in memory. You can think of the parser as having a very short memory, only able to recall the object that it happens to be looking at right now. The parser only runs through the document once unless Bio::LazyBlast#rewind is called.
23
+
24
+ == Example Usage
25
+
26
+ Each report is an enumerable that yields iterations.
27
+ Each iteration is an enumerable that yields hits (if there are any hits).
28
+
29
+ require 'bio-lazyblastxml'
30
+
31
+ # Generate your new report object
32
+ report = Bio::LazyBlast::Report.new('my_huge_blastfile.xml')
33
+
34
+ # How many hits does each query have?
35
+ report.each_iteration do |iteration|
36
+ puts [iteration.query_def, iteration.count].join("\t")
37
+ end
38
+
39
+ Each hit is an enumerable that yields hsps:
40
+
41
+ require 'bio-lazyblastxml'
42
+
43
+ # Generate your new report object
44
+ report = Bio::LazyBlast::Report.new('my_huge_blastfile.xml')
45
+
46
+ report.each_iteration do |iteration|
47
+ iteration.each_hit do |hit|
48
+ # Sum up the lengths of all the hsps
49
+ hsp_length_sum = hit.inject(0){|count,hsp| count += hsp.align_len}
50
+ puts [iteration.query_def, hit.definition, hsp_length_sum].join("\t")
51
+ end
52
+ end
53
+
54
+
4
55
 
5
56
  == Contributing to bio-lazyblastxml
6
57
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.0
1
+ 0.4.0
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{bio-lazyblastxml}
8
- s.version = "0.3.0"
8
+ s.version = "0.4.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["robsyme"]
12
- s.date = %q{2011-05-31}
12
+ s.date = %q{2011-06-01}
13
13
  s.description = %q{This is very scrappy at the moment, and will need to be seriously cleaned up. It does what I need it to do for now. I'll fix it up in the coming weeks. Promise :)}
14
14
  s.email = %q{rob.syme@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -4,10 +4,11 @@ module Bio
4
4
  class LazyBlast
5
5
  class Report
6
6
  include Enumerable
7
- attr_reader :reader, :program, :version, :db, :query_id, :query_def, :query_len, :parameters
7
+ attr_reader :program, :version, :db, :query_id, :query_def, :query_len, :statistics
8
8
 
9
9
  def initialize(filename)
10
- @reader = LibXML::XML::Reader.file(filename)
10
+ @filename = filename
11
+ @reader = LibXML::XML::Reader.file(@filename)
11
12
  @nodes = Enumerator.new do |yielder|
12
13
  while @reader.read
13
14
  yielder << @reader if @reader.node_type == LibXML::XML::Reader::TYPE_ELEMENT
@@ -17,9 +18,9 @@ module Bio
17
18
  end
18
19
 
19
20
  def setup_report_values
20
- @parameters = Hash.new
21
+ @statistics = Hash.new
21
22
  @nodes.each do |node|
22
- return node if node.name == "BlastOutput_iterations"
23
+ return if node.name == "BlastOutput_iterations"
23
24
  case node.name
24
25
  when 'BlastOutput_program'
25
26
  @program = node.read_inner_xml
@@ -34,15 +35,15 @@ module Bio
34
35
  when 'BlastOutput_query-len'
35
36
  @query_len = node.read_inner_xml.to_i
36
37
  when 'Parameters_matrix'
37
- @parameters['matrix'] = node.read_inner_xml
38
+ @statistics['matrix'] = node.read_inner_xml
38
39
  when 'Parameters_expect'
39
- @parameters['expect'] = node.read_inner_xml.to_i
40
+ @statistics['expect'] = node.read_inner_xml.to_i
40
41
  when 'Parameters_gap-open'
41
- @parameters['gap-open'] = node.read_inner_xml.to_i
42
+ @statistics['gap-open'] = node.read_inner_xml.to_i
42
43
  when 'Parameters_gap-extend'
43
- @parameters['gap-extend'] = node.read_inner_xml.to_i
44
+ @statistics['gap-extend'] = node.read_inner_xml.to_i
44
45
  when 'Parameters_filter'
45
- @parameters['filter'] = node.read_inner_xml
46
+ @statistics['filter'] = node.read_inner_xml
46
47
  end
47
48
  end
48
49
  end
@@ -52,6 +53,16 @@ module Bio
52
53
  end
53
54
  alias :each_iteration :each
54
55
 
56
+ def rewind
57
+ @reader.close
58
+ @reader = LibXML::XML::Reader.file(@filename)
59
+ @nodes = Enumerator.new do |yielder|
60
+ while @reader.read
61
+ yielder << @reader if @reader.node_type == LibXML::XML::Reader::TYPE_ELEMENT
62
+ end
63
+ end
64
+ end
65
+
55
66
  class Iteration
56
67
  include Enumerable
57
68
  attr_reader :num, :query_id, :query_def, :query_len, :message, :parameters
@@ -67,7 +78,7 @@ module Bio
67
78
 
68
79
  def setup_iteration_values
69
80
  @nodes.each do |node|
70
- return node if node.name == 'Iteration_hits'
81
+ return if node.name == 'Iteration_hits'
71
82
  case node.name
72
83
  when 'Iteration_iter-num'
73
84
  @num = node.read_inner_xml.to_i
@@ -103,7 +114,7 @@ module Bio
103
114
 
104
115
  def setup_hit_values
105
116
  @nodes.each do |node|
106
- return node if node.name == 'Hit_hsps'
117
+ return if node.name == 'Hit_hsps'
107
118
  case node.name
108
119
  when 'Hit_num'
109
120
  @num = node.read_inner_xml.to_i
@@ -35,17 +35,24 @@ class TestIteration < MiniTest::Unit::TestCase
35
35
  @report = Bio::LazyBlast::Report.new(@blast_filename)
36
36
  end
37
37
 
38
+ def test_rewind_ability
39
+ assert_equal 2, @report.count, "Test report should contain 2 iterations"
40
+ assert_equal 0, @report.count, "Report should contain no more iterations once the file has been read through"
41
+ @report.rewind
42
+ assert_equal 2, @report.count, "Test report should contain 2 iterations once the file has been rewound."
43
+ end
44
+
38
45
  def test_iteration_creation
39
46
  @iteration = @report.first
40
47
  assert_kind_of Bio::LazyBlast::Report::Iteration, @iteration
41
48
  assert_equal 'blastp', @report.program
42
49
  assert_equal 'blastp 2.2.21 [Jun-14-2009]', @report.version
43
50
  assert_equal 'db.fasta', @report.db
44
- assert_equal 'BLOSUM62', @report.parameters['matrix']
45
- assert_equal 10, @report.parameters['expect']
46
- assert_equal 11, @report.parameters['gap-open']
47
- assert_equal 1, @report.parameters['gap-extend']
48
- assert_equal 'F', @report.parameters['filter']
51
+ assert_equal 'BLOSUM62', @report.statistics['matrix']
52
+ assert_equal 10, @report.statistics['expect']
53
+ assert_equal 11, @report.statistics['gap-open']
54
+ assert_equal 1, @report.statistics['gap-extend']
55
+ assert_equal 'F', @report.statistics['filter']
49
56
  end
50
57
 
51
58
  def test_example_usage
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: bio-lazyblastxml
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.3.0
5
+ version: 0.4.0
6
6
  platform: ruby
7
7
  authors:
8
8
  - robsyme
@@ -10,7 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2011-05-31 00:00:00 +08:00
13
+ date: 2011-06-01 00:00:00 +08:00
14
14
  default_executable:
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
@@ -116,7 +116,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
116
116
  requirements:
117
117
  - - ">="
118
118
  - !ruby/object:Gem::Version
119
- hash: -3799468829096378627
119
+ hash: -1790620181502382914
120
120
  segments:
121
121
  - 0
122
122
  version: "0"