bioruby-phyloxml 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,227 @@
1
+ #
2
+ # = bio/db/phyloxml_writer.rb - PhyloXML writer
3
+ #
4
+ # Copyright:: Copyright (C) 2009
5
+ # Diana Jaunzeikare <latvianlinuxgirl@gmail.com>
6
+ # License:: The Ruby License
7
+ #
8
+ #
9
+ # == Description
10
+ #
11
+ # This file containts writer for PhyloXML.
12
+ #
13
+ # == Requirements
14
+ #
15
+ # Libxml2 XML parser is required. Install libxml-ruby bindings from
16
+ # http://libxml.rubyforge.org or
17
+ #
18
+ # gem install -r libxml-ruby
19
+ #
20
+ # == References
21
+ #
22
+ # * http://www.phyloxml.org
23
+ #
24
+ # * https://www.nescent.org/wg_phyloinformatics/PhyloSoC:PhyloXML_support_in_BioRuby
25
+
26
+ require 'libxml'
27
+ require 'bio-phyloxml/phyloxml_elements'
28
+
29
+ module Bio
30
+
31
+ module PhyloXML
32
+
33
+ # == Description
34
+ #
35
+ # Bio::PhyloXML::Writer is for writing phyloXML (version 1.10) format files.
36
+ #
37
+ # == Requirements
38
+ #
39
+ # Libxml2 XML parser is required. Install libxml-ruby bindings from
40
+ # http://libxml.rubyforge.org or
41
+ #
42
+ # gem install -r libxml-ruby
43
+ #
44
+ # == Usage
45
+ #
46
+ # require 'bio'
47
+ #
48
+ # # Create new phyloxml parser
49
+ # phyloxml = Bio::PhyloXML::Parser.open('example.xml')
50
+ #
51
+ # # Read in some trees from file
52
+ # tree1 = phyloxml.next_tree
53
+ # tree2 = phyloxml.next_tree
54
+ #
55
+ # # Create new phyloxml writer
56
+ # writer = Bio::PhyloXML::Writer.new('tree.xml')
57
+ #
58
+ # # Write tree to the file tree.xml
59
+ # writer.write(tree1)
60
+ #
61
+ # # Add another tree to the file
62
+ # writer.write(tree2)
63
+ #
64
+ # == References
65
+ #
66
+ # http://www.phyloxml.org/documentation/version_100/phyloxml.xsd.html
67
+
68
+ class Writer
69
+
70
+ include LibXML
71
+
72
+ SCHEMA_LOCATION = 'http://www.phyloxml.org http://www.phyloxml.org/1.10/phyloxml.xsd'
73
+
74
+ attr_accessor :write_branch_length_as_subelement
75
+
76
+ #
77
+ # Create new Writer object. As parameters provide filename of xml file
78
+ # you wish to create. Optional parameter is whether to indent or no.
79
+ # Default is true. By default branch_length is written as subelement of
80
+ # clade element.
81
+ #
82
+ def initialize(filename, indent=true)
83
+ @write_branch_length_as_subelement = true #default value
84
+ @filename = filename
85
+ @indent = indent
86
+
87
+ @doc = XML::Document.new()
88
+ @doc.root = XML::Node.new('phyloxml')
89
+ @root = @doc.root
90
+ @root['xmlns:xsi'] = 'http://www.w3.org/2001/XMLSchema-instance'
91
+ @root['xsi:schemaLocation'] = SCHEMA_LOCATION
92
+ @root['xmlns'] = 'http://www.phyloxml.org'
93
+
94
+ #@todo save encoding to be UTF-8. (However it is the default one).
95
+ #it gives error NameError: uninitialized constant LibXML::XML::Encoding
96
+ #@doc.encoding = XML::Encoding::UTF_8
97
+
98
+ @doc.save(@filename, :indent => true)
99
+ end
100
+
101
+ #
102
+ # Write a tree to a file in phyloxml format.
103
+ #
104
+ # require 'Bio'
105
+ # writer = Bio::PhyloXML::Writer.new
106
+ # writer.write(tree)
107
+ #
108
+ def write(tree)
109
+ @root << phylogeny = XML::Node.new('phylogeny')
110
+
111
+ PhyloXML::Writer.generate_xml(phylogeny, tree, [
112
+ [:attr, 'rooted'],
113
+ [:simple, 'name', tree.name],
114
+ [:complex, 'id', tree.phylogeny_id],
115
+ [:simple, 'description', tree.description],
116
+ [:simple, 'date', tree.date],
117
+ [:objarr, 'confidence', 'confidences']])
118
+
119
+ root_clade = tree.root.to_xml(nil, @write_branch_length_as_subelement)
120
+
121
+ phylogeny << root_clade
122
+
123
+ tree.children(tree.root).each do |node|
124
+ root_clade << node_to_xml(tree, node, tree.root)
125
+ end
126
+
127
+ Bio::PhyloXML::Writer::generate_xml(phylogeny, tree, [
128
+ [:objarr, 'clade_relation', 'clade_relations'],
129
+ [:objarr, 'sequence_relation', 'sequence_relations'],
130
+ [:objarr, 'property', 'properties']] )
131
+
132
+ @doc.save(@filename, :indent => @indent)
133
+ end #writer#write
134
+
135
+
136
+ #
137
+ # PhyloXML Schema allows to save data in different xml format after all
138
+ # phylogeny elements. This method is to write these additional data.
139
+ #
140
+ # parser = PhyloXML::Parser.open('phyloxml_examples.xml')
141
+ # writer = PhyloXML::Writer.new('new.xml')
142
+ #
143
+ # parser.each do |tree|
144
+ # writer.write(tree)
145
+ # end
146
+ #
147
+ # # When all the trees are read in by the parser, whats left is saved at
148
+ # # PhyloXML::Parser#other
149
+ # writer.write(parser.other)
150
+ #
151
+
152
+ def write_other(other_arr)
153
+ other_arr.each do |other_obj|
154
+ @root << other_obj.to_xml
155
+ end
156
+ @doc.save(@filename, :indent => @indent)
157
+ end
158
+
159
+ #class method
160
+
161
+ #
162
+ # Used by to_xml methods of PhyloXML element classes. Generally not to be
163
+ # invoked directly.
164
+ #
165
+ def self.generate_xml(root, elem, subelement_array)
166
+ #example usage: generate_xml(node, self, [[ :complex,'accession', ], [:simple, 'name', @name], [:simple, 'location', @location]])
167
+ subelement_array.each do |subelem|
168
+ if subelem[0] == :simple
169
+ root << XML::Node.new(subelem[1], subelem[2].to_s) if subelem[2] != nil and not subelem[2].to_s.empty?
170
+
171
+ elsif subelem[0] == :complex
172
+ root << subelem[2].send("to_xml") if subelem[2] != nil
173
+
174
+ elsif subelem[0] == :pattern
175
+ #seq, self, [[:pattern, 'symbol', @symbol, "\S{1,10}"]
176
+ if subelem[2] != nil
177
+ if subelem[2] =~ subelem[3]
178
+ root << XML::Node.new(subelem[1], subelem[2])
179
+ else
180
+ raise "#{subelem[2]} is not a valid value of #{subelem[1]}. It should follow pattern #{subelem[3]}"
181
+ end
182
+ end
183
+
184
+ elsif subelem[0] == :objarr
185
+ #[:objarr, 'annotation', 'annotations']])
186
+ obj_arr = elem.send(subelem[2])
187
+ obj_arr.each do |arr_elem|
188
+ root << arr_elem.to_xml
189
+ end
190
+
191
+ elsif subelem[0] == :simplearr
192
+ # [:simplearr, 'common_name', @common_names]
193
+ subelem[2].each do |elem_val|
194
+ root << XML::Node.new(subelem[1], elem_val)
195
+ end
196
+ elsif subelem[0] == :attr
197
+ #[:attr, 'rooted']
198
+ obj = elem.send(subelem[1])
199
+ if obj != nil
200
+ root[subelem[1]] = obj.to_s
201
+ end
202
+ else
203
+ raise "Not supported type of element by method generate_xml."
204
+ end
205
+ end
206
+ return root
207
+ end
208
+
209
+ private
210
+
211
+ def node_to_xml(tree, node, parent)
212
+ edge = tree.get_edge(parent, node)
213
+ branch_length = edge.distance
214
+
215
+ clade = node.to_xml(branch_length, @write_branch_length_as_subelement)
216
+
217
+ tree.children(node).each do |new_node|
218
+ clade << node_to_xml(tree, new_node, node)
219
+ end
220
+
221
+ return clade
222
+ end
223
+
224
+ end
225
+
226
+ end
227
+ end
@@ -0,0 +1,7 @@
1
+
2
+ module Bio
3
+ module PhyloXML
4
+ # version number of the bio-phyloxml gem package
5
+ VERSION = "1.0.0"
6
+ end
7
+ end
@@ -0,0 +1,4 @@
1
+ # for migration from old bioruby to bio-phyloxml
2
+ if require 'bio-phyloxml/compat/stub_phyloxml_elements' then
3
+ require 'bio-phyloxml'
4
+ end
@@ -0,0 +1,4 @@
1
+ # for migration from old bioruby to bio-phyloxml
2
+ if require 'bio-phyloxml/compat/stub_phyloxml_parser' then
3
+ require 'bio-phyloxml'
4
+ end
@@ -0,0 +1,4 @@
1
+ # for migration from old bioruby to bio-phyloxml
2
+ if require 'bio-phyloxml/compat/stub_phyloxml_writer' then
3
+ require 'bio-phyloxml'
4
+ end
@@ -0,0 +1,10 @@
1
+ #
2
+
3
+ if RUBY_VERSION >= "1.9" then
4
+ require_relative "bio-phyloxml.rb"
5
+ else
6
+ dir = File.dirname(__FILE__)
7
+ target = File.join(dir, "bio-phyloxml.rb")
8
+ require target
9
+ end
10
+
@@ -0,0 +1,205 @@
1
+ #
2
+ # = sample/test_phyloxml_big.rb - Tests for Bio::PhyloXML. Testing very big files.
3
+ #
4
+ # Copyright:: Copyright (C) 2009
5
+ # Diana Jaunzeikare <latvianlinuxgirl@gmail.com>
6
+ # Naohisa Goto <ng@bioruby.org>
7
+ # License:: The Ruby License
8
+ #
9
+
10
+ # libraries needed for the tests
11
+ require 'libxml'
12
+ require 'pathname'
13
+ require 'test/unit'
14
+ require 'digest/sha1'
15
+
16
+ require 'bio/command'
17
+ require 'bio/db/phyloxml/phyloxml_parser'
18
+ require 'bio/db/phyloxml/phyloxml_writer'
19
+
20
+ PhyloXMLBigDataPath = ARGV.shift
21
+
22
+ if !PhyloXMLBigDataPath then
23
+ exit_code = 0
24
+ elsif !File.directory?(PhyloXMLBigDataPath) then
25
+ exit_code = 1
26
+ else
27
+ exit_code = false
28
+ end
29
+
30
+ if exit_code then
31
+ puts "Usage: #{$0} path_to_data (test options...)"
32
+ puts ""
33
+ puts "Requirements:"
34
+ puts " - Write permission to the path_to_data"
35
+ puts " - Internet connection for downloading test data"
36
+ puts " - unzip command to extract downloaded test data"
37
+ puts ""
38
+ puts "You may want to run Ruby with -rubygems and -I<path_to_bioruby_lib>."
39
+ puts ""
40
+ puts "Example of usage using /tmp:"
41
+ puts " $ mkdir /tmp/phyloxml"
42
+ puts " $ ruby -rubygems -I lib #{$0} /tmp/phyloxml -v"
43
+ puts ""
44
+ exit(exit_code)
45
+ end
46
+
47
+ module TestPhyloXMLBigData
48
+
49
+ module_function
50
+
51
+ def metazoa_xml
52
+ #puts "Metazoa 30MB"
53
+ filename = 'ncbi_taxonomy_metazoa.xml'
54
+ uri = "http://www.phylosoft.org/archaeopteryx/examples/data/ncbi_taxonomy_metazoa.xml.zip"
55
+ download_and_unzip_if_not_found(filename, uri, "1M", "33M")
56
+ end
57
+
58
+ def metazoa_test_xml
59
+ #puts "writing Metazoa 30MB"
60
+ File.join PhyloXMLBigDataPath, 'writer_test_ncbi_taxonomy_metazoa.xml'
61
+ end
62
+
63
+ def metazoa_roundtrip_xml
64
+ #puts "writing Metazoa 30MB roundtrip"
65
+ File.join PhyloXMLBigDataPath, 'roundtrip_test_ncbi_taxonomy_metazoa.xml'
66
+ end
67
+
68
+ def mollusca_xml
69
+ #puts "Mollusca 1.5MB"
70
+ filename = 'ncbi_taxonomy_mollusca.xml'
71
+ uri = "http://www.phylosoft.org/archaeopteryx/examples/data/ncbi_taxonomy_mollusca.xml.zip"
72
+ download_and_unzip_if_not_found(filename, uri, "67K", "1.5M")
73
+ end
74
+
75
+ def mollusca_test_xml
76
+ #puts "Writing Mollusca 1.5MB"
77
+ File.join PhyloXMLBigDataPath, 'writer_test_ncbi_taxonomy_mollusca.xml'
78
+ end
79
+
80
+ def mollusca_roundtrip_xml
81
+ #puts "Writing Mollusca 1.5MB roundtrip"
82
+ File.join PhyloXMLBigDataPath, 'roundtrip_test_ncbi_taxonomy_mollusca.xml'
83
+ end
84
+
85
+ def life_xml
86
+ #Right now this file is not compatible with xsd 1.10
87
+ filename = 'tol_life_on_earth_1.xml'
88
+ uri = "http://www.phylosoft.org/archaeopteryx/examples/data/tol_life_on_earth_1.xml.zip"
89
+
90
+ download_and_unzip_if_not_found(filename, uri, '10M', '45M')
91
+ end
92
+
93
+ def life_test_xml
94
+ File.join PhyloXMLBigDataPath, 'writer_test_tol_life_on_earth_1.xml'
95
+ end
96
+
97
+ def life_roundtrip_xml
98
+ File.join PhyloXMLBigDataPath, 'roundtrip_test_tol_life_on_earth_1.xml'
99
+ end
100
+
101
+ def unzip_file(file, target_dir)
102
+ flag = system('unzip', "#{file}.zip", "-d", target_dir)
103
+ unless flag then
104
+ raise "Failed to unzip #{file}.zip"
105
+ end
106
+ file
107
+ end
108
+
109
+ def download_and_unzip_if_not_found(basename, uri, zipsize, origsize)
110
+ file = File.join PhyloXMLBigDataPath, basename
111
+ return file if File.exists?(file)
112
+
113
+ if File.exists?("#{file}.zip")
114
+ unzip_file(file, PhyloXMLBigDataPath)
115
+ return file
116
+ end
117
+
118
+ puts "File #{basename} does not exist. Do you want to download it? (If yes, ~#{zipsize}B zip file will be downloaded and extracted (to #{origsize}B), if no, the test will be skipped.) y/n?"
119
+ res = gets
120
+ if res.to_s.chomp.downcase == "y"
121
+ File.open("#{file}.zip", "wb") do |f|
122
+ f.write(Bio::Command.read_uri(uri))
123
+ end
124
+ puts "File downloaded."
125
+ self.unzip_file(file, PhyloXMLBigDataPath)
126
+ return file
127
+ else
128
+ return nil
129
+ #return File.join PHYLOXML_TEST_DATA, "#{basename}.stub"
130
+ end
131
+ end
132
+
133
+ end #end module TestPhyloXMLBigData
134
+
135
+ module Bio
136
+
137
+ class TestPhyloXMLBig < Test::Unit::TestCase
138
+
139
+ def do_test_next_tree(readfilename)
140
+ raise "the test is skipped" unless readfilename
141
+ filesizeMB = File.size(readfilename) / 1048576.0
142
+ printf "Reading %s (%2.1f MB)\n", readfilename, filesizeMB
143
+
144
+ begin
145
+ phyloxml = Bio::PhyloXML::Parser.open(readfilename)
146
+ rescue NoMethodError
147
+ phyloxml = Bio::PhyloXML::Parser.new(readfilename)
148
+ end
149
+ tree = nil
150
+ assert_nothing_raised {
151
+ tree = phyloxml.next_tree
152
+ }
153
+ tree
154
+ end
155
+ private :do_test_next_tree
156
+
157
+ def do_test_write(tree, writefilename)
158
+ printf "Writing to %s\n", writefilename
159
+ writer = Bio::PhyloXML::Writer.new(writefilename)
160
+ assert_nothing_raised {
161
+ writer.write(tree)
162
+ }
163
+
164
+ # checks file size and sha1sum
165
+ str = File.open(writefilename, 'rb') { |f| f.read }
166
+ sha1 = Digest::SHA1.hexdigest(str)
167
+ puts "Wrote #{str.length} bytes."
168
+ puts "sha1: #{sha1}"
169
+ end
170
+ private :do_test_write
171
+
172
+ def test_mollusca
173
+ tree = do_test_next_tree(TestPhyloXMLBigData.mollusca_xml)
174
+ do_test_write(tree, TestPhyloXMLBigData.mollusca_test_xml)
175
+
176
+ tree2 = do_test_next_tree(TestPhyloXMLBigData.mollusca_test_xml)
177
+ do_test_write(tree2, TestPhyloXMLBigData.mollusca_roundtrip_xml)
178
+ end
179
+
180
+ def test_metazoa
181
+ tree = do_test_next_tree(TestPhyloXMLBigData.metazoa_xml)
182
+ do_test_write(tree, TestPhyloXMLBigData.metazoa_test_xml)
183
+
184
+ tree2 = do_test_next_tree(TestPhyloXMLBigData.metazoa_test_xml)
185
+ do_test_write(tree2, TestPhyloXMLBigData.metazoa_roundtrip_xml)
186
+ end
187
+
188
+ if false
189
+ # Disabled because of the error.
190
+ # LibXML::XML::Error: Fatal error: Input is not proper UTF-8,
191
+ # indicate encoding !
192
+ # Bytes: 0xE9 0x6B 0x65 0x73 at tol_life_on_earth_1.xml:132170.
193
+ #
194
+ def test_life
195
+ tree = do_test_next_tree(TestPhyloXMLBigData.life_xml)
196
+ do_test_write(tree, TestPhyloXMLBigData.life_test_xml)
197
+
198
+ tree2 = do_test_next_tree(TestPhyloXMLBigData.life_test_xml)
199
+ do_test_write(tree2, TestPhyloXMLBigData.life_roundtrip_xml)
200
+ end
201
+ end #if false
202
+
203
+ end
204
+
205
+ end