bioruby-phyloxml 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/BSDL +22 -0
- data/COPYING +57 -0
- data/COPYING.ja +51 -0
- data/GPL +340 -0
- data/Gemfile +4 -0
- data/LEGAL +36 -0
- data/LGPL +504 -0
- data/README.md +214 -0
- data/Rakefile +20 -0
- data/bioruby-phyloxml.gemspec +36 -0
- data/doc/Tutorial.rd +152 -0
- data/lib/bio-phyloxml.rb +27 -0
- data/lib/bio-phyloxml/compat/cleanup.rb +13 -0
- data/lib/bio-phyloxml/compat/stub_phyloxml_elements.rb +1 -0
- data/lib/bio-phyloxml/compat/stub_phyloxml_parser.rb +1 -0
- data/lib/bio-phyloxml/compat/stub_phyloxml_writer.rb +1 -0
- data/lib/bio-phyloxml/phyloxml.xsd +582 -0
- data/lib/bio-phyloxml/phyloxml_elements.rb +1186 -0
- data/lib/bio-phyloxml/phyloxml_parser.rb +1001 -0
- data/lib/bio-phyloxml/phyloxml_writer.rb +227 -0
- data/lib/bio-phyloxml/version.rb +7 -0
- data/lib/bio/db/phyloxml/phyloxml_elements.rb +4 -0
- data/lib/bio/db/phyloxml/phyloxml_parser.rb +4 -0
- data/lib/bio/db/phyloxml/phyloxml_writer.rb +4 -0
- data/lib/bioruby-phyloxml.rb +10 -0
- data/sample/test_phyloxml_big.rb +205 -0
- metadata +156 -0
@@ -0,0 +1,227 @@
|
|
1
|
+
#
|
2
|
+
# = bio/db/phyloxml_writer.rb - PhyloXML writer
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2009
|
5
|
+
# Diana Jaunzeikare <latvianlinuxgirl@gmail.com>
|
6
|
+
# License:: The Ruby License
|
7
|
+
#
|
8
|
+
#
|
9
|
+
# == Description
|
10
|
+
#
|
11
|
+
# This file containts writer for PhyloXML.
|
12
|
+
#
|
13
|
+
# == Requirements
|
14
|
+
#
|
15
|
+
# Libxml2 XML parser is required. Install libxml-ruby bindings from
|
16
|
+
# http://libxml.rubyforge.org or
|
17
|
+
#
|
18
|
+
# gem install -r libxml-ruby
|
19
|
+
#
|
20
|
+
# == References
|
21
|
+
#
|
22
|
+
# * http://www.phyloxml.org
|
23
|
+
#
|
24
|
+
# * https://www.nescent.org/wg_phyloinformatics/PhyloSoC:PhyloXML_support_in_BioRuby
|
25
|
+
|
26
|
+
require 'libxml'
|
27
|
+
require 'bio-phyloxml/phyloxml_elements'
|
28
|
+
|
29
|
+
module Bio
|
30
|
+
|
31
|
+
module PhyloXML
|
32
|
+
|
33
|
+
# == Description
|
34
|
+
#
|
35
|
+
# Bio::PhyloXML::Writer is for writing phyloXML (version 1.10) format files.
|
36
|
+
#
|
37
|
+
# == Requirements
|
38
|
+
#
|
39
|
+
# Libxml2 XML parser is required. Install libxml-ruby bindings from
|
40
|
+
# http://libxml.rubyforge.org or
|
41
|
+
#
|
42
|
+
# gem install -r libxml-ruby
|
43
|
+
#
|
44
|
+
# == Usage
|
45
|
+
#
|
46
|
+
# require 'bio'
|
47
|
+
#
|
48
|
+
# # Create new phyloxml parser
|
49
|
+
# phyloxml = Bio::PhyloXML::Parser.open('example.xml')
|
50
|
+
#
|
51
|
+
# # Read in some trees from file
|
52
|
+
# tree1 = phyloxml.next_tree
|
53
|
+
# tree2 = phyloxml.next_tree
|
54
|
+
#
|
55
|
+
# # Create new phyloxml writer
|
56
|
+
# writer = Bio::PhyloXML::Writer.new('tree.xml')
|
57
|
+
#
|
58
|
+
# # Write tree to the file tree.xml
|
59
|
+
# writer.write(tree1)
|
60
|
+
#
|
61
|
+
# # Add another tree to the file
|
62
|
+
# writer.write(tree2)
|
63
|
+
#
|
64
|
+
# == References
|
65
|
+
#
|
66
|
+
# http://www.phyloxml.org/documentation/version_100/phyloxml.xsd.html
|
67
|
+
|
68
|
+
class Writer
|
69
|
+
|
70
|
+
include LibXML
|
71
|
+
|
72
|
+
SCHEMA_LOCATION = 'http://www.phyloxml.org http://www.phyloxml.org/1.10/phyloxml.xsd'
|
73
|
+
|
74
|
+
attr_accessor :write_branch_length_as_subelement
|
75
|
+
|
76
|
+
#
|
77
|
+
# Create new Writer object. As parameters provide filename of xml file
|
78
|
+
# you wish to create. Optional parameter is whether to indent or no.
|
79
|
+
# Default is true. By default branch_length is written as subelement of
|
80
|
+
# clade element.
|
81
|
+
#
|
82
|
+
def initialize(filename, indent=true)
|
83
|
+
@write_branch_length_as_subelement = true #default value
|
84
|
+
@filename = filename
|
85
|
+
@indent = indent
|
86
|
+
|
87
|
+
@doc = XML::Document.new()
|
88
|
+
@doc.root = XML::Node.new('phyloxml')
|
89
|
+
@root = @doc.root
|
90
|
+
@root['xmlns:xsi'] = 'http://www.w3.org/2001/XMLSchema-instance'
|
91
|
+
@root['xsi:schemaLocation'] = SCHEMA_LOCATION
|
92
|
+
@root['xmlns'] = 'http://www.phyloxml.org'
|
93
|
+
|
94
|
+
#@todo save encoding to be UTF-8. (However it is the default one).
|
95
|
+
#it gives error NameError: uninitialized constant LibXML::XML::Encoding
|
96
|
+
#@doc.encoding = XML::Encoding::UTF_8
|
97
|
+
|
98
|
+
@doc.save(@filename, :indent => true)
|
99
|
+
end
|
100
|
+
|
101
|
+
#
|
102
|
+
# Write a tree to a file in phyloxml format.
|
103
|
+
#
|
104
|
+
# require 'Bio'
|
105
|
+
# writer = Bio::PhyloXML::Writer.new
|
106
|
+
# writer.write(tree)
|
107
|
+
#
|
108
|
+
def write(tree)
|
109
|
+
@root << phylogeny = XML::Node.new('phylogeny')
|
110
|
+
|
111
|
+
PhyloXML::Writer.generate_xml(phylogeny, tree, [
|
112
|
+
[:attr, 'rooted'],
|
113
|
+
[:simple, 'name', tree.name],
|
114
|
+
[:complex, 'id', tree.phylogeny_id],
|
115
|
+
[:simple, 'description', tree.description],
|
116
|
+
[:simple, 'date', tree.date],
|
117
|
+
[:objarr, 'confidence', 'confidences']])
|
118
|
+
|
119
|
+
root_clade = tree.root.to_xml(nil, @write_branch_length_as_subelement)
|
120
|
+
|
121
|
+
phylogeny << root_clade
|
122
|
+
|
123
|
+
tree.children(tree.root).each do |node|
|
124
|
+
root_clade << node_to_xml(tree, node, tree.root)
|
125
|
+
end
|
126
|
+
|
127
|
+
Bio::PhyloXML::Writer::generate_xml(phylogeny, tree, [
|
128
|
+
[:objarr, 'clade_relation', 'clade_relations'],
|
129
|
+
[:objarr, 'sequence_relation', 'sequence_relations'],
|
130
|
+
[:objarr, 'property', 'properties']] )
|
131
|
+
|
132
|
+
@doc.save(@filename, :indent => @indent)
|
133
|
+
end #writer#write
|
134
|
+
|
135
|
+
|
136
|
+
#
|
137
|
+
# PhyloXML Schema allows to save data in different xml format after all
|
138
|
+
# phylogeny elements. This method is to write these additional data.
|
139
|
+
#
|
140
|
+
# parser = PhyloXML::Parser.open('phyloxml_examples.xml')
|
141
|
+
# writer = PhyloXML::Writer.new('new.xml')
|
142
|
+
#
|
143
|
+
# parser.each do |tree|
|
144
|
+
# writer.write(tree)
|
145
|
+
# end
|
146
|
+
#
|
147
|
+
# # When all the trees are read in by the parser, whats left is saved at
|
148
|
+
# # PhyloXML::Parser#other
|
149
|
+
# writer.write(parser.other)
|
150
|
+
#
|
151
|
+
|
152
|
+
def write_other(other_arr)
|
153
|
+
other_arr.each do |other_obj|
|
154
|
+
@root << other_obj.to_xml
|
155
|
+
end
|
156
|
+
@doc.save(@filename, :indent => @indent)
|
157
|
+
end
|
158
|
+
|
159
|
+
#class method
|
160
|
+
|
161
|
+
#
|
162
|
+
# Used by to_xml methods of PhyloXML element classes. Generally not to be
|
163
|
+
# invoked directly.
|
164
|
+
#
|
165
|
+
def self.generate_xml(root, elem, subelement_array)
|
166
|
+
#example usage: generate_xml(node, self, [[ :complex,'accession', ], [:simple, 'name', @name], [:simple, 'location', @location]])
|
167
|
+
subelement_array.each do |subelem|
|
168
|
+
if subelem[0] == :simple
|
169
|
+
root << XML::Node.new(subelem[1], subelem[2].to_s) if subelem[2] != nil and not subelem[2].to_s.empty?
|
170
|
+
|
171
|
+
elsif subelem[0] == :complex
|
172
|
+
root << subelem[2].send("to_xml") if subelem[2] != nil
|
173
|
+
|
174
|
+
elsif subelem[0] == :pattern
|
175
|
+
#seq, self, [[:pattern, 'symbol', @symbol, "\S{1,10}"]
|
176
|
+
if subelem[2] != nil
|
177
|
+
if subelem[2] =~ subelem[3]
|
178
|
+
root << XML::Node.new(subelem[1], subelem[2])
|
179
|
+
else
|
180
|
+
raise "#{subelem[2]} is not a valid value of #{subelem[1]}. It should follow pattern #{subelem[3]}"
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
elsif subelem[0] == :objarr
|
185
|
+
#[:objarr, 'annotation', 'annotations']])
|
186
|
+
obj_arr = elem.send(subelem[2])
|
187
|
+
obj_arr.each do |arr_elem|
|
188
|
+
root << arr_elem.to_xml
|
189
|
+
end
|
190
|
+
|
191
|
+
elsif subelem[0] == :simplearr
|
192
|
+
# [:simplearr, 'common_name', @common_names]
|
193
|
+
subelem[2].each do |elem_val|
|
194
|
+
root << XML::Node.new(subelem[1], elem_val)
|
195
|
+
end
|
196
|
+
elsif subelem[0] == :attr
|
197
|
+
#[:attr, 'rooted']
|
198
|
+
obj = elem.send(subelem[1])
|
199
|
+
if obj != nil
|
200
|
+
root[subelem[1]] = obj.to_s
|
201
|
+
end
|
202
|
+
else
|
203
|
+
raise "Not supported type of element by method generate_xml."
|
204
|
+
end
|
205
|
+
end
|
206
|
+
return root
|
207
|
+
end
|
208
|
+
|
209
|
+
private
|
210
|
+
|
211
|
+
def node_to_xml(tree, node, parent)
|
212
|
+
edge = tree.get_edge(parent, node)
|
213
|
+
branch_length = edge.distance
|
214
|
+
|
215
|
+
clade = node.to_xml(branch_length, @write_branch_length_as_subelement)
|
216
|
+
|
217
|
+
tree.children(node).each do |new_node|
|
218
|
+
clade << node_to_xml(tree, new_node, node)
|
219
|
+
end
|
220
|
+
|
221
|
+
return clade
|
222
|
+
end
|
223
|
+
|
224
|
+
end
|
225
|
+
|
226
|
+
end
|
227
|
+
end
|
@@ -0,0 +1,205 @@
|
|
1
|
+
#
|
2
|
+
# = sample/test_phyloxml_big.rb - Tests for Bio::PhyloXML. Testing very big files.
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2009
|
5
|
+
# Diana Jaunzeikare <latvianlinuxgirl@gmail.com>
|
6
|
+
# Naohisa Goto <ng@bioruby.org>
|
7
|
+
# License:: The Ruby License
|
8
|
+
#
|
9
|
+
|
10
|
+
# libraries needed for the tests
|
11
|
+
require 'libxml'
|
12
|
+
require 'pathname'
|
13
|
+
require 'test/unit'
|
14
|
+
require 'digest/sha1'
|
15
|
+
|
16
|
+
require 'bio/command'
|
17
|
+
require 'bio/db/phyloxml/phyloxml_parser'
|
18
|
+
require 'bio/db/phyloxml/phyloxml_writer'
|
19
|
+
|
20
|
+
PhyloXMLBigDataPath = ARGV.shift
|
21
|
+
|
22
|
+
if !PhyloXMLBigDataPath then
|
23
|
+
exit_code = 0
|
24
|
+
elsif !File.directory?(PhyloXMLBigDataPath) then
|
25
|
+
exit_code = 1
|
26
|
+
else
|
27
|
+
exit_code = false
|
28
|
+
end
|
29
|
+
|
30
|
+
if exit_code then
|
31
|
+
puts "Usage: #{$0} path_to_data (test options...)"
|
32
|
+
puts ""
|
33
|
+
puts "Requirements:"
|
34
|
+
puts " - Write permission to the path_to_data"
|
35
|
+
puts " - Internet connection for downloading test data"
|
36
|
+
puts " - unzip command to extract downloaded test data"
|
37
|
+
puts ""
|
38
|
+
puts "You may want to run Ruby with -rubygems and -I<path_to_bioruby_lib>."
|
39
|
+
puts ""
|
40
|
+
puts "Example of usage using /tmp:"
|
41
|
+
puts " $ mkdir /tmp/phyloxml"
|
42
|
+
puts " $ ruby -rubygems -I lib #{$0} /tmp/phyloxml -v"
|
43
|
+
puts ""
|
44
|
+
exit(exit_code)
|
45
|
+
end
|
46
|
+
|
47
|
+
module TestPhyloXMLBigData
|
48
|
+
|
49
|
+
module_function
|
50
|
+
|
51
|
+
def metazoa_xml
|
52
|
+
#puts "Metazoa 30MB"
|
53
|
+
filename = 'ncbi_taxonomy_metazoa.xml'
|
54
|
+
uri = "http://www.phylosoft.org/archaeopteryx/examples/data/ncbi_taxonomy_metazoa.xml.zip"
|
55
|
+
download_and_unzip_if_not_found(filename, uri, "1M", "33M")
|
56
|
+
end
|
57
|
+
|
58
|
+
def metazoa_test_xml
|
59
|
+
#puts "writing Metazoa 30MB"
|
60
|
+
File.join PhyloXMLBigDataPath, 'writer_test_ncbi_taxonomy_metazoa.xml'
|
61
|
+
end
|
62
|
+
|
63
|
+
def metazoa_roundtrip_xml
|
64
|
+
#puts "writing Metazoa 30MB roundtrip"
|
65
|
+
File.join PhyloXMLBigDataPath, 'roundtrip_test_ncbi_taxonomy_metazoa.xml'
|
66
|
+
end
|
67
|
+
|
68
|
+
def mollusca_xml
|
69
|
+
#puts "Mollusca 1.5MB"
|
70
|
+
filename = 'ncbi_taxonomy_mollusca.xml'
|
71
|
+
uri = "http://www.phylosoft.org/archaeopteryx/examples/data/ncbi_taxonomy_mollusca.xml.zip"
|
72
|
+
download_and_unzip_if_not_found(filename, uri, "67K", "1.5M")
|
73
|
+
end
|
74
|
+
|
75
|
+
def mollusca_test_xml
|
76
|
+
#puts "Writing Mollusca 1.5MB"
|
77
|
+
File.join PhyloXMLBigDataPath, 'writer_test_ncbi_taxonomy_mollusca.xml'
|
78
|
+
end
|
79
|
+
|
80
|
+
def mollusca_roundtrip_xml
|
81
|
+
#puts "Writing Mollusca 1.5MB roundtrip"
|
82
|
+
File.join PhyloXMLBigDataPath, 'roundtrip_test_ncbi_taxonomy_mollusca.xml'
|
83
|
+
end
|
84
|
+
|
85
|
+
def life_xml
|
86
|
+
#Right now this file is not compatible with xsd 1.10
|
87
|
+
filename = 'tol_life_on_earth_1.xml'
|
88
|
+
uri = "http://www.phylosoft.org/archaeopteryx/examples/data/tol_life_on_earth_1.xml.zip"
|
89
|
+
|
90
|
+
download_and_unzip_if_not_found(filename, uri, '10M', '45M')
|
91
|
+
end
|
92
|
+
|
93
|
+
def life_test_xml
|
94
|
+
File.join PhyloXMLBigDataPath, 'writer_test_tol_life_on_earth_1.xml'
|
95
|
+
end
|
96
|
+
|
97
|
+
def life_roundtrip_xml
|
98
|
+
File.join PhyloXMLBigDataPath, 'roundtrip_test_tol_life_on_earth_1.xml'
|
99
|
+
end
|
100
|
+
|
101
|
+
def unzip_file(file, target_dir)
|
102
|
+
flag = system('unzip', "#{file}.zip", "-d", target_dir)
|
103
|
+
unless flag then
|
104
|
+
raise "Failed to unzip #{file}.zip"
|
105
|
+
end
|
106
|
+
file
|
107
|
+
end
|
108
|
+
|
109
|
+
def download_and_unzip_if_not_found(basename, uri, zipsize, origsize)
|
110
|
+
file = File.join PhyloXMLBigDataPath, basename
|
111
|
+
return file if File.exists?(file)
|
112
|
+
|
113
|
+
if File.exists?("#{file}.zip")
|
114
|
+
unzip_file(file, PhyloXMLBigDataPath)
|
115
|
+
return file
|
116
|
+
end
|
117
|
+
|
118
|
+
puts "File #{basename} does not exist. Do you want to download it? (If yes, ~#{zipsize}B zip file will be downloaded and extracted (to #{origsize}B), if no, the test will be skipped.) y/n?"
|
119
|
+
res = gets
|
120
|
+
if res.to_s.chomp.downcase == "y"
|
121
|
+
File.open("#{file}.zip", "wb") do |f|
|
122
|
+
f.write(Bio::Command.read_uri(uri))
|
123
|
+
end
|
124
|
+
puts "File downloaded."
|
125
|
+
self.unzip_file(file, PhyloXMLBigDataPath)
|
126
|
+
return file
|
127
|
+
else
|
128
|
+
return nil
|
129
|
+
#return File.join PHYLOXML_TEST_DATA, "#{basename}.stub"
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
end #end module TestPhyloXMLBigData
|
134
|
+
|
135
|
+
module Bio
|
136
|
+
|
137
|
+
class TestPhyloXMLBig < Test::Unit::TestCase
|
138
|
+
|
139
|
+
def do_test_next_tree(readfilename)
|
140
|
+
raise "the test is skipped" unless readfilename
|
141
|
+
filesizeMB = File.size(readfilename) / 1048576.0
|
142
|
+
printf "Reading %s (%2.1f MB)\n", readfilename, filesizeMB
|
143
|
+
|
144
|
+
begin
|
145
|
+
phyloxml = Bio::PhyloXML::Parser.open(readfilename)
|
146
|
+
rescue NoMethodError
|
147
|
+
phyloxml = Bio::PhyloXML::Parser.new(readfilename)
|
148
|
+
end
|
149
|
+
tree = nil
|
150
|
+
assert_nothing_raised {
|
151
|
+
tree = phyloxml.next_tree
|
152
|
+
}
|
153
|
+
tree
|
154
|
+
end
|
155
|
+
private :do_test_next_tree
|
156
|
+
|
157
|
+
def do_test_write(tree, writefilename)
|
158
|
+
printf "Writing to %s\n", writefilename
|
159
|
+
writer = Bio::PhyloXML::Writer.new(writefilename)
|
160
|
+
assert_nothing_raised {
|
161
|
+
writer.write(tree)
|
162
|
+
}
|
163
|
+
|
164
|
+
# checks file size and sha1sum
|
165
|
+
str = File.open(writefilename, 'rb') { |f| f.read }
|
166
|
+
sha1 = Digest::SHA1.hexdigest(str)
|
167
|
+
puts "Wrote #{str.length} bytes."
|
168
|
+
puts "sha1: #{sha1}"
|
169
|
+
end
|
170
|
+
private :do_test_write
|
171
|
+
|
172
|
+
def test_mollusca
|
173
|
+
tree = do_test_next_tree(TestPhyloXMLBigData.mollusca_xml)
|
174
|
+
do_test_write(tree, TestPhyloXMLBigData.mollusca_test_xml)
|
175
|
+
|
176
|
+
tree2 = do_test_next_tree(TestPhyloXMLBigData.mollusca_test_xml)
|
177
|
+
do_test_write(tree2, TestPhyloXMLBigData.mollusca_roundtrip_xml)
|
178
|
+
end
|
179
|
+
|
180
|
+
def test_metazoa
|
181
|
+
tree = do_test_next_tree(TestPhyloXMLBigData.metazoa_xml)
|
182
|
+
do_test_write(tree, TestPhyloXMLBigData.metazoa_test_xml)
|
183
|
+
|
184
|
+
tree2 = do_test_next_tree(TestPhyloXMLBigData.metazoa_test_xml)
|
185
|
+
do_test_write(tree2, TestPhyloXMLBigData.metazoa_roundtrip_xml)
|
186
|
+
end
|
187
|
+
|
188
|
+
if false
|
189
|
+
# Disabled because of the error.
|
190
|
+
# LibXML::XML::Error: Fatal error: Input is not proper UTF-8,
|
191
|
+
# indicate encoding !
|
192
|
+
# Bytes: 0xE9 0x6B 0x65 0x73 at tol_life_on_earth_1.xml:132170.
|
193
|
+
#
|
194
|
+
def test_life
|
195
|
+
tree = do_test_next_tree(TestPhyloXMLBigData.life_xml)
|
196
|
+
do_test_write(tree, TestPhyloXMLBigData.life_test_xml)
|
197
|
+
|
198
|
+
tree2 = do_test_next_tree(TestPhyloXMLBigData.life_test_xml)
|
199
|
+
do_test_write(tree2, TestPhyloXMLBigData.life_roundtrip_xml)
|
200
|
+
end
|
201
|
+
end #if false
|
202
|
+
|
203
|
+
end
|
204
|
+
|
205
|
+
end
|