bioruby-phyloxml 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/BSDL +22 -0
- data/COPYING +57 -0
- data/COPYING.ja +51 -0
- data/GPL +340 -0
- data/Gemfile +4 -0
- data/LEGAL +36 -0
- data/LGPL +504 -0
- data/README.md +214 -0
- data/Rakefile +20 -0
- data/bioruby-phyloxml.gemspec +36 -0
- data/doc/Tutorial.rd +152 -0
- data/lib/bio-phyloxml.rb +27 -0
- data/lib/bio-phyloxml/compat/cleanup.rb +13 -0
- data/lib/bio-phyloxml/compat/stub_phyloxml_elements.rb +1 -0
- data/lib/bio-phyloxml/compat/stub_phyloxml_parser.rb +1 -0
- data/lib/bio-phyloxml/compat/stub_phyloxml_writer.rb +1 -0
- data/lib/bio-phyloxml/phyloxml.xsd +582 -0
- data/lib/bio-phyloxml/phyloxml_elements.rb +1186 -0
- data/lib/bio-phyloxml/phyloxml_parser.rb +1001 -0
- data/lib/bio-phyloxml/phyloxml_writer.rb +227 -0
- data/lib/bio-phyloxml/version.rb +7 -0
- data/lib/bio/db/phyloxml/phyloxml_elements.rb +4 -0
- data/lib/bio/db/phyloxml/phyloxml_parser.rb +4 -0
- data/lib/bio/db/phyloxml/phyloxml_writer.rb +4 -0
- data/lib/bioruby-phyloxml.rb +10 -0
- data/sample/test_phyloxml_big.rb +205 -0
- metadata +156 -0
@@ -0,0 +1,227 @@
|
|
1
|
+
#
|
2
|
+
# = bio/db/phyloxml_writer.rb - PhyloXML writer
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2009
|
5
|
+
# Diana Jaunzeikare <latvianlinuxgirl@gmail.com>
|
6
|
+
# License:: The Ruby License
|
7
|
+
#
|
8
|
+
#
|
9
|
+
# == Description
|
10
|
+
#
|
11
|
+
# This file containts writer for PhyloXML.
|
12
|
+
#
|
13
|
+
# == Requirements
|
14
|
+
#
|
15
|
+
# Libxml2 XML parser is required. Install libxml-ruby bindings from
|
16
|
+
# http://libxml.rubyforge.org or
|
17
|
+
#
|
18
|
+
# gem install -r libxml-ruby
|
19
|
+
#
|
20
|
+
# == References
|
21
|
+
#
|
22
|
+
# * http://www.phyloxml.org
|
23
|
+
#
|
24
|
+
# * https://www.nescent.org/wg_phyloinformatics/PhyloSoC:PhyloXML_support_in_BioRuby
|
25
|
+
|
26
|
+
require 'libxml'
|
27
|
+
require 'bio-phyloxml/phyloxml_elements'
|
28
|
+
|
29
|
+
module Bio
|
30
|
+
|
31
|
+
module PhyloXML
|
32
|
+
|
33
|
+
# == Description
|
34
|
+
#
|
35
|
+
# Bio::PhyloXML::Writer is for writing phyloXML (version 1.10) format files.
|
36
|
+
#
|
37
|
+
# == Requirements
|
38
|
+
#
|
39
|
+
# Libxml2 XML parser is required. Install libxml-ruby bindings from
|
40
|
+
# http://libxml.rubyforge.org or
|
41
|
+
#
|
42
|
+
# gem install -r libxml-ruby
|
43
|
+
#
|
44
|
+
# == Usage
|
45
|
+
#
|
46
|
+
# require 'bio'
|
47
|
+
#
|
48
|
+
# # Create new phyloxml parser
|
49
|
+
# phyloxml = Bio::PhyloXML::Parser.open('example.xml')
|
50
|
+
#
|
51
|
+
# # Read in some trees from file
|
52
|
+
# tree1 = phyloxml.next_tree
|
53
|
+
# tree2 = phyloxml.next_tree
|
54
|
+
#
|
55
|
+
# # Create new phyloxml writer
|
56
|
+
# writer = Bio::PhyloXML::Writer.new('tree.xml')
|
57
|
+
#
|
58
|
+
# # Write tree to the file tree.xml
|
59
|
+
# writer.write(tree1)
|
60
|
+
#
|
61
|
+
# # Add another tree to the file
|
62
|
+
# writer.write(tree2)
|
63
|
+
#
|
64
|
+
# == References
|
65
|
+
#
|
66
|
+
# http://www.phyloxml.org/documentation/version_100/phyloxml.xsd.html
|
67
|
+
|
68
|
+
class Writer
|
69
|
+
|
70
|
+
include LibXML
|
71
|
+
|
72
|
+
SCHEMA_LOCATION = 'http://www.phyloxml.org http://www.phyloxml.org/1.10/phyloxml.xsd'
|
73
|
+
|
74
|
+
attr_accessor :write_branch_length_as_subelement
|
75
|
+
|
76
|
+
#
|
77
|
+
# Create new Writer object. As parameters provide filename of xml file
|
78
|
+
# you wish to create. Optional parameter is whether to indent or no.
|
79
|
+
# Default is true. By default branch_length is written as subelement of
|
80
|
+
# clade element.
|
81
|
+
#
|
82
|
+
def initialize(filename, indent=true)
|
83
|
+
@write_branch_length_as_subelement = true #default value
|
84
|
+
@filename = filename
|
85
|
+
@indent = indent
|
86
|
+
|
87
|
+
@doc = XML::Document.new()
|
88
|
+
@doc.root = XML::Node.new('phyloxml')
|
89
|
+
@root = @doc.root
|
90
|
+
@root['xmlns:xsi'] = 'http://www.w3.org/2001/XMLSchema-instance'
|
91
|
+
@root['xsi:schemaLocation'] = SCHEMA_LOCATION
|
92
|
+
@root['xmlns'] = 'http://www.phyloxml.org'
|
93
|
+
|
94
|
+
#@todo save encoding to be UTF-8. (However it is the default one).
|
95
|
+
#it gives error NameError: uninitialized constant LibXML::XML::Encoding
|
96
|
+
#@doc.encoding = XML::Encoding::UTF_8
|
97
|
+
|
98
|
+
@doc.save(@filename, :indent => true)
|
99
|
+
end
|
100
|
+
|
101
|
+
#
|
102
|
+
# Write a tree to a file in phyloxml format.
|
103
|
+
#
|
104
|
+
# require 'Bio'
|
105
|
+
# writer = Bio::PhyloXML::Writer.new
|
106
|
+
# writer.write(tree)
|
107
|
+
#
|
108
|
+
def write(tree)
|
109
|
+
@root << phylogeny = XML::Node.new('phylogeny')
|
110
|
+
|
111
|
+
PhyloXML::Writer.generate_xml(phylogeny, tree, [
|
112
|
+
[:attr, 'rooted'],
|
113
|
+
[:simple, 'name', tree.name],
|
114
|
+
[:complex, 'id', tree.phylogeny_id],
|
115
|
+
[:simple, 'description', tree.description],
|
116
|
+
[:simple, 'date', tree.date],
|
117
|
+
[:objarr, 'confidence', 'confidences']])
|
118
|
+
|
119
|
+
root_clade = tree.root.to_xml(nil, @write_branch_length_as_subelement)
|
120
|
+
|
121
|
+
phylogeny << root_clade
|
122
|
+
|
123
|
+
tree.children(tree.root).each do |node|
|
124
|
+
root_clade << node_to_xml(tree, node, tree.root)
|
125
|
+
end
|
126
|
+
|
127
|
+
Bio::PhyloXML::Writer::generate_xml(phylogeny, tree, [
|
128
|
+
[:objarr, 'clade_relation', 'clade_relations'],
|
129
|
+
[:objarr, 'sequence_relation', 'sequence_relations'],
|
130
|
+
[:objarr, 'property', 'properties']] )
|
131
|
+
|
132
|
+
@doc.save(@filename, :indent => @indent)
|
133
|
+
end #writer#write
|
134
|
+
|
135
|
+
|
136
|
+
#
|
137
|
+
# PhyloXML Schema allows to save data in different xml format after all
|
138
|
+
# phylogeny elements. This method is to write these additional data.
|
139
|
+
#
|
140
|
+
# parser = PhyloXML::Parser.open('phyloxml_examples.xml')
|
141
|
+
# writer = PhyloXML::Writer.new('new.xml')
|
142
|
+
#
|
143
|
+
# parser.each do |tree|
|
144
|
+
# writer.write(tree)
|
145
|
+
# end
|
146
|
+
#
|
147
|
+
# # When all the trees are read in by the parser, whats left is saved at
|
148
|
+
# # PhyloXML::Parser#other
|
149
|
+
# writer.write(parser.other)
|
150
|
+
#
|
151
|
+
|
152
|
+
def write_other(other_arr)
|
153
|
+
other_arr.each do |other_obj|
|
154
|
+
@root << other_obj.to_xml
|
155
|
+
end
|
156
|
+
@doc.save(@filename, :indent => @indent)
|
157
|
+
end
|
158
|
+
|
159
|
+
#class method
|
160
|
+
|
161
|
+
#
|
162
|
+
# Used by to_xml methods of PhyloXML element classes. Generally not to be
|
163
|
+
# invoked directly.
|
164
|
+
#
|
165
|
+
def self.generate_xml(root, elem, subelement_array)
|
166
|
+
#example usage: generate_xml(node, self, [[ :complex,'accession', ], [:simple, 'name', @name], [:simple, 'location', @location]])
|
167
|
+
subelement_array.each do |subelem|
|
168
|
+
if subelem[0] == :simple
|
169
|
+
root << XML::Node.new(subelem[1], subelem[2].to_s) if subelem[2] != nil and not subelem[2].to_s.empty?
|
170
|
+
|
171
|
+
elsif subelem[0] == :complex
|
172
|
+
root << subelem[2].send("to_xml") if subelem[2] != nil
|
173
|
+
|
174
|
+
elsif subelem[0] == :pattern
|
175
|
+
#seq, self, [[:pattern, 'symbol', @symbol, "\S{1,10}"]
|
176
|
+
if subelem[2] != nil
|
177
|
+
if subelem[2] =~ subelem[3]
|
178
|
+
root << XML::Node.new(subelem[1], subelem[2])
|
179
|
+
else
|
180
|
+
raise "#{subelem[2]} is not a valid value of #{subelem[1]}. It should follow pattern #{subelem[3]}"
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
elsif subelem[0] == :objarr
|
185
|
+
#[:objarr, 'annotation', 'annotations']])
|
186
|
+
obj_arr = elem.send(subelem[2])
|
187
|
+
obj_arr.each do |arr_elem|
|
188
|
+
root << arr_elem.to_xml
|
189
|
+
end
|
190
|
+
|
191
|
+
elsif subelem[0] == :simplearr
|
192
|
+
# [:simplearr, 'common_name', @common_names]
|
193
|
+
subelem[2].each do |elem_val|
|
194
|
+
root << XML::Node.new(subelem[1], elem_val)
|
195
|
+
end
|
196
|
+
elsif subelem[0] == :attr
|
197
|
+
#[:attr, 'rooted']
|
198
|
+
obj = elem.send(subelem[1])
|
199
|
+
if obj != nil
|
200
|
+
root[subelem[1]] = obj.to_s
|
201
|
+
end
|
202
|
+
else
|
203
|
+
raise "Not supported type of element by method generate_xml."
|
204
|
+
end
|
205
|
+
end
|
206
|
+
return root
|
207
|
+
end
|
208
|
+
|
209
|
+
private
|
210
|
+
|
211
|
+
def node_to_xml(tree, node, parent)
|
212
|
+
edge = tree.get_edge(parent, node)
|
213
|
+
branch_length = edge.distance
|
214
|
+
|
215
|
+
clade = node.to_xml(branch_length, @write_branch_length_as_subelement)
|
216
|
+
|
217
|
+
tree.children(node).each do |new_node|
|
218
|
+
clade << node_to_xml(tree, new_node, node)
|
219
|
+
end
|
220
|
+
|
221
|
+
return clade
|
222
|
+
end
|
223
|
+
|
224
|
+
end
|
225
|
+
|
226
|
+
end
|
227
|
+
end
|
@@ -0,0 +1,205 @@
|
|
1
|
+
#
|
2
|
+
# = sample/test_phyloxml_big.rb - Tests for Bio::PhyloXML. Testing very big files.
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2009
|
5
|
+
# Diana Jaunzeikare <latvianlinuxgirl@gmail.com>
|
6
|
+
# Naohisa Goto <ng@bioruby.org>
|
7
|
+
# License:: The Ruby License
|
8
|
+
#
|
9
|
+
|
10
|
+
# libraries needed for the tests
|
11
|
+
require 'libxml'
|
12
|
+
require 'pathname'
|
13
|
+
require 'test/unit'
|
14
|
+
require 'digest/sha1'
|
15
|
+
|
16
|
+
require 'bio/command'
|
17
|
+
require 'bio/db/phyloxml/phyloxml_parser'
|
18
|
+
require 'bio/db/phyloxml/phyloxml_writer'
|
19
|
+
|
20
|
+
PhyloXMLBigDataPath = ARGV.shift
|
21
|
+
|
22
|
+
if !PhyloXMLBigDataPath then
|
23
|
+
exit_code = 0
|
24
|
+
elsif !File.directory?(PhyloXMLBigDataPath) then
|
25
|
+
exit_code = 1
|
26
|
+
else
|
27
|
+
exit_code = false
|
28
|
+
end
|
29
|
+
|
30
|
+
if exit_code then
|
31
|
+
puts "Usage: #{$0} path_to_data (test options...)"
|
32
|
+
puts ""
|
33
|
+
puts "Requirements:"
|
34
|
+
puts " - Write permission to the path_to_data"
|
35
|
+
puts " - Internet connection for downloading test data"
|
36
|
+
puts " - unzip command to extract downloaded test data"
|
37
|
+
puts ""
|
38
|
+
puts "You may want to run Ruby with -rubygems and -I<path_to_bioruby_lib>."
|
39
|
+
puts ""
|
40
|
+
puts "Example of usage using /tmp:"
|
41
|
+
puts " $ mkdir /tmp/phyloxml"
|
42
|
+
puts " $ ruby -rubygems -I lib #{$0} /tmp/phyloxml -v"
|
43
|
+
puts ""
|
44
|
+
exit(exit_code)
|
45
|
+
end
|
46
|
+
|
47
|
+
module TestPhyloXMLBigData
|
48
|
+
|
49
|
+
module_function
|
50
|
+
|
51
|
+
def metazoa_xml
|
52
|
+
#puts "Metazoa 30MB"
|
53
|
+
filename = 'ncbi_taxonomy_metazoa.xml'
|
54
|
+
uri = "http://www.phylosoft.org/archaeopteryx/examples/data/ncbi_taxonomy_metazoa.xml.zip"
|
55
|
+
download_and_unzip_if_not_found(filename, uri, "1M", "33M")
|
56
|
+
end
|
57
|
+
|
58
|
+
def metazoa_test_xml
|
59
|
+
#puts "writing Metazoa 30MB"
|
60
|
+
File.join PhyloXMLBigDataPath, 'writer_test_ncbi_taxonomy_metazoa.xml'
|
61
|
+
end
|
62
|
+
|
63
|
+
def metazoa_roundtrip_xml
|
64
|
+
#puts "writing Metazoa 30MB roundtrip"
|
65
|
+
File.join PhyloXMLBigDataPath, 'roundtrip_test_ncbi_taxonomy_metazoa.xml'
|
66
|
+
end
|
67
|
+
|
68
|
+
def mollusca_xml
|
69
|
+
#puts "Mollusca 1.5MB"
|
70
|
+
filename = 'ncbi_taxonomy_mollusca.xml'
|
71
|
+
uri = "http://www.phylosoft.org/archaeopteryx/examples/data/ncbi_taxonomy_mollusca.xml.zip"
|
72
|
+
download_and_unzip_if_not_found(filename, uri, "67K", "1.5M")
|
73
|
+
end
|
74
|
+
|
75
|
+
def mollusca_test_xml
|
76
|
+
#puts "Writing Mollusca 1.5MB"
|
77
|
+
File.join PhyloXMLBigDataPath, 'writer_test_ncbi_taxonomy_mollusca.xml'
|
78
|
+
end
|
79
|
+
|
80
|
+
def mollusca_roundtrip_xml
|
81
|
+
#puts "Writing Mollusca 1.5MB roundtrip"
|
82
|
+
File.join PhyloXMLBigDataPath, 'roundtrip_test_ncbi_taxonomy_mollusca.xml'
|
83
|
+
end
|
84
|
+
|
85
|
+
def life_xml
|
86
|
+
#Right now this file is not compatible with xsd 1.10
|
87
|
+
filename = 'tol_life_on_earth_1.xml'
|
88
|
+
uri = "http://www.phylosoft.org/archaeopteryx/examples/data/tol_life_on_earth_1.xml.zip"
|
89
|
+
|
90
|
+
download_and_unzip_if_not_found(filename, uri, '10M', '45M')
|
91
|
+
end
|
92
|
+
|
93
|
+
def life_test_xml
|
94
|
+
File.join PhyloXMLBigDataPath, 'writer_test_tol_life_on_earth_1.xml'
|
95
|
+
end
|
96
|
+
|
97
|
+
def life_roundtrip_xml
|
98
|
+
File.join PhyloXMLBigDataPath, 'roundtrip_test_tol_life_on_earth_1.xml'
|
99
|
+
end
|
100
|
+
|
101
|
+
def unzip_file(file, target_dir)
|
102
|
+
flag = system('unzip', "#{file}.zip", "-d", target_dir)
|
103
|
+
unless flag then
|
104
|
+
raise "Failed to unzip #{file}.zip"
|
105
|
+
end
|
106
|
+
file
|
107
|
+
end
|
108
|
+
|
109
|
+
def download_and_unzip_if_not_found(basename, uri, zipsize, origsize)
|
110
|
+
file = File.join PhyloXMLBigDataPath, basename
|
111
|
+
return file if File.exists?(file)
|
112
|
+
|
113
|
+
if File.exists?("#{file}.zip")
|
114
|
+
unzip_file(file, PhyloXMLBigDataPath)
|
115
|
+
return file
|
116
|
+
end
|
117
|
+
|
118
|
+
puts "File #{basename} does not exist. Do you want to download it? (If yes, ~#{zipsize}B zip file will be downloaded and extracted (to #{origsize}B), if no, the test will be skipped.) y/n?"
|
119
|
+
res = gets
|
120
|
+
if res.to_s.chomp.downcase == "y"
|
121
|
+
File.open("#{file}.zip", "wb") do |f|
|
122
|
+
f.write(Bio::Command.read_uri(uri))
|
123
|
+
end
|
124
|
+
puts "File downloaded."
|
125
|
+
self.unzip_file(file, PhyloXMLBigDataPath)
|
126
|
+
return file
|
127
|
+
else
|
128
|
+
return nil
|
129
|
+
#return File.join PHYLOXML_TEST_DATA, "#{basename}.stub"
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
end #end module TestPhyloXMLBigData
|
134
|
+
|
135
|
+
module Bio
|
136
|
+
|
137
|
+
class TestPhyloXMLBig < Test::Unit::TestCase
|
138
|
+
|
139
|
+
def do_test_next_tree(readfilename)
|
140
|
+
raise "the test is skipped" unless readfilename
|
141
|
+
filesizeMB = File.size(readfilename) / 1048576.0
|
142
|
+
printf "Reading %s (%2.1f MB)\n", readfilename, filesizeMB
|
143
|
+
|
144
|
+
begin
|
145
|
+
phyloxml = Bio::PhyloXML::Parser.open(readfilename)
|
146
|
+
rescue NoMethodError
|
147
|
+
phyloxml = Bio::PhyloXML::Parser.new(readfilename)
|
148
|
+
end
|
149
|
+
tree = nil
|
150
|
+
assert_nothing_raised {
|
151
|
+
tree = phyloxml.next_tree
|
152
|
+
}
|
153
|
+
tree
|
154
|
+
end
|
155
|
+
private :do_test_next_tree
|
156
|
+
|
157
|
+
def do_test_write(tree, writefilename)
|
158
|
+
printf "Writing to %s\n", writefilename
|
159
|
+
writer = Bio::PhyloXML::Writer.new(writefilename)
|
160
|
+
assert_nothing_raised {
|
161
|
+
writer.write(tree)
|
162
|
+
}
|
163
|
+
|
164
|
+
# checks file size and sha1sum
|
165
|
+
str = File.open(writefilename, 'rb') { |f| f.read }
|
166
|
+
sha1 = Digest::SHA1.hexdigest(str)
|
167
|
+
puts "Wrote #{str.length} bytes."
|
168
|
+
puts "sha1: #{sha1}"
|
169
|
+
end
|
170
|
+
private :do_test_write
|
171
|
+
|
172
|
+
def test_mollusca
|
173
|
+
tree = do_test_next_tree(TestPhyloXMLBigData.mollusca_xml)
|
174
|
+
do_test_write(tree, TestPhyloXMLBigData.mollusca_test_xml)
|
175
|
+
|
176
|
+
tree2 = do_test_next_tree(TestPhyloXMLBigData.mollusca_test_xml)
|
177
|
+
do_test_write(tree2, TestPhyloXMLBigData.mollusca_roundtrip_xml)
|
178
|
+
end
|
179
|
+
|
180
|
+
def test_metazoa
|
181
|
+
tree = do_test_next_tree(TestPhyloXMLBigData.metazoa_xml)
|
182
|
+
do_test_write(tree, TestPhyloXMLBigData.metazoa_test_xml)
|
183
|
+
|
184
|
+
tree2 = do_test_next_tree(TestPhyloXMLBigData.metazoa_test_xml)
|
185
|
+
do_test_write(tree2, TestPhyloXMLBigData.metazoa_roundtrip_xml)
|
186
|
+
end
|
187
|
+
|
188
|
+
if false
|
189
|
+
# Disabled because of the error.
|
190
|
+
# LibXML::XML::Error: Fatal error: Input is not proper UTF-8,
|
191
|
+
# indicate encoding !
|
192
|
+
# Bytes: 0xE9 0x6B 0x65 0x73 at tol_life_on_earth_1.xml:132170.
|
193
|
+
#
|
194
|
+
def test_life
|
195
|
+
tree = do_test_next_tree(TestPhyloXMLBigData.life_xml)
|
196
|
+
do_test_write(tree, TestPhyloXMLBigData.life_test_xml)
|
197
|
+
|
198
|
+
tree2 = do_test_next_tree(TestPhyloXMLBigData.life_test_xml)
|
199
|
+
do_test_write(tree2, TestPhyloXMLBigData.life_roundtrip_xml)
|
200
|
+
end
|
201
|
+
end #if false
|
202
|
+
|
203
|
+
end
|
204
|
+
|
205
|
+
end
|