bio-phyloxml 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.travis.yml +12 -0
- data/Gemfile +13 -0
- data/LICENSE.txt +20 -0
- data/README.md +199 -0
- data/README.rdoc +48 -0
- data/Rakefile +45 -0
- data/VERSION +1 -0
- data/lib/bio-phyloxml.rb +12 -0
- data/lib/bio/phyloxml.rb +3 -0
- data/lib/bio/phyloxml/elements.rb +1166 -0
- data/lib/bio/phyloxml/parser.rb +1000 -0
- data/lib/bio/phyloxml/phyloxml.xsd +582 -0
- data/lib/bio/phyloxml/writer.rb +227 -0
- data/sample/test_phyloxml_big.rb +205 -0
- data/test/data/phyloxml/apaf.xml +666 -0
- data/test/data/phyloxml/bcl_2.xml +2097 -0
- data/test/data/phyloxml/made_up.xml +144 -0
- data/test/data/phyloxml/ncbi_taxonomy_mollusca_short.xml +65 -0
- data/test/data/phyloxml/phyloxml_examples.xml +415 -0
- data/test/helper.rb +25 -0
- data/test/unit/bio/test_phyloxml.rb +821 -0
- data/test/unit/bio/test_phyloxml_writer.rb +334 -0
- metadata +155 -0
@@ -0,0 +1,1000 @@
|
|
1
|
+
#
|
2
|
+
# = bio/db/phyloxml_parser.rb - PhyloXML parser
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2009
|
5
|
+
# Diana Jaunzeikare <latvianlinuxgirl@gmail.com>
|
6
|
+
# License:: The Ruby License
|
7
|
+
#
|
8
|
+
# $Id:$
|
9
|
+
#
|
10
|
+
# == Description
|
11
|
+
#
|
12
|
+
# This file containts parser for PhyloXML.
|
13
|
+
#
|
14
|
+
# == Requirements
|
15
|
+
#
|
16
|
+
# Libxml2 XML parser is required. Install libxml-ruby bindings from
|
17
|
+
# http://libxml.rubyforge.org or
|
18
|
+
#
|
19
|
+
# gem install -r libxml-ruby
|
20
|
+
#
|
21
|
+
# == References
|
22
|
+
#
|
23
|
+
# * http://www.phyloxml.org
|
24
|
+
#
|
25
|
+
# * https://www.nescent.org/wg_phyloinformatics/PhyloSoC:PhyloXML_support_in_BioRuby
|
26
|
+
|
27
|
+
|
28
|
+
require 'uri'
|
29
|
+
require 'libxml'
|
30
|
+
|
31
|
+
require 'bio/tree'
|
32
|
+
|
33
|
+
|
34
|
+
module Bio
|
35
|
+
|
36
|
+
module PhyloXML
|
37
|
+
|
38
|
+
|
39
|
+
|
40
|
+
|
41
|
+
# == Description
|
42
|
+
#
|
43
|
+
# Bio::PhyloXML::Parser is for parsing phyloXML format files.
|
44
|
+
#
|
45
|
+
# == Requirements
|
46
|
+
#
|
47
|
+
# Libxml2 XML parser is required. Install libxml-ruby bindings from
|
48
|
+
# http://libxml.rubyforge.org or
|
49
|
+
#
|
50
|
+
# gem install -r libxml-ruby
|
51
|
+
#
|
52
|
+
# == Usage
|
53
|
+
#
|
54
|
+
# require 'bio'
|
55
|
+
#
|
56
|
+
# # Create new phyloxml parser
|
57
|
+
# phyloxml = Bio::PhyloXML::Parser.open('example.xml')
|
58
|
+
#
|
59
|
+
# # Print the names of all trees in the file
|
60
|
+
# phyloxml.each do |tree|
|
61
|
+
# puts tree.name
|
62
|
+
# end
|
63
|
+
#
|
64
|
+
#
|
65
|
+
# == References
|
66
|
+
#
|
67
|
+
# http://www.phyloxml.org/documentation/version_100/phyloxml.xsd.html
|
68
|
+
#
|
69
|
+
class Parser
|
70
|
+
|
71
|
+
include LibXML
|
72
|
+
|
73
|
+
# After parsing all the trees, if there is anything else in other xml format,
|
74
|
+
# it is saved in this array of PhyloXML::Other objects
|
75
|
+
attr_reader :other
|
76
|
+
|
77
|
+
# Initializes LibXML::Reader and reads the file until it reaches the first
|
78
|
+
# phylogeny element.
|
79
|
+
#
|
80
|
+
# Example: Create a new Bio::PhyloXML::Parser object.
|
81
|
+
#
|
82
|
+
# p = Bio::PhyloXML::Parser.open("./phyloxml_examples.xml")
|
83
|
+
#
|
84
|
+
# If the optional code block is given, Bio::PhyloXML object is passed to
|
85
|
+
# the block as an argument. When the block terminates, the Bio::PhyloXML
|
86
|
+
# object is automatically closed, and the open method returns the value
|
87
|
+
# of the block.
|
88
|
+
#
|
89
|
+
# Example: Get the first tree in the file.
|
90
|
+
#
|
91
|
+
# tree = Bio::PhyloXML::Parser.open("example.xml") do |px|
|
92
|
+
# px.next_tree
|
93
|
+
# end
|
94
|
+
#
|
95
|
+
# ---
|
96
|
+
# *Arguments*:
|
97
|
+
# * (required) _filename_: Path to the file to parse.
|
98
|
+
# * (optional) _validate_: Whether to validate the file against schema or not. Default value is true.
|
99
|
+
# *Returns*:: (without block) Bio::PhyloXML::Parser object
|
100
|
+
# *Returns*:: (with block) the value of the block
|
101
|
+
def self.open(filename, validate=true)
|
102
|
+
obj = new(nil, validate)
|
103
|
+
obj.instance_eval {
|
104
|
+
filename = _secure_filename(filename)
|
105
|
+
_validate(:file, filename) if validate
|
106
|
+
# XML::Parser::Options::NONET for security reason
|
107
|
+
@reader = XML::Reader.file(filename,
|
108
|
+
{ :options =>
|
109
|
+
LibXML::XML::Parser::Options::NONET })
|
110
|
+
_skip_leader
|
111
|
+
}
|
112
|
+
if block_given? then
|
113
|
+
begin
|
114
|
+
ret = yield obj
|
115
|
+
ensure
|
116
|
+
obj.close if obj and !obj.closed?
|
117
|
+
end
|
118
|
+
ret
|
119
|
+
else
|
120
|
+
obj
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
# Initializes LibXML::Reader and reads the file until it reaches the first
|
125
|
+
# phylogeny element.
|
126
|
+
#
|
127
|
+
# Create a new Bio::PhyloXML::Parser object.
|
128
|
+
#
|
129
|
+
# p = Bio::PhyloXML::Parser.open_uri("http://www.phyloxml.org/examples/apaf.xml")
|
130
|
+
#
|
131
|
+
# If the optional code block is given, Bio::PhyloXML object is passed to
|
132
|
+
# the block as an argument. When the block terminates, the Bio::PhyloXML
|
133
|
+
# object is automatically closed, and the open_uri method returns the
|
134
|
+
# value of the block.
|
135
|
+
#
|
136
|
+
# ---
|
137
|
+
# *Arguments*:
|
138
|
+
# * (required) _uri_: (URI or String) URI to the data to parse
|
139
|
+
# * (optional) _validate_: For URI reader, the "validate" option is ignored and no validation is executed.
|
140
|
+
# *Returns*:: (without block) Bio::PhyloXML::Parser object
|
141
|
+
# *Returns*:: (with block) the value of the block
|
142
|
+
def self.open_uri(uri, validate=true)
|
143
|
+
case uri
|
144
|
+
when URI
|
145
|
+
uri = uri.to_s
|
146
|
+
else
|
147
|
+
# raises error if not a String
|
148
|
+
uri = uri.to_str
|
149
|
+
# raises error if invalid URI
|
150
|
+
URI.parse(uri)
|
151
|
+
end
|
152
|
+
|
153
|
+
obj = new(nil, validate)
|
154
|
+
obj.instance_eval {
|
155
|
+
@reader = XML::Reader.file(uri)
|
156
|
+
_skip_leader
|
157
|
+
}
|
158
|
+
if block_given? then
|
159
|
+
begin
|
160
|
+
ret = yield obj
|
161
|
+
ensure
|
162
|
+
obj.close if obj and !obj.closed?
|
163
|
+
end
|
164
|
+
else
|
165
|
+
obj
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
# Special class for closed PhyloXML::Parser object.
|
170
|
+
# It raises error for any methods except essential methods.
|
171
|
+
#
|
172
|
+
# Bio::PhyloXML internal use only.
|
173
|
+
class ClosedPhyloXMLParser #:nodoc:
|
174
|
+
def method_missing(*arg)
|
175
|
+
raise LibXML::XML::Error, 'closed PhyloXML::Parser object'
|
176
|
+
end
|
177
|
+
end #class ClosedPhyloXMLParser
|
178
|
+
|
179
|
+
# Closes the LibXML::Reader inside the object.
|
180
|
+
# It also closes the opened file if it is created by using
|
181
|
+
# Bio::PhyloXML::Parser.open method.
|
182
|
+
#
|
183
|
+
# When closed object is closed again, or closed object is used,
|
184
|
+
# it raises LibXML::XML::Error.
|
185
|
+
# ---
|
186
|
+
# *Returns*:: nil
|
187
|
+
def close
|
188
|
+
@reader.close
|
189
|
+
@reader = ClosedPhyloXMLParser.new
|
190
|
+
nil
|
191
|
+
end
|
192
|
+
|
193
|
+
# If the object is closed by using the close method or equivalent,
|
194
|
+
# returns true. Otherwise, returns false.
|
195
|
+
# ---
|
196
|
+
# *Returns*:: true or false
|
197
|
+
def closed?
|
198
|
+
if @reader.kind_of?(ClosedPhyloXMLParser) then
|
199
|
+
true
|
200
|
+
else
|
201
|
+
false
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
# Initializes LibXML::Reader and reads from the IO until it reaches
|
206
|
+
# the first phylogeny element.
|
207
|
+
#
|
208
|
+
# Create a new Bio::PhyloXML::Parser object.
|
209
|
+
#
|
210
|
+
# p = Bio::PhyloXML::Parser.for_io($stdin)
|
211
|
+
#
|
212
|
+
# ---
|
213
|
+
# *Arguments*:
|
214
|
+
# * (required) _io_: IO object
|
215
|
+
# * (optional) _validate_: For IO reader, the "validate" option is ignored and no validation is executed.
|
216
|
+
# *Returns*:: Bio::PhyloXML::Parser object
|
217
|
+
def self.for_io(io, validate=true)
|
218
|
+
obj = new(nil, validate)
|
219
|
+
obj.instance_eval {
|
220
|
+
@reader = XML::Reader.io(io,
|
221
|
+
{ :options =>
|
222
|
+
LibXML::XML::Parser::Options::NONET })
|
223
|
+
_skip_leader
|
224
|
+
}
|
225
|
+
obj
|
226
|
+
end
|
227
|
+
|
228
|
+
# (private) returns PhyloXML schema
|
229
|
+
def _schema
|
230
|
+
XML::Schema.document(XML::Document.file(File.join(File.dirname(__FILE__),'phyloxml.xsd')))
|
231
|
+
end
|
232
|
+
private :_schema
|
233
|
+
|
234
|
+
# (private) do validation
|
235
|
+
# ---
|
236
|
+
# *Arguments*:
|
237
|
+
# * (required) <em>data_type</em>_: :file for filename, :string for string
|
238
|
+
# * (required) _arg_: filename or string
|
239
|
+
# *Returns*:: (undefined)
|
240
|
+
def _validate(data_type, arg)
|
241
|
+
options = { :options =>
|
242
|
+
(LibXML::XML::Parser::Options::NOERROR | # no error messages
|
243
|
+
LibXML::XML::Parser::Options::NOWARNING | # no warning messages
|
244
|
+
LibXML::XML::Parser::Options::NONET) # no network access
|
245
|
+
}
|
246
|
+
case data_type
|
247
|
+
when :file
|
248
|
+
# No validation when special file e.g. FIFO (named pipe)
|
249
|
+
return unless File.file?(arg)
|
250
|
+
xml_instance = XML::Document.file(arg, options)
|
251
|
+
when :string
|
252
|
+
xml_instance = XML::Document.string(arg, options)
|
253
|
+
else
|
254
|
+
# no validation for unknown data type
|
255
|
+
return
|
256
|
+
end
|
257
|
+
|
258
|
+
schema = _schema
|
259
|
+
begin
|
260
|
+
flag = xml_instance.validate_schema(schema) do |msg, flag|
|
261
|
+
# The document of libxml-ruby says that the block is called
|
262
|
+
# when validation failed, but it seems it is never called
|
263
|
+
# even when validation failed!
|
264
|
+
raise "Validation of the XML document against phyloxml.xsd schema failed. #{msg}"
|
265
|
+
end
|
266
|
+
rescue LibXML::XML::Error => evar
|
267
|
+
raise "Validation of the XML document against phyloxml.xsd schema failed, or XML error occurred. #{evar.message}"
|
268
|
+
end
|
269
|
+
unless flag then
|
270
|
+
raise "Validation of the XML document against phyloxml.xsd schema failed."
|
271
|
+
end
|
272
|
+
end
|
273
|
+
private :_validate
|
274
|
+
|
275
|
+
# (private) It seems that LibXML::XML::Reader reads from the network
|
276
|
+
# even if LibXML::XML::Parser::Options::NONET is set.
|
277
|
+
# So, for URI-like filename, '://' is replaced with ':/'.
|
278
|
+
def _secure_filename(filename)
|
279
|
+
# for safety, URI-like filename is checked.
|
280
|
+
if /\A[a-zA-Z]+\:\/\// =~ filename then
|
281
|
+
# for example, "http://a/b" is changed to "http:/a/b".
|
282
|
+
filename = filename.sub(/\:\/\//, ':/')
|
283
|
+
end
|
284
|
+
filename
|
285
|
+
end
|
286
|
+
private :_secure_filename
|
287
|
+
|
288
|
+
# (private) loops through until reaches phylogeny stuff
|
289
|
+
def _skip_leader
|
290
|
+
#loops through until reaches phylogeny stuff
|
291
|
+
# Have to leave this way, if accepting strings, instead of files
|
292
|
+
@reader.read until is_element?('phylogeny')
|
293
|
+
nil
|
294
|
+
end
|
295
|
+
private :_skip_leader
|
296
|
+
|
297
|
+
# Initializes LibXML::Reader and reads the PhyloXML-formatted string
|
298
|
+
# until it reaches the first phylogeny element.
|
299
|
+
#
|
300
|
+
# Create a new Bio::PhyloXML::Parser object.
|
301
|
+
#
|
302
|
+
# str = File.read("./phyloxml_examples.xml")
|
303
|
+
# p = Bio::PhyloXML::Parser.new(str)
|
304
|
+
#
|
305
|
+
#
|
306
|
+
# Deprecated usage: Reads data from a file. <em>str<em> is a filename.
|
307
|
+
#
|
308
|
+
# p = Bio::PhyloXML::Parser.new("./phyloxml_examples.xml")
|
309
|
+
#
|
310
|
+
# Taking filename is deprecated. Use Bio::PhyloXML::Parser.open(filename).
|
311
|
+
#
|
312
|
+
# ---
|
313
|
+
# *Arguments*:
|
314
|
+
# * (required) _str_: PhyloXML-formatted string
|
315
|
+
# * (optional) _validate_: Whether to validate the file against schema or not. Default value is true.
|
316
|
+
# *Returns*:: Bio::PhyloXML::Parser object
|
317
|
+
def initialize(str, validate=true)
|
318
|
+
|
319
|
+
@other = []
|
320
|
+
|
321
|
+
return unless str
|
322
|
+
|
323
|
+
# For compatibility, if filename-like string is given,
|
324
|
+
# treat it as a filename.
|
325
|
+
if /[\<\>\r\n]/ !~ str and File.exist?(str) then
|
326
|
+
# assume that str is filename
|
327
|
+
warn "Bio::PhyloXML::Parser.new(filename) is deprecated. Use Bio::PhyloXML::Parser.open(filename)."
|
328
|
+
filename = _secure_filename(str)
|
329
|
+
_validate(:file, filename) if validate
|
330
|
+
@reader = XML::Reader.file(filename)
|
331
|
+
_skip_leader
|
332
|
+
return
|
333
|
+
end
|
334
|
+
|
335
|
+
# initialize for string
|
336
|
+
@reader = XML::Reader.string(str,
|
337
|
+
{ :options =>
|
338
|
+
LibXML::XML::Parser::Options::NONET })
|
339
|
+
_skip_leader
|
340
|
+
end
|
341
|
+
|
342
|
+
|
343
|
+
# Iterate through all trees in the file.
|
344
|
+
#
|
345
|
+
# phyloxml = Bio::PhyloXML::Parser.open('example.xml')
|
346
|
+
# phyloxml.each do |tree|
|
347
|
+
# puts tree.name
|
348
|
+
# end
|
349
|
+
#
|
350
|
+
def each
|
351
|
+
while tree = next_tree
|
352
|
+
yield tree
|
353
|
+
end
|
354
|
+
end
|
355
|
+
|
356
|
+
# Access the specified tree in the file. It parses trees until the specified
|
357
|
+
# tree is reached.
|
358
|
+
#
|
359
|
+
# # Get 3rd tree in the file (starts counting from 0).
|
360
|
+
# parser = PhyloXML::Parser.open('phyloxml_examples.xml')
|
361
|
+
# tree = parser[2]
|
362
|
+
#
|
363
|
+
def [](i)
|
364
|
+
tree = nil
|
365
|
+
(i+1).times do
|
366
|
+
tree = self.next_tree
|
367
|
+
end
|
368
|
+
return tree
|
369
|
+
end
|
370
|
+
|
371
|
+
# Parse and return the next phylogeny tree. If there are no more phylogeny
|
372
|
+
# element, nil is returned. If there is something else besides phylogeny
|
373
|
+
# elements, it is saved in the PhyloXML::Parser#other.
|
374
|
+
#
|
375
|
+
# p = Bio::PhyloXML::Parser.open("./phyloxml_examples.xml")
|
376
|
+
# tree = p.next_tree
|
377
|
+
#
|
378
|
+
# ---
|
379
|
+
# *Returns*:: Bio::PhyloXML::Tree
|
380
|
+
def next_tree()
|
381
|
+
|
382
|
+
if not is_element?('phylogeny')
|
383
|
+
if @reader.node_type == XML::Reader::TYPE_END_ELEMENT
|
384
|
+
if is_end_element?('phyloxml')
|
385
|
+
return nil
|
386
|
+
else
|
387
|
+
@reader.read
|
388
|
+
@reader.read
|
389
|
+
if is_end_element?('phyloxml')
|
390
|
+
return nil
|
391
|
+
end
|
392
|
+
end
|
393
|
+
end
|
394
|
+
# phyloxml can hold only phylogeny and "other" elements. If this is not
|
395
|
+
# phylogeny element then it is other. Also, "other" always comes after
|
396
|
+
# all phylogenies
|
397
|
+
@other << parse_other
|
398
|
+
#return nil for tree, since this is not valid phyloxml tree.
|
399
|
+
return nil
|
400
|
+
end
|
401
|
+
|
402
|
+
tree = Bio::PhyloXML::Tree.new
|
403
|
+
|
404
|
+
# keep track of current node in clades array/stack. Current node is the
|
405
|
+
# last element in the clades array
|
406
|
+
clades = []
|
407
|
+
clades.push tree
|
408
|
+
|
409
|
+
#keep track of current edge to be able to parse branch_length tag
|
410
|
+
current_edge = nil
|
411
|
+
|
412
|
+
# we are going to parse clade iteratively by pointing (and changing) to
|
413
|
+
# the current node in the tree. Since the property element is both in
|
414
|
+
# clade and in the phylogeny, we need some boolean to know if we are
|
415
|
+
# parsing the clade (there can be only max 1 clade in phylogeny) or
|
416
|
+
# parsing phylogeny
|
417
|
+
parsing_clade = false
|
418
|
+
|
419
|
+
while not is_end_element?('phylogeny') do
|
420
|
+
break if is_end_element?('phyloxml')
|
421
|
+
|
422
|
+
# parse phylogeny elements, except clade
|
423
|
+
if not parsing_clade
|
424
|
+
|
425
|
+
if is_element?('phylogeny')
|
426
|
+
@reader["rooted"] == "true" ? tree.rooted = true : tree.rooted = false
|
427
|
+
@reader["rerootable"] == "true" ? tree.rerootable = true : tree.rerootable = false
|
428
|
+
parse_attributes(tree, ["branch_length_unit", 'type'])
|
429
|
+
end
|
430
|
+
|
431
|
+
parse_simple_elements(tree, [ "name", 'description', "date"])
|
432
|
+
|
433
|
+
if is_element?('confidence')
|
434
|
+
tree.confidences << parse_confidence
|
435
|
+
end
|
436
|
+
|
437
|
+
end
|
438
|
+
|
439
|
+
if @reader.node_type == XML::Reader::TYPE_ELEMENT
|
440
|
+
case @reader.name
|
441
|
+
when 'clade'
|
442
|
+
#parse clade element
|
443
|
+
|
444
|
+
parsing_clade = true
|
445
|
+
|
446
|
+
node= Bio::PhyloXML::Node.new
|
447
|
+
|
448
|
+
branch_length = @reader['branch_length']
|
449
|
+
|
450
|
+
parse_attributes(node, ["id_source"])
|
451
|
+
|
452
|
+
#add new node to the tree
|
453
|
+
tree.add_node(node)
|
454
|
+
# The first clade will always be root since by xsd schema phyloxml can
|
455
|
+
# have 0 to 1 clades in it.
|
456
|
+
if tree.root == nil
|
457
|
+
tree.root = node
|
458
|
+
else
|
459
|
+
current_edge = tree.add_edge(clades[-1], node,
|
460
|
+
Bio::Tree::Edge.new(branch_length))
|
461
|
+
end
|
462
|
+
clades.push node
|
463
|
+
#end if clade element
|
464
|
+
else
|
465
|
+
parse_clade_elements(clades[-1], current_edge) if parsing_clade
|
466
|
+
end
|
467
|
+
end
|
468
|
+
|
469
|
+
#end clade element, go one parent up
|
470
|
+
if is_end_element?('clade')
|
471
|
+
|
472
|
+
#if we have reached the closing tag of the top-most clade, then our
|
473
|
+
# curent node should point to the root, If thats the case, we are done
|
474
|
+
# parsing the clade element
|
475
|
+
if clades[-1] == tree.root
|
476
|
+
parsing_clade = false
|
477
|
+
else
|
478
|
+
# set current node (clades[-1) to the previous clade in the array
|
479
|
+
clades.pop
|
480
|
+
end
|
481
|
+
end
|
482
|
+
|
483
|
+
#parsing phylogeny elements
|
484
|
+
if not parsing_clade
|
485
|
+
|
486
|
+
if @reader.node_type == XML::Reader::TYPE_ELEMENT
|
487
|
+
case @reader.name
|
488
|
+
when 'property'
|
489
|
+
tree.properties << parse_property
|
490
|
+
|
491
|
+
when 'clade_relation'
|
492
|
+
clade_relation = CladeRelation.new
|
493
|
+
parse_attributes(clade_relation, ["id_ref_0", "id_ref_1", "distance", "type"])
|
494
|
+
|
495
|
+
#@ add unit test for this
|
496
|
+
if not @reader.empty_element?
|
497
|
+
@reader.read
|
498
|
+
if is_element?('confidence')
|
499
|
+
clade_relation.confidence = parse_confidence
|
500
|
+
end
|
501
|
+
end
|
502
|
+
tree.clade_relations << clade_relation
|
503
|
+
|
504
|
+
when 'sequence_relation'
|
505
|
+
sequence_relation = SequenceRelation.new
|
506
|
+
parse_attributes(sequence_relation, ["id_ref_0", "id_ref_1", "distance", "type"])
|
507
|
+
if not @reader.empty_element?
|
508
|
+
@reader.read
|
509
|
+
if is_element?('confidence')
|
510
|
+
sequence_relation.confidence = parse_confidence
|
511
|
+
end
|
512
|
+
end
|
513
|
+
tree.sequence_relations << sequence_relation
|
514
|
+
when 'phylogeny'
|
515
|
+
#do nothing
|
516
|
+
else
|
517
|
+
tree.other << parse_other
|
518
|
+
#puts "Not recognized element. #{@reader.name}"
|
519
|
+
end
|
520
|
+
end
|
521
|
+
end
|
522
|
+
# go to next element
|
523
|
+
@reader.read
|
524
|
+
end #end while not </phylogeny>
|
525
|
+
#move on to the next tag after /phylogeny which is text, since phylogeny
|
526
|
+
#end tag is empty element, which value is nil, therefore need to move to
|
527
|
+
#the next meaningful element (therefore @reader.read twice)
|
528
|
+
@reader.read
|
529
|
+
@reader.read
|
530
|
+
|
531
|
+
return tree
|
532
|
+
end
|
533
|
+
|
534
|
+
# return tree of specified name.
|
535
|
+
# @todo Implement this method.
|
536
|
+
# def get_tree_by_name(name)
|
537
|
+
|
538
|
+
# while not is_end_element?('phyloxml')
|
539
|
+
# if is_element?('phylogeny')
|
540
|
+
# @reader.read
|
541
|
+
# @reader.read
|
542
|
+
#
|
543
|
+
# if is_element?('name')
|
544
|
+
# @reader.read
|
545
|
+
# if @reader.value == name
|
546
|
+
# puts "equasl"
|
547
|
+
# tree = next_tree
|
548
|
+
# puts tree
|
549
|
+
# end
|
550
|
+
# end
|
551
|
+
# end
|
552
|
+
# @reader.read
|
553
|
+
# end
|
554
|
+
#
|
555
|
+
# end
|
556
|
+
|
557
|
+
|
558
|
+
private
|
559
|
+
|
560
|
+
####
|
561
|
+
# Utility methods
|
562
|
+
###
|
563
|
+
|
564
|
+
def is_element?(str)
|
565
|
+
@reader.node_type == XML::Reader::TYPE_ELEMENT and @reader.name == str ? true : false
|
566
|
+
end
|
567
|
+
|
568
|
+
def is_end_element?(str)
|
569
|
+
@reader.node_type==XML::Reader::TYPE_END_ELEMENT and @reader.name == str ? true : false
|
570
|
+
end
|
571
|
+
|
572
|
+
def has_reached_end_element?(str)
|
573
|
+
if not(is_end_element?(str))
|
574
|
+
raise "Warning: Should have reached </#{str}> element here"
|
575
|
+
end
|
576
|
+
end
|
577
|
+
|
578
|
+
# Parses a simple XML element. for example <speciations>1</speciations>
|
579
|
+
# It reads in the value and assigns it to object.speciation = 1
|
580
|
+
# Also checks if have reached end tag (</speciations> and gives warning
|
581
|
+
# if not
|
582
|
+
def parse_simple_element(object, name)
|
583
|
+
if is_element?(name)
|
584
|
+
@reader.read
|
585
|
+
object.send("#{name}=", @reader.value)
|
586
|
+
@reader.read
|
587
|
+
has_reached_end_element?(name)
|
588
|
+
end
|
589
|
+
end
|
590
|
+
|
591
|
+
def parse_simple_elements(object, elements)
|
592
|
+
elements.each do |elmt|
|
593
|
+
parse_simple_element(object, elmt)
|
594
|
+
end
|
595
|
+
end
|
596
|
+
|
597
|
+
#Parses list of attributes
|
598
|
+
#use for the code like: clade_relation.type = @reader["type"]
|
599
|
+
def parse_attributes(object, arr_of_attrs)
|
600
|
+
arr_of_attrs.each do |attr|
|
601
|
+
object.send("#{attr}=", @reader[attr])
|
602
|
+
end
|
603
|
+
end
|
604
|
+
|
605
|
+
def parse_clade_elements(current_node, current_edge)
|
606
|
+
#no loop inside, loop is already outside
|
607
|
+
|
608
|
+
if @reader.node_type == XML::Reader::TYPE_ELEMENT
|
609
|
+
case @reader.name
|
610
|
+
when 'branch_length'
|
611
|
+
# @todo add unit test for this. current_edge is nil, if the root clade
|
612
|
+
# has branch_length attribute.
|
613
|
+
@reader.read
|
614
|
+
branch_length = @reader.value
|
615
|
+
current_edge.distance = branch_length.to_f if current_edge != nil
|
616
|
+
@reader.read
|
617
|
+
when 'width'
|
618
|
+
@reader.read
|
619
|
+
current_node.width = @reader.value
|
620
|
+
@reader.read
|
621
|
+
when 'name'
|
622
|
+
@reader.read
|
623
|
+
current_node.name = @reader.value
|
624
|
+
@reader.read
|
625
|
+
when 'events'
|
626
|
+
current_node.events = parse_events
|
627
|
+
when 'confidence'
|
628
|
+
current_node.confidences << parse_confidence
|
629
|
+
when 'sequence'
|
630
|
+
current_node.sequences << parse_sequence
|
631
|
+
when 'property'
|
632
|
+
current_node.properties << parse_property
|
633
|
+
when 'taxonomy'
|
634
|
+
current_node.taxonomies << parse_taxonomy
|
635
|
+
when 'distribution'
|
636
|
+
current_node.distributions << parse_distribution
|
637
|
+
when 'node_id'
|
638
|
+
id = Id.new
|
639
|
+
id.type = @reader["type"]
|
640
|
+
@reader.read
|
641
|
+
id.value = @reader.value
|
642
|
+
@reader.read
|
643
|
+
#has_reached_end_element?('node_id')
|
644
|
+
#@todo write unit test for this. There is no example of this in the example files
|
645
|
+
current_node.id = id
|
646
|
+
when 'color'
|
647
|
+
color = BranchColor.new
|
648
|
+
parse_simple_element(color, 'red')
|
649
|
+
parse_simple_element(color, 'green')
|
650
|
+
parse_simple_element(color, 'blue')
|
651
|
+
current_node.color = color
|
652
|
+
#@todo add unit test for this
|
653
|
+
when 'date'
|
654
|
+
date = Date.new
|
655
|
+
date.unit = @reader["unit"]
|
656
|
+
#move to the next token, which is always empty, since date tag does not
|
657
|
+
# have text associated with it
|
658
|
+
@reader.read
|
659
|
+
@reader.read #now the token is the first tag under date tag
|
660
|
+
while not(is_end_element?('date'))
|
661
|
+
parse_simple_element(date, 'desc')
|
662
|
+
parse_simple_element(date, 'value')
|
663
|
+
parse_simple_element(date, 'minimum')
|
664
|
+
parse_simple_element(date, 'maximum')
|
665
|
+
@reader.read
|
666
|
+
end
|
667
|
+
current_node.date = date
|
668
|
+
when 'reference'
|
669
|
+
reference = Reference.new()
|
670
|
+
reference.doi = @reader['doi']
|
671
|
+
if not @reader.empty_element?
|
672
|
+
while not is_end_element?('reference')
|
673
|
+
parse_simple_element(reference, 'desc')
|
674
|
+
@reader.read
|
675
|
+
end
|
676
|
+
end
|
677
|
+
current_node.references << reference
|
678
|
+
when 'binary_characters'
|
679
|
+
current_node.binary_characters = parse_binary_characters
|
680
|
+
when 'clade'
|
681
|
+
#do nothing
|
682
|
+
else
|
683
|
+
current_node.other << parse_other
|
684
|
+
#puts "No match found in parse_clade_elements.(#{@reader.name})"
|
685
|
+
end
|
686
|
+
|
687
|
+
end
|
688
|
+
|
689
|
+
end #parse_clade_elements
|
690
|
+
|
691
|
+
def parse_events()
|
692
|
+
events = PhyloXML::Events.new
|
693
|
+
@reader.read #go to next element
|
694
|
+
while not(is_end_element?('events')) do
|
695
|
+
parse_simple_elements(events, ['type', 'duplications',
|
696
|
+
'speciations', 'losses'])
|
697
|
+
if is_element?('confidence')
|
698
|
+
events.confidence = parse_confidence
|
699
|
+
#@todo could add unit test for this (example file does not have this case)
|
700
|
+
end
|
701
|
+
@reader.read
|
702
|
+
end
|
703
|
+
return events
|
704
|
+
end #parse_events
|
705
|
+
|
706
|
+
def parse_taxonomy
|
707
|
+
taxonomy = PhyloXML::Taxonomy.new
|
708
|
+
parse_attributes(taxonomy, ["id_source"])
|
709
|
+
@reader.read
|
710
|
+
while not(is_end_element?('taxonomy')) do
|
711
|
+
|
712
|
+
if @reader.node_type == XML::Reader::TYPE_ELEMENT
|
713
|
+
case @reader.name
|
714
|
+
when 'code'
|
715
|
+
@reader.read
|
716
|
+
taxonomy.code = @reader.value
|
717
|
+
@reader.read
|
718
|
+
when 'scientific_name'
|
719
|
+
@reader.read
|
720
|
+
taxonomy.scientific_name = @reader.value
|
721
|
+
@reader.read
|
722
|
+
when 'rank'
|
723
|
+
@reader.read
|
724
|
+
taxonomy.rank = @reader.value
|
725
|
+
@reader.read
|
726
|
+
when 'authority'
|
727
|
+
@reader.read
|
728
|
+
taxonomy.authority = @reader.value
|
729
|
+
@reader.read
|
730
|
+
when 'id'
|
731
|
+
taxonomy.taxonomy_id = parse_id('id')
|
732
|
+
when 'common_name'
|
733
|
+
@reader.read
|
734
|
+
taxonomy.common_names << @reader.value
|
735
|
+
@reader.read
|
736
|
+
#has_reached_end_element?('common_name')
|
737
|
+
when 'synonym'
|
738
|
+
@reader.read
|
739
|
+
taxonomy.synonyms << @reader.value
|
740
|
+
@reader.read
|
741
|
+
#has_reached_end_element?('synonym')
|
742
|
+
when 'uri'
|
743
|
+
taxonomy.uri = parse_uri
|
744
|
+
else
|
745
|
+
taxonomy.other << parse_other
|
746
|
+
end
|
747
|
+
end
|
748
|
+
|
749
|
+
@reader.read #move to next tag in the loop
|
750
|
+
end
|
751
|
+
return taxonomy
|
752
|
+
end #parse_taxonomy
|
753
|
+
|
754
|
+
private
|
755
|
+
|
756
|
+
def parse_sequence
|
757
|
+
sequence = Sequence.new
|
758
|
+
parse_attributes(sequence, ["type", "id_source", "id_ref"])
|
759
|
+
|
760
|
+
@reader.read
|
761
|
+
while not(is_end_element?('sequence'))
|
762
|
+
|
763
|
+
if @reader.node_type == XML::Reader::TYPE_ELEMENT
|
764
|
+
case @reader.name
|
765
|
+
when 'symbol'
|
766
|
+
@reader.read
|
767
|
+
sequence.symbol = @reader.value
|
768
|
+
@reader.read
|
769
|
+
when 'name'
|
770
|
+
@reader.read
|
771
|
+
sequence.name = @reader.value
|
772
|
+
@reader.read
|
773
|
+
when 'location'
|
774
|
+
@reader.read
|
775
|
+
sequence.location = @reader.value
|
776
|
+
@reader.read
|
777
|
+
when 'mol_seq'
|
778
|
+
sequence.is_aligned = @reader["is_aligned"]
|
779
|
+
@reader.read
|
780
|
+
sequence.mol_seq = @reader.value
|
781
|
+
@reader.read
|
782
|
+
has_reached_end_element?('mol_seq')
|
783
|
+
when 'accession'
|
784
|
+
sequence.accession = Accession.new
|
785
|
+
sequence.accession.source = @reader["source"]
|
786
|
+
@reader.read
|
787
|
+
sequence.accession.value = @reader.value
|
788
|
+
@reader.read
|
789
|
+
has_reached_end_element?('accession')
|
790
|
+
when 'uri'
|
791
|
+
sequence.uri = parse_uri
|
792
|
+
when 'annotation'
|
793
|
+
sequence.annotations << parse_annotation
|
794
|
+
when 'domain_architecture'
|
795
|
+
sequence.domain_architecture = DomainArchitecture.new
|
796
|
+
sequence.domain_architecture.length = @reader["length"]
|
797
|
+
@reader.read
|
798
|
+
@reader.read
|
799
|
+
while not(is_end_element?('domain_architecture'))
|
800
|
+
sequence.domain_architecture.domains << parse_domain
|
801
|
+
@reader.read #go to next domain element
|
802
|
+
end
|
803
|
+
else
|
804
|
+
sequence.other << parse_other
|
805
|
+
#@todo add unit test
|
806
|
+
end
|
807
|
+
end
|
808
|
+
|
809
|
+
@reader.read
|
810
|
+
end
|
811
|
+
return sequence
|
812
|
+
end #parse_sequence
|
813
|
+
|
814
|
+
def parse_uri
|
815
|
+
uri = Uri.new
|
816
|
+
parse_attributes(uri, ["desc", "type"])
|
817
|
+
parse_simple_element(uri, 'uri')
|
818
|
+
return uri
|
819
|
+
end
|
820
|
+
|
821
|
+
def parse_annotation
|
822
|
+
annotation = Annotation.new
|
823
|
+
|
824
|
+
parse_attributes(annotation, ['ref', 'source', 'evidence', 'type'])
|
825
|
+
|
826
|
+
if not @reader.empty_element?
|
827
|
+
while not(is_end_element?('annotation'))
|
828
|
+
parse_simple_element(annotation, 'desc') if is_element?('desc')
|
829
|
+
|
830
|
+
annotation.confidence = parse_confidence if is_element?('confidence')
|
831
|
+
|
832
|
+
annotation.properties << parse_property if is_element?('property')
|
833
|
+
|
834
|
+
if is_element?('uri')
|
835
|
+
annotation.uri = parse_uri
|
836
|
+
end
|
837
|
+
|
838
|
+
@reader.read
|
839
|
+
end
|
840
|
+
|
841
|
+
end
|
842
|
+
return annotation
|
843
|
+
end
|
844
|
+
|
845
|
+
def parse_property
|
846
|
+
property = Property.new
|
847
|
+
parse_attributes(property, ["ref", "unit", "datatype", "applies_to", "id_ref"])
|
848
|
+
@reader.read
|
849
|
+
property.value = @reader.value
|
850
|
+
@reader.read
|
851
|
+
has_reached_end_element?('property')
|
852
|
+
return property
|
853
|
+
end #parse_property
|
854
|
+
|
855
|
+
def parse_confidence
|
856
|
+
type = @reader["type"]
|
857
|
+
@reader.read
|
858
|
+
value = @reader.value.to_f
|
859
|
+
@reader.read
|
860
|
+
has_reached_end_element?('confidence')
|
861
|
+
return Confidence.new(type, value)
|
862
|
+
end #parse_confidence
|
863
|
+
|
864
|
+
def parse_distribution
|
865
|
+
distribution = Distribution.new
|
866
|
+
@reader.read
|
867
|
+
while not(is_end_element?('distribution')) do
|
868
|
+
|
869
|
+
parse_simple_element(distribution, 'desc')
|
870
|
+
|
871
|
+
distribution.points << parse_point if is_element?('point')
|
872
|
+
distribution.polygons << parse_polygon if is_element?('polygon')
|
873
|
+
|
874
|
+
@reader.read
|
875
|
+
end
|
876
|
+
return distribution
|
877
|
+
end #parse_distribution
|
878
|
+
|
879
|
+
def parse_point
|
880
|
+
point = Point.new
|
881
|
+
|
882
|
+
point.geodetic_datum = @reader["geodetic_datum"]
|
883
|
+
point.alt_unit = @reader["alt_unit"]
|
884
|
+
|
885
|
+
@reader.read
|
886
|
+
while not(is_end_element?('point')) do
|
887
|
+
|
888
|
+
parse_simple_elements(point, ['lat', 'long'] )
|
889
|
+
|
890
|
+
if is_element?('alt')
|
891
|
+
@reader.read
|
892
|
+
point.alt = @reader.value.to_f
|
893
|
+
@reader.read
|
894
|
+
has_reached_end_element?('alt')
|
895
|
+
end
|
896
|
+
#advance reader
|
897
|
+
@reader.read
|
898
|
+
end
|
899
|
+
return point
|
900
|
+
end #parse_point
|
901
|
+
|
902
|
+
def parse_polygon
|
903
|
+
polygon = Polygon.new
|
904
|
+
@reader.read
|
905
|
+
while not(is_end_element?('polygon')) do
|
906
|
+
polygon.points << parse_point if is_element?('point')
|
907
|
+
@reader.read
|
908
|
+
end
|
909
|
+
|
910
|
+
#@todo should check for it at all? Probably not if xml is valid.
|
911
|
+
if polygon.points.length <3
|
912
|
+
puts "Warning: <polygon> should have at least 3 points"
|
913
|
+
end
|
914
|
+
return polygon
|
915
|
+
end #parse_polygon
|
916
|
+
|
917
|
+
def parse_id(tag_name)
|
918
|
+
id = Id.new
|
919
|
+
id.provider = @reader["provider"]
|
920
|
+
@reader.read
|
921
|
+
id.value = @reader.value
|
922
|
+
@reader.read #@todo shouldn't there be another read?
|
923
|
+
has_reached_end_element?(tag_name)
|
924
|
+
return id
|
925
|
+
end #parse_id
|
926
|
+
|
927
|
+
def parse_domain
|
928
|
+
domain = ProteinDomain.new
|
929
|
+
parse_attributes(domain, ["from", "to", "confidence", "id"])
|
930
|
+
@reader.read
|
931
|
+
domain.value = @reader.value
|
932
|
+
@reader.read
|
933
|
+
has_reached_end_element?('domain')
|
934
|
+
@reader.read
|
935
|
+
return domain
|
936
|
+
end
|
937
|
+
|
938
|
+
def parse_binary_characters
|
939
|
+
b = PhyloXML::BinaryCharacters.new
|
940
|
+
b.bc_type = @reader['type']
|
941
|
+
|
942
|
+
parse_attributes(b, ['gained_count', 'absent_count', 'lost_count', 'present_count'])
|
943
|
+
if not @reader.empty_element?
|
944
|
+
@reader.read
|
945
|
+
while not is_end_element?('binary_characters')
|
946
|
+
|
947
|
+
parse_bc(b, 'lost')
|
948
|
+
parse_bc(b, 'gained')
|
949
|
+
parse_bc(b, 'absent')
|
950
|
+
parse_bc(b, 'present')
|
951
|
+
|
952
|
+
@reader.read
|
953
|
+
end
|
954
|
+
end
|
955
|
+
return b
|
956
|
+
end #parse_binary_characters
|
957
|
+
|
958
|
+
def parse_bc(object, element)
|
959
|
+
if is_element?(element)
|
960
|
+
@reader.read
|
961
|
+
while not is_end_element?(element)
|
962
|
+
if is_element?('bc')
|
963
|
+
@reader.read
|
964
|
+
object.send(element) << @reader.value
|
965
|
+
@reader.read
|
966
|
+
has_reached_end_element?('bc')
|
967
|
+
end
|
968
|
+
@reader.read
|
969
|
+
end
|
970
|
+
end
|
971
|
+
end #parse_bc
|
972
|
+
|
973
|
+
def parse_other
|
974
|
+
other_obj = PhyloXML::Other.new
|
975
|
+
other_obj.element_name = @reader.name
|
976
|
+
#parse attributes
|
977
|
+
code = @reader.move_to_first_attribute
|
978
|
+
while code ==1
|
979
|
+
other_obj.attributes[@reader.name] = @reader.value
|
980
|
+
code = @reader.move_to_next_attribute
|
981
|
+
end
|
982
|
+
|
983
|
+
while not is_end_element?(other_obj.element_name) do
|
984
|
+
@reader.read
|
985
|
+
if @reader.node_type == XML::Reader::TYPE_ELEMENT
|
986
|
+
other_obj.children << parse_other #recursice call to parse children
|
987
|
+
elsif @reader.node_type == XML::Reader::TYPE_TEXT
|
988
|
+
other_obj.value = @reader.value
|
989
|
+
end
|
990
|
+
end
|
991
|
+
#just a check
|
992
|
+
has_reached_end_element?(other_obj.element_name)
|
993
|
+
return other_obj
|
994
|
+
end #parse_other
|
995
|
+
|
996
|
+
end #class phyloxmlParser
|
997
|
+
|
998
|
+
end #module PhyloXML
|
999
|
+
|
1000
|
+
end #module Bio
|