bio-phyloxml 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,1000 @@
1
+ #
2
+ # = bio/db/phyloxml_parser.rb - PhyloXML parser
3
+ #
4
+ # Copyright:: Copyright (C) 2009
5
+ # Diana Jaunzeikare <latvianlinuxgirl@gmail.com>
6
+ # License:: The Ruby License
7
+ #
8
+ # $Id:$
9
+ #
10
+ # == Description
11
+ #
12
+ # This file containts parser for PhyloXML.
13
+ #
14
+ # == Requirements
15
+ #
16
+ # Libxml2 XML parser is required. Install libxml-ruby bindings from
17
+ # http://libxml.rubyforge.org or
18
+ #
19
+ # gem install -r libxml-ruby
20
+ #
21
+ # == References
22
+ #
23
+ # * http://www.phyloxml.org
24
+ #
25
+ # * https://www.nescent.org/wg_phyloinformatics/PhyloSoC:PhyloXML_support_in_BioRuby
26
+
27
+
28
+ require 'uri'
29
+ require 'libxml'
30
+
31
+ require 'bio/tree'
32
+
33
+
34
+ module Bio
35
+
36
+ module PhyloXML
37
+
38
+
39
+
40
+
41
+ # == Description
42
+ #
43
+ # Bio::PhyloXML::Parser is for parsing phyloXML format files.
44
+ #
45
+ # == Requirements
46
+ #
47
+ # Libxml2 XML parser is required. Install libxml-ruby bindings from
48
+ # http://libxml.rubyforge.org or
49
+ #
50
+ # gem install -r libxml-ruby
51
+ #
52
+ # == Usage
53
+ #
54
+ # require 'bio'
55
+ #
56
+ # # Create new phyloxml parser
57
+ # phyloxml = Bio::PhyloXML::Parser.open('example.xml')
58
+ #
59
+ # # Print the names of all trees in the file
60
+ # phyloxml.each do |tree|
61
+ # puts tree.name
62
+ # end
63
+ #
64
+ #
65
+ # == References
66
+ #
67
+ # http://www.phyloxml.org/documentation/version_100/phyloxml.xsd.html
68
+ #
69
+ class Parser
70
+
71
+ include LibXML
72
+
73
+ # After parsing all the trees, if there is anything else in other xml format,
74
+ # it is saved in this array of PhyloXML::Other objects
75
+ attr_reader :other
76
+
77
+ # Initializes LibXML::Reader and reads the file until it reaches the first
78
+ # phylogeny element.
79
+ #
80
+ # Example: Create a new Bio::PhyloXML::Parser object.
81
+ #
82
+ # p = Bio::PhyloXML::Parser.open("./phyloxml_examples.xml")
83
+ #
84
+ # If the optional code block is given, Bio::PhyloXML object is passed to
85
+ # the block as an argument. When the block terminates, the Bio::PhyloXML
86
+ # object is automatically closed, and the open method returns the value
87
+ # of the block.
88
+ #
89
+ # Example: Get the first tree in the file.
90
+ #
91
+ # tree = Bio::PhyloXML::Parser.open("example.xml") do |px|
92
+ # px.next_tree
93
+ # end
94
+ #
95
+ # ---
96
+ # *Arguments*:
97
+ # * (required) _filename_: Path to the file to parse.
98
+ # * (optional) _validate_: Whether to validate the file against schema or not. Default value is true.
99
+ # *Returns*:: (without block) Bio::PhyloXML::Parser object
100
+ # *Returns*:: (with block) the value of the block
101
+ def self.open(filename, validate=true)
102
+ obj = new(nil, validate)
103
+ obj.instance_eval {
104
+ filename = _secure_filename(filename)
105
+ _validate(:file, filename) if validate
106
+ # XML::Parser::Options::NONET for security reason
107
+ @reader = XML::Reader.file(filename,
108
+ { :options =>
109
+ LibXML::XML::Parser::Options::NONET })
110
+ _skip_leader
111
+ }
112
+ if block_given? then
113
+ begin
114
+ ret = yield obj
115
+ ensure
116
+ obj.close if obj and !obj.closed?
117
+ end
118
+ ret
119
+ else
120
+ obj
121
+ end
122
+ end
123
+
124
+ # Initializes LibXML::Reader and reads the file until it reaches the first
125
+ # phylogeny element.
126
+ #
127
+ # Create a new Bio::PhyloXML::Parser object.
128
+ #
129
+ # p = Bio::PhyloXML::Parser.open_uri("http://www.phyloxml.org/examples/apaf.xml")
130
+ #
131
+ # If the optional code block is given, Bio::PhyloXML object is passed to
132
+ # the block as an argument. When the block terminates, the Bio::PhyloXML
133
+ # object is automatically closed, and the open_uri method returns the
134
+ # value of the block.
135
+ #
136
+ # ---
137
+ # *Arguments*:
138
+ # * (required) _uri_: (URI or String) URI to the data to parse
139
+ # * (optional) _validate_: For URI reader, the "validate" option is ignored and no validation is executed.
140
+ # *Returns*:: (without block) Bio::PhyloXML::Parser object
141
+ # *Returns*:: (with block) the value of the block
142
+ def self.open_uri(uri, validate=true)
143
+ case uri
144
+ when URI
145
+ uri = uri.to_s
146
+ else
147
+ # raises error if not a String
148
+ uri = uri.to_str
149
+ # raises error if invalid URI
150
+ URI.parse(uri)
151
+ end
152
+
153
+ obj = new(nil, validate)
154
+ obj.instance_eval {
155
+ @reader = XML::Reader.file(uri)
156
+ _skip_leader
157
+ }
158
+ if block_given? then
159
+ begin
160
+ ret = yield obj
161
+ ensure
162
+ obj.close if obj and !obj.closed?
163
+ end
164
+ else
165
+ obj
166
+ end
167
+ end
168
+
169
+ # Special class for closed PhyloXML::Parser object.
170
+ # It raises error for any methods except essential methods.
171
+ #
172
+ # Bio::PhyloXML internal use only.
173
+ class ClosedPhyloXMLParser #:nodoc:
174
+ def method_missing(*arg)
175
+ raise LibXML::XML::Error, 'closed PhyloXML::Parser object'
176
+ end
177
+ end #class ClosedPhyloXMLParser
178
+
179
+ # Closes the LibXML::Reader inside the object.
180
+ # It also closes the opened file if it is created by using
181
+ # Bio::PhyloXML::Parser.open method.
182
+ #
183
+ # When closed object is closed again, or closed object is used,
184
+ # it raises LibXML::XML::Error.
185
+ # ---
186
+ # *Returns*:: nil
187
+ def close
188
+ @reader.close
189
+ @reader = ClosedPhyloXMLParser.new
190
+ nil
191
+ end
192
+
193
+ # If the object is closed by using the close method or equivalent,
194
+ # returns true. Otherwise, returns false.
195
+ # ---
196
+ # *Returns*:: true or false
197
+ def closed?
198
+ if @reader.kind_of?(ClosedPhyloXMLParser) then
199
+ true
200
+ else
201
+ false
202
+ end
203
+ end
204
+
205
+ # Initializes LibXML::Reader and reads from the IO until it reaches
206
+ # the first phylogeny element.
207
+ #
208
+ # Create a new Bio::PhyloXML::Parser object.
209
+ #
210
+ # p = Bio::PhyloXML::Parser.for_io($stdin)
211
+ #
212
+ # ---
213
+ # *Arguments*:
214
+ # * (required) _io_: IO object
215
+ # * (optional) _validate_: For IO reader, the "validate" option is ignored and no validation is executed.
216
+ # *Returns*:: Bio::PhyloXML::Parser object
217
+ def self.for_io(io, validate=true)
218
+ obj = new(nil, validate)
219
+ obj.instance_eval {
220
+ @reader = XML::Reader.io(io,
221
+ { :options =>
222
+ LibXML::XML::Parser::Options::NONET })
223
+ _skip_leader
224
+ }
225
+ obj
226
+ end
227
+
228
+ # (private) returns PhyloXML schema
229
+ def _schema
230
+ XML::Schema.document(XML::Document.file(File.join(File.dirname(__FILE__),'phyloxml.xsd')))
231
+ end
232
+ private :_schema
233
+
234
+ # (private) do validation
235
+ # ---
236
+ # *Arguments*:
237
+ # * (required) <em>data_type</em>_: :file for filename, :string for string
238
+ # * (required) _arg_: filename or string
239
+ # *Returns*:: (undefined)
240
+ def _validate(data_type, arg)
241
+ options = { :options =>
242
+ (LibXML::XML::Parser::Options::NOERROR | # no error messages
243
+ LibXML::XML::Parser::Options::NOWARNING | # no warning messages
244
+ LibXML::XML::Parser::Options::NONET) # no network access
245
+ }
246
+ case data_type
247
+ when :file
248
+ # No validation when special file e.g. FIFO (named pipe)
249
+ return unless File.file?(arg)
250
+ xml_instance = XML::Document.file(arg, options)
251
+ when :string
252
+ xml_instance = XML::Document.string(arg, options)
253
+ else
254
+ # no validation for unknown data type
255
+ return
256
+ end
257
+
258
+ schema = _schema
259
+ begin
260
+ flag = xml_instance.validate_schema(schema) do |msg, flag|
261
+ # The document of libxml-ruby says that the block is called
262
+ # when validation failed, but it seems it is never called
263
+ # even when validation failed!
264
+ raise "Validation of the XML document against phyloxml.xsd schema failed. #{msg}"
265
+ end
266
+ rescue LibXML::XML::Error => evar
267
+ raise "Validation of the XML document against phyloxml.xsd schema failed, or XML error occurred. #{evar.message}"
268
+ end
269
+ unless flag then
270
+ raise "Validation of the XML document against phyloxml.xsd schema failed."
271
+ end
272
+ end
273
+ private :_validate
274
+
275
+ # (private) It seems that LibXML::XML::Reader reads from the network
276
+ # even if LibXML::XML::Parser::Options::NONET is set.
277
+ # So, for URI-like filename, '://' is replaced with ':/'.
278
+ def _secure_filename(filename)
279
+ # for safety, URI-like filename is checked.
280
+ if /\A[a-zA-Z]+\:\/\// =~ filename then
281
+ # for example, "http://a/b" is changed to "http:/a/b".
282
+ filename = filename.sub(/\:\/\//, ':/')
283
+ end
284
+ filename
285
+ end
286
+ private :_secure_filename
287
+
288
+ # (private) loops through until reaches phylogeny stuff
289
+ def _skip_leader
290
+ #loops through until reaches phylogeny stuff
291
+ # Have to leave this way, if accepting strings, instead of files
292
+ @reader.read until is_element?('phylogeny')
293
+ nil
294
+ end
295
+ private :_skip_leader
296
+
297
+ # Initializes LibXML::Reader and reads the PhyloXML-formatted string
298
+ # until it reaches the first phylogeny element.
299
+ #
300
+ # Create a new Bio::PhyloXML::Parser object.
301
+ #
302
+ # str = File.read("./phyloxml_examples.xml")
303
+ # p = Bio::PhyloXML::Parser.new(str)
304
+ #
305
+ #
306
+ # Deprecated usage: Reads data from a file. <em>str<em> is a filename.
307
+ #
308
+ # p = Bio::PhyloXML::Parser.new("./phyloxml_examples.xml")
309
+ #
310
+ # Taking filename is deprecated. Use Bio::PhyloXML::Parser.open(filename).
311
+ #
312
+ # ---
313
+ # *Arguments*:
314
+ # * (required) _str_: PhyloXML-formatted string
315
+ # * (optional) _validate_: Whether to validate the file against schema or not. Default value is true.
316
+ # *Returns*:: Bio::PhyloXML::Parser object
317
+ def initialize(str, validate=true)
318
+
319
+ @other = []
320
+
321
+ return unless str
322
+
323
+ # For compatibility, if filename-like string is given,
324
+ # treat it as a filename.
325
+ if /[\<\>\r\n]/ !~ str and File.exist?(str) then
326
+ # assume that str is filename
327
+ warn "Bio::PhyloXML::Parser.new(filename) is deprecated. Use Bio::PhyloXML::Parser.open(filename)."
328
+ filename = _secure_filename(str)
329
+ _validate(:file, filename) if validate
330
+ @reader = XML::Reader.file(filename)
331
+ _skip_leader
332
+ return
333
+ end
334
+
335
+ # initialize for string
336
+ @reader = XML::Reader.string(str,
337
+ { :options =>
338
+ LibXML::XML::Parser::Options::NONET })
339
+ _skip_leader
340
+ end
341
+
342
+
343
+ # Iterate through all trees in the file.
344
+ #
345
+ # phyloxml = Bio::PhyloXML::Parser.open('example.xml')
346
+ # phyloxml.each do |tree|
347
+ # puts tree.name
348
+ # end
349
+ #
350
+ def each
351
+ while tree = next_tree
352
+ yield tree
353
+ end
354
+ end
355
+
356
+ # Access the specified tree in the file. It parses trees until the specified
357
+ # tree is reached.
358
+ #
359
+ # # Get 3rd tree in the file (starts counting from 0).
360
+ # parser = PhyloXML::Parser.open('phyloxml_examples.xml')
361
+ # tree = parser[2]
362
+ #
363
+ def [](i)
364
+ tree = nil
365
+ (i+1).times do
366
+ tree = self.next_tree
367
+ end
368
+ return tree
369
+ end
370
+
371
+ # Parse and return the next phylogeny tree. If there are no more phylogeny
372
+ # element, nil is returned. If there is something else besides phylogeny
373
+ # elements, it is saved in the PhyloXML::Parser#other.
374
+ #
375
+ # p = Bio::PhyloXML::Parser.open("./phyloxml_examples.xml")
376
+ # tree = p.next_tree
377
+ #
378
+ # ---
379
+ # *Returns*:: Bio::PhyloXML::Tree
380
+ def next_tree()
381
+
382
+ if not is_element?('phylogeny')
383
+ if @reader.node_type == XML::Reader::TYPE_END_ELEMENT
384
+ if is_end_element?('phyloxml')
385
+ return nil
386
+ else
387
+ @reader.read
388
+ @reader.read
389
+ if is_end_element?('phyloxml')
390
+ return nil
391
+ end
392
+ end
393
+ end
394
+ # phyloxml can hold only phylogeny and "other" elements. If this is not
395
+ # phylogeny element then it is other. Also, "other" always comes after
396
+ # all phylogenies
397
+ @other << parse_other
398
+ #return nil for tree, since this is not valid phyloxml tree.
399
+ return nil
400
+ end
401
+
402
+ tree = Bio::PhyloXML::Tree.new
403
+
404
+ # keep track of current node in clades array/stack. Current node is the
405
+ # last element in the clades array
406
+ clades = []
407
+ clades.push tree
408
+
409
+ #keep track of current edge to be able to parse branch_length tag
410
+ current_edge = nil
411
+
412
+ # we are going to parse clade iteratively by pointing (and changing) to
413
+ # the current node in the tree. Since the property element is both in
414
+ # clade and in the phylogeny, we need some boolean to know if we are
415
+ # parsing the clade (there can be only max 1 clade in phylogeny) or
416
+ # parsing phylogeny
417
+ parsing_clade = false
418
+
419
+ while not is_end_element?('phylogeny') do
420
+ break if is_end_element?('phyloxml')
421
+
422
+ # parse phylogeny elements, except clade
423
+ if not parsing_clade
424
+
425
+ if is_element?('phylogeny')
426
+ @reader["rooted"] == "true" ? tree.rooted = true : tree.rooted = false
427
+ @reader["rerootable"] == "true" ? tree.rerootable = true : tree.rerootable = false
428
+ parse_attributes(tree, ["branch_length_unit", 'type'])
429
+ end
430
+
431
+ parse_simple_elements(tree, [ "name", 'description', "date"])
432
+
433
+ if is_element?('confidence')
434
+ tree.confidences << parse_confidence
435
+ end
436
+
437
+ end
438
+
439
+ if @reader.node_type == XML::Reader::TYPE_ELEMENT
440
+ case @reader.name
441
+ when 'clade'
442
+ #parse clade element
443
+
444
+ parsing_clade = true
445
+
446
+ node= Bio::PhyloXML::Node.new
447
+
448
+ branch_length = @reader['branch_length']
449
+
450
+ parse_attributes(node, ["id_source"])
451
+
452
+ #add new node to the tree
453
+ tree.add_node(node)
454
+ # The first clade will always be root since by xsd schema phyloxml can
455
+ # have 0 to 1 clades in it.
456
+ if tree.root == nil
457
+ tree.root = node
458
+ else
459
+ current_edge = tree.add_edge(clades[-1], node,
460
+ Bio::Tree::Edge.new(branch_length))
461
+ end
462
+ clades.push node
463
+ #end if clade element
464
+ else
465
+ parse_clade_elements(clades[-1], current_edge) if parsing_clade
466
+ end
467
+ end
468
+
469
+ #end clade element, go one parent up
470
+ if is_end_element?('clade')
471
+
472
+ #if we have reached the closing tag of the top-most clade, then our
473
+ # curent node should point to the root, If thats the case, we are done
474
+ # parsing the clade element
475
+ if clades[-1] == tree.root
476
+ parsing_clade = false
477
+ else
478
+ # set current node (clades[-1) to the previous clade in the array
479
+ clades.pop
480
+ end
481
+ end
482
+
483
+ #parsing phylogeny elements
484
+ if not parsing_clade
485
+
486
+ if @reader.node_type == XML::Reader::TYPE_ELEMENT
487
+ case @reader.name
488
+ when 'property'
489
+ tree.properties << parse_property
490
+
491
+ when 'clade_relation'
492
+ clade_relation = CladeRelation.new
493
+ parse_attributes(clade_relation, ["id_ref_0", "id_ref_1", "distance", "type"])
494
+
495
+ #@ add unit test for this
496
+ if not @reader.empty_element?
497
+ @reader.read
498
+ if is_element?('confidence')
499
+ clade_relation.confidence = parse_confidence
500
+ end
501
+ end
502
+ tree.clade_relations << clade_relation
503
+
504
+ when 'sequence_relation'
505
+ sequence_relation = SequenceRelation.new
506
+ parse_attributes(sequence_relation, ["id_ref_0", "id_ref_1", "distance", "type"])
507
+ if not @reader.empty_element?
508
+ @reader.read
509
+ if is_element?('confidence')
510
+ sequence_relation.confidence = parse_confidence
511
+ end
512
+ end
513
+ tree.sequence_relations << sequence_relation
514
+ when 'phylogeny'
515
+ #do nothing
516
+ else
517
+ tree.other << parse_other
518
+ #puts "Not recognized element. #{@reader.name}"
519
+ end
520
+ end
521
+ end
522
+ # go to next element
523
+ @reader.read
524
+ end #end while not </phylogeny>
525
+ #move on to the next tag after /phylogeny which is text, since phylogeny
526
+ #end tag is empty element, which value is nil, therefore need to move to
527
+ #the next meaningful element (therefore @reader.read twice)
528
+ @reader.read
529
+ @reader.read
530
+
531
+ return tree
532
+ end
533
+
534
+ # return tree of specified name.
535
+ # @todo Implement this method.
536
+ # def get_tree_by_name(name)
537
+
538
+ # while not is_end_element?('phyloxml')
539
+ # if is_element?('phylogeny')
540
+ # @reader.read
541
+ # @reader.read
542
+ #
543
+ # if is_element?('name')
544
+ # @reader.read
545
+ # if @reader.value == name
546
+ # puts "equasl"
547
+ # tree = next_tree
548
+ # puts tree
549
+ # end
550
+ # end
551
+ # end
552
+ # @reader.read
553
+ # end
554
+ #
555
+ # end
556
+
557
+
558
+ private
559
+
560
+ ####
561
+ # Utility methods
562
+ ###
563
+
564
+ def is_element?(str)
565
+ @reader.node_type == XML::Reader::TYPE_ELEMENT and @reader.name == str ? true : false
566
+ end
567
+
568
+ def is_end_element?(str)
569
+ @reader.node_type==XML::Reader::TYPE_END_ELEMENT and @reader.name == str ? true : false
570
+ end
571
+
572
+ def has_reached_end_element?(str)
573
+ if not(is_end_element?(str))
574
+ raise "Warning: Should have reached </#{str}> element here"
575
+ end
576
+ end
577
+
578
+ # Parses a simple XML element. for example <speciations>1</speciations>
579
+ # It reads in the value and assigns it to object.speciation = 1
580
+ # Also checks if have reached end tag (</speciations> and gives warning
581
+ # if not
582
+ def parse_simple_element(object, name)
583
+ if is_element?(name)
584
+ @reader.read
585
+ object.send("#{name}=", @reader.value)
586
+ @reader.read
587
+ has_reached_end_element?(name)
588
+ end
589
+ end
590
+
591
+ def parse_simple_elements(object, elements)
592
+ elements.each do |elmt|
593
+ parse_simple_element(object, elmt)
594
+ end
595
+ end
596
+
597
+ #Parses list of attributes
598
+ #use for the code like: clade_relation.type = @reader["type"]
599
+ def parse_attributes(object, arr_of_attrs)
600
+ arr_of_attrs.each do |attr|
601
+ object.send("#{attr}=", @reader[attr])
602
+ end
603
+ end
604
+
605
+ def parse_clade_elements(current_node, current_edge)
606
+ #no loop inside, loop is already outside
607
+
608
+ if @reader.node_type == XML::Reader::TYPE_ELEMENT
609
+ case @reader.name
610
+ when 'branch_length'
611
+ # @todo add unit test for this. current_edge is nil, if the root clade
612
+ # has branch_length attribute.
613
+ @reader.read
614
+ branch_length = @reader.value
615
+ current_edge.distance = branch_length.to_f if current_edge != nil
616
+ @reader.read
617
+ when 'width'
618
+ @reader.read
619
+ current_node.width = @reader.value
620
+ @reader.read
621
+ when 'name'
622
+ @reader.read
623
+ current_node.name = @reader.value
624
+ @reader.read
625
+ when 'events'
626
+ current_node.events = parse_events
627
+ when 'confidence'
628
+ current_node.confidences << parse_confidence
629
+ when 'sequence'
630
+ current_node.sequences << parse_sequence
631
+ when 'property'
632
+ current_node.properties << parse_property
633
+ when 'taxonomy'
634
+ current_node.taxonomies << parse_taxonomy
635
+ when 'distribution'
636
+ current_node.distributions << parse_distribution
637
+ when 'node_id'
638
+ id = Id.new
639
+ id.type = @reader["type"]
640
+ @reader.read
641
+ id.value = @reader.value
642
+ @reader.read
643
+ #has_reached_end_element?('node_id')
644
+ #@todo write unit test for this. There is no example of this in the example files
645
+ current_node.id = id
646
+ when 'color'
647
+ color = BranchColor.new
648
+ parse_simple_element(color, 'red')
649
+ parse_simple_element(color, 'green')
650
+ parse_simple_element(color, 'blue')
651
+ current_node.color = color
652
+ #@todo add unit test for this
653
+ when 'date'
654
+ date = Date.new
655
+ date.unit = @reader["unit"]
656
+ #move to the next token, which is always empty, since date tag does not
657
+ # have text associated with it
658
+ @reader.read
659
+ @reader.read #now the token is the first tag under date tag
660
+ while not(is_end_element?('date'))
661
+ parse_simple_element(date, 'desc')
662
+ parse_simple_element(date, 'value')
663
+ parse_simple_element(date, 'minimum')
664
+ parse_simple_element(date, 'maximum')
665
+ @reader.read
666
+ end
667
+ current_node.date = date
668
+ when 'reference'
669
+ reference = Reference.new()
670
+ reference.doi = @reader['doi']
671
+ if not @reader.empty_element?
672
+ while not is_end_element?('reference')
673
+ parse_simple_element(reference, 'desc')
674
+ @reader.read
675
+ end
676
+ end
677
+ current_node.references << reference
678
+ when 'binary_characters'
679
+ current_node.binary_characters = parse_binary_characters
680
+ when 'clade'
681
+ #do nothing
682
+ else
683
+ current_node.other << parse_other
684
+ #puts "No match found in parse_clade_elements.(#{@reader.name})"
685
+ end
686
+
687
+ end
688
+
689
+ end #parse_clade_elements
690
+
691
+ def parse_events()
692
+ events = PhyloXML::Events.new
693
+ @reader.read #go to next element
694
+ while not(is_end_element?('events')) do
695
+ parse_simple_elements(events, ['type', 'duplications',
696
+ 'speciations', 'losses'])
697
+ if is_element?('confidence')
698
+ events.confidence = parse_confidence
699
+ #@todo could add unit test for this (example file does not have this case)
700
+ end
701
+ @reader.read
702
+ end
703
+ return events
704
+ end #parse_events
705
+
706
+ def parse_taxonomy
707
+ taxonomy = PhyloXML::Taxonomy.new
708
+ parse_attributes(taxonomy, ["id_source"])
709
+ @reader.read
710
+ while not(is_end_element?('taxonomy')) do
711
+
712
+ if @reader.node_type == XML::Reader::TYPE_ELEMENT
713
+ case @reader.name
714
+ when 'code'
715
+ @reader.read
716
+ taxonomy.code = @reader.value
717
+ @reader.read
718
+ when 'scientific_name'
719
+ @reader.read
720
+ taxonomy.scientific_name = @reader.value
721
+ @reader.read
722
+ when 'rank'
723
+ @reader.read
724
+ taxonomy.rank = @reader.value
725
+ @reader.read
726
+ when 'authority'
727
+ @reader.read
728
+ taxonomy.authority = @reader.value
729
+ @reader.read
730
+ when 'id'
731
+ taxonomy.taxonomy_id = parse_id('id')
732
+ when 'common_name'
733
+ @reader.read
734
+ taxonomy.common_names << @reader.value
735
+ @reader.read
736
+ #has_reached_end_element?('common_name')
737
+ when 'synonym'
738
+ @reader.read
739
+ taxonomy.synonyms << @reader.value
740
+ @reader.read
741
+ #has_reached_end_element?('synonym')
742
+ when 'uri'
743
+ taxonomy.uri = parse_uri
744
+ else
745
+ taxonomy.other << parse_other
746
+ end
747
+ end
748
+
749
+ @reader.read #move to next tag in the loop
750
+ end
751
+ return taxonomy
752
+ end #parse_taxonomy
753
+
754
+ private
755
+
756
+ def parse_sequence
757
+ sequence = Sequence.new
758
+ parse_attributes(sequence, ["type", "id_source", "id_ref"])
759
+
760
+ @reader.read
761
+ while not(is_end_element?('sequence'))
762
+
763
+ if @reader.node_type == XML::Reader::TYPE_ELEMENT
764
+ case @reader.name
765
+ when 'symbol'
766
+ @reader.read
767
+ sequence.symbol = @reader.value
768
+ @reader.read
769
+ when 'name'
770
+ @reader.read
771
+ sequence.name = @reader.value
772
+ @reader.read
773
+ when 'location'
774
+ @reader.read
775
+ sequence.location = @reader.value
776
+ @reader.read
777
+ when 'mol_seq'
778
+ sequence.is_aligned = @reader["is_aligned"]
779
+ @reader.read
780
+ sequence.mol_seq = @reader.value
781
+ @reader.read
782
+ has_reached_end_element?('mol_seq')
783
+ when 'accession'
784
+ sequence.accession = Accession.new
785
+ sequence.accession.source = @reader["source"]
786
+ @reader.read
787
+ sequence.accession.value = @reader.value
788
+ @reader.read
789
+ has_reached_end_element?('accession')
790
+ when 'uri'
791
+ sequence.uri = parse_uri
792
+ when 'annotation'
793
+ sequence.annotations << parse_annotation
794
+ when 'domain_architecture'
795
+ sequence.domain_architecture = DomainArchitecture.new
796
+ sequence.domain_architecture.length = @reader["length"]
797
+ @reader.read
798
+ @reader.read
799
+ while not(is_end_element?('domain_architecture'))
800
+ sequence.domain_architecture.domains << parse_domain
801
+ @reader.read #go to next domain element
802
+ end
803
+ else
804
+ sequence.other << parse_other
805
+ #@todo add unit test
806
+ end
807
+ end
808
+
809
+ @reader.read
810
+ end
811
+ return sequence
812
+ end #parse_sequence
813
+
814
+ def parse_uri
815
+ uri = Uri.new
816
+ parse_attributes(uri, ["desc", "type"])
817
+ parse_simple_element(uri, 'uri')
818
+ return uri
819
+ end
820
+
821
+ def parse_annotation
822
+ annotation = Annotation.new
823
+
824
+ parse_attributes(annotation, ['ref', 'source', 'evidence', 'type'])
825
+
826
+ if not @reader.empty_element?
827
+ while not(is_end_element?('annotation'))
828
+ parse_simple_element(annotation, 'desc') if is_element?('desc')
829
+
830
+ annotation.confidence = parse_confidence if is_element?('confidence')
831
+
832
+ annotation.properties << parse_property if is_element?('property')
833
+
834
+ if is_element?('uri')
835
+ annotation.uri = parse_uri
836
+ end
837
+
838
+ @reader.read
839
+ end
840
+
841
+ end
842
+ return annotation
843
+ end
844
+
845
+ def parse_property
846
+ property = Property.new
847
+ parse_attributes(property, ["ref", "unit", "datatype", "applies_to", "id_ref"])
848
+ @reader.read
849
+ property.value = @reader.value
850
+ @reader.read
851
+ has_reached_end_element?('property')
852
+ return property
853
+ end #parse_property
854
+
855
+ def parse_confidence
856
+ type = @reader["type"]
857
+ @reader.read
858
+ value = @reader.value.to_f
859
+ @reader.read
860
+ has_reached_end_element?('confidence')
861
+ return Confidence.new(type, value)
862
+ end #parse_confidence
863
+
864
+ def parse_distribution
865
+ distribution = Distribution.new
866
+ @reader.read
867
+ while not(is_end_element?('distribution')) do
868
+
869
+ parse_simple_element(distribution, 'desc')
870
+
871
+ distribution.points << parse_point if is_element?('point')
872
+ distribution.polygons << parse_polygon if is_element?('polygon')
873
+
874
+ @reader.read
875
+ end
876
+ return distribution
877
+ end #parse_distribution
878
+
879
+ def parse_point
880
+ point = Point.new
881
+
882
+ point.geodetic_datum = @reader["geodetic_datum"]
883
+ point.alt_unit = @reader["alt_unit"]
884
+
885
+ @reader.read
886
+ while not(is_end_element?('point')) do
887
+
888
+ parse_simple_elements(point, ['lat', 'long'] )
889
+
890
+ if is_element?('alt')
891
+ @reader.read
892
+ point.alt = @reader.value.to_f
893
+ @reader.read
894
+ has_reached_end_element?('alt')
895
+ end
896
+ #advance reader
897
+ @reader.read
898
+ end
899
+ return point
900
+ end #parse_point
901
+
902
+ def parse_polygon
903
+ polygon = Polygon.new
904
+ @reader.read
905
+ while not(is_end_element?('polygon')) do
906
+ polygon.points << parse_point if is_element?('point')
907
+ @reader.read
908
+ end
909
+
910
+ #@todo should check for it at all? Probably not if xml is valid.
911
+ if polygon.points.length <3
912
+ puts "Warning: <polygon> should have at least 3 points"
913
+ end
914
+ return polygon
915
+ end #parse_polygon
916
+
917
+ def parse_id(tag_name)
918
+ id = Id.new
919
+ id.provider = @reader["provider"]
920
+ @reader.read
921
+ id.value = @reader.value
922
+ @reader.read #@todo shouldn't there be another read?
923
+ has_reached_end_element?(tag_name)
924
+ return id
925
+ end #parse_id
926
+
927
+ def parse_domain
928
+ domain = ProteinDomain.new
929
+ parse_attributes(domain, ["from", "to", "confidence", "id"])
930
+ @reader.read
931
+ domain.value = @reader.value
932
+ @reader.read
933
+ has_reached_end_element?('domain')
934
+ @reader.read
935
+ return domain
936
+ end
937
+
938
+ def parse_binary_characters
939
+ b = PhyloXML::BinaryCharacters.new
940
+ b.bc_type = @reader['type']
941
+
942
+ parse_attributes(b, ['gained_count', 'absent_count', 'lost_count', 'present_count'])
943
+ if not @reader.empty_element?
944
+ @reader.read
945
+ while not is_end_element?('binary_characters')
946
+
947
+ parse_bc(b, 'lost')
948
+ parse_bc(b, 'gained')
949
+ parse_bc(b, 'absent')
950
+ parse_bc(b, 'present')
951
+
952
+ @reader.read
953
+ end
954
+ end
955
+ return b
956
+ end #parse_binary_characters
957
+
958
+ def parse_bc(object, element)
959
+ if is_element?(element)
960
+ @reader.read
961
+ while not is_end_element?(element)
962
+ if is_element?('bc')
963
+ @reader.read
964
+ object.send(element) << @reader.value
965
+ @reader.read
966
+ has_reached_end_element?('bc')
967
+ end
968
+ @reader.read
969
+ end
970
+ end
971
+ end #parse_bc
972
+
973
+ def parse_other
974
+ other_obj = PhyloXML::Other.new
975
+ other_obj.element_name = @reader.name
976
+ #parse attributes
977
+ code = @reader.move_to_first_attribute
978
+ while code ==1
979
+ other_obj.attributes[@reader.name] = @reader.value
980
+ code = @reader.move_to_next_attribute
981
+ end
982
+
983
+ while not is_end_element?(other_obj.element_name) do
984
+ @reader.read
985
+ if @reader.node_type == XML::Reader::TYPE_ELEMENT
986
+ other_obj.children << parse_other #recursice call to parse children
987
+ elsif @reader.node_type == XML::Reader::TYPE_TEXT
988
+ other_obj.value = @reader.value
989
+ end
990
+ end
991
+ #just a check
992
+ has_reached_end_element?(other_obj.element_name)
993
+ return other_obj
994
+ end #parse_other
995
+
996
+ end #class phyloxmlParser
997
+
998
+ end #module PhyloXML
999
+
1000
+ end #module Bio