bio-phyloxml 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1000 @@
1
+ #
2
+ # = bio/db/phyloxml_parser.rb - PhyloXML parser
3
+ #
4
+ # Copyright:: Copyright (C) 2009
5
+ # Diana Jaunzeikare <latvianlinuxgirl@gmail.com>
6
+ # License:: The Ruby License
7
+ #
8
+ # $Id:$
9
+ #
10
+ # == Description
11
+ #
12
+ # This file containts parser for PhyloXML.
13
+ #
14
+ # == Requirements
15
+ #
16
+ # Libxml2 XML parser is required. Install libxml-ruby bindings from
17
+ # http://libxml.rubyforge.org or
18
+ #
19
+ # gem install -r libxml-ruby
20
+ #
21
+ # == References
22
+ #
23
+ # * http://www.phyloxml.org
24
+ #
25
+ # * https://www.nescent.org/wg_phyloinformatics/PhyloSoC:PhyloXML_support_in_BioRuby
26
+
27
+
28
+ require 'uri'
29
+ require 'libxml'
30
+
31
+ require 'bio/tree'
32
+
33
+
34
+ module Bio
35
+
36
+ module PhyloXML
37
+
38
+
39
+
40
+
41
+ # == Description
42
+ #
43
+ # Bio::PhyloXML::Parser is for parsing phyloXML format files.
44
+ #
45
+ # == Requirements
46
+ #
47
+ # Libxml2 XML parser is required. Install libxml-ruby bindings from
48
+ # http://libxml.rubyforge.org or
49
+ #
50
+ # gem install -r libxml-ruby
51
+ #
52
+ # == Usage
53
+ #
54
+ # require 'bio'
55
+ #
56
+ # # Create new phyloxml parser
57
+ # phyloxml = Bio::PhyloXML::Parser.open('example.xml')
58
+ #
59
+ # # Print the names of all trees in the file
60
+ # phyloxml.each do |tree|
61
+ # puts tree.name
62
+ # end
63
+ #
64
+ #
65
+ # == References
66
+ #
67
+ # http://www.phyloxml.org/documentation/version_100/phyloxml.xsd.html
68
+ #
69
+ class Parser
70
+
71
+ include LibXML
72
+
73
+ # After parsing all the trees, if there is anything else in other xml format,
74
+ # it is saved in this array of PhyloXML::Other objects
75
+ attr_reader :other
76
+
77
+ # Initializes LibXML::Reader and reads the file until it reaches the first
78
+ # phylogeny element.
79
+ #
80
+ # Example: Create a new Bio::PhyloXML::Parser object.
81
+ #
82
+ # p = Bio::PhyloXML::Parser.open("./phyloxml_examples.xml")
83
+ #
84
+ # If the optional code block is given, Bio::PhyloXML object is passed to
85
+ # the block as an argument. When the block terminates, the Bio::PhyloXML
86
+ # object is automatically closed, and the open method returns the value
87
+ # of the block.
88
+ #
89
+ # Example: Get the first tree in the file.
90
+ #
91
+ # tree = Bio::PhyloXML::Parser.open("example.xml") do |px|
92
+ # px.next_tree
93
+ # end
94
+ #
95
+ # ---
96
+ # *Arguments*:
97
+ # * (required) _filename_: Path to the file to parse.
98
+ # * (optional) _validate_: Whether to validate the file against schema or not. Default value is true.
99
+ # *Returns*:: (without block) Bio::PhyloXML::Parser object
100
+ # *Returns*:: (with block) the value of the block
101
+ def self.open(filename, validate=true)
102
+ obj = new(nil, validate)
103
+ obj.instance_eval {
104
+ filename = _secure_filename(filename)
105
+ _validate(:file, filename) if validate
106
+ # XML::Parser::Options::NONET for security reason
107
+ @reader = XML::Reader.file(filename,
108
+ { :options =>
109
+ LibXML::XML::Parser::Options::NONET })
110
+ _skip_leader
111
+ }
112
+ if block_given? then
113
+ begin
114
+ ret = yield obj
115
+ ensure
116
+ obj.close if obj and !obj.closed?
117
+ end
118
+ ret
119
+ else
120
+ obj
121
+ end
122
+ end
123
+
124
+ # Initializes LibXML::Reader and reads the file until it reaches the first
125
+ # phylogeny element.
126
+ #
127
+ # Create a new Bio::PhyloXML::Parser object.
128
+ #
129
+ # p = Bio::PhyloXML::Parser.open_uri("http://www.phyloxml.org/examples/apaf.xml")
130
+ #
131
+ # If the optional code block is given, Bio::PhyloXML object is passed to
132
+ # the block as an argument. When the block terminates, the Bio::PhyloXML
133
+ # object is automatically closed, and the open_uri method returns the
134
+ # value of the block.
135
+ #
136
+ # ---
137
+ # *Arguments*:
138
+ # * (required) _uri_: (URI or String) URI to the data to parse
139
+ # * (optional) _validate_: For URI reader, the "validate" option is ignored and no validation is executed.
140
+ # *Returns*:: (without block) Bio::PhyloXML::Parser object
141
+ # *Returns*:: (with block) the value of the block
142
+ def self.open_uri(uri, validate=true)
143
+ case uri
144
+ when URI
145
+ uri = uri.to_s
146
+ else
147
+ # raises error if not a String
148
+ uri = uri.to_str
149
+ # raises error if invalid URI
150
+ URI.parse(uri)
151
+ end
152
+
153
+ obj = new(nil, validate)
154
+ obj.instance_eval {
155
+ @reader = XML::Reader.file(uri)
156
+ _skip_leader
157
+ }
158
+ if block_given? then
159
+ begin
160
+ ret = yield obj
161
+ ensure
162
+ obj.close if obj and !obj.closed?
163
+ end
164
+ else
165
+ obj
166
+ end
167
+ end
168
+
169
+ # Special class for closed PhyloXML::Parser object.
170
+ # It raises error for any methods except essential methods.
171
+ #
172
+ # Bio::PhyloXML internal use only.
173
+ class ClosedPhyloXMLParser #:nodoc:
174
+ def method_missing(*arg)
175
+ raise LibXML::XML::Error, 'closed PhyloXML::Parser object'
176
+ end
177
+ end #class ClosedPhyloXMLParser
178
+
179
+ # Closes the LibXML::Reader inside the object.
180
+ # It also closes the opened file if it is created by using
181
+ # Bio::PhyloXML::Parser.open method.
182
+ #
183
+ # When closed object is closed again, or closed object is used,
184
+ # it raises LibXML::XML::Error.
185
+ # ---
186
+ # *Returns*:: nil
187
+ def close
188
+ @reader.close
189
+ @reader = ClosedPhyloXMLParser.new
190
+ nil
191
+ end
192
+
193
+ # If the object is closed by using the close method or equivalent,
194
+ # returns true. Otherwise, returns false.
195
+ # ---
196
+ # *Returns*:: true or false
197
+ def closed?
198
+ if @reader.kind_of?(ClosedPhyloXMLParser) then
199
+ true
200
+ else
201
+ false
202
+ end
203
+ end
204
+
205
+ # Initializes LibXML::Reader and reads from the IO until it reaches
206
+ # the first phylogeny element.
207
+ #
208
+ # Create a new Bio::PhyloXML::Parser object.
209
+ #
210
+ # p = Bio::PhyloXML::Parser.for_io($stdin)
211
+ #
212
+ # ---
213
+ # *Arguments*:
214
+ # * (required) _io_: IO object
215
+ # * (optional) _validate_: For IO reader, the "validate" option is ignored and no validation is executed.
216
+ # *Returns*:: Bio::PhyloXML::Parser object
217
+ def self.for_io(io, validate=true)
218
+ obj = new(nil, validate)
219
+ obj.instance_eval {
220
+ @reader = XML::Reader.io(io,
221
+ { :options =>
222
+ LibXML::XML::Parser::Options::NONET })
223
+ _skip_leader
224
+ }
225
+ obj
226
+ end
227
+
228
+ # (private) returns PhyloXML schema
229
+ def _schema
230
+ XML::Schema.document(XML::Document.file(File.join(File.dirname(__FILE__),'phyloxml.xsd')))
231
+ end
232
+ private :_schema
233
+
234
+ # (private) do validation
235
+ # ---
236
+ # *Arguments*:
237
+ # * (required) <em>data_type</em>_: :file for filename, :string for string
238
+ # * (required) _arg_: filename or string
239
+ # *Returns*:: (undefined)
240
+ def _validate(data_type, arg)
241
+ options = { :options =>
242
+ (LibXML::XML::Parser::Options::NOERROR | # no error messages
243
+ LibXML::XML::Parser::Options::NOWARNING | # no warning messages
244
+ LibXML::XML::Parser::Options::NONET) # no network access
245
+ }
246
+ case data_type
247
+ when :file
248
+ # No validation when special file e.g. FIFO (named pipe)
249
+ return unless File.file?(arg)
250
+ xml_instance = XML::Document.file(arg, options)
251
+ when :string
252
+ xml_instance = XML::Document.string(arg, options)
253
+ else
254
+ # no validation for unknown data type
255
+ return
256
+ end
257
+
258
+ schema = _schema
259
+ begin
260
+ flag = xml_instance.validate_schema(schema) do |msg, flag|
261
+ # The document of libxml-ruby says that the block is called
262
+ # when validation failed, but it seems it is never called
263
+ # even when validation failed!
264
+ raise "Validation of the XML document against phyloxml.xsd schema failed. #{msg}"
265
+ end
266
+ rescue LibXML::XML::Error => evar
267
+ raise "Validation of the XML document against phyloxml.xsd schema failed, or XML error occurred. #{evar.message}"
268
+ end
269
+ unless flag then
270
+ raise "Validation of the XML document against phyloxml.xsd schema failed."
271
+ end
272
+ end
273
+ private :_validate
274
+
275
+ # (private) It seems that LibXML::XML::Reader reads from the network
276
+ # even if LibXML::XML::Parser::Options::NONET is set.
277
+ # So, for URI-like filename, '://' is replaced with ':/'.
278
+ def _secure_filename(filename)
279
+ # for safety, URI-like filename is checked.
280
+ if /\A[a-zA-Z]+\:\/\// =~ filename then
281
+ # for example, "http://a/b" is changed to "http:/a/b".
282
+ filename = filename.sub(/\:\/\//, ':/')
283
+ end
284
+ filename
285
+ end
286
+ private :_secure_filename
287
+
288
+ # (private) loops through until reaches phylogeny stuff
289
+ def _skip_leader
290
+ #loops through until reaches phylogeny stuff
291
+ # Have to leave this way, if accepting strings, instead of files
292
+ @reader.read until is_element?('phylogeny')
293
+ nil
294
+ end
295
+ private :_skip_leader
296
+
297
+ # Initializes LibXML::Reader and reads the PhyloXML-formatted string
298
+ # until it reaches the first phylogeny element.
299
+ #
300
+ # Create a new Bio::PhyloXML::Parser object.
301
+ #
302
+ # str = File.read("./phyloxml_examples.xml")
303
+ # p = Bio::PhyloXML::Parser.new(str)
304
+ #
305
+ #
306
+ # Deprecated usage: Reads data from a file. <em>str<em> is a filename.
307
+ #
308
+ # p = Bio::PhyloXML::Parser.new("./phyloxml_examples.xml")
309
+ #
310
+ # Taking filename is deprecated. Use Bio::PhyloXML::Parser.open(filename).
311
+ #
312
+ # ---
313
+ # *Arguments*:
314
+ # * (required) _str_: PhyloXML-formatted string
315
+ # * (optional) _validate_: Whether to validate the file against schema or not. Default value is true.
316
+ # *Returns*:: Bio::PhyloXML::Parser object
317
+ def initialize(str, validate=true)
318
+
319
+ @other = []
320
+
321
+ return unless str
322
+
323
+ # For compatibility, if filename-like string is given,
324
+ # treat it as a filename.
325
+ if /[\<\>\r\n]/ !~ str and File.exist?(str) then
326
+ # assume that str is filename
327
+ warn "Bio::PhyloXML::Parser.new(filename) is deprecated. Use Bio::PhyloXML::Parser.open(filename)."
328
+ filename = _secure_filename(str)
329
+ _validate(:file, filename) if validate
330
+ @reader = XML::Reader.file(filename)
331
+ _skip_leader
332
+ return
333
+ end
334
+
335
+ # initialize for string
336
+ @reader = XML::Reader.string(str,
337
+ { :options =>
338
+ LibXML::XML::Parser::Options::NONET })
339
+ _skip_leader
340
+ end
341
+
342
+
343
+ # Iterate through all trees in the file.
344
+ #
345
+ # phyloxml = Bio::PhyloXML::Parser.open('example.xml')
346
+ # phyloxml.each do |tree|
347
+ # puts tree.name
348
+ # end
349
+ #
350
+ def each
351
+ while tree = next_tree
352
+ yield tree
353
+ end
354
+ end
355
+
356
+ # Access the specified tree in the file. It parses trees until the specified
357
+ # tree is reached.
358
+ #
359
+ # # Get 3rd tree in the file (starts counting from 0).
360
+ # parser = PhyloXML::Parser.open('phyloxml_examples.xml')
361
+ # tree = parser[2]
362
+ #
363
+ def [](i)
364
+ tree = nil
365
+ (i+1).times do
366
+ tree = self.next_tree
367
+ end
368
+ return tree
369
+ end
370
+
371
+ # Parse and return the next phylogeny tree. If there are no more phylogeny
372
+ # element, nil is returned. If there is something else besides phylogeny
373
+ # elements, it is saved in the PhyloXML::Parser#other.
374
+ #
375
+ # p = Bio::PhyloXML::Parser.open("./phyloxml_examples.xml")
376
+ # tree = p.next_tree
377
+ #
378
+ # ---
379
+ # *Returns*:: Bio::PhyloXML::Tree
380
+ def next_tree()
381
+
382
+ if not is_element?('phylogeny')
383
+ if @reader.node_type == XML::Reader::TYPE_END_ELEMENT
384
+ if is_end_element?('phyloxml')
385
+ return nil
386
+ else
387
+ @reader.read
388
+ @reader.read
389
+ if is_end_element?('phyloxml')
390
+ return nil
391
+ end
392
+ end
393
+ end
394
+ # phyloxml can hold only phylogeny and "other" elements. If this is not
395
+ # phylogeny element then it is other. Also, "other" always comes after
396
+ # all phylogenies
397
+ @other << parse_other
398
+ #return nil for tree, since this is not valid phyloxml tree.
399
+ return nil
400
+ end
401
+
402
+ tree = Bio::PhyloXML::Tree.new
403
+
404
+ # keep track of current node in clades array/stack. Current node is the
405
+ # last element in the clades array
406
+ clades = []
407
+ clades.push tree
408
+
409
+ #keep track of current edge to be able to parse branch_length tag
410
+ current_edge = nil
411
+
412
+ # we are going to parse clade iteratively by pointing (and changing) to
413
+ # the current node in the tree. Since the property element is both in
414
+ # clade and in the phylogeny, we need some boolean to know if we are
415
+ # parsing the clade (there can be only max 1 clade in phylogeny) or
416
+ # parsing phylogeny
417
+ parsing_clade = false
418
+
419
+ while not is_end_element?('phylogeny') do
420
+ break if is_end_element?('phyloxml')
421
+
422
+ # parse phylogeny elements, except clade
423
+ if not parsing_clade
424
+
425
+ if is_element?('phylogeny')
426
+ @reader["rooted"] == "true" ? tree.rooted = true : tree.rooted = false
427
+ @reader["rerootable"] == "true" ? tree.rerootable = true : tree.rerootable = false
428
+ parse_attributes(tree, ["branch_length_unit", 'type'])
429
+ end
430
+
431
+ parse_simple_elements(tree, [ "name", 'description', "date"])
432
+
433
+ if is_element?('confidence')
434
+ tree.confidences << parse_confidence
435
+ end
436
+
437
+ end
438
+
439
+ if @reader.node_type == XML::Reader::TYPE_ELEMENT
440
+ case @reader.name
441
+ when 'clade'
442
+ #parse clade element
443
+
444
+ parsing_clade = true
445
+
446
+ node= Bio::PhyloXML::Node.new
447
+
448
+ branch_length = @reader['branch_length']
449
+
450
+ parse_attributes(node, ["id_source"])
451
+
452
+ #add new node to the tree
453
+ tree.add_node(node)
454
+ # The first clade will always be root since by xsd schema phyloxml can
455
+ # have 0 to 1 clades in it.
456
+ if tree.root == nil
457
+ tree.root = node
458
+ else
459
+ current_edge = tree.add_edge(clades[-1], node,
460
+ Bio::Tree::Edge.new(branch_length))
461
+ end
462
+ clades.push node
463
+ #end if clade element
464
+ else
465
+ parse_clade_elements(clades[-1], current_edge) if parsing_clade
466
+ end
467
+ end
468
+
469
+ #end clade element, go one parent up
470
+ if is_end_element?('clade')
471
+
472
+ #if we have reached the closing tag of the top-most clade, then our
473
+ # curent node should point to the root, If thats the case, we are done
474
+ # parsing the clade element
475
+ if clades[-1] == tree.root
476
+ parsing_clade = false
477
+ else
478
+ # set current node (clades[-1) to the previous clade in the array
479
+ clades.pop
480
+ end
481
+ end
482
+
483
+ #parsing phylogeny elements
484
+ if not parsing_clade
485
+
486
+ if @reader.node_type == XML::Reader::TYPE_ELEMENT
487
+ case @reader.name
488
+ when 'property'
489
+ tree.properties << parse_property
490
+
491
+ when 'clade_relation'
492
+ clade_relation = CladeRelation.new
493
+ parse_attributes(clade_relation, ["id_ref_0", "id_ref_1", "distance", "type"])
494
+
495
+ #@ add unit test for this
496
+ if not @reader.empty_element?
497
+ @reader.read
498
+ if is_element?('confidence')
499
+ clade_relation.confidence = parse_confidence
500
+ end
501
+ end
502
+ tree.clade_relations << clade_relation
503
+
504
+ when 'sequence_relation'
505
+ sequence_relation = SequenceRelation.new
506
+ parse_attributes(sequence_relation, ["id_ref_0", "id_ref_1", "distance", "type"])
507
+ if not @reader.empty_element?
508
+ @reader.read
509
+ if is_element?('confidence')
510
+ sequence_relation.confidence = parse_confidence
511
+ end
512
+ end
513
+ tree.sequence_relations << sequence_relation
514
+ when 'phylogeny'
515
+ #do nothing
516
+ else
517
+ tree.other << parse_other
518
+ #puts "Not recognized element. #{@reader.name}"
519
+ end
520
+ end
521
+ end
522
+ # go to next element
523
+ @reader.read
524
+ end #end while not </phylogeny>
525
+ #move on to the next tag after /phylogeny which is text, since phylogeny
526
+ #end tag is empty element, which value is nil, therefore need to move to
527
+ #the next meaningful element (therefore @reader.read twice)
528
+ @reader.read
529
+ @reader.read
530
+
531
+ return tree
532
+ end
533
+
534
+ # return tree of specified name.
535
+ # @todo Implement this method.
536
+ # def get_tree_by_name(name)
537
+
538
+ # while not is_end_element?('phyloxml')
539
+ # if is_element?('phylogeny')
540
+ # @reader.read
541
+ # @reader.read
542
+ #
543
+ # if is_element?('name')
544
+ # @reader.read
545
+ # if @reader.value == name
546
+ # puts "equasl"
547
+ # tree = next_tree
548
+ # puts tree
549
+ # end
550
+ # end
551
+ # end
552
+ # @reader.read
553
+ # end
554
+ #
555
+ # end
556
+
557
+
558
+ private
559
+
560
+ ####
561
+ # Utility methods
562
+ ###
563
+
564
+ def is_element?(str)
565
+ @reader.node_type == XML::Reader::TYPE_ELEMENT and @reader.name == str ? true : false
566
+ end
567
+
568
+ def is_end_element?(str)
569
+ @reader.node_type==XML::Reader::TYPE_END_ELEMENT and @reader.name == str ? true : false
570
+ end
571
+
572
+ def has_reached_end_element?(str)
573
+ if not(is_end_element?(str))
574
+ raise "Warning: Should have reached </#{str}> element here"
575
+ end
576
+ end
577
+
578
+ # Parses a simple XML element. for example <speciations>1</speciations>
579
+ # It reads in the value and assigns it to object.speciation = 1
580
+ # Also checks if have reached end tag (</speciations> and gives warning
581
+ # if not
582
+ def parse_simple_element(object, name)
583
+ if is_element?(name)
584
+ @reader.read
585
+ object.send("#{name}=", @reader.value)
586
+ @reader.read
587
+ has_reached_end_element?(name)
588
+ end
589
+ end
590
+
591
+ def parse_simple_elements(object, elements)
592
+ elements.each do |elmt|
593
+ parse_simple_element(object, elmt)
594
+ end
595
+ end
596
+
597
+ #Parses list of attributes
598
+ #use for the code like: clade_relation.type = @reader["type"]
599
+ def parse_attributes(object, arr_of_attrs)
600
+ arr_of_attrs.each do |attr|
601
+ object.send("#{attr}=", @reader[attr])
602
+ end
603
+ end
604
+
605
+ def parse_clade_elements(current_node, current_edge)
606
+ #no loop inside, loop is already outside
607
+
608
+ if @reader.node_type == XML::Reader::TYPE_ELEMENT
609
+ case @reader.name
610
+ when 'branch_length'
611
+ # @todo add unit test for this. current_edge is nil, if the root clade
612
+ # has branch_length attribute.
613
+ @reader.read
614
+ branch_length = @reader.value
615
+ current_edge.distance = branch_length.to_f if current_edge != nil
616
+ @reader.read
617
+ when 'width'
618
+ @reader.read
619
+ current_node.width = @reader.value
620
+ @reader.read
621
+ when 'name'
622
+ @reader.read
623
+ current_node.name = @reader.value
624
+ @reader.read
625
+ when 'events'
626
+ current_node.events = parse_events
627
+ when 'confidence'
628
+ current_node.confidences << parse_confidence
629
+ when 'sequence'
630
+ current_node.sequences << parse_sequence
631
+ when 'property'
632
+ current_node.properties << parse_property
633
+ when 'taxonomy'
634
+ current_node.taxonomies << parse_taxonomy
635
+ when 'distribution'
636
+ current_node.distributions << parse_distribution
637
+ when 'node_id'
638
+ id = Id.new
639
+ id.type = @reader["type"]
640
+ @reader.read
641
+ id.value = @reader.value
642
+ @reader.read
643
+ #has_reached_end_element?('node_id')
644
+ #@todo write unit test for this. There is no example of this in the example files
645
+ current_node.id = id
646
+ when 'color'
647
+ color = BranchColor.new
648
+ parse_simple_element(color, 'red')
649
+ parse_simple_element(color, 'green')
650
+ parse_simple_element(color, 'blue')
651
+ current_node.color = color
652
+ #@todo add unit test for this
653
+ when 'date'
654
+ date = Date.new
655
+ date.unit = @reader["unit"]
656
+ #move to the next token, which is always empty, since date tag does not
657
+ # have text associated with it
658
+ @reader.read
659
+ @reader.read #now the token is the first tag under date tag
660
+ while not(is_end_element?('date'))
661
+ parse_simple_element(date, 'desc')
662
+ parse_simple_element(date, 'value')
663
+ parse_simple_element(date, 'minimum')
664
+ parse_simple_element(date, 'maximum')
665
+ @reader.read
666
+ end
667
+ current_node.date = date
668
+ when 'reference'
669
+ reference = Reference.new()
670
+ reference.doi = @reader['doi']
671
+ if not @reader.empty_element?
672
+ while not is_end_element?('reference')
673
+ parse_simple_element(reference, 'desc')
674
+ @reader.read
675
+ end
676
+ end
677
+ current_node.references << reference
678
+ when 'binary_characters'
679
+ current_node.binary_characters = parse_binary_characters
680
+ when 'clade'
681
+ #do nothing
682
+ else
683
+ current_node.other << parse_other
684
+ #puts "No match found in parse_clade_elements.(#{@reader.name})"
685
+ end
686
+
687
+ end
688
+
689
+ end #parse_clade_elements
690
+
691
+ def parse_events()
692
+ events = PhyloXML::Events.new
693
+ @reader.read #go to next element
694
+ while not(is_end_element?('events')) do
695
+ parse_simple_elements(events, ['type', 'duplications',
696
+ 'speciations', 'losses'])
697
+ if is_element?('confidence')
698
+ events.confidence = parse_confidence
699
+ #@todo could add unit test for this (example file does not have this case)
700
+ end
701
+ @reader.read
702
+ end
703
+ return events
704
+ end #parse_events
705
+
706
+ def parse_taxonomy
707
+ taxonomy = PhyloXML::Taxonomy.new
708
+ parse_attributes(taxonomy, ["id_source"])
709
+ @reader.read
710
+ while not(is_end_element?('taxonomy')) do
711
+
712
+ if @reader.node_type == XML::Reader::TYPE_ELEMENT
713
+ case @reader.name
714
+ when 'code'
715
+ @reader.read
716
+ taxonomy.code = @reader.value
717
+ @reader.read
718
+ when 'scientific_name'
719
+ @reader.read
720
+ taxonomy.scientific_name = @reader.value
721
+ @reader.read
722
+ when 'rank'
723
+ @reader.read
724
+ taxonomy.rank = @reader.value
725
+ @reader.read
726
+ when 'authority'
727
+ @reader.read
728
+ taxonomy.authority = @reader.value
729
+ @reader.read
730
+ when 'id'
731
+ taxonomy.taxonomy_id = parse_id('id')
732
+ when 'common_name'
733
+ @reader.read
734
+ taxonomy.common_names << @reader.value
735
+ @reader.read
736
+ #has_reached_end_element?('common_name')
737
+ when 'synonym'
738
+ @reader.read
739
+ taxonomy.synonyms << @reader.value
740
+ @reader.read
741
+ #has_reached_end_element?('synonym')
742
+ when 'uri'
743
+ taxonomy.uri = parse_uri
744
+ else
745
+ taxonomy.other << parse_other
746
+ end
747
+ end
748
+
749
+ @reader.read #move to next tag in the loop
750
+ end
751
+ return taxonomy
752
+ end #parse_taxonomy
753
+
754
+ private
755
+
756
+ def parse_sequence
757
+ sequence = Sequence.new
758
+ parse_attributes(sequence, ["type", "id_source", "id_ref"])
759
+
760
+ @reader.read
761
+ while not(is_end_element?('sequence'))
762
+
763
+ if @reader.node_type == XML::Reader::TYPE_ELEMENT
764
+ case @reader.name
765
+ when 'symbol'
766
+ @reader.read
767
+ sequence.symbol = @reader.value
768
+ @reader.read
769
+ when 'name'
770
+ @reader.read
771
+ sequence.name = @reader.value
772
+ @reader.read
773
+ when 'location'
774
+ @reader.read
775
+ sequence.location = @reader.value
776
+ @reader.read
777
+ when 'mol_seq'
778
+ sequence.is_aligned = @reader["is_aligned"]
779
+ @reader.read
780
+ sequence.mol_seq = @reader.value
781
+ @reader.read
782
+ has_reached_end_element?('mol_seq')
783
+ when 'accession'
784
+ sequence.accession = Accession.new
785
+ sequence.accession.source = @reader["source"]
786
+ @reader.read
787
+ sequence.accession.value = @reader.value
788
+ @reader.read
789
+ has_reached_end_element?('accession')
790
+ when 'uri'
791
+ sequence.uri = parse_uri
792
+ when 'annotation'
793
+ sequence.annotations << parse_annotation
794
+ when 'domain_architecture'
795
+ sequence.domain_architecture = DomainArchitecture.new
796
+ sequence.domain_architecture.length = @reader["length"]
797
+ @reader.read
798
+ @reader.read
799
+ while not(is_end_element?('domain_architecture'))
800
+ sequence.domain_architecture.domains << parse_domain
801
+ @reader.read #go to next domain element
802
+ end
803
+ else
804
+ sequence.other << parse_other
805
+ #@todo add unit test
806
+ end
807
+ end
808
+
809
+ @reader.read
810
+ end
811
+ return sequence
812
+ end #parse_sequence
813
+
814
+ def parse_uri
815
+ uri = Uri.new
816
+ parse_attributes(uri, ["desc", "type"])
817
+ parse_simple_element(uri, 'uri')
818
+ return uri
819
+ end
820
+
821
+ def parse_annotation
822
+ annotation = Annotation.new
823
+
824
+ parse_attributes(annotation, ['ref', 'source', 'evidence', 'type'])
825
+
826
+ if not @reader.empty_element?
827
+ while not(is_end_element?('annotation'))
828
+ parse_simple_element(annotation, 'desc') if is_element?('desc')
829
+
830
+ annotation.confidence = parse_confidence if is_element?('confidence')
831
+
832
+ annotation.properties << parse_property if is_element?('property')
833
+
834
+ if is_element?('uri')
835
+ annotation.uri = parse_uri
836
+ end
837
+
838
+ @reader.read
839
+ end
840
+
841
+ end
842
+ return annotation
843
+ end
844
+
845
+ def parse_property
846
+ property = Property.new
847
+ parse_attributes(property, ["ref", "unit", "datatype", "applies_to", "id_ref"])
848
+ @reader.read
849
+ property.value = @reader.value
850
+ @reader.read
851
+ has_reached_end_element?('property')
852
+ return property
853
+ end #parse_property
854
+
855
+ def parse_confidence
856
+ type = @reader["type"]
857
+ @reader.read
858
+ value = @reader.value.to_f
859
+ @reader.read
860
+ has_reached_end_element?('confidence')
861
+ return Confidence.new(type, value)
862
+ end #parse_confidence
863
+
864
+ def parse_distribution
865
+ distribution = Distribution.new
866
+ @reader.read
867
+ while not(is_end_element?('distribution')) do
868
+
869
+ parse_simple_element(distribution, 'desc')
870
+
871
+ distribution.points << parse_point if is_element?('point')
872
+ distribution.polygons << parse_polygon if is_element?('polygon')
873
+
874
+ @reader.read
875
+ end
876
+ return distribution
877
+ end #parse_distribution
878
+
879
+ def parse_point
880
+ point = Point.new
881
+
882
+ point.geodetic_datum = @reader["geodetic_datum"]
883
+ point.alt_unit = @reader["alt_unit"]
884
+
885
+ @reader.read
886
+ while not(is_end_element?('point')) do
887
+
888
+ parse_simple_elements(point, ['lat', 'long'] )
889
+
890
+ if is_element?('alt')
891
+ @reader.read
892
+ point.alt = @reader.value.to_f
893
+ @reader.read
894
+ has_reached_end_element?('alt')
895
+ end
896
+ #advance reader
897
+ @reader.read
898
+ end
899
+ return point
900
+ end #parse_point
901
+
902
+ def parse_polygon
903
+ polygon = Polygon.new
904
+ @reader.read
905
+ while not(is_end_element?('polygon')) do
906
+ polygon.points << parse_point if is_element?('point')
907
+ @reader.read
908
+ end
909
+
910
+ #@todo should check for it at all? Probably not if xml is valid.
911
+ if polygon.points.length <3
912
+ puts "Warning: <polygon> should have at least 3 points"
913
+ end
914
+ return polygon
915
+ end #parse_polygon
916
+
917
+ def parse_id(tag_name)
918
+ id = Id.new
919
+ id.provider = @reader["provider"]
920
+ @reader.read
921
+ id.value = @reader.value
922
+ @reader.read #@todo shouldn't there be another read?
923
+ has_reached_end_element?(tag_name)
924
+ return id
925
+ end #parse_id
926
+
927
+ def parse_domain
928
+ domain = ProteinDomain.new
929
+ parse_attributes(domain, ["from", "to", "confidence", "id"])
930
+ @reader.read
931
+ domain.value = @reader.value
932
+ @reader.read
933
+ has_reached_end_element?('domain')
934
+ @reader.read
935
+ return domain
936
+ end
937
+
938
+ def parse_binary_characters
939
+ b = PhyloXML::BinaryCharacters.new
940
+ b.bc_type = @reader['type']
941
+
942
+ parse_attributes(b, ['gained_count', 'absent_count', 'lost_count', 'present_count'])
943
+ if not @reader.empty_element?
944
+ @reader.read
945
+ while not is_end_element?('binary_characters')
946
+
947
+ parse_bc(b, 'lost')
948
+ parse_bc(b, 'gained')
949
+ parse_bc(b, 'absent')
950
+ parse_bc(b, 'present')
951
+
952
+ @reader.read
953
+ end
954
+ end
955
+ return b
956
+ end #parse_binary_characters
957
+
958
+ def parse_bc(object, element)
959
+ if is_element?(element)
960
+ @reader.read
961
+ while not is_end_element?(element)
962
+ if is_element?('bc')
963
+ @reader.read
964
+ object.send(element) << @reader.value
965
+ @reader.read
966
+ has_reached_end_element?('bc')
967
+ end
968
+ @reader.read
969
+ end
970
+ end
971
+ end #parse_bc
972
+
973
+ def parse_other
974
+ other_obj = PhyloXML::Other.new
975
+ other_obj.element_name = @reader.name
976
+ #parse attributes
977
+ code = @reader.move_to_first_attribute
978
+ while code ==1
979
+ other_obj.attributes[@reader.name] = @reader.value
980
+ code = @reader.move_to_next_attribute
981
+ end
982
+
983
+ while not is_end_element?(other_obj.element_name) do
984
+ @reader.read
985
+ if @reader.node_type == XML::Reader::TYPE_ELEMENT
986
+ other_obj.children << parse_other #recursice call to parse children
987
+ elsif @reader.node_type == XML::Reader::TYPE_TEXT
988
+ other_obj.value = @reader.value
989
+ end
990
+ end
991
+ #just a check
992
+ has_reached_end_element?(other_obj.element_name)
993
+ return other_obj
994
+ end #parse_other
995
+
996
+ end #class phyloxmlParser
997
+
998
+ end #module PhyloXML
999
+
1000
+ end #module Bio