bioruby-phyloxml 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,1001 @@
1
+ #
2
+ # = bio/db/phyloxml_parser.rb - PhyloXML parser
3
+ #
4
+ # Copyright:: Copyright (C) 2009
5
+ # Diana Jaunzeikare <latvianlinuxgirl@gmail.com>
6
+ # License:: The Ruby License
7
+ #
8
+ #
9
+ # == Description
10
+ #
11
+ # This file containts parser for PhyloXML.
12
+ #
13
+ # == Requirements
14
+ #
15
+ # Libxml2 XML parser is required. Install libxml-ruby bindings from
16
+ # http://libxml.rubyforge.org or
17
+ #
18
+ # gem install -r libxml-ruby
19
+ #
20
+ # == References
21
+ #
22
+ # * http://www.phyloxml.org
23
+ #
24
+ # * https://www.nescent.org/wg_phyloinformatics/PhyloSoC:PhyloXML_support_in_BioRuby
25
+
26
+
27
+ require 'uri'
28
+ require 'libxml'
29
+
30
+ require 'bio/tree'
31
+ require 'bio-phyloxml/phyloxml_elements'
32
+
33
+
34
+ module Bio
35
+
36
+ module PhyloXML
37
+
38
+
39
+
40
+
41
+ # == Description
42
+ #
43
+ # Bio::PhyloXML::Parser is for parsing phyloXML format files.
44
+ #
45
+ # == Requirements
46
+ #
47
+ # Libxml2 XML parser is required. Install libxml-ruby bindings from
48
+ # http://libxml.rubyforge.org or
49
+ #
50
+ # gem install -r libxml-ruby
51
+ #
52
+ # == Usage
53
+ #
54
+ # require 'bio'
55
+ #
56
+ # # Create new phyloxml parser
57
+ # phyloxml = Bio::PhyloXML::Parser.open('example.xml')
58
+ #
59
+ # # Print the names of all trees in the file
60
+ # phyloxml.each do |tree|
61
+ # puts tree.name
62
+ # end
63
+ #
64
+ #
65
+ # == References
66
+ #
67
+ # http://www.phyloxml.org/documentation/version_100/phyloxml.xsd.html
68
+ #
69
+ class Parser
70
+
71
+ include LibXML
72
+
73
+ # After parsing all the trees, if there is anything else in other xml format,
74
+ # it is saved in this array of PhyloXML::Other objects
75
+ attr_reader :other
76
+
77
+ # Initializes LibXML::Reader and reads the file until it reaches the first
78
+ # phylogeny element.
79
+ #
80
+ # Example: Create a new Bio::PhyloXML::Parser object.
81
+ #
82
+ # p = Bio::PhyloXML::Parser.open("./phyloxml_examples.xml")
83
+ #
84
+ # If the optional code block is given, Bio::PhyloXML object is passed to
85
+ # the block as an argument. When the block terminates, the Bio::PhyloXML
86
+ # object is automatically closed, and the open method returns the value
87
+ # of the block.
88
+ #
89
+ # Example: Get the first tree in the file.
90
+ #
91
+ # tree = Bio::PhyloXML::Parser.open("example.xml") do |px|
92
+ # px.next_tree
93
+ # end
94
+ #
95
+ # ---
96
+ # *Arguments*:
97
+ # * (required) _filename_: Path to the file to parse.
98
+ # * (optional) _validate_: Whether to validate the file against schema or not. Default value is true.
99
+ # *Returns*:: (without block) Bio::PhyloXML::Parser object
100
+ # *Returns*:: (with block) the value of the block
101
+ def self.open(filename, validate=true)
102
+ obj = new(nil, validate)
103
+ obj.instance_eval {
104
+ filename = _secure_filename(filename)
105
+ _validate(:file, filename) if validate
106
+ # XML::Parser::Options::NONET for security reason
107
+ @reader = XML::Reader.file(filename,
108
+ { :options =>
109
+ LibXML::XML::Parser::Options::NONET })
110
+ _skip_leader
111
+ }
112
+ if block_given? then
113
+ begin
114
+ ret = yield obj
115
+ ensure
116
+ obj.close if obj and !obj.closed?
117
+ end
118
+ ret
119
+ else
120
+ obj
121
+ end
122
+ end
123
+
124
+ # Initializes LibXML::Reader and reads the file until it reaches the first
125
+ # phylogeny element.
126
+ #
127
+ # Create a new Bio::PhyloXML::Parser object.
128
+ #
129
+ # p = Bio::PhyloXML::Parser.open_uri("http://www.phyloxml.org/examples/apaf.xml")
130
+ #
131
+ # If the optional code block is given, Bio::PhyloXML object is passed to
132
+ # the block as an argument. When the block terminates, the Bio::PhyloXML
133
+ # object is automatically closed, and the open_uri method returns the
134
+ # value of the block.
135
+ #
136
+ # ---
137
+ # *Arguments*:
138
+ # * (required) _uri_: (URI or String) URI to the data to parse
139
+ # * (optional) _validate_: For URI reader, the "validate" option is ignored and no validation is executed.
140
+ # *Returns*:: (without block) Bio::PhyloXML::Parser object
141
+ # *Returns*:: (with block) the value of the block
142
+ def self.open_uri(uri, validate=true)
143
+ case uri
144
+ when URI
145
+ uri = uri.to_s
146
+ else
147
+ # raises error if not a String
148
+ uri = uri.to_str
149
+ # raises error if invalid URI
150
+ URI.parse(uri)
151
+ end
152
+
153
+ obj = new(nil, validate)
154
+ obj.instance_eval {
155
+ @reader = XML::Reader.file(uri)
156
+ _skip_leader
157
+ }
158
+ if block_given? then
159
+ begin
160
+ ret = yield obj
161
+ ensure
162
+ obj.close if obj and !obj.closed?
163
+ end
164
+ ret
165
+ else
166
+ obj
167
+ end
168
+ end
169
+
170
+ # Special class for closed PhyloXML::Parser object.
171
+ # It raises error for any methods except essential methods.
172
+ #
173
+ # Bio::PhyloXML internal use only.
174
+ class ClosedPhyloXMLParser #:nodoc:
175
+ def method_missing(*arg)
176
+ raise LibXML::XML::Error, 'closed PhyloXML::Parser object'
177
+ end
178
+ end #class ClosedPhyloXMLParser
179
+
180
+ # Closes the LibXML::Reader inside the object.
181
+ # It also closes the opened file if it is created by using
182
+ # Bio::PhyloXML::Parser.open method.
183
+ #
184
+ # When closed object is closed again, or closed object is used,
185
+ # it raises LibXML::XML::Error.
186
+ # ---
187
+ # *Returns*:: nil
188
+ def close
189
+ @reader.close
190
+ @reader = ClosedPhyloXMLParser.new
191
+ nil
192
+ end
193
+
194
+ # If the object is closed by using the close method or equivalent,
195
+ # returns true. Otherwise, returns false.
196
+ # ---
197
+ # *Returns*:: true or false
198
+ def closed?
199
+ if @reader.kind_of?(ClosedPhyloXMLParser) then
200
+ true
201
+ else
202
+ false
203
+ end
204
+ end
205
+
206
+ # Initializes LibXML::Reader and reads from the IO until it reaches
207
+ # the first phylogeny element.
208
+ #
209
+ # Create a new Bio::PhyloXML::Parser object.
210
+ #
211
+ # p = Bio::PhyloXML::Parser.for_io($stdin)
212
+ #
213
+ # ---
214
+ # *Arguments*:
215
+ # * (required) _io_: IO object
216
+ # * (optional) _validate_: For IO reader, the "validate" option is ignored and no validation is executed.
217
+ # *Returns*:: Bio::PhyloXML::Parser object
218
+ def self.for_io(io, validate=true)
219
+ obj = new(nil, validate)
220
+ obj.instance_eval {
221
+ @reader = XML::Reader.io(io,
222
+ { :options =>
223
+ LibXML::XML::Parser::Options::NONET })
224
+ _skip_leader
225
+ }
226
+ obj
227
+ end
228
+
229
+ # (private) returns PhyloXML schema
230
+ def _schema
231
+ XML::Schema.document(XML::Document.file(File.join(File.dirname(__FILE__),'phyloxml.xsd')))
232
+ end
233
+ private :_schema
234
+
235
+ # (private) do validation
236
+ # ---
237
+ # *Arguments*:
238
+ # * (required) <em>data_type</em>_: :file for filename, :string for string
239
+ # * (required) _arg_: filename or string
240
+ # *Returns*:: (undefined)
241
+ def _validate(data_type, arg)
242
+ options = { :options =>
243
+ (LibXML::XML::Parser::Options::NOERROR | # no error messages
244
+ LibXML::XML::Parser::Options::NOWARNING | # no warning messages
245
+ LibXML::XML::Parser::Options::NONET) # no network access
246
+ }
247
+ case data_type
248
+ when :file
249
+ # No validation when special file e.g. FIFO (named pipe)
250
+ return unless File.file?(arg)
251
+ xml_instance = XML::Document.file(arg, options)
252
+ when :string
253
+ xml_instance = XML::Document.string(arg, options)
254
+ else
255
+ # no validation for unknown data type
256
+ return
257
+ end
258
+
259
+ schema = _schema
260
+ begin
261
+ flag = xml_instance.validate_schema(schema) do |msg, _|
262
+ # The document of libxml-ruby says that the block is called
263
+ # when validation failed, but it seems it is never called
264
+ # even when validation failed!
265
+ raise "Validation of the XML document against phyloxml.xsd schema failed. #{msg}"
266
+ end
267
+ rescue LibXML::XML::Error => evar
268
+ raise "Validation of the XML document against phyloxml.xsd schema failed, or XML error occurred. #{evar.message}"
269
+ end
270
+ unless flag then
271
+ raise "Validation of the XML document against phyloxml.xsd schema failed."
272
+ end
273
+ end
274
+ private :_validate
275
+
276
+ # (private) It seems that LibXML::XML::Reader reads from the network
277
+ # even if LibXML::XML::Parser::Options::NONET is set.
278
+ # So, for URI-like filename, '://' is replaced with ':/'.
279
+ def _secure_filename(filename)
280
+ # for safety, URI-like filename is checked.
281
+ if /\A[a-zA-Z]+\:\/\// =~ filename then
282
+ # for example, "http://a/b" is changed to "http:/a/b".
283
+ filename = filename.sub(/\:\/\//, ':/')
284
+ end
285
+ filename
286
+ end
287
+ private :_secure_filename
288
+
289
+ # (private) loops through until reaches phylogeny stuff
290
+ def _skip_leader
291
+ #loops through until reaches phylogeny stuff
292
+ # Have to leave this way, if accepting strings, instead of files
293
+ @reader.read until is_element?('phylogeny')
294
+ nil
295
+ end
296
+ private :_skip_leader
297
+
298
+ # Initializes LibXML::Reader and reads the PhyloXML-formatted string
299
+ # until it reaches the first phylogeny element.
300
+ #
301
+ # Create a new Bio::PhyloXML::Parser object.
302
+ #
303
+ # str = File.read("./phyloxml_examples.xml")
304
+ # p = Bio::PhyloXML::Parser.new(str)
305
+ #
306
+ #
307
+ # Deprecated usage: Reads data from a file. <em>str<em> is a filename.
308
+ #
309
+ # p = Bio::PhyloXML::Parser.new("./phyloxml_examples.xml")
310
+ #
311
+ # Taking filename is deprecated. Use Bio::PhyloXML::Parser.open(filename).
312
+ #
313
+ # ---
314
+ # *Arguments*:
315
+ # * (required) _str_: PhyloXML-formatted string
316
+ # * (optional) _validate_: Whether to validate the file against schema or not. Default value is true.
317
+ # *Returns*:: Bio::PhyloXML::Parser object
318
+ def initialize(str, validate=true)
319
+
320
+ @other = []
321
+
322
+ return unless str
323
+
324
+ # For compatibility, if filename-like string is given,
325
+ # treat it as a filename.
326
+ if /[\<\>\r\n]/ !~ str and File.exist?(str) then
327
+ # assume that str is filename
328
+ warn "Bio::PhyloXML::Parser.new(filename) is deprecated. Use Bio::PhyloXML::Parser.open(filename)."
329
+ filename = _secure_filename(str)
330
+ _validate(:file, filename) if validate
331
+ @reader = XML::Reader.file(filename)
332
+ _skip_leader
333
+ return
334
+ end
335
+
336
+ # initialize for string
337
+ @reader = XML::Reader.string(str,
338
+ { :options =>
339
+ LibXML::XML::Parser::Options::NONET })
340
+ _skip_leader
341
+ end
342
+
343
+
344
+ # Iterate through all trees in the file.
345
+ #
346
+ # phyloxml = Bio::PhyloXML::Parser.open('example.xml')
347
+ # phyloxml.each do |tree|
348
+ # puts tree.name
349
+ # end
350
+ #
351
+ def each
352
+ while tree = next_tree
353
+ yield tree
354
+ end
355
+ end
356
+
357
+ # Access the specified tree in the file. It parses trees until the specified
358
+ # tree is reached.
359
+ #
360
+ # # Get 3rd tree in the file (starts counting from 0).
361
+ # parser = PhyloXML::Parser.open('phyloxml_examples.xml')
362
+ # tree = parser[2]
363
+ #
364
+ def [](i)
365
+ tree = nil
366
+ (i+1).times do
367
+ tree = self.next_tree
368
+ end
369
+ return tree
370
+ end
371
+
372
+ # Parse and return the next phylogeny tree. If there are no more phylogeny
373
+ # element, nil is returned. If there is something else besides phylogeny
374
+ # elements, it is saved in the PhyloXML::Parser#other.
375
+ #
376
+ # p = Bio::PhyloXML::Parser.open("./phyloxml_examples.xml")
377
+ # tree = p.next_tree
378
+ #
379
+ # ---
380
+ # *Returns*:: Bio::PhyloXML::Tree
381
+ def next_tree()
382
+
383
+ if not is_element?('phylogeny')
384
+ if @reader.node_type == XML::Reader::TYPE_END_ELEMENT
385
+ if is_end_element?('phyloxml')
386
+ return nil
387
+ else
388
+ @reader.read
389
+ @reader.read
390
+ if is_end_element?('phyloxml')
391
+ return nil
392
+ end
393
+ end
394
+ end
395
+ # phyloxml can hold only phylogeny and "other" elements. If this is not
396
+ # phylogeny element then it is other. Also, "other" always comes after
397
+ # all phylogenies
398
+ @other << parse_other
399
+ #return nil for tree, since this is not valid phyloxml tree.
400
+ return nil
401
+ end
402
+
403
+ tree = Bio::PhyloXML::Tree.new
404
+
405
+ # keep track of current node in clades array/stack. Current node is the
406
+ # last element in the clades array
407
+ clades = []
408
+ clades.push tree
409
+
410
+ #keep track of current edge to be able to parse branch_length tag
411
+ current_edge = nil
412
+
413
+ # we are going to parse clade iteratively by pointing (and changing) to
414
+ # the current node in the tree. Since the property element is both in
415
+ # clade and in the phylogeny, we need some boolean to know if we are
416
+ # parsing the clade (there can be only max 1 clade in phylogeny) or
417
+ # parsing phylogeny
418
+ parsing_clade = false
419
+
420
+ while not is_end_element?('phylogeny') do
421
+ break if is_end_element?('phyloxml')
422
+
423
+ # parse phylogeny elements, except clade
424
+ if not parsing_clade
425
+
426
+ if is_element?('phylogeny')
427
+ @reader["rooted"] == "true" ? tree.rooted = true : tree.rooted = false
428
+ @reader["rerootable"] == "true" ? tree.rerootable = true : tree.rerootable = false
429
+ parse_attributes(tree, ["branch_length_unit", 'type'])
430
+ end
431
+
432
+ parse_simple_elements(tree, [ "name", 'description', "date"])
433
+
434
+ if is_element?('confidence')
435
+ tree.confidences << parse_confidence
436
+ end
437
+
438
+ end
439
+
440
+ if @reader.node_type == XML::Reader::TYPE_ELEMENT
441
+ case @reader.name
442
+ when 'clade'
443
+ #parse clade element
444
+
445
+ parsing_clade = true
446
+
447
+ node= Bio::PhyloXML::Node.new
448
+
449
+ branch_length = @reader['branch_length']
450
+
451
+ parse_attributes(node, ["id_source"])
452
+
453
+ #add new node to the tree
454
+ tree.add_node(node)
455
+ # The first clade will always be root since by xsd schema phyloxml can
456
+ # have 0 to 1 clades in it.
457
+ if tree.root == nil
458
+ tree.root = node
459
+ else
460
+ current_edge = tree.add_edge(clades[-1], node,
461
+ Bio::Tree::Edge.new(branch_length))
462
+ end
463
+ clades.push node
464
+ #end if clade element
465
+ else
466
+ parse_clade_elements(clades[-1], current_edge) if parsing_clade
467
+ end
468
+ end
469
+
470
+ #end clade element, go one parent up
471
+ if is_end_element?('clade')
472
+
473
+ #if we have reached the closing tag of the top-most clade, then our
474
+ # curent node should point to the root, If thats the case, we are done
475
+ # parsing the clade element
476
+ if clades[-1] == tree.root
477
+ parsing_clade = false
478
+ else
479
+ # set current node (clades[-1) to the previous clade in the array
480
+ clades.pop
481
+ end
482
+ end
483
+
484
+ #parsing phylogeny elements
485
+ if not parsing_clade
486
+
487
+ if @reader.node_type == XML::Reader::TYPE_ELEMENT
488
+ case @reader.name
489
+ when 'property'
490
+ tree.properties << parse_property
491
+
492
+ when 'clade_relation'
493
+ clade_relation = CladeRelation.new
494
+ parse_attributes(clade_relation, ["id_ref_0", "id_ref_1", "distance", "type"])
495
+
496
+ #@ add unit test for this
497
+ if not @reader.empty_element?
498
+ @reader.read
499
+ if is_element?('confidence')
500
+ clade_relation.confidence = parse_confidence
501
+ end
502
+ end
503
+ tree.clade_relations << clade_relation
504
+
505
+ when 'sequence_relation'
506
+ sequence_relation = SequenceRelation.new
507
+ parse_attributes(sequence_relation, ["id_ref_0", "id_ref_1", "distance", "type"])
508
+ if not @reader.empty_element?
509
+ @reader.read
510
+ if is_element?('confidence')
511
+ sequence_relation.confidence = parse_confidence
512
+ end
513
+ end
514
+ tree.sequence_relations << sequence_relation
515
+ when 'phylogeny'
516
+ #do nothing
517
+ else
518
+ tree.other << parse_other
519
+ #puts "Not recognized element. #{@reader.name}"
520
+ end
521
+ end
522
+ end
523
+ # go to next element
524
+ @reader.read
525
+ end #end while not </phylogeny>
526
+ #move on to the next tag after /phylogeny which is text, since phylogeny
527
+ #end tag is empty element, which value is nil, therefore need to move to
528
+ #the next meaningful element (therefore @reader.read twice)
529
+ @reader.read
530
+ @reader.read
531
+
532
+ return tree
533
+ end
534
+
535
+ # return tree of specified name.
536
+ # @todo Implement this method.
537
+ # def get_tree_by_name(name)
538
+
539
+ # while not is_end_element?('phyloxml')
540
+ # if is_element?('phylogeny')
541
+ # @reader.read
542
+ # @reader.read
543
+ #
544
+ # if is_element?('name')
545
+ # @reader.read
546
+ # if @reader.value == name
547
+ # puts "equasl"
548
+ # tree = next_tree
549
+ # puts tree
550
+ # end
551
+ # end
552
+ # end
553
+ # @reader.read
554
+ # end
555
+ #
556
+ # end
557
+
558
+
559
+ private
560
+
561
+ ####
562
+ # Utility methods
563
+ ###
564
+
565
+ def is_element?(str)
566
+ @reader.node_type == XML::Reader::TYPE_ELEMENT and @reader.name == str ? true : false
567
+ end
568
+
569
+ def is_end_element?(str)
570
+ @reader.node_type==XML::Reader::TYPE_END_ELEMENT and @reader.name == str ? true : false
571
+ end
572
+
573
+ def has_reached_end_element?(str)
574
+ if not(is_end_element?(str))
575
+ raise "Warning: Should have reached </#{str}> element here"
576
+ end
577
+ end
578
+
579
+ # Parses a simple XML element. for example <speciations>1</speciations>
580
+ # It reads in the value and assigns it to object.speciation = 1
581
+ # Also checks if have reached end tag (</speciations> and gives warning
582
+ # if not
583
+ def parse_simple_element(object, name)
584
+ if is_element?(name)
585
+ @reader.read
586
+ object.send("#{name}=", @reader.value)
587
+ @reader.read
588
+ has_reached_end_element?(name)
589
+ end
590
+ end
591
+
592
+ def parse_simple_elements(object, elements)
593
+ elements.each do |elmt|
594
+ parse_simple_element(object, elmt)
595
+ end
596
+ end
597
+
598
+ #Parses list of attributes
599
+ #use for the code like: clade_relation.type = @reader["type"]
600
+ def parse_attributes(object, arr_of_attrs)
601
+ arr_of_attrs.each do |attr|
602
+ object.send("#{attr}=", @reader[attr])
603
+ end
604
+ end
605
+
606
+ def parse_clade_elements(current_node, current_edge)
607
+ #no loop inside, loop is already outside
608
+
609
+ if @reader.node_type == XML::Reader::TYPE_ELEMENT
610
+ case @reader.name
611
+ when 'branch_length'
612
+ # @todo add unit test for this. current_edge is nil, if the root clade
613
+ # has branch_length attribute.
614
+ @reader.read
615
+ branch_length = @reader.value
616
+ current_edge.distance = branch_length.to_f if current_edge != nil
617
+ @reader.read
618
+ when 'width'
619
+ @reader.read
620
+ current_node.width = @reader.value
621
+ @reader.read
622
+ when 'name'
623
+ @reader.read
624
+ current_node.name = @reader.value
625
+ @reader.read
626
+ when 'events'
627
+ current_node.events = parse_events
628
+ when 'confidence'
629
+ current_node.confidences << parse_confidence
630
+ when 'sequence'
631
+ current_node.sequences << parse_sequence
632
+ when 'property'
633
+ current_node.properties << parse_property
634
+ when 'taxonomy'
635
+ current_node.taxonomies << parse_taxonomy
636
+ when 'distribution'
637
+ current_node.distributions << parse_distribution
638
+ when 'node_id'
639
+ id = Id.new
640
+ id.type = @reader["type"]
641
+ @reader.read
642
+ id.value = @reader.value
643
+ @reader.read
644
+ #has_reached_end_element?('node_id')
645
+ #@todo write unit test for this. There is no example of this in the example files
646
+ current_node.id = id
647
+ when 'color'
648
+ color = BranchColor.new
649
+ parse_simple_element(color, 'red')
650
+ parse_simple_element(color, 'green')
651
+ parse_simple_element(color, 'blue')
652
+ current_node.color = color
653
+ #@todo add unit test for this
654
+ when 'date'
655
+ date = Date.new
656
+ date.unit = @reader["unit"]
657
+ #move to the next token, which is always empty, since date tag does not
658
+ # have text associated with it
659
+ @reader.read
660
+ @reader.read #now the token is the first tag under date tag
661
+ while not(is_end_element?('date'))
662
+ parse_simple_element(date, 'desc')
663
+ parse_simple_element(date, 'value')
664
+ parse_simple_element(date, 'minimum')
665
+ parse_simple_element(date, 'maximum')
666
+ @reader.read
667
+ end
668
+ current_node.date = date
669
+ when 'reference'
670
+ reference = Reference.new()
671
+ reference.doi = @reader['doi']
672
+ if not @reader.empty_element?
673
+ while not is_end_element?('reference')
674
+ parse_simple_element(reference, 'desc')
675
+ @reader.read
676
+ end
677
+ end
678
+ current_node.references << reference
679
+ when 'binary_characters'
680
+ current_node.binary_characters = parse_binary_characters
681
+ when 'clade'
682
+ #do nothing
683
+ else
684
+ current_node.other << parse_other
685
+ #puts "No match found in parse_clade_elements.(#{@reader.name})"
686
+ end
687
+
688
+ end
689
+
690
+ end #parse_clade_elements
691
+
692
+ def parse_events()
693
+ events = PhyloXML::Events.new
694
+ @reader.read #go to next element
695
+ while not(is_end_element?('events')) do
696
+ parse_simple_elements(events, ['type', 'duplications',
697
+ 'speciations', 'losses'])
698
+ if is_element?('confidence')
699
+ events.confidence = parse_confidence
700
+ #@todo could add unit test for this (example file does not have this case)
701
+ end
702
+ @reader.read
703
+ end
704
+ return events
705
+ end #parse_events
706
+
707
+ def parse_taxonomy
708
+ taxonomy = PhyloXML::Taxonomy.new
709
+ parse_attributes(taxonomy, ["id_source"])
710
+ @reader.read
711
+ while not(is_end_element?('taxonomy')) do
712
+
713
+ if @reader.node_type == XML::Reader::TYPE_ELEMENT
714
+ case @reader.name
715
+ when 'code'
716
+ @reader.read
717
+ taxonomy.code = @reader.value
718
+ @reader.read
719
+ when 'scientific_name'
720
+ @reader.read
721
+ taxonomy.scientific_name = @reader.value
722
+ @reader.read
723
+ when 'rank'
724
+ @reader.read
725
+ taxonomy.rank = @reader.value
726
+ @reader.read
727
+ when 'authority'
728
+ @reader.read
729
+ taxonomy.authority = @reader.value
730
+ @reader.read
731
+ when 'id'
732
+ taxonomy.taxonomy_id = parse_id('id')
733
+ when 'common_name'
734
+ @reader.read
735
+ taxonomy.common_names << @reader.value
736
+ @reader.read
737
+ #has_reached_end_element?('common_name')
738
+ when 'synonym'
739
+ @reader.read
740
+ taxonomy.synonyms << @reader.value
741
+ @reader.read
742
+ #has_reached_end_element?('synonym')
743
+ when 'uri'
744
+ taxonomy.uri = parse_uri
745
+ else
746
+ taxonomy.other << parse_other
747
+ end
748
+ end
749
+
750
+ @reader.read #move to next tag in the loop
751
+ end
752
+ return taxonomy
753
+ end #parse_taxonomy
754
+
755
+ private
756
+
757
+ def parse_sequence
758
+ sequence = Sequence.new
759
+ parse_attributes(sequence, ["type", "id_source", "id_ref"])
760
+
761
+ @reader.read
762
+ while not(is_end_element?('sequence'))
763
+
764
+ if @reader.node_type == XML::Reader::TYPE_ELEMENT
765
+ case @reader.name
766
+ when 'symbol'
767
+ @reader.read
768
+ sequence.symbol = @reader.value
769
+ @reader.read
770
+ when 'name'
771
+ @reader.read
772
+ sequence.name = @reader.value
773
+ @reader.read
774
+ when 'location'
775
+ @reader.read
776
+ sequence.location = @reader.value
777
+ @reader.read
778
+ when 'mol_seq'
779
+ sequence.is_aligned = @reader["is_aligned"]
780
+ @reader.read
781
+ sequence.mol_seq = @reader.value
782
+ @reader.read
783
+ has_reached_end_element?('mol_seq')
784
+ when 'accession'
785
+ sequence.accession = Accession.new
786
+ sequence.accession.source = @reader["source"]
787
+ @reader.read
788
+ sequence.accession.value = @reader.value
789
+ @reader.read
790
+ has_reached_end_element?('accession')
791
+ when 'uri'
792
+ sequence.uri = parse_uri
793
+ when 'annotation'
794
+ sequence.annotations << parse_annotation
795
+ when 'domain_architecture'
796
+ sequence.domain_architecture = DomainArchitecture.new
797
+ sequence.domain_architecture.length = @reader["length"]
798
+ @reader.read
799
+ @reader.read
800
+ while not(is_end_element?('domain_architecture'))
801
+ sequence.domain_architecture.domains << parse_domain
802
+ @reader.read #go to next domain element
803
+ end
804
+ else
805
+ sequence.other << parse_other
806
+ #@todo add unit test
807
+ end
808
+ end
809
+
810
+ @reader.read
811
+ end
812
+ return sequence
813
+ end #parse_sequence
814
+
815
+ def parse_uri
816
+ uri = Uri.new
817
+ parse_attributes(uri, ["desc", "type"])
818
+ parse_simple_element(uri, 'uri')
819
+ return uri
820
+ end
821
+
822
+ def parse_annotation
823
+ annotation = Annotation.new
824
+
825
+ parse_attributes(annotation, ['ref', 'source', 'evidence', 'type'])
826
+
827
+ if not @reader.empty_element?
828
+ while not(is_end_element?('annotation'))
829
+ parse_simple_element(annotation, 'desc') if is_element?('desc')
830
+
831
+ annotation.confidence = parse_confidence if is_element?('confidence')
832
+
833
+ annotation.properties << parse_property if is_element?('property')
834
+
835
+ if is_element?('uri')
836
+ annotation.uri = parse_uri
837
+ end
838
+
839
+ @reader.read
840
+ end
841
+
842
+ end
843
+ return annotation
844
+ end
845
+
846
+ def parse_property
847
+ property = Property.new
848
+ parse_attributes(property, ["ref", "unit", "datatype", "applies_to", "id_ref"])
849
+ @reader.read
850
+ property.value = @reader.value
851
+ @reader.read
852
+ has_reached_end_element?('property')
853
+ return property
854
+ end #parse_property
855
+
856
+ def parse_confidence
857
+ type = @reader["type"]
858
+ @reader.read
859
+ value = @reader.value.to_f
860
+ @reader.read
861
+ has_reached_end_element?('confidence')
862
+ return Confidence.new(type, value)
863
+ end #parse_confidence
864
+
865
+ def parse_distribution
866
+ distribution = Distribution.new
867
+ @reader.read
868
+ while not(is_end_element?('distribution')) do
869
+
870
+ parse_simple_element(distribution, 'desc')
871
+
872
+ distribution.points << parse_point if is_element?('point')
873
+ distribution.polygons << parse_polygon if is_element?('polygon')
874
+
875
+ @reader.read
876
+ end
877
+ return distribution
878
+ end #parse_distribution
879
+
880
+ def parse_point
881
+ point = Point.new
882
+
883
+ point.geodetic_datum = @reader["geodetic_datum"]
884
+ point.alt_unit = @reader["alt_unit"]
885
+
886
+ @reader.read
887
+ while not(is_end_element?('point')) do
888
+
889
+ parse_simple_elements(point, ['lat', 'long'] )
890
+
891
+ if is_element?('alt')
892
+ @reader.read
893
+ point.alt = @reader.value.to_f
894
+ @reader.read
895
+ has_reached_end_element?('alt')
896
+ end
897
+ #advance reader
898
+ @reader.read
899
+ end
900
+ return point
901
+ end #parse_point
902
+
903
+ def parse_polygon
904
+ polygon = Polygon.new
905
+ @reader.read
906
+ while not(is_end_element?('polygon')) do
907
+ polygon.points << parse_point if is_element?('point')
908
+ @reader.read
909
+ end
910
+
911
+ #@todo should check for it at all? Probably not if xml is valid.
912
+ if polygon.points.length <3
913
+ puts "Warning: <polygon> should have at least 3 points"
914
+ end
915
+ return polygon
916
+ end #parse_polygon
917
+
918
+ def parse_id(tag_name)
919
+ id = Id.new
920
+ id.provider = @reader["provider"]
921
+ @reader.read
922
+ id.value = @reader.value
923
+ @reader.read #@todo shouldn't there be another read?
924
+ has_reached_end_element?(tag_name)
925
+ return id
926
+ end #parse_id
927
+
928
+ def parse_domain
929
+ domain = ProteinDomain.new
930
+ parse_attributes(domain, ["from", "to", "confidence", "id"])
931
+ @reader.read
932
+ domain.value = @reader.value
933
+ @reader.read
934
+ has_reached_end_element?('domain')
935
+ @reader.read
936
+ return domain
937
+ end
938
+
939
+ def parse_binary_characters
940
+ b = PhyloXML::BinaryCharacters.new
941
+ b.bc_type = @reader['type']
942
+
943
+ parse_attributes(b, ['gained_count', 'absent_count', 'lost_count', 'present_count'])
944
+ if not @reader.empty_element?
945
+ @reader.read
946
+ while not is_end_element?('binary_characters')
947
+
948
+ parse_bc(b, 'lost')
949
+ parse_bc(b, 'gained')
950
+ parse_bc(b, 'absent')
951
+ parse_bc(b, 'present')
952
+
953
+ @reader.read
954
+ end
955
+ end
956
+ return b
957
+ end #parse_binary_characters
958
+
959
+ def parse_bc(object, element)
960
+ if is_element?(element)
961
+ @reader.read
962
+ while not is_end_element?(element)
963
+ if is_element?('bc')
964
+ @reader.read
965
+ object.send(element) << @reader.value
966
+ @reader.read
967
+ has_reached_end_element?('bc')
968
+ end
969
+ @reader.read
970
+ end
971
+ end
972
+ end #parse_bc
973
+
974
+ def parse_other
975
+ other_obj = PhyloXML::Other.new
976
+ other_obj.element_name = @reader.name
977
+ #parse attributes
978
+ code = @reader.move_to_first_attribute
979
+ while code ==1
980
+ other_obj.attributes[@reader.name] = @reader.value
981
+ code = @reader.move_to_next_attribute
982
+ end
983
+
984
+ while not is_end_element?(other_obj.element_name) do
985
+ @reader.read
986
+ if @reader.node_type == XML::Reader::TYPE_ELEMENT
987
+ other_obj.children << parse_other #recursice call to parse children
988
+ elsif @reader.node_type == XML::Reader::TYPE_TEXT
989
+ other_obj.value = @reader.value
990
+ end
991
+ end
992
+ #just a check
993
+ has_reached_end_element?(other_obj.element_name)
994
+ return other_obj
995
+ end #parse_other
996
+
997
+ end #class phyloxmlParser
998
+
999
+ end #module PhyloXML
1000
+
1001
+ end #module Bio