bio-nexml 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. data/Gemfile +15 -0
  2. data/Gemfile.lock +24 -0
  3. data/LICENSE.txt +20 -0
  4. data/README.rdoc +47 -0
  5. data/Rakefile +55 -0
  6. data/TODO.txt +6 -0
  7. data/VERSION +1 -0
  8. data/bio-nexml.gemspec +126 -0
  9. data/extconf.rb +2 -0
  10. data/lib/bio-nexml.rb +0 -0
  11. data/lib/bio.rb +321 -0
  12. data/lib/bio/db/nexml.rb +109 -0
  13. data/lib/bio/db/nexml/mapper.rb +113 -0
  14. data/lib/bio/db/nexml/mapper/framework.rb +157 -0
  15. data/lib/bio/db/nexml/mapper/inflection.rb +99 -0
  16. data/lib/bio/db/nexml/mapper/repository.rb +59 -0
  17. data/lib/bio/db/nexml/matrix.rb +1046 -0
  18. data/lib/bio/db/nexml/parser.rb +622 -0
  19. data/lib/bio/db/nexml/schema/README.txt +21 -0
  20. data/lib/bio/db/nexml/schema/abstract.xsd +159 -0
  21. data/lib/bio/db/nexml/schema/characters/README.txt +1 -0
  22. data/lib/bio/db/nexml/schema/characters/abstractcharacters.xsd +361 -0
  23. data/lib/bio/db/nexml/schema/characters/characters.xsd +22 -0
  24. data/lib/bio/db/nexml/schema/characters/continuous.xsd +190 -0
  25. data/lib/bio/db/nexml/schema/characters/dna.xsd +282 -0
  26. data/lib/bio/db/nexml/schema/characters/protein.xsd +280 -0
  27. data/lib/bio/db/nexml/schema/characters/restriction.xsd +239 -0
  28. data/lib/bio/db/nexml/schema/characters/rna.xsd +283 -0
  29. data/lib/bio/db/nexml/schema/characters/standard.xsd +261 -0
  30. data/lib/bio/db/nexml/schema/external/sawsdl.xsd +21 -0
  31. data/lib/bio/db/nexml/schema/external/xhtml-datatypes-1.xsd +177 -0
  32. data/lib/bio/db/nexml/schema/external/xlink.xsd +75 -0
  33. data/lib/bio/db/nexml/schema/external/xml.xsd +145 -0
  34. data/lib/bio/db/nexml/schema/meta/README.txt +2 -0
  35. data/lib/bio/db/nexml/schema/meta/annotations.xsd +100 -0
  36. data/lib/bio/db/nexml/schema/meta/meta.xsd +294 -0
  37. data/lib/bio/db/nexml/schema/nexml.xsd +104 -0
  38. data/lib/bio/db/nexml/schema/taxa/README.txt +2 -0
  39. data/lib/bio/db/nexml/schema/taxa/taxa.xsd +39 -0
  40. data/lib/bio/db/nexml/schema/trees/README.txt +2 -0
  41. data/lib/bio/db/nexml/schema/trees/abstracttrees.xsd +135 -0
  42. data/lib/bio/db/nexml/schema/trees/network.xsd +113 -0
  43. data/lib/bio/db/nexml/schema/trees/tree.xsd +149 -0
  44. data/lib/bio/db/nexml/schema/trees/trees.xsd +36 -0
  45. data/lib/bio/db/nexml/taxa.rb +147 -0
  46. data/lib/bio/db/nexml/trees.rb +663 -0
  47. data/lib/bio/db/nexml/writer.rb +265 -0
  48. data/test/data/nexml/test.xml +69 -0
  49. data/test/test_bio-nexml.rb +17 -0
  50. data/test/unit/bio/db/nexml/tc_factory.rb +119 -0
  51. data/test/unit/bio/db/nexml/tc_mapper.rb +78 -0
  52. data/test/unit/bio/db/nexml/tc_matrix.rb +551 -0
  53. data/test/unit/bio/db/nexml/tc_parser.rb +21 -0
  54. data/test/unit/bio/db/nexml/tc_taxa.rb +118 -0
  55. data/test/unit/bio/db/nexml/tc_trees.rb +370 -0
  56. data/test/unit/bio/db/nexml/tc_writer.rb +633 -0
  57. metadata +253 -0
@@ -0,0 +1,622 @@
1
+ #require "ruby-debug"
2
+ #Debugger.start
3
+
4
+ module Bio
5
+ module NeXML
6
+ include LibXML
7
+
8
+ #def self.parse( nexml, validate = false )
9
+ #Parser.new( nexml, validate ).parse
10
+ #end
11
+
12
+ class Parser
13
+
14
+ def initialize( nexml, validate = false )
15
+ #initialize a cache
16
+ @cache = {}
17
+
18
+ #initialize a libxml cursor
19
+ @reader = read( nexml )
20
+
21
+ #validate
22
+ validate_nexml if validate
23
+
24
+ end
25
+
26
+ #Is a factory method that returns an object of class
27
+ #Bio::NeXML::Nexml
28
+ def parse
29
+ #return a cached version if it exists
30
+ return @nexml if @nexml
31
+
32
+ #start at the root element
33
+ skip_leader
34
+
35
+ #start with a new Nexml object
36
+ version = attribute( 'version' )
37
+ generator = attribute( 'generator' )
38
+ @nexml = NeXML::Nexml.new( version, generator )
39
+
40
+ #perhaps a namespace api as well
41
+
42
+ #start parsing other elements
43
+ while next_node
44
+ case local_name
45
+ when "otus"
46
+ @nexml.add_otus( parse_otus )
47
+ when "trees"
48
+ @nexml.add_trees( parse_trees )
49
+ when "characters"
50
+ @nexml.add_characters( parse_characters )
51
+ end
52
+ end
53
+
54
+ #close the libxml parser object
55
+ #close
56
+
57
+ #return the Nexml object
58
+ @nexml
59
+ end
60
+
61
+ #Close the assosicated XML::Reader object
62
+ #and try to free other resources like @nexml
63
+ def close
64
+ @reader.close
65
+ end
66
+
67
+ private
68
+
69
+ # Cache otus, otu, states, state, char, node
70
+ def cache( object = nil )
71
+ return @cache unless object
72
+ @cache[ object.id ] = object
73
+ end
74
+
75
+ #Determine if the 'nexml' is a file, string, or an io
76
+ #and accordingly return a XML::Reader object.
77
+ def read( nexml )
78
+ case nexml
79
+ when /\.xml$/
80
+ XML::Reader.file( nexml, :options => parse_opts )
81
+ when IO
82
+ XML::Reader.io( nexml, :options => parse_opts )
83
+ when String
84
+ XML::Reader.string( nexml, :options => parse_opts )
85
+ end
86
+ end
87
+
88
+ def skip_leader
89
+ @reader.read until local_name == "nexml"
90
+ end
91
+
92
+ def local_name
93
+ @reader.local_name
94
+ end
95
+
96
+ def value
97
+ @reader.value
98
+ end
99
+
100
+ def attribute( name )
101
+ @reader[ name ]
102
+ end
103
+
104
+ def next_node
105
+ while @reader.read
106
+ return true if element_start? or element_end? or text_node?
107
+ end
108
+ false
109
+ end
110
+
111
+ #Define XML parsing options for the libxml parser.
112
+ #1. remove blank nodes
113
+ #2. substitute entities
114
+ #3. forbid network access
115
+ def parse_opts
116
+ XML::Parser::Options::NOBLANKS |
117
+ XML::Parser::Options::NOENT |
118
+ XML:: Parser::Options::NONET
119
+ end
120
+
121
+ def validate_nexml
122
+ valid = @reader.schema_validate( File.join( File.dirname(__FILE__),
123
+ "schema/nexml.xsd" ) )
124
+ return true if valid == 0
125
+ end
126
+
127
+ #Check if 'name'( without prefix ) is an element node or not.
128
+ def element_start?
129
+ @reader.node_type == XML::Reader::TYPE_ELEMENT
130
+ end
131
+
132
+ #Check if 'name'( without prefix ) is the end of an element or not.
133
+ def element_end?
134
+ @reader.node_type == XML::Reader::TYPE_END_ELEMENT
135
+ end
136
+
137
+ def text_node?
138
+ @reader.node_type == XML::Reader::TYPE_TEXT
139
+ end
140
+
141
+ def empty_element?
142
+ @reader.empty_element?
143
+ end
144
+
145
+ #When this function is called the cursor is at an 'otus' element.
146
+ #Return - an 'otus' object
147
+ def parse_otus
148
+ id = attribute( 'id' )
149
+ label = attribute( 'label' )
150
+
151
+ otus = NeXML::Otus.new( id, :label => label )
152
+
153
+ cache otus
154
+
155
+ #according to the schema an 'otus' may have no child element.
156
+ return otus if empty_element?
157
+
158
+ #else, parse child elements
159
+ while next_node
160
+ case local_name
161
+ when "otu"
162
+ #parse child otu element
163
+ otus << parse_otu
164
+ when "otus"
165
+ #end of current 'otus' element has been reached
166
+ break
167
+ end
168
+ end
169
+
170
+ #return the 'otus' object
171
+ otus
172
+ end
173
+
174
+ #When this function is called the cursor is at an 'otu' element.
175
+ #Return - an 'otu' object.
176
+ def parse_otu
177
+ id = attribute( 'id' )
178
+ label = attribute( 'label' )
179
+
180
+ otu = NeXML::Otu.new( id, :label => label )
181
+
182
+ cache otu
183
+
184
+ #according to the schema an 'otu' may have no child element.
185
+ return otu if empty_element?
186
+
187
+ while next_node
188
+ case local_name
189
+ when 'otu'
190
+ #end of current 'otu' element has been reached
191
+ break
192
+ end
193
+ end
194
+
195
+ #return the 'otu' object
196
+ otu
197
+ end
198
+
199
+ #When this function is called the cursor is at a 'trees' element.
200
+ #Return - a 'trees' object.
201
+ def parse_trees
202
+ otus = cache[ attribute( 'otus' ) ]
203
+
204
+ id = attribute( 'id' )
205
+ label = attribute( 'label' )
206
+
207
+ trees = NeXML::Trees.new( id, :otus => otus, :label => label )
208
+
209
+ #a 'trees' element *will* have child nodes.
210
+ while next_node
211
+ case local_name
212
+ when "tree"
213
+ #parse child 'tree' element
214
+ trees << parse_tree
215
+ when "network"
216
+ trees << parse_network
217
+ when "trees"
218
+ #end of current 'trees' element has been reached
219
+ break
220
+ end
221
+ end
222
+
223
+ #return the 'trees' object
224
+ trees
225
+ end
226
+
227
+ #When this function is called the cursor is at a 'tree' element.
228
+ #Return - a 'tree' object.
229
+ def parse_tree
230
+ id = attribute( 'id' )
231
+ label = attribute( 'label' )
232
+
233
+ type = attribute( 'xsi:type' )[4..-1]
234
+ klass = NeXML.const_get( type )
235
+ tree = klass.new( id, :label => label )
236
+
237
+ #a 'tree' element *will* have child nodes.
238
+ while next_node
239
+ case local_name
240
+ when "node"
241
+ #parse child 'node' element
242
+ node = parse_node
243
+
244
+ #and add it to the 'tree'
245
+ tree.add_node node
246
+
247
+ #root?
248
+ tree.roots << node if node.root?
249
+ when "rootedge"
250
+ #parse child 'edge' element
251
+ rootedge = parse_rootedge
252
+
253
+ #and add it to the 'tree'
254
+ # tree.add_rootedge rootedge # XXX it looks like the super class(es)
255
+ # can only deal with edges that have source and target
256
+ when "edge"
257
+ #parse child 'edge' element
258
+ edge = parse_edge( type )
259
+
260
+ #and add it to the 'tree'
261
+ tree.add_edge edge
262
+ when "tree"
263
+ #end of current 'tree' element has been reached
264
+ break
265
+ end
266
+ end
267
+
268
+ #return the 'tree' object
269
+ tree
270
+ end
271
+
272
+ def parse_network
273
+ id = attribute( 'id' )
274
+ label = attribute( 'label' )
275
+
276
+ type = attribute( 'xsi:type' )[4..-1]
277
+ klass = NeXML.const_get type
278
+ network = klass.new( id, :label => label )
279
+
280
+ #a 'network' element *will* have child nodes.
281
+ while next_node
282
+ case local_name
283
+ when "node"
284
+ #parse child 'node' element
285
+ node = parse_node
286
+
287
+ #and add it to the 'network'
288
+ network.add_node node
289
+
290
+ #root?
291
+ network.root = node if node.root?
292
+ when "edge"
293
+ #parse child 'edge' element
294
+ edge = parse_edge( type )
295
+
296
+ #and add it to the 'network'
297
+ network.add_edge edge
298
+
299
+ when "network"
300
+ #end of current 'network' element has been reached
301
+ break
302
+ end
303
+ end
304
+
305
+ #return the 'network' object
306
+ network
307
+ end
308
+
309
+ #When this function is called the cursor is at a 'node' element.
310
+ #Return - a 'node' object.
311
+ def parse_node
312
+ id = attribute( 'id' )
313
+ label = attribute( 'label' )
314
+ root = attribute( 'root' ) ? true : false
315
+
316
+ #is this node taxon linked
317
+ if otu_id = attribute( 'otu' )
318
+ otu = cache[ otu_id ]
319
+ end
320
+
321
+ node = NeXML::Node.new( id, :otu => otu, :root => root, :label => label )
322
+ cache node
323
+
324
+ #according to the schema a 'node' may have no child element.
325
+ return node if empty_element?
326
+
327
+ #else, if 'node' has child elements
328
+ while next_node
329
+ case local_name
330
+ when 'node'
331
+ #end of current 'node' element has been reached
332
+ break
333
+ end
334
+ end
335
+
336
+ #return the 'node' object
337
+ node
338
+ end
339
+
340
+ #When this function is called the cursor is at a 'edge' element.
341
+ #Return - a 'edge' object.
342
+ def parse_edge( type )
343
+ id = attribute( 'id' )
344
+ source = cache[ attribute( 'source' ) ]
345
+ target = cache[ attribute( 'target' ) ]
346
+ length = attribute( 'length' )
347
+
348
+ type.sub!(/Tree|Network/, "Edge")
349
+ klass = NeXML.const_get( type )
350
+ edge = klass.new( id, :source => source, :target => target, :length => length )
351
+
352
+ #according to the schema an 'edge' may have no child element.
353
+ return edge if empty_element?
354
+
355
+ while next_node
356
+ case local_name
357
+ when 'edge'
358
+ #end of current 'edge' element has been reached
359
+ break
360
+ end
361
+ end
362
+
363
+ #return the 'edge' object
364
+ edge
365
+ end
366
+
367
+ def parse_rootedge
368
+ id = attribute( 'id' )
369
+ target = cache[ attribute( 'target' ) ]
370
+ length = attribute( 'length' )
371
+
372
+ rootedge = RootEdge.new( id, :target => target, :length => length )
373
+
374
+ #according to the schema an 'edge' may have no child element.
375
+ return rootedge if empty_element?
376
+
377
+ while next_node
378
+ case local_name
379
+ when 'rootedge'
380
+ #end of current 'rootedge' element has been reached
381
+ break
382
+ end
383
+ end
384
+
385
+ #return the 'rootedge' object
386
+ rootedge
387
+ end
388
+
389
+ def parse_characters
390
+ #get the taxon linkage
391
+ otus = cache[ attribute( 'otus' ) ]
392
+
393
+ #other attribute
394
+ id = attribute( 'id' )
395
+ label = attribute( 'label' )
396
+
397
+ #determine the type
398
+ type = attribute( 'xsi:type' )[ 4..-1 ]
399
+ klass = NeXML.const_get( type )
400
+ verbose = type =~ /Cells/ ? true : false;
401
+
402
+ characters = klass.new( id, :otus => otus, :label => label )
403
+
404
+ #according to the schema a 'characters' will have a child
405
+ while next_node
406
+ case local_name
407
+ when 'format'
408
+ format = parse_format( type )
409
+ characters.add_format( format )
410
+ when 'matrix'
411
+ matrix = parse_matrix( type, verbose )
412
+ characters.add_matrix( matrix )
413
+ when 'characters'
414
+ break
415
+ end #end case
416
+ end #end while
417
+
418
+ characters
419
+ end #end parse_characters
420
+
421
+ def parse_format( type )
422
+ format = Format.new
423
+
424
+ #according to the schema a concrete characters type
425
+ #will have a child element.
426
+ while next_node
427
+ case local_name
428
+ when 'states'
429
+ states = parse_states( type )
430
+ format.add_states( states )
431
+ when 'char'
432
+ char = parse_char( type )
433
+ format.add_char( char )
434
+ when 'format'
435
+ break
436
+ end #end case
437
+ end #end while
438
+
439
+ format
440
+ end #end parse_format
441
+
442
+ def parse_states( type )
443
+ id = attribute( 'id' )
444
+ label = attribute( 'label' )
445
+ states = States.new( id, :label => label )
446
+
447
+ while next_node
448
+ case local_name
449
+ when 'state'
450
+ state = parse_state( type )
451
+ states.add_state( state )
452
+ when 'polymorphic_state_set'
453
+ state = parse_state( type )
454
+ state.ambiguity = :polymorphic
455
+ states.add_state( state )
456
+ when 'uncertain_state_set'
457
+ state = parse_state( type )
458
+ state.ambiguity = :uncertain
459
+ states.add_state( state )
460
+ when 'states'
461
+ break
462
+ end
463
+ end
464
+
465
+ cache states
466
+
467
+ states
468
+ end
469
+
470
+ def parse_state( type )
471
+ id = attribute( 'id' )
472
+ symbol = attribute( 'symbol' )
473
+ label = attribute( 'label' )
474
+
475
+ type = type[ 0..-2 ]
476
+ state = State.new( id, symbol, :label => label )
477
+
478
+ cache state
479
+
480
+ return state if empty_element?
481
+
482
+ while next_node
483
+ case local_name
484
+ when 'state', 'polymorphic_state_set', 'uncertain_state_set'
485
+ break
486
+ when 'member'
487
+ state.add_member( parse_member )
488
+ end
489
+ end
490
+
491
+ state
492
+ end
493
+
494
+ def parse_member
495
+ state_id = attribute( 'state' )
496
+ cache[ state_id ]
497
+ end
498
+
499
+ def parse_char( type )
500
+ id = attribute( 'id' )
501
+ label = attribute( 'label' )
502
+ states = cache[ attribute( 'states' ) ]
503
+
504
+ type = type.sub( /Format/, "Char" )
505
+ char = Char.new( id, states, :label => label )
506
+
507
+ if char.respond_to?(:codon=) and c = attribute( 'codon' )
508
+ char.codon = c
509
+ end
510
+
511
+ cache char
512
+
513
+ return char if empty_element?
514
+
515
+ while next_node
516
+ case local_name
517
+ when 'char'
518
+ break
519
+ end #end case
520
+ end #end while
521
+
522
+ char
523
+ end #end method parse_char
524
+
525
+ def parse_matrix( type, verbose )
526
+ type = type[ 0..-2 ]
527
+ type << "Matrix"
528
+
529
+ matrix = Matrix.new
530
+
531
+ while next_node
532
+ case local_name
533
+ when 'row'
534
+ row = parse_row( type, verbose )
535
+ matrix.add_row( row )
536
+ when 'matrix'
537
+ break
538
+ end
539
+ end
540
+
541
+ matrix
542
+ end #end method parse_matrix
543
+
544
+ def parse_row( type, verbose )
545
+ id = attribute( 'id' )
546
+ label = attribute( 'label' )
547
+ otu = cache[ attribute( 'otu' ) ]
548
+
549
+ type = type.sub( /Matrix/, "Row" )
550
+ klass = verbose ? CellRow : SeqRow
551
+ row = klass.new( id, :label => label )
552
+
553
+ while next_node
554
+ case local_name
555
+ when 'seq'
556
+ seq = parse_seq( type )
557
+ row.add_sequence( seq )
558
+ when 'cell'
559
+ cell = parse_cell( type )
560
+ row.add_cell( cell )
561
+ when 'row'
562
+ break
563
+ end
564
+ end
565
+
566
+ row
567
+ end #end class parse_row
568
+
569
+ def parse_seq( type )
570
+ type = type[ 0..-4 ]
571
+ #klass = NeXML.const_get type
572
+
573
+ seq = Sequence.new
574
+
575
+ return seq if empty_element?
576
+
577
+ while next_node
578
+ case local_name
579
+ when '#text'
580
+ seq.value = value
581
+ when 'seq'
582
+ break
583
+ end
584
+ end
585
+
586
+ seq
587
+ end
588
+
589
+ def parse_cell( type )
590
+ type = type[ 0..-4 ]
591
+
592
+ cell = Cell.new
593
+
594
+ char_id = attribute( 'char' )
595
+ state_id = attribute( 'state' )
596
+
597
+ char = cache[ char_id ]
598
+ state = ( type !~ /Continuous/ ? cache[ state_id ] : state_id )
599
+
600
+ cell.state = state
601
+ cell.char = char
602
+
603
+ return cell if empty_element?
604
+
605
+ while next_node
606
+ case local_name
607
+ when 'cell'
608
+ break
609
+ end
610
+ end
611
+
612
+ cell
613
+ end
614
+
615
+ end #end Parser class
616
+
617
+ end #end NeXML module
618
+
619
+ end #end Bio module
620
+
621
+ #n = Bio::NeXML.parse "examples/test.xml"
622
+ #Debugger.stop