biointerchange 0.2.2 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. data/Gemfile +1 -0
  2. data/README.md +269 -19
  3. data/VERSION +1 -1
  4. data/examples/bininda_emonds_mammals.new +1 -0
  5. data/examples/rdfization.rb +17 -0
  6. data/examples/tree1.new +1 -0
  7. data/examples/tree2.new +1 -0
  8. data/examples/vocabulary.rb +26 -5
  9. data/generators/javaify.rb +12 -18
  10. data/generators/make_supplement_releases.rb +2 -0
  11. data/generators/pythonify.rb +21 -8
  12. data/generators/rdfxml.rb +15 -1
  13. data/lib/biointerchange/cdao.rb +2014 -0
  14. data/lib/biointerchange/core.rb +70 -77
  15. data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb +16 -0
  16. data/lib/biointerchange/genomics/gff3_reader.rb +18 -4
  17. data/lib/biointerchange/genomics/gvf_reader.rb +14 -0
  18. data/lib/biointerchange/phylogenetics/cdao_rdf_ntriples.rb +108 -0
  19. data/lib/biointerchange/phylogenetics/newick_reader.rb +81 -0
  20. data/lib/biointerchange/phylogenetics/tree_set.rb +50 -0
  21. data/lib/biointerchange/registry.rb +50 -8
  22. data/lib/biointerchange/so.rb +150 -0
  23. data/lib/biointerchange/textmining/pdfx_xml_reader.rb +21 -2
  24. data/lib/biointerchange/textmining/pubannos_json_reader.rb +24 -1
  25. data/lib/biointerchange/textmining/text_mining_rdf_ntriples.rb +9 -0
  26. data/lib/biointerchange/textmining/text_mining_reader.rb +5 -5
  27. data/spec/phylogenetics_spec.rb +79 -0
  28. data/supplemental/java/biointerchange/pom.xml +1 -1
  29. data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/CDAO.java +2602 -0
  30. data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/FALDO.java +30 -28
  31. data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/GFF3O.java +136 -104
  32. data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/GVF1O.java +367 -278
  33. data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/SIO.java +4388 -3127
  34. data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/SO.java +5970 -4351
  35. data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/SOFA.java +733 -544
  36. data/supplemental/java/biointerchange/src/test/java/org/biointerchange/AppTest.java +3 -1
  37. data/supplemental/python/biointerchange/cdao.py +2021 -0
  38. data/supplemental/python/biointerchange/faldo.py +37 -38
  39. data/supplemental/python/biointerchange/gff3o.py +156 -157
  40. data/supplemental/python/biointerchange/goxref.py +172 -172
  41. data/supplemental/python/biointerchange/gvf1o.py +428 -429
  42. data/supplemental/python/biointerchange/sio.py +3133 -3134
  43. data/supplemental/python/biointerchange/so.py +6626 -6527
  44. data/supplemental/python/biointerchange/sofa.py +790 -791
  45. data/supplemental/python/example.py +23 -5
  46. data/supplemental/python/setup.py +2 -2
  47. data/web/about.html +1 -0
  48. data/web/api.html +223 -15
  49. data/web/biointerchange.js +27 -6
  50. data/web/cli.html +8 -3
  51. data/web/index.html +6 -2
  52. data/web/ontologies.html +3 -0
  53. data/web/service/rdfizer.fcgi +7 -15
  54. data/web/webservices.html +6 -2
  55. metadata +30 -3
@@ -20,6 +20,7 @@ module BioInterchange
20
20
  require 'biointerchange/exceptions'
21
21
 
22
22
  # Ontologies (besides the ones from the 'rdf' gem)
23
+ require 'biointerchange/cdao'
23
24
  require 'biointerchange/faldo'
24
25
  require 'biointerchange/gff3o'
25
26
  require 'biointerchange/goxref'
@@ -28,6 +29,9 @@ module BioInterchange
28
29
  require 'biointerchange/so'
29
30
  require 'biointerchange/sofa'
30
31
 
32
+ # Registry for reader/writer management:
33
+ require 'biointerchange/registry'
34
+
31
35
  # Reader/writer interfaces
32
36
  require 'biointerchange/reader'
33
37
  require 'biointerchange/model'
@@ -81,6 +85,19 @@ module BioInterchange
81
85
  # Writer
82
86
  # ...same GFF3 writer
83
87
 
88
+ #
89
+ # PHYLOGENETICS
90
+ #
91
+
92
+ # Reader
93
+ require 'biointerchange/phylogenetics/newick_reader'
94
+
95
+ # Model
96
+ require 'biointerchange/phylogenetics/tree_set'
97
+
98
+ # Writer
99
+ require 'biointerchange/phylogenetics/cdao_rdf_ntriples'
100
+
84
101
  #
85
102
  # ACTUAL COMMAND LINE IMPLEMENTATION
86
103
  #
@@ -97,7 +114,7 @@ module BioInterchange
97
114
  ["--batchsize", "-b", Getopt::OPTIONAL], # batchsize for readers/writers that support +postpone?+
98
115
  ["--input", "-i", Getopt::REQUIRED], # input file format
99
116
  ["--rdf", "-r", Getopt::REQUIRED], # output file format
100
- ["--annotate_name", Getopt::OPTIONAL], # name of resourcce/tool/person
117
+ ["--annotate_name", Getopt::OPTIONAL], # name of resource/tool/person
101
118
  ["--annotate_name_id", Getopt::OPTIONAL], # uri of resource/tool/person
102
119
  ["--annotate_date", Getopt::OPTIONAL], # date of processing/annotation
103
120
  ["--annotate_version", Getopt::OPTIONAL], # version number of resource
@@ -110,47 +127,38 @@ module BioInterchange
110
127
  puts "Usage: ruby #{$0} -i <format> -r <format> [options]"
111
128
  puts ''
112
129
  puts 'Supported input formats (--input <format>/-i <format>):'
113
- puts ' biointerchange.gff3 : GFF3'
114
- puts ' biointerchange.gvf : GVF'
115
- puts ' dbcls.catanns.json : PubAnnotation JSON'
116
- puts ' uk.ac.man.pdfx : PDFx XML'
130
+ Registry.reader_descriptions.each_pair { |reader_id, description|
131
+ puts " #{reader_id}#{' ' * (34 - reader_id.length)} : #{description}"
132
+ }
117
133
  puts ''
118
134
  puts 'Supported output formats (--rdf <format>/-r <format>)'
119
- puts ' rdf.biointerchange.gff3 : RDF N-Triples for the following input'
120
- puts ' biointerchange.gff3'
121
- puts ' rdf.biointerchange.gvf : RDF N-Triples for the following input'
122
- puts ' biointerchange.gff3'
123
- puts ' biointerchange.gvf'
124
- puts ' rdf.bh12.sio : RDF N-Triples for the following inputs'
125
- puts ' dbcls.catanns.json'
126
- puts ' uk.ac.man.pdfx'
135
+ Registry.writer_descriptions.each_pair { |writer_id, description|
136
+ puts " #{writer_id}#{' ' * (34 - writer_id.length)} : #{description}"
137
+ }
127
138
  puts ''
128
139
  puts 'I/O options:'
140
+ puts ' -b <size>/--batchsize <size> : process input in batches of the given size'
141
+ puts ' (if supported, see below for valid input/rdf pairs)'
129
142
  puts ' -f <file>/--file <file> : file to read; STDIN used if not supplied'
130
143
  puts ' -o <file>/--out <file> : output file; STDOUT used if not supplied'
131
144
  puts ''
132
- puts 'Input-/RDF-format specific options:'
133
- puts ' Input: dbcls.catanns.json, uk.ac.man.pdfx'
134
- puts ' Output: rdf.bh12.sio'
135
- puts ' Options:'
136
- puts ' --annotate_date <date> : date of processing/annotation (optional)'
137
- puts ' --annotate_version <version> : version number of resource (optional)'
138
- puts ' --annotate_name <name> : name of resource/tool/person (required)'
139
- puts ' --annotate_name_id <id> : URI of resource/tool/person (required)'
140
- puts ''
141
- puts 'Input-/RDF-format specific options:'
142
- puts ' Input: biointerchange.gff3 or biointerchange.gvf'
143
- puts ' Output: rdf.biointerchange.gff3 or rdf.biointerchange.gvf'
144
- puts ' Options:'
145
- puts ' -b <size>/--batchsize <size> : process features in batches of the given size (optional)'
146
- puts ' -t <date>/--date <date> : date when the GFF3/GVF file was created (optional)'
147
- puts ' --name <name> : name of the GFF3/GVF file creator (optional)'
148
- puts ' --name_id <id> : email address of the GFF3/GVF file creator (optional)'
149
- puts ''
150
145
  puts 'Other options:'
151
146
  puts ' -v / --version : print the Gem\'s version number and exit'
152
147
  puts ' -d / --debug : turn on debugging output (for stacktraces)'
153
148
  puts ' -h --help : this message'
149
+ puts ''
150
+ puts 'Input-/RDF-format specific options:'
151
+ reader_writer_pairs = Registry.reader_writer_pairs
152
+ reader_writer_pairs.each_index { |reader_writer_pair_index|
153
+ reader_id, writer_id = reader_writer_pairs[reader_writer_pair_index]
154
+ puts " Input format : #{reader_id}"
155
+ puts " Output format : #{writer_id}"
156
+ Registry.options_help(reader_id).each { |option_description|
157
+ option, description = option_description
158
+ puts " --annotate_#{option}#{' ' * (21 - option.length)} : #{description}"
159
+ }
160
+ puts '' if reader_writer_pair_index + 1 < reader_writer_pairs.length
161
+ }
154
162
 
155
163
  exit 1
156
164
  end
@@ -166,62 +174,43 @@ module BioInterchange
166
174
  @@skip_rdf_graph = false if opt['no_rdf_graph_optimization']
167
175
 
168
176
  # Check if the input/rdf options are supported:
169
- if opt['input'] == 'dbcls.catanns.json' or opt['input'] == 'uk.ac.man.pdfx' then
170
- if opt['rdf'] == 'rdf.bh12.sio' then
171
- raise ArgumentError, 'Require --name and --name_id options to specify source of annotations (e.g., a manual annotators name, or software tool name) and their associated URI (e.g., email address, or webaddress).' unless opt['name'] and opt['name_id']
172
- else
173
- unsupported_combination
174
- end
175
- elsif opt['input'] == 'biointerchange.gff3' then
176
- if opt['rdf'] == 'rdf.biointerchange.gff3' then
177
- # Okay. No further arguments required.
178
- else
179
- unsupported_combination
180
- end
181
- elsif opt['input'] == 'biointerchange.gvf' then
182
- if opt['rdf'] == 'rdf.biointerchange.gff3' or opt['rdf'] == 'rdf.biointerchange.gvf' then
183
- # Okay. No further arguments required.
184
- else
185
- unsupported_combination
186
- end
187
- else
188
- unsupported_combination
189
- end
177
+ unsupported_combination unless Registry.is_supported?(opt['input'], opt['rdf'])
190
178
 
191
- wrong_type('batchsize', 'a positive integer') if opt['batchsize'] and not opt['batchsize'].match(/^[1-9][0-9]*$/)
179
+ if opt['batchsize'] then
180
+ batching_not_supported unless Registry.is_supporting_batch_processing?(opt['input'], opt['rdf'])
181
+ wrong_type('batchsize', 'a positive integer') unless opt['batchsize'].match(/^[1-9][0-9]*$/)
182
+ end
192
183
 
193
- opt['batchsize'] = opt['batchsize'].to_i if opt['batchsize']
184
+ # Create a parameter map that can be passed along to Reader implementations:
185
+ map = {
186
+ 'input' => opt['input'],
187
+ 'output' => opt['output']
188
+ }
189
+ map['batchsize'] = opt['batchsize'].to_i if opt['batchsize']
190
+ opt.each_key { |key|
191
+ map[key.sub(/^annotate_/, '')] = opt[key] if key.start_with?('annotate_')
192
+ }
194
193
 
195
194
  # Generate model from file (deserialization).
196
- # Note: if-clauses are lexicographically ordered.
197
- reader = nil
198
- if opt['input'] == 'biointerchange.gff3' then
199
- reader = BioInterchange::Genomics::GFF3Reader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], opt['batchsize'])
200
- elsif opt['input'] == 'biointerchange.gvf' then
201
- reader = BioInterchange::Genomics::GVFReader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], opt['batchsize'])
202
- elsif opt['input'] == 'dbcls.catanns.json' then
203
- reader = BioInterchange::TextMining::PubAnnosJSONReader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], BioInterchange::TextMining::Process::UNSPECIFIED, opt['version'])
204
- elsif opt['input'] == 'uk.ac.man.pdfx' then
205
- reader = BioInterchange::TextMining::PDFxXMLReader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], BioInterchange::TextMining::Process::UNSPECIFIED, opt['annotate_version'])
206
- end
195
+ reader_class, *args = Registry.reader(opt['input'])
196
+ reader = reader_class.new(*BioInterchange::get_parameters(map, args))
207
197
 
208
- if opt["file"]
209
- input_source = File.new(opt["file"],'r')
198
+ input_source = nil
199
+ if opt['file'] then
200
+ input_source = File.new(opt['file'], 'r')
210
201
  else
211
202
  input_source = STDIN
212
203
  end
213
204
 
214
- # Generate rdf from model (serialization).
215
- # Note: if-clauses are lexicographically ordered.
216
- writer = nil
217
- if opt['rdf'] == 'rdf.bh12.sio' then
218
- writer = BioInterchange::TextMining::RDFWriter.new(File.new(opt['out'], 'w')) if opt['out']
219
- writer = BioInterchange::TextMining::RDFWriter.new(STDOUT) unless opt['out']
220
- end
221
- if opt['rdf'] == 'rdf.biointerchange.gff3' or opt['rdf'] == 'rdf.biointerchange.gvf' then
222
- writer = BioInterchange::Genomics::RDFWriter.new(File.new(opt['out'], 'w')) if opt['out']
223
- writer = BioInterchange::Genomics::RDFWriter.new(STDOUT) unless opt['out']
205
+ output_source = nil
206
+ if opt['out'] then
207
+ output_source = File.new(opt['out'], 'w')
208
+ else
209
+ output_source = STDOUT
224
210
  end
211
+
212
+ # Generate rdf from model (serialization).
213
+ writer = Registry.writer(opt['rdf']).new(output_source)
225
214
 
226
215
  begin
227
216
  model = reader.deserialize(input_source)
@@ -270,6 +259,10 @@ module BioInterchange
270
259
 
271
260
  private
272
261
 
262
+ def self.batching_not_supported
263
+ raise ArgumentError, 'Batching is not supported for this input/output format combination.'
264
+ end
265
+
273
266
  def self.unsupported_combination
274
267
  raise ArgumentError, 'This input/output format combination is not supported.'
275
268
  end
@@ -15,6 +15,22 @@ module BioInterchange::Genomics
15
15
  # - rdf.biointerchange.gvf
16
16
  class RDFWriter < BioInterchange::Writer
17
17
 
18
+ # Register writers:
19
+ BioInterchange::Registry.register_writer(
20
+ 'rdf.biointerchange.gff3',
21
+ BioInterchange::Genomics::RDFWriter,
22
+ [ 'biointerchange.gff3' ],
23
+ true,
24
+ 'Generic Feature Format Version 3 Ontology (GFF3O) based RDFization'
25
+ )
26
+ BioInterchange::Registry.register_writer(
27
+ 'rdf.biointerchange.gvf',
28
+ BioInterchange::Genomics::RDFWriter,
29
+ [ 'biointerchange.gvf' ],
30
+ true,
31
+ 'Genome Variation Format Version 1 Ontology (GVF1O) based RDFization'
32
+ )
33
+
18
34
  # Creates a new instance of a RDFWriter that will use the provided output stream to serialize RDF.
19
35
  #
20
36
  # +ostream+:: instance of an IO class or derivative that is used for RDF serialization
@@ -4,6 +4,20 @@ module BioInterchange::Genomics
4
4
 
5
5
  class GFF3Reader < BioInterchange::Reader
6
6
 
7
+ # Register reader:
8
+ BioInterchange::Registry.register_reader(
9
+ 'biointerchange.gff3',
10
+ GFF3Reader,
11
+ [ 'name', 'name_uri', 'date' ],
12
+ true,
13
+ 'Generic Feature Format Version 3 (GFF3) reader',
14
+ [
15
+ [ 'date <date>', 'date when the GFF3 file was created (optional)' ],
16
+ [ 'name <name>', 'name of the GFF3 file creator (optional)' ],
17
+ [ 'name_id <id>', 'email address of the GFF3 file creator (optional)' ]
18
+ ]
19
+ )
20
+
7
21
  # Creates a new instance of a Generic Feature Format Version 3 (GFF3) reader.
8
22
  #
9
23
  # The reader supports batch processing.
@@ -94,7 +108,7 @@ protected
94
108
  if type.match(/^SO:\d{7}$/) then
95
109
  type = RDF::URI.new("http://www.sequenceontology.org/miso/current_release/term/#{feature.type}")
96
110
  else
97
- type = BioInterchange::SOFA.send(BioInterchange.make_safe_label(type))
111
+ type = BioInterchange::SO.send(BioInterchange.make_safe_label(type))
98
112
  end
99
113
  rescue NoMethodError
100
114
  raise BioInterchange::Exceptions::InputFormatError, "Type of feature is set to an unknown SOFA term: \"#{type}\""
@@ -154,15 +168,15 @@ protected
154
168
  feature_set.set_pragma(name, { name => value.to_f })
155
169
  elsif name == 'sequence-region' then
156
170
  regions = feature_set.pragma(name)
157
- regions = {} unless regions
171
+ regions = { name => {} } unless regions
158
172
  seqid, start_coordinate, end_coordinate = value.split(/\s+/, 3)
159
- regions[seqid] = BioInterchange::Genomics::GFF3Landmark.new(seqid, start_coordinate.to_i, end_coordinate.to_i)
173
+ regions[name][seqid] = BioInterchange::Genomics::GFF3Landmark.new(seqid, start_coordinate.to_i, end_coordinate.to_i)
160
174
  feature_set.set_pragma(name, regions)
161
175
  elsif name == 'species' then
162
176
  feature_set.set_pragma(name, { name => value })
163
177
  else
164
178
  # Unhandled pragma. Just save the value in its string form.
165
- feature_set.set_pragma(name, value)
179
+ feature_set.set_pragma(name, { name => value })
166
180
  end
167
181
  end
168
182
 
@@ -2,6 +2,20 @@ module BioInterchange::Genomics
2
2
 
3
3
  class GVFReader < GFF3Reader
4
4
 
5
+ # Register reader:
6
+ BioInterchange::Registry.register_reader(
7
+ 'biointerchange.gvf',
8
+ GVFReader,
9
+ [ 'name', 'name_uri', 'date' ],
10
+ true,
11
+ 'Genome Variation Format Version 1 (GVF) reader',
12
+ [
13
+ [ 'date <date>', 'date when the GVF file was created (optional)' ],
14
+ [ 'name <name>', 'name of the GVF file creator (optional)' ],
15
+ [ 'name_id <id>', 'email address of the GVF file creator (optional)' ]
16
+ ]
17
+ )
18
+
5
19
  # Creates a new instance of a Genome Variation Format (GVF) reader.
6
20
  #
7
21
  # +name+:: Optional name of the person who generated the GVF file.
@@ -0,0 +1,108 @@
1
+ require 'rdf'
2
+ require 'rdf/ntriples'
3
+
4
+ module BioInterchange::Phylogenetics
5
+
6
+ # Serialized phylogenetic tree models based on BioRuby's phylogenetic tree implementation.
7
+ class CDAORDFWriter < BioInterchange::Writer
8
+
9
+ # Register writers:
10
+ BioInterchange::Registry.register_writer(
11
+ 'rdf.phylotastic.newick',
12
+ CDAORDFWriter,
13
+ [ 'phylotastic.newick' ],
14
+ true,
15
+ 'Comparative Data Analysis Ontology (CDAO) based RDFization'
16
+ )
17
+
18
+ # Creates a new instance of a CDAORDFWriter that will use the provided output stream to serialize RDF.
19
+ #
20
+ # +ostream+:: instance of an IO class or derivative that is used for RDF serialization
21
+ def initialize(ostream)
22
+ @ostream = ostream
23
+ end
24
+
25
+ # Serialize a model as RDF.
26
+ #
27
+ # +model+:: a generic representation of input data that is an instance of BioInterchange::Phylogenetics::TreeSet
28
+ def serialize(model)
29
+ model.contents.each { |tree|
30
+ serialize_model(model, tree)
31
+ }
32
+ end
33
+
34
+ protected
35
+
36
+ def serialize_model(model, tree)
37
+ graph = RDF::Graph.new
38
+ graph.fast_ostream(@ostream) if BioInterchange::skip_rdf_graph
39
+ tree_uri = RDF::URI.new(model.uri)
40
+ if model.date then
41
+ graph.insert(RDF::Statement.new(tree_uri, RDF::DC.date, RDF::Literal.new(model.date)))
42
+ end
43
+ serialize_tree(graph, tree, tree_uri, tree.root, true)
44
+ RDF::NTriples::Writer.dump(graph, @ostream)
45
+ end
46
+
47
+ def serialize_tree(graph, tree, tree_uri, node, is_root)
48
+ node_uri = RDF::URI.new("#{tree_uri.to_s}/node/#{node.object_id}")
49
+
50
+ if is_root then
51
+ graph.insert(RDF::Statement.new(tree_uri, RDF.type, BioInterchange::CDAO.NewickTree))
52
+ # Commented out some lines since it appears not to be determinable for Newick trees.
53
+ if tree.root then
54
+ # graph.insert(RDF::Statement.new(tree_uri, RDF.type, BioInterchange::CDAO.rootedtree))
55
+ else
56
+ # graph.insert(RDF::Statement.new(tree_uri, RDF.type, BioInterchange::CDAO.unrootedtree))
57
+ # Pick the first node available to permit serialization of the tree:
58
+ tree.root = node = tree.nodes.first
59
+ end
60
+ end
61
+
62
+ if node.name and not node.name.empty? then
63
+ taxonomic_unit_uri = RDF::URI.new("#{tree_uri.to_s}/taxonomic_unit/#{node.object_id}")
64
+ graph.insert(RDF::Statement.new(taxonomic_unit_uri, RDF.type, BioInterchange::CDAO.TU))
65
+ graph.insert(RDF::Statement.new(node_uri, BioInterchange::CDAO::represents_TU, taxonomic_unit_uri))
66
+ graph.insert(RDF::Statement.new(taxonomic_unit_uri, RDF::RDFS.label, RDF::Literal.new(node.name.gsub('_', ' '))))
67
+ end
68
+
69
+ if tree.descendents(node).empty? then
70
+ graph.insert(RDF::Statement.new(node_uri, RDF.type, BioInterchange::CDAO.TerminalNode))
71
+ else
72
+ graph.insert(RDF::Statement.new(node_uri, RDF.type, BioInterchange::CDAO.AncestralNode))
73
+ end
74
+
75
+ if not tree.root == node and tree.parent(node) then
76
+ parent_uri = RDF::URI.new("#{tree_uri.to_s}/node/#{tree.parent(node).object_id}")
77
+ edge_uri = RDF::URI.new("#{tree_uri.to_s}/edge/#{tree.get_edge(tree.parent(node), node).object_id}")
78
+ annotation_uri = RDF::URI.new("#{tree_uri.to_s}/edge/#{tree.get_edge(tree.parent(node), node).object_id}/annotation")
79
+ graph.insert(RDF::Statement.new(edge_uri, RDF.type, BioInterchange::CDAO.DirectedEdge))
80
+ graph.insert(RDF::Statement.new(edge_uri, BioInterchange::CDAO.belongs_to_Tree, tree_uri))
81
+ graph.insert(RDF::Statement.new(edge_uri, BioInterchange::CDAO.has_Parent_Node, parent_uri))
82
+ graph.insert(RDF::Statement.new(edge_uri, BioInterchange::CDAO.has_Child_Node, node_uri))
83
+ graph.insert(RDF::Statement.new(node_uri, BioInterchange::CDAO.belongs_to_Edge_as_Child, edge_uri))
84
+ graph.insert(RDF::Statement.new(node_uri, BioInterchange::CDAO.has_Parent, parent_uri))
85
+ graph.insert(RDF::Statement.new(parent_uri, BioInterchange::CDAO.belongs_to_Edge_as_Parent, edge_uri))
86
+
87
+ # if node.distance then
88
+ # graph.insert(RDF::Statement.new(node_uri, BioInterchange::CDAO.has_Support_Value, RDF::Literal.new(node.distance, :datatype => RDF::URI.new('http://www.w3.org/2001/XMLSchema#decimal'))))
89
+ # end
90
+
91
+ graph.insert(RDF::Statement.new(edge_uri, BioInterchange::CDAO.has_Annotation, annotation_uri))
92
+ graph.insert(RDF::Statement.new(annotation_uri, RDF.type, BioInterchange::CDAO.EdgeLength))
93
+ graph.insert(RDF::Statement.new(annotation_uri, BioInterchange::CDAO.has_Value, RDF::Literal.new(tree.get_edge(tree.parent(node), node).distance, :datatype => RDF::URI.new('http://www.w3.org/2001/XMLSchema#decimal'))))
94
+ end
95
+
96
+ graph.insert(RDF::Statement.new(tree_uri, BioInterchange::CDAO.has_Root, node_uri))
97
+ graph.insert(RDF::Statement.new(node_uri, BioInterchange::CDAO.belongs_to_Tree, tree_uri))
98
+
99
+ # Now, continue traversing the tree by visiting the current node's descendents:
100
+ tree.descendents(node).each { |descendent_node|
101
+ serialize_tree(graph, tree, tree_uri, descendent_node, false)
102
+ }
103
+ end
104
+
105
+ end
106
+
107
+ end
108
+
@@ -0,0 +1,81 @@
1
+ require 'bio'
2
+ require 'date'
3
+
4
+ module BioInterchange::Phylogenetics
5
+
6
+ class NewickReader < BioInterchange::Reader
7
+
8
+ # Register reader:
9
+ BioInterchange::Registry.register_reader(
10
+ 'phylotastic.newick',
11
+ NewickReader,
12
+ [ 'date' ],
13
+ true,
14
+ 'Newick Tree File Format reader',
15
+ [
16
+ [ 'date <date>', 'date when the Newick file was created (optional)' ]
17
+ ]
18
+ )
19
+
20
+ # Creates a new instance of a Newick file format reader.
21
+ #
22
+ # The reader supports batch processing.
23
+ #
24
+ # +date+:: Optional date of when the Newick file was produced, annotated, etc.
25
+ # +batch_size+:: Optional integer that determines that number of features that
26
+ # should be processed in one go.
27
+ def initialize(date = nil, batch_size = nil)
28
+ @date = date
29
+ @batch_size = batch_size
30
+ end
31
+
32
+ # Reads a Newick file from the input stream and returns an associated model.
33
+ #
34
+ # If this method is called when +postponed?+ returns true, then the reading will
35
+ # continue from where it has been interrupted beforehand.
36
+ #
37
+ # +inputstream+:: an instance of class IO or String that holds the contents of a Newick file
38
+ def deserialize(inputstream)
39
+ if inputstream.kind_of?(IO)
40
+ create_model(inputstream)
41
+ elsif inputstream.kind_of?(String) then
42
+ create_model(StringIO.new(inputstream))
43
+ else
44
+ raise BioInterchange::Exceptions::ImplementationReaderError, 'The provided input stream needs to be either of type IO or String.'
45
+ end
46
+ end
47
+
48
+ # Returns true if the reading of the input was postponed due to a full batch.
49
+ def postponed?
50
+ @postponed
51
+ end
52
+
53
+ protected
54
+
55
+ def create_model(newick)
56
+ if @postponed then
57
+ @postponed = false
58
+ @trees.prune
59
+ else
60
+ @trees = BioInterchange::Phylogenetics::TreeSet.new()
61
+ @trees.set_date(Date.parse(@date)) if @date
62
+ end
63
+
64
+ tree_io = Bio::FlatFile.open(Bio::Newick, newick)
65
+ while newick_tree = tree_io.next_entry
66
+ newick_tree.options[:bootstrap_style] = :disabled
67
+ @trees.add(newick_tree.tree)
68
+
69
+ if @batch_size and feature_no >= @batch_size then
70
+ @postponed = true
71
+ break
72
+ end
73
+ end
74
+
75
+ @trees
76
+ end
77
+
78
+ end
79
+
80
+ end
81
+
@@ -0,0 +1,50 @@
1
+ require 'digest/sha1'
2
+
3
+ module BioInterchange::Phylogenetics
4
+
5
+ # A phylogenetic tree set that can contain multiple phylogenetic trees.
6
+ class TreeSet < BioInterchange::Model
7
+
8
+ # Create a new instance of a tree set. A tree set can contain multiple phylogenetic trees.
9
+ def initialize
10
+ # Trees are stored as the keys of a hash map to increase performance:
11
+ @set = {}
12
+ end
13
+
14
+ # Returns the contents of the tree set.
15
+ def contents
16
+ @set.keys
17
+ end
18
+
19
+ # If a date was provided, then this method returns its value.
20
+ def date
21
+ @date
22
+ end
23
+
24
+ # Sets a date that is associated with the trees in this model (e.g., annotation date, creation date, etc.).
25
+ #
26
+ # +date+:: an instance of Date that is associated with all trees in the model
27
+ def set_date(date)
28
+ @date = date
29
+ end
30
+
31
+ # Returns an URI for this particular tree set, which is a not necessarily globally unique SHA1 hash.
32
+ def uri
33
+ "biointerchange://phylogenetics/treeset/self/#{Digest::SHA1.hexdigest(Time.now.to_s)}"
34
+ end
35
+
36
+ # Add a tree to the tree set.
37
+ #
38
+ # +tree+:: BioRuby tree instance that is added to the contents of this tree set
39
+ def add(tree)
40
+ @set[tree] = true
41
+ end
42
+
43
+ # Removes all features from the set, but keeps additional data (e.g., the date).
44
+ def prune
45
+ @set.clear
46
+ end
47
+
48
+ end
49
+
50
+ end
@@ -1,27 +1,69 @@
1
1
  module BioInterchange
2
2
 
3
+ # A registry of Reader and Writer subclasses that also keeps track which
4
+ # Reader/Writer combinations can be used together. The registry makes it
5
+ # possible to implement readers and writers without the need to modify
6
+ # other BioInterchange framework code.
3
7
  class Registry
4
8
 
5
- def self.register_reader(uid, service)
6
- @@readers[uid] = service
9
+ def self.register_reader(reader_id, reader_class, parameters, supports_batch_processing, descriptive_name, options_help)
10
+ @@readers[reader_id] = [ reader_class ] + parameters
11
+ @@reader_batch_processors[reader_id] = true if supports_batch_processing
12
+ @@reader_descriptions[reader_id] = descriptive_name
13
+ @@reader_help_texts[reader_id] = options_help
7
14
  end
8
15
 
9
- def self.register_writer(uid, service)
10
- @@writers[uid] = service
16
+ def self.register_writer(writer_id, writer_class, compatible_reader_ids, supports_batch_processing, descriptive_name)
17
+ @@writers[writer_id] = writer_class
18
+ @@writer_batch_processors[writer_id] = true if supports_batch_processing
19
+ @@writer_descriptions[writer_id] = descriptive_name
20
+ compatible_reader_ids.each { |reader_id|
21
+ @@compatible_reader_writer_pairs["#{reader_id} #{writer_id}"] = true
22
+ }
11
23
  end
12
24
 
13
- def self.readers
14
- @@readers.clone.freeze
25
+ def self.is_supported?(reader_id, writer_id)
26
+ @@compatible_reader_writer_pairs["#{reader_id} #{writer_id}"] == true
15
27
  end
16
28
 
17
- def self.writers
18
- @@writers.clone.freeze
29
+ def self.is_supporting_batch_processing?(reader_id, writer_id)
30
+ @@reader_batch_processors[reader_id] and @@writer_batch_processors[writer_id]
31
+ end
32
+
33
+ def self.reader(reader_id)
34
+ @@readers[reader_id]
35
+ end
36
+
37
+ def self.writer(writer_id)
38
+ @@writers[writer_id]
39
+ end
40
+
41
+ def self.reader_descriptions
42
+ @@reader_descriptions.clone.freeze
43
+ end
44
+
45
+ def self.writer_descriptions
46
+ @@writer_descriptions.clone.freeze
47
+ end
48
+
49
+ def self.options_help(reader_id)
50
+ @@reader_help_texts[reader_id].clone.freeze
51
+ end
52
+
53
+ def self.reader_writer_pairs
54
+ @@compatible_reader_writer_pairs.keys.sort.map { |reader_writer_pair| reader_writer_pair.split(/ /, 2) }.freeze
19
55
  end
20
56
 
21
57
  private
22
58
 
23
59
  @@readers = {}
24
60
  @@writers = {}
61
+ @@reader_batch_processors = {}
62
+ @@writer_batch_processors = {}
63
+ @@reader_descriptions = {}
64
+ @@writer_descriptions = {}
65
+ @@reader_help_texts = {}
66
+ @@compatible_reader_writer_pairs = {}
25
67
 
26
68
  end
27
69