biointerchange 0.2.2 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. data/Gemfile +1 -0
  2. data/README.md +269 -19
  3. data/VERSION +1 -1
  4. data/examples/bininda_emonds_mammals.new +1 -0
  5. data/examples/rdfization.rb +17 -0
  6. data/examples/tree1.new +1 -0
  7. data/examples/tree2.new +1 -0
  8. data/examples/vocabulary.rb +26 -5
  9. data/generators/javaify.rb +12 -18
  10. data/generators/make_supplement_releases.rb +2 -0
  11. data/generators/pythonify.rb +21 -8
  12. data/generators/rdfxml.rb +15 -1
  13. data/lib/biointerchange/cdao.rb +2014 -0
  14. data/lib/biointerchange/core.rb +70 -77
  15. data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb +16 -0
  16. data/lib/biointerchange/genomics/gff3_reader.rb +18 -4
  17. data/lib/biointerchange/genomics/gvf_reader.rb +14 -0
  18. data/lib/biointerchange/phylogenetics/cdao_rdf_ntriples.rb +108 -0
  19. data/lib/biointerchange/phylogenetics/newick_reader.rb +81 -0
  20. data/lib/biointerchange/phylogenetics/tree_set.rb +50 -0
  21. data/lib/biointerchange/registry.rb +50 -8
  22. data/lib/biointerchange/so.rb +150 -0
  23. data/lib/biointerchange/textmining/pdfx_xml_reader.rb +21 -2
  24. data/lib/biointerchange/textmining/pubannos_json_reader.rb +24 -1
  25. data/lib/biointerchange/textmining/text_mining_rdf_ntriples.rb +9 -0
  26. data/lib/biointerchange/textmining/text_mining_reader.rb +5 -5
  27. data/spec/phylogenetics_spec.rb +79 -0
  28. data/supplemental/java/biointerchange/pom.xml +1 -1
  29. data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/CDAO.java +2602 -0
  30. data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/FALDO.java +30 -28
  31. data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/GFF3O.java +136 -104
  32. data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/GVF1O.java +367 -278
  33. data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/SIO.java +4388 -3127
  34. data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/SO.java +5970 -4351
  35. data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/SOFA.java +733 -544
  36. data/supplemental/java/biointerchange/src/test/java/org/biointerchange/AppTest.java +3 -1
  37. data/supplemental/python/biointerchange/cdao.py +2021 -0
  38. data/supplemental/python/biointerchange/faldo.py +37 -38
  39. data/supplemental/python/biointerchange/gff3o.py +156 -157
  40. data/supplemental/python/biointerchange/goxref.py +172 -172
  41. data/supplemental/python/biointerchange/gvf1o.py +428 -429
  42. data/supplemental/python/biointerchange/sio.py +3133 -3134
  43. data/supplemental/python/biointerchange/so.py +6626 -6527
  44. data/supplemental/python/biointerchange/sofa.py +790 -791
  45. data/supplemental/python/example.py +23 -5
  46. data/supplemental/python/setup.py +2 -2
  47. data/web/about.html +1 -0
  48. data/web/api.html +223 -15
  49. data/web/biointerchange.js +27 -6
  50. data/web/cli.html +8 -3
  51. data/web/index.html +6 -2
  52. data/web/ontologies.html +3 -0
  53. data/web/service/rdfizer.fcgi +7 -15
  54. data/web/webservices.html +6 -2
  55. metadata +30 -3
@@ -20,6 +20,7 @@ module BioInterchange
20
20
  require 'biointerchange/exceptions'
21
21
 
22
22
  # Ontologies (besides the ones from the 'rdf' gem)
23
+ require 'biointerchange/cdao'
23
24
  require 'biointerchange/faldo'
24
25
  require 'biointerchange/gff3o'
25
26
  require 'biointerchange/goxref'
@@ -28,6 +29,9 @@ module BioInterchange
28
29
  require 'biointerchange/so'
29
30
  require 'biointerchange/sofa'
30
31
 
32
+ # Registry for reader/writer management:
33
+ require 'biointerchange/registry'
34
+
31
35
  # Reader/writer interfaces
32
36
  require 'biointerchange/reader'
33
37
  require 'biointerchange/model'
@@ -81,6 +85,19 @@ module BioInterchange
81
85
  # Writer
82
86
  # ...same GFF3 writer
83
87
 
88
+ #
89
+ # PHYLOGENETICS
90
+ #
91
+
92
+ # Reader
93
+ require 'biointerchange/phylogenetics/newick_reader'
94
+
95
+ # Model
96
+ require 'biointerchange/phylogenetics/tree_set'
97
+
98
+ # Writer
99
+ require 'biointerchange/phylogenetics/cdao_rdf_ntriples'
100
+
84
101
  #
85
102
  # ACTUAL COMMAND LINE IMPLEMENTATION
86
103
  #
@@ -97,7 +114,7 @@ module BioInterchange
97
114
  ["--batchsize", "-b", Getopt::OPTIONAL], # batchsize for readers/writers that support +postpone?+
98
115
  ["--input", "-i", Getopt::REQUIRED], # input file format
99
116
  ["--rdf", "-r", Getopt::REQUIRED], # output file format
100
- ["--annotate_name", Getopt::OPTIONAL], # name of resourcce/tool/person
117
+ ["--annotate_name", Getopt::OPTIONAL], # name of resource/tool/person
101
118
  ["--annotate_name_id", Getopt::OPTIONAL], # uri of resource/tool/person
102
119
  ["--annotate_date", Getopt::OPTIONAL], # date of processing/annotation
103
120
  ["--annotate_version", Getopt::OPTIONAL], # version number of resource
@@ -110,47 +127,38 @@ module BioInterchange
110
127
  puts "Usage: ruby #{$0} -i <format> -r <format> [options]"
111
128
  puts ''
112
129
  puts 'Supported input formats (--input <format>/-i <format>):'
113
- puts ' biointerchange.gff3 : GFF3'
114
- puts ' biointerchange.gvf : GVF'
115
- puts ' dbcls.catanns.json : PubAnnotation JSON'
116
- puts ' uk.ac.man.pdfx : PDFx XML'
130
+ Registry.reader_descriptions.each_pair { |reader_id, description|
131
+ puts " #{reader_id}#{' ' * (34 - reader_id.length)} : #{description}"
132
+ }
117
133
  puts ''
118
134
  puts 'Supported output formats (--rdf <format>/-r <format>)'
119
- puts ' rdf.biointerchange.gff3 : RDF N-Triples for the following input'
120
- puts ' biointerchange.gff3'
121
- puts ' rdf.biointerchange.gvf : RDF N-Triples for the following input'
122
- puts ' biointerchange.gff3'
123
- puts ' biointerchange.gvf'
124
- puts ' rdf.bh12.sio : RDF N-Triples for the following inputs'
125
- puts ' dbcls.catanns.json'
126
- puts ' uk.ac.man.pdfx'
135
+ Registry.writer_descriptions.each_pair { |writer_id, description|
136
+ puts " #{writer_id}#{' ' * (34 - writer_id.length)} : #{description}"
137
+ }
127
138
  puts ''
128
139
  puts 'I/O options:'
140
+ puts ' -b <size>/--batchsize <size> : process input in batches of the given size'
141
+ puts ' (if supported, see below for valid input/rdf pairs)'
129
142
  puts ' -f <file>/--file <file> : file to read; STDIN used if not supplied'
130
143
  puts ' -o <file>/--out <file> : output file; STDOUT used if not supplied'
131
144
  puts ''
132
- puts 'Input-/RDF-format specific options:'
133
- puts ' Input: dbcls.catanns.json, uk.ac.man.pdfx'
134
- puts ' Output: rdf.bh12.sio'
135
- puts ' Options:'
136
- puts ' --annotate_date <date> : date of processing/annotation (optional)'
137
- puts ' --annotate_version <version> : version number of resource (optional)'
138
- puts ' --annotate_name <name> : name of resource/tool/person (required)'
139
- puts ' --annotate_name_id <id> : URI of resource/tool/person (required)'
140
- puts ''
141
- puts 'Input-/RDF-format specific options:'
142
- puts ' Input: biointerchange.gff3 or biointerchange.gvf'
143
- puts ' Output: rdf.biointerchange.gff3 or rdf.biointerchange.gvf'
144
- puts ' Options:'
145
- puts ' -b <size>/--batchsize <size> : process features in batches of the given size (optional)'
146
- puts ' -t <date>/--date <date> : date when the GFF3/GVF file was created (optional)'
147
- puts ' --name <name> : name of the GFF3/GVF file creator (optional)'
148
- puts ' --name_id <id> : email address of the GFF3/GVF file creator (optional)'
149
- puts ''
150
145
  puts 'Other options:'
151
146
  puts ' -v / --version : print the Gem\'s version number and exit'
152
147
  puts ' -d / --debug : turn on debugging output (for stacktraces)'
153
148
  puts ' -h --help : this message'
149
+ puts ''
150
+ puts 'Input-/RDF-format specific options:'
151
+ reader_writer_pairs = Registry.reader_writer_pairs
152
+ reader_writer_pairs.each_index { |reader_writer_pair_index|
153
+ reader_id, writer_id = reader_writer_pairs[reader_writer_pair_index]
154
+ puts " Input format : #{reader_id}"
155
+ puts " Output format : #{writer_id}"
156
+ Registry.options_help(reader_id).each { |option_description|
157
+ option, description = option_description
158
+ puts " --annotate_#{option}#{' ' * (21 - option.length)} : #{description}"
159
+ }
160
+ puts '' if reader_writer_pair_index + 1 < reader_writer_pairs.length
161
+ }
154
162
 
155
163
  exit 1
156
164
  end
@@ -166,62 +174,43 @@ module BioInterchange
166
174
  @@skip_rdf_graph = false if opt['no_rdf_graph_optimization']
167
175
 
168
176
  # Check if the input/rdf options are supported:
169
- if opt['input'] == 'dbcls.catanns.json' or opt['input'] == 'uk.ac.man.pdfx' then
170
- if opt['rdf'] == 'rdf.bh12.sio' then
171
- raise ArgumentError, 'Require --name and --name_id options to specify source of annotations (e.g., a manual annotators name, or software tool name) and their associated URI (e.g., email address, or webaddress).' unless opt['name'] and opt['name_id']
172
- else
173
- unsupported_combination
174
- end
175
- elsif opt['input'] == 'biointerchange.gff3' then
176
- if opt['rdf'] == 'rdf.biointerchange.gff3' then
177
- # Okay. No further arguments required.
178
- else
179
- unsupported_combination
180
- end
181
- elsif opt['input'] == 'biointerchange.gvf' then
182
- if opt['rdf'] == 'rdf.biointerchange.gff3' or opt['rdf'] == 'rdf.biointerchange.gvf' then
183
- # Okay. No further arguments required.
184
- else
185
- unsupported_combination
186
- end
187
- else
188
- unsupported_combination
189
- end
177
+ unsupported_combination unless Registry.is_supported?(opt['input'], opt['rdf'])
190
178
 
191
- wrong_type('batchsize', 'a positive integer') if opt['batchsize'] and not opt['batchsize'].match(/^[1-9][0-9]*$/)
179
+ if opt['batchsize'] then
180
+ batching_not_supported unless Registry.is_supporting_batch_processing?(opt['input'], opt['rdf'])
181
+ wrong_type('batchsize', 'a positive integer') unless opt['batchsize'].match(/^[1-9][0-9]*$/)
182
+ end
192
183
 
193
- opt['batchsize'] = opt['batchsize'].to_i if opt['batchsize']
184
+ # Create a parameter map that can be passed along to Reader implementations:
185
+ map = {
186
+ 'input' => opt['input'],
187
+ 'output' => opt['output']
188
+ }
189
+ map['batchsize'] = opt['batchsize'].to_i if opt['batchsize']
190
+ opt.each_key { |key|
191
+ map[key.sub(/^annotate_/, '')] = opt[key] if key.start_with?('annotate_')
192
+ }
194
193
 
195
194
  # Generate model from file (deserialization).
196
- # Note: if-clauses are lexicographically ordered.
197
- reader = nil
198
- if opt['input'] == 'biointerchange.gff3' then
199
- reader = BioInterchange::Genomics::GFF3Reader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], opt['batchsize'])
200
- elsif opt['input'] == 'biointerchange.gvf' then
201
- reader = BioInterchange::Genomics::GVFReader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], opt['batchsize'])
202
- elsif opt['input'] == 'dbcls.catanns.json' then
203
- reader = BioInterchange::TextMining::PubAnnosJSONReader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], BioInterchange::TextMining::Process::UNSPECIFIED, opt['version'])
204
- elsif opt['input'] == 'uk.ac.man.pdfx' then
205
- reader = BioInterchange::TextMining::PDFxXMLReader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], BioInterchange::TextMining::Process::UNSPECIFIED, opt['annotate_version'])
206
- end
195
+ reader_class, *args = Registry.reader(opt['input'])
196
+ reader = reader_class.new(*BioInterchange::get_parameters(map, args))
207
197
 
208
- if opt["file"]
209
- input_source = File.new(opt["file"],'r')
198
+ input_source = nil
199
+ if opt['file'] then
200
+ input_source = File.new(opt['file'], 'r')
210
201
  else
211
202
  input_source = STDIN
212
203
  end
213
204
 
214
- # Generate rdf from model (serialization).
215
- # Note: if-clauses are lexicographically ordered.
216
- writer = nil
217
- if opt['rdf'] == 'rdf.bh12.sio' then
218
- writer = BioInterchange::TextMining::RDFWriter.new(File.new(opt['out'], 'w')) if opt['out']
219
- writer = BioInterchange::TextMining::RDFWriter.new(STDOUT) unless opt['out']
220
- end
221
- if opt['rdf'] == 'rdf.biointerchange.gff3' or opt['rdf'] == 'rdf.biointerchange.gvf' then
222
- writer = BioInterchange::Genomics::RDFWriter.new(File.new(opt['out'], 'w')) if opt['out']
223
- writer = BioInterchange::Genomics::RDFWriter.new(STDOUT) unless opt['out']
205
+ output_source = nil
206
+ if opt['out'] then
207
+ output_source = File.new(opt['out'], 'w')
208
+ else
209
+ output_source = STDOUT
224
210
  end
211
+
212
+ # Generate rdf from model (serialization).
213
+ writer = Registry.writer(opt['rdf']).new(output_source)
225
214
 
226
215
  begin
227
216
  model = reader.deserialize(input_source)
@@ -270,6 +259,10 @@ module BioInterchange
270
259
 
271
260
  private
272
261
 
262
+ def self.batching_not_supported
263
+ raise ArgumentError, 'Batching is not supported for this input/output format combination.'
264
+ end
265
+
273
266
  def self.unsupported_combination
274
267
  raise ArgumentError, 'This input/output format combination is not supported.'
275
268
  end
@@ -15,6 +15,22 @@ module BioInterchange::Genomics
15
15
  # - rdf.biointerchange.gvf
16
16
  class RDFWriter < BioInterchange::Writer
17
17
 
18
+ # Register writers:
19
+ BioInterchange::Registry.register_writer(
20
+ 'rdf.biointerchange.gff3',
21
+ BioInterchange::Genomics::RDFWriter,
22
+ [ 'biointerchange.gff3' ],
23
+ true,
24
+ 'Generic Feature Format Version 3 Ontology (GFF3O) based RDFization'
25
+ )
26
+ BioInterchange::Registry.register_writer(
27
+ 'rdf.biointerchange.gvf',
28
+ BioInterchange::Genomics::RDFWriter,
29
+ [ 'biointerchange.gvf' ],
30
+ true,
31
+ 'Genome Variation Format Version 1 Ontology (GVF1O) based RDFization'
32
+ )
33
+
18
34
  # Creates a new instance of a RDFWriter that will use the provided output stream to serialize RDF.
19
35
  #
20
36
  # +ostream+:: instance of an IO class or derivative that is used for RDF serialization
@@ -4,6 +4,20 @@ module BioInterchange::Genomics
4
4
 
5
5
  class GFF3Reader < BioInterchange::Reader
6
6
 
7
+ # Register reader:
8
+ BioInterchange::Registry.register_reader(
9
+ 'biointerchange.gff3',
10
+ GFF3Reader,
11
+ [ 'name', 'name_uri', 'date' ],
12
+ true,
13
+ 'Generic Feature Format Version 3 (GFF3) reader',
14
+ [
15
+ [ 'date <date>', 'date when the GFF3 file was created (optional)' ],
16
+ [ 'name <name>', 'name of the GFF3 file creator (optional)' ],
17
+ [ 'name_id <id>', 'email address of the GFF3 file creator (optional)' ]
18
+ ]
19
+ )
20
+
7
21
  # Creates a new instance of a Generic Feature Format Version 3 (GFF3) reader.
8
22
  #
9
23
  # The reader supports batch processing.
@@ -94,7 +108,7 @@ protected
94
108
  if type.match(/^SO:\d{7}$/) then
95
109
  type = RDF::URI.new("http://www.sequenceontology.org/miso/current_release/term/#{feature.type}")
96
110
  else
97
- type = BioInterchange::SOFA.send(BioInterchange.make_safe_label(type))
111
+ type = BioInterchange::SO.send(BioInterchange.make_safe_label(type))
98
112
  end
99
113
  rescue NoMethodError
100
114
  raise BioInterchange::Exceptions::InputFormatError, "Type of feature is set to an unknown SOFA term: \"#{type}\""
@@ -154,15 +168,15 @@ protected
154
168
  feature_set.set_pragma(name, { name => value.to_f })
155
169
  elsif name == 'sequence-region' then
156
170
  regions = feature_set.pragma(name)
157
- regions = {} unless regions
171
+ regions = { name => {} } unless regions
158
172
  seqid, start_coordinate, end_coordinate = value.split(/\s+/, 3)
159
- regions[seqid] = BioInterchange::Genomics::GFF3Landmark.new(seqid, start_coordinate.to_i, end_coordinate.to_i)
173
+ regions[name][seqid] = BioInterchange::Genomics::GFF3Landmark.new(seqid, start_coordinate.to_i, end_coordinate.to_i)
160
174
  feature_set.set_pragma(name, regions)
161
175
  elsif name == 'species' then
162
176
  feature_set.set_pragma(name, { name => value })
163
177
  else
164
178
  # Unhandled pragma. Just save the value in its string form.
165
- feature_set.set_pragma(name, value)
179
+ feature_set.set_pragma(name, { name => value })
166
180
  end
167
181
  end
168
182
 
@@ -2,6 +2,20 @@ module BioInterchange::Genomics
2
2
 
3
3
  class GVFReader < GFF3Reader
4
4
 
5
+ # Register reader:
6
+ BioInterchange::Registry.register_reader(
7
+ 'biointerchange.gvf',
8
+ GVFReader,
9
+ [ 'name', 'name_uri', 'date' ],
10
+ true,
11
+ 'Genome Variation Format Version 1 (GVF) reader',
12
+ [
13
+ [ 'date <date>', 'date when the GVF file was created (optional)' ],
14
+ [ 'name <name>', 'name of the GVF file creator (optional)' ],
15
+ [ 'name_id <id>', 'email address of the GVF file creator (optional)' ]
16
+ ]
17
+ )
18
+
5
19
  # Creates a new instance of a Genome Variation Format (GVF) reader.
6
20
  #
7
21
  # +name+:: Optional name of the person who generated the GVF file.
@@ -0,0 +1,108 @@
1
+ require 'rdf'
2
+ require 'rdf/ntriples'
3
+
4
+ module BioInterchange::Phylogenetics
5
+
6
+ # Serialized phylogenetic tree models based on BioRuby's phylogenetic tree implementation.
7
+ class CDAORDFWriter < BioInterchange::Writer
8
+
9
+ # Register writers:
10
+ BioInterchange::Registry.register_writer(
11
+ 'rdf.phylotastic.newick',
12
+ CDAORDFWriter,
13
+ [ 'phylotastic.newick' ],
14
+ true,
15
+ 'Comparative Data Analysis Ontology (CDAO) based RDFization'
16
+ )
17
+
18
+ # Creates a new instance of a CDAORDFWriter that will use the provided output stream to serialize RDF.
19
+ #
20
+ # +ostream+:: instance of an IO class or derivative that is used for RDF serialization
21
+ def initialize(ostream)
22
+ @ostream = ostream
23
+ end
24
+
25
+ # Serialize a model as RDF.
26
+ #
27
+ # +model+:: a generic representation of input data that is an instance of BioInterchange::Phylogenetics::TreeSet
28
+ def serialize(model)
29
+ model.contents.each { |tree|
30
+ serialize_model(model, tree)
31
+ }
32
+ end
33
+
34
+ protected
35
+
36
+ def serialize_model(model, tree)
37
+ graph = RDF::Graph.new
38
+ graph.fast_ostream(@ostream) if BioInterchange::skip_rdf_graph
39
+ tree_uri = RDF::URI.new(model.uri)
40
+ if model.date then
41
+ graph.insert(RDF::Statement.new(tree_uri, RDF::DC.date, RDF::Literal.new(model.date)))
42
+ end
43
+ serialize_tree(graph, tree, tree_uri, tree.root, true)
44
+ RDF::NTriples::Writer.dump(graph, @ostream)
45
+ end
46
+
47
+ def serialize_tree(graph, tree, tree_uri, node, is_root)
48
+ node_uri = RDF::URI.new("#{tree_uri.to_s}/node/#{node.object_id}")
49
+
50
+ if is_root then
51
+ graph.insert(RDF::Statement.new(tree_uri, RDF.type, BioInterchange::CDAO.NewickTree))
52
+ # Commented out some lines since it appears not to be determinable for Newick trees.
53
+ if tree.root then
54
+ # graph.insert(RDF::Statement.new(tree_uri, RDF.type, BioInterchange::CDAO.rootedtree))
55
+ else
56
+ # graph.insert(RDF::Statement.new(tree_uri, RDF.type, BioInterchange::CDAO.unrootedtree))
57
+ # Pick the first node available to permit serialization of the tree:
58
+ tree.root = node = tree.nodes.first
59
+ end
60
+ end
61
+
62
+ if node.name and not node.name.empty? then
63
+ taxonomic_unit_uri = RDF::URI.new("#{tree_uri.to_s}/taxonomic_unit/#{node.object_id}")
64
+ graph.insert(RDF::Statement.new(taxonomic_unit_uri, RDF.type, BioInterchange::CDAO.TU))
65
+ graph.insert(RDF::Statement.new(node_uri, BioInterchange::CDAO::represents_TU, taxonomic_unit_uri))
66
+ graph.insert(RDF::Statement.new(taxonomic_unit_uri, RDF::RDFS.label, RDF::Literal.new(node.name.gsub('_', ' '))))
67
+ end
68
+
69
+ if tree.descendents(node).empty? then
70
+ graph.insert(RDF::Statement.new(node_uri, RDF.type, BioInterchange::CDAO.TerminalNode))
71
+ else
72
+ graph.insert(RDF::Statement.new(node_uri, RDF.type, BioInterchange::CDAO.AncestralNode))
73
+ end
74
+
75
+ if not tree.root == node and tree.parent(node) then
76
+ parent_uri = RDF::URI.new("#{tree_uri.to_s}/node/#{tree.parent(node).object_id}")
77
+ edge_uri = RDF::URI.new("#{tree_uri.to_s}/edge/#{tree.get_edge(tree.parent(node), node).object_id}")
78
+ annotation_uri = RDF::URI.new("#{tree_uri.to_s}/edge/#{tree.get_edge(tree.parent(node), node).object_id}/annotation")
79
+ graph.insert(RDF::Statement.new(edge_uri, RDF.type, BioInterchange::CDAO.DirectedEdge))
80
+ graph.insert(RDF::Statement.new(edge_uri, BioInterchange::CDAO.belongs_to_Tree, tree_uri))
81
+ graph.insert(RDF::Statement.new(edge_uri, BioInterchange::CDAO.has_Parent_Node, parent_uri))
82
+ graph.insert(RDF::Statement.new(edge_uri, BioInterchange::CDAO.has_Child_Node, node_uri))
83
+ graph.insert(RDF::Statement.new(node_uri, BioInterchange::CDAO.belongs_to_Edge_as_Child, edge_uri))
84
+ graph.insert(RDF::Statement.new(node_uri, BioInterchange::CDAO.has_Parent, parent_uri))
85
+ graph.insert(RDF::Statement.new(parent_uri, BioInterchange::CDAO.belongs_to_Edge_as_Parent, edge_uri))
86
+
87
+ # if node.distance then
88
+ # graph.insert(RDF::Statement.new(node_uri, BioInterchange::CDAO.has_Support_Value, RDF::Literal.new(node.distance, :datatype => RDF::URI.new('http://www.w3.org/2001/XMLSchema#decimal'))))
89
+ # end
90
+
91
+ graph.insert(RDF::Statement.new(edge_uri, BioInterchange::CDAO.has_Annotation, annotation_uri))
92
+ graph.insert(RDF::Statement.new(annotation_uri, RDF.type, BioInterchange::CDAO.EdgeLength))
93
+ graph.insert(RDF::Statement.new(annotation_uri, BioInterchange::CDAO.has_Value, RDF::Literal.new(tree.get_edge(tree.parent(node), node).distance, :datatype => RDF::URI.new('http://www.w3.org/2001/XMLSchema#decimal'))))
94
+ end
95
+
96
+ graph.insert(RDF::Statement.new(tree_uri, BioInterchange::CDAO.has_Root, node_uri))
97
+ graph.insert(RDF::Statement.new(node_uri, BioInterchange::CDAO.belongs_to_Tree, tree_uri))
98
+
99
+ # Now, continue traversing the tree by visiting the current node's descendents:
100
+ tree.descendents(node).each { |descendent_node|
101
+ serialize_tree(graph, tree, tree_uri, descendent_node, false)
102
+ }
103
+ end
104
+
105
+ end
106
+
107
+ end
108
+
@@ -0,0 +1,81 @@
1
+ require 'bio'
2
+ require 'date'
3
+
4
+ module BioInterchange::Phylogenetics
5
+
6
+ class NewickReader < BioInterchange::Reader
7
+
8
+ # Register reader:
9
+ BioInterchange::Registry.register_reader(
10
+ 'phylotastic.newick',
11
+ NewickReader,
12
+ [ 'date' ],
13
+ true,
14
+ 'Newick Tree File Format reader',
15
+ [
16
+ [ 'date <date>', 'date when the Newick file was created (optional)' ]
17
+ ]
18
+ )
19
+
20
+ # Creates a new instance of a Newick file format reader.
21
+ #
22
+ # The reader supports batch processing.
23
+ #
24
+ # +date+:: Optional date of when the Newick file was produced, annotated, etc.
25
+ # +batch_size+:: Optional integer that determines that number of features that
26
+ # should be processed in one go.
27
+ def initialize(date = nil, batch_size = nil)
28
+ @date = date
29
+ @batch_size = batch_size
30
+ end
31
+
32
+ # Reads a Newick file from the input stream and returns an associated model.
33
+ #
34
+ # If this method is called when +postponed?+ returns true, then the reading will
35
+ # continue from where it has been interrupted beforehand.
36
+ #
37
+ # +inputstream+:: an instance of class IO or String that holds the contents of a Newick file
38
+ def deserialize(inputstream)
39
+ if inputstream.kind_of?(IO)
40
+ create_model(inputstream)
41
+ elsif inputstream.kind_of?(String) then
42
+ create_model(StringIO.new(inputstream))
43
+ else
44
+ raise BioInterchange::Exceptions::ImplementationReaderError, 'The provided input stream needs to be either of type IO or String.'
45
+ end
46
+ end
47
+
48
+ # Returns true if the reading of the input was postponed due to a full batch.
49
+ def postponed?
50
+ @postponed
51
+ end
52
+
53
+ protected
54
+
55
+ def create_model(newick)
56
+ if @postponed then
57
+ @postponed = false
58
+ @trees.prune
59
+ else
60
+ @trees = BioInterchange::Phylogenetics::TreeSet.new()
61
+ @trees.set_date(Date.parse(@date)) if @date
62
+ end
63
+
64
+ tree_io = Bio::FlatFile.open(Bio::Newick, newick)
65
+ while newick_tree = tree_io.next_entry
66
+ newick_tree.options[:bootstrap_style] = :disabled
67
+ @trees.add(newick_tree.tree)
68
+
69
+ if @batch_size and feature_no >= @batch_size then
70
+ @postponed = true
71
+ break
72
+ end
73
+ end
74
+
75
+ @trees
76
+ end
77
+
78
+ end
79
+
80
+ end
81
+
@@ -0,0 +1,50 @@
1
+ require 'digest/sha1'
2
+
3
+ module BioInterchange::Phylogenetics
4
+
5
+ # A phylogenetic tree set that can contain multiple phylogenetic trees.
6
+ class TreeSet < BioInterchange::Model
7
+
8
+ # Create a new instance of a tree set. A tree set can contain multiple phylogenetic trees.
9
+ def initialize
10
+ # Trees are stored as the keys of a hash map to increase performance:
11
+ @set = {}
12
+ end
13
+
14
+ # Returns the contents of the tree set.
15
+ def contents
16
+ @set.keys
17
+ end
18
+
19
+ # If a date was provided, then this method returns its value.
20
+ def date
21
+ @date
22
+ end
23
+
24
+ # Sets a date that is associated with the trees in this model (e.g., annotation date, creation date, etc.).
25
+ #
26
+ # +date+:: an instance of Date that is associated with all trees in the model
27
+ def set_date(date)
28
+ @date = date
29
+ end
30
+
31
+ # Returns an URI for this particular tree set, which is a not necessarily globally unique SHA1 hash.
32
+ def uri
33
+ "biointerchange://phylogenetics/treeset/self/#{Digest::SHA1.hexdigest(Time.now.to_s)}"
34
+ end
35
+
36
+ # Add a tree to the tree set.
37
+ #
38
+ # +tree+:: BioRuby tree instance that is added to the contents of this tree set
39
+ def add(tree)
40
+ @set[tree] = true
41
+ end
42
+
43
+ # Removes all features from the set, but keeps additional data (e.g., the date).
44
+ def prune
45
+ @set.clear
46
+ end
47
+
48
+ end
49
+
50
+ end
@@ -1,27 +1,69 @@
1
1
  module BioInterchange
2
2
 
3
+ # A registry of Reader and Writer subclasses that also keeps track which
4
+ # Reader/Writer combinations can be used together. The registry makes it
5
+ # possible to implement readers and writers without the need to modify
6
+ # other BioInterchange framework code.
3
7
  class Registry
4
8
 
5
- def self.register_reader(uid, service)
6
- @@readers[uid] = service
9
+ def self.register_reader(reader_id, reader_class, parameters, supports_batch_processing, descriptive_name, options_help)
10
+ @@readers[reader_id] = [ reader_class ] + parameters
11
+ @@reader_batch_processors[reader_id] = true if supports_batch_processing
12
+ @@reader_descriptions[reader_id] = descriptive_name
13
+ @@reader_help_texts[reader_id] = options_help
7
14
  end
8
15
 
9
- def self.register_writer(uid, service)
10
- @@writers[uid] = service
16
+ def self.register_writer(writer_id, writer_class, compatible_reader_ids, supports_batch_processing, descriptive_name)
17
+ @@writers[writer_id] = writer_class
18
+ @@writer_batch_processors[writer_id] = true if supports_batch_processing
19
+ @@writer_descriptions[writer_id] = descriptive_name
20
+ compatible_reader_ids.each { |reader_id|
21
+ @@compatible_reader_writer_pairs["#{reader_id} #{writer_id}"] = true
22
+ }
11
23
  end
12
24
 
13
- def self.readers
14
- @@readers.clone.freeze
25
+ def self.is_supported?(reader_id, writer_id)
26
+ @@compatible_reader_writer_pairs["#{reader_id} #{writer_id}"] == true
15
27
  end
16
28
 
17
- def self.writers
18
- @@writers.clone.freeze
29
+ def self.is_supporting_batch_processing?(reader_id, writer_id)
30
+ @@reader_batch_processors[reader_id] and @@writer_batch_processors[writer_id]
31
+ end
32
+
33
+ def self.reader(reader_id)
34
+ @@readers[reader_id]
35
+ end
36
+
37
+ def self.writer(writer_id)
38
+ @@writers[writer_id]
39
+ end
40
+
41
+ def self.reader_descriptions
42
+ @@reader_descriptions.clone.freeze
43
+ end
44
+
45
+ def self.writer_descriptions
46
+ @@writer_descriptions.clone.freeze
47
+ end
48
+
49
+ def self.options_help(reader_id)
50
+ @@reader_help_texts[reader_id].clone.freeze
51
+ end
52
+
53
+ def self.reader_writer_pairs
54
+ @@compatible_reader_writer_pairs.keys.sort.map { |reader_writer_pair| reader_writer_pair.split(/ /, 2) }.freeze
19
55
  end
20
56
 
21
57
  private
22
58
 
23
59
  @@readers = {}
24
60
  @@writers = {}
61
+ @@reader_batch_processors = {}
62
+ @@writer_batch_processors = {}
63
+ @@reader_descriptions = {}
64
+ @@writer_descriptions = {}
65
+ @@reader_help_texts = {}
66
+ @@compatible_reader_writer_pairs = {}
25
67
 
26
68
  end
27
69