RubyGems - biointerchange - Versions diffs - 0.2.2 → 1.0.0 - Mend

biointerchange 0.2.2 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

data/Gemfile +1 -0
data/README.md +269 -19
data/VERSION +1 -1
data/examples/bininda_emonds_mammals.new +1 -0
data/examples/rdfization.rb +17 -0
data/examples/tree1.new +1 -0
data/examples/tree2.new +1 -0
data/examples/vocabulary.rb +26 -5
data/generators/javaify.rb +12 -18
data/generators/make_supplement_releases.rb +2 -0
data/generators/pythonify.rb +21 -8
data/generators/rdfxml.rb +15 -1
data/lib/biointerchange/cdao.rb +2014 -0
data/lib/biointerchange/core.rb +70 -77
data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb +16 -0
data/lib/biointerchange/genomics/gff3_reader.rb +18 -4
data/lib/biointerchange/genomics/gvf_reader.rb +14 -0
data/lib/biointerchange/phylogenetics/cdao_rdf_ntriples.rb +108 -0
data/lib/biointerchange/phylogenetics/newick_reader.rb +81 -0
data/lib/biointerchange/phylogenetics/tree_set.rb +50 -0
data/lib/biointerchange/registry.rb +50 -8
data/lib/biointerchange/so.rb +150 -0
data/lib/biointerchange/textmining/pdfx_xml_reader.rb +21 -2
data/lib/biointerchange/textmining/pubannos_json_reader.rb +24 -1
data/lib/biointerchange/textmining/text_mining_rdf_ntriples.rb +9 -0
data/lib/biointerchange/textmining/text_mining_reader.rb +5 -5
data/spec/phylogenetics_spec.rb +79 -0
data/supplemental/java/biointerchange/pom.xml +1 -1
data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/CDAO.java +2602 -0
data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/FALDO.java +30 -28
data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/GFF3O.java +136 -104
data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/GVF1O.java +367 -278
data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/SIO.java +4388 -3127
data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/SO.java +5970 -4351
data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/SOFA.java +733 -544
data/supplemental/java/biointerchange/src/test/java/org/biointerchange/AppTest.java +3 -1
data/supplemental/python/biointerchange/cdao.py +2021 -0
data/supplemental/python/biointerchange/faldo.py +37 -38
data/supplemental/python/biointerchange/gff3o.py +156 -157
data/supplemental/python/biointerchange/goxref.py +172 -172
data/supplemental/python/biointerchange/gvf1o.py +428 -429
data/supplemental/python/biointerchange/sio.py +3133 -3134
data/supplemental/python/biointerchange/so.py +6626 -6527
data/supplemental/python/biointerchange/sofa.py +790 -791
data/supplemental/python/example.py +23 -5
data/supplemental/python/setup.py +2 -2
data/web/about.html +1 -0
data/web/api.html +223 -15
data/web/biointerchange.js +27 -6
data/web/cli.html +8 -3
data/web/index.html +6 -2
data/web/ontologies.html +3 -0
data/web/service/rdfizer.fcgi +7 -15
data/web/webservices.html +6 -2
metadata +30 -3

data/lib/biointerchange/core.rb CHANGED Viewed

@@ -20,6 +20,7 @@ module BioInterchange
   require 'biointerchange/exceptions'
   # Ontologies (besides the ones from the 'rdf' gem)
+  require 'biointerchange/cdao'
   require 'biointerchange/faldo'
   require 'biointerchange/gff3o'
   require 'biointerchange/goxref'
@@ -28,6 +29,9 @@ module BioInterchange
   require 'biointerchange/so'
   require 'biointerchange/sofa'
+  # Registry for reader/writer management:
+  require 'biointerchange/registry'
   # Reader/writer interfaces
   require 'biointerchange/reader'
   require 'biointerchange/model'
@@ -81,6 +85,19 @@ module BioInterchange
   # Writer
   # ...same GFF3 writer
+  #
+  # PHYLOGENETICS
+  #
+  # Reader
+  require 'biointerchange/phylogenetics/newick_reader'
+  # Model
+  require 'biointerchange/phylogenetics/tree_set'
+  # Writer
+  require 'biointerchange/phylogenetics/cdao_rdf_ntriples'
   #
   # ACTUAL COMMAND LINE IMPLEMENTATION
   #
@@ -97,7 +114,7 @@ module BioInterchange
         ["--batchsize", "-b", Getopt::OPTIONAL],                # batchsize for readers/writers that support +postpone?+
         ["--input", "-i", Getopt::REQUIRED],                    # input file format
         ["--rdf", "-r", Getopt::REQUIRED],                      # output file format
-        ["--annotate_name", Getopt::OPTIONAL],                  # name of resourcce/tool/person
+        ["--annotate_name", Getopt::OPTIONAL],                  # name of resource/tool/person
         ["--annotate_name_id", Getopt::OPTIONAL],               # uri of resource/tool/person
         ["--annotate_date", Getopt::OPTIONAL],                  # date of processing/annotation
         ["--annotate_version", Getopt::OPTIONAL],               # version number of resource
@@ -110,47 +127,38 @@ module BioInterchange
         puts "Usage: ruby #{$0} -i <format> -r <format> [options]"
         puts ''
         puts 'Supported input formats (--input <format>/-i <format>):'
-        puts '  biointerchange.gff3                : GFF3'
-        puts '  biointerchange.gvf                 : GVF'
-        puts '  dbcls.catanns.json                 : PubAnnotation JSON'
-        puts '  uk.ac.man.pdfx                     : PDFx XML'
+        Registry.reader_descriptions.each_pair { |reader_id, description|
+          puts "  #{reader_id}#{' ' * (34 - reader_id.length)} : #{description}"
+        }
         puts ''
         puts 'Supported output formats (--rdf <format>/-r <format>)'
-        puts '  rdf.biointerchange.gff3            : RDF N-Triples for the following input'
-        puts '    biointerchange.gff3'
-        puts '  rdf.biointerchange.gvf             : RDF N-Triples for the following input'
-        puts '    biointerchange.gff3'
-        puts '    biointerchange.gvf'
-        puts '  rdf.bh12.sio                       : RDF N-Triples for the following inputs'
-        puts '    dbcls.catanns.json'
-        puts '    uk.ac.man.pdfx'
+        Registry.writer_descriptions.each_pair { |writer_id, description|
+          puts "  #{writer_id}#{' ' * (34 - writer_id.length)} : #{description}"
+        }
         puts ''
         puts 'I/O options:'
+        puts '  -b <size>/--batchsize <size>       : process input in batches of the given size'
+        puts '                                      (if supported, see below for valid input/rdf pairs)'
         puts '  -f <file>/--file <file>            : file to read; STDIN used if not supplied'
         puts '  -o <file>/--out <file>             : output file; STDOUT used if not supplied'
         puts ''
-        puts 'Input-/RDF-format specific options:'
-        puts '  Input: dbcls.catanns.json, uk.ac.man.pdfx'
-        puts '  Output: rdf.bh12.sio'
-        puts '  Options:'
-        puts '    --annotate_date <date>           : date of processing/annotation (optional)'
-        puts '    --annotate_version <version>     : version number of resource (optional)'
-        puts '    --annotate_name <name>           : name of resource/tool/person (required)'
-        puts '    --annotate_name_id <id>          : URI of resource/tool/person (required)'
-        puts ''
-        puts 'Input-/RDF-format specific options:'
-        puts '  Input: biointerchange.gff3 or biointerchange.gvf'
-        puts '  Output: rdf.biointerchange.gff3 or rdf.biointerchange.gvf'
-        puts '  Options:'
-        puts '    -b <size>/--batchsize <size>     : process features in batches of the given size (optional)'
-        puts '    -t <date>/--date <date>          : date when the GFF3/GVF file was created (optional)'
-        puts '    --name <name>                    : name of the GFF3/GVF file creator (optional)'
-        puts '    --name_id <id>                   : email address of the GFF3/GVF file creator (optional)'
-        puts ''
         puts 'Other options:'
         puts '  -v / --version                     : print the Gem\'s version number and exit'
         puts '  -d / --debug                       : turn on debugging output (for stacktraces)'
         puts '  -h  --help                         : this message'
+        puts ''
+        puts 'Input-/RDF-format specific options:'
+        reader_writer_pairs = Registry.reader_writer_pairs
+        reader_writer_pairs.each_index { |reader_writer_pair_index|
+          reader_id, writer_id = reader_writer_pairs[reader_writer_pair_index]
+          puts "  Input format  : #{reader_id}"
+          puts "  Output format : #{writer_id}"
+          Registry.options_help(reader_id).each { |option_description|
+            option, description = option_description
+            puts "    --annotate_#{option}#{' ' * (21 - option.length)} : #{description}"
+          }
+          puts '' if reader_writer_pair_index + 1 < reader_writer_pairs.length
+        }
         exit 1
       end
@@ -166,62 +174,43 @@ module BioInterchange
       @@skip_rdf_graph = false if opt['no_rdf_graph_optimization']
       # Check if the input/rdf options are supported:
-      if opt['input'] == 'dbcls.catanns.json' or opt['input'] == 'uk.ac.man.pdfx' then
-        if opt['rdf'] == 'rdf.bh12.sio' then
-          raise ArgumentError, 'Require --name and --name_id options to specify source of annotations (e.g., a manual annotators name, or software tool name) and their associated URI (e.g., email address, or webaddress).' unless opt['name'] and opt['name_id']
-        else
-          unsupported_combination
-        end
-      elsif opt['input'] == 'biointerchange.gff3' then
-        if opt['rdf'] == 'rdf.biointerchange.gff3' then
-          # Okay. No further arguments required.
-        else
-          unsupported_combination
-        end
-      elsif opt['input'] == 'biointerchange.gvf' then
-        if opt['rdf'] == 'rdf.biointerchange.gff3' or opt['rdf'] == 'rdf.biointerchange.gvf' then
-          # Okay. No further arguments required.
-        else
-          unsupported_combination
-        end
-      else
-        unsupported_combination
-      end
+      unsupported_combination unless Registry.is_supported?(opt['input'], opt['rdf'])
-      wrong_type('batchsize', 'a positive integer') if opt['batchsize'] and not opt['batchsize'].match(/^[1-9][0-9]*$/)
+      if opt['batchsize'] then
+        batching_not_supported unless Registry.is_supporting_batch_processing?(opt['input'], opt['rdf'])
+        wrong_type('batchsize', 'a positive integer') unless opt['batchsize'].match(/^[1-9][0-9]*$/)
+      end
-      opt['batchsize'] = opt['batchsize'].to_i if opt['batchsize']
+      # Create a parameter map that can be passed along to Reader implementations:
+      map = {
+        'input'  => opt['input'],
+        'output' => opt['output']
+      }
+      map['batchsize'] = opt['batchsize'].to_i if opt['batchsize']
+      opt.each_key { |key|
+        map[key.sub(/^annotate_/, '')] = opt[key] if key.start_with?('annotate_')
+      }
       # Generate model from file (deserialization).
-      # Note: if-clauses are lexicographically ordered.
-      reader = nil
-      if opt['input'] == 'biointerchange.gff3' then
-        reader = BioInterchange::Genomics::GFF3Reader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], opt['batchsize'])
-      elsif opt['input'] == 'biointerchange.gvf' then
-        reader = BioInterchange::Genomics::GVFReader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], opt['batchsize'])
-      elsif opt['input'] == 'dbcls.catanns.json' then
-        reader = BioInterchange::TextMining::PubAnnosJSONReader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], BioInterchange::TextMining::Process::UNSPECIFIED, opt['version'])
-      elsif opt['input'] == 'uk.ac.man.pdfx' then
-        reader = BioInterchange::TextMining::PDFxXMLReader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], BioInterchange::TextMining::Process::UNSPECIFIED, opt['annotate_version'])
-      end
+      reader_class, *args = Registry.reader(opt['input'])
+      reader = reader_class.new(*BioInterchange::get_parameters(map, args))
-      if opt["file"]
-        input_source = File.new(opt["file"],'r')
+      input_source = nil
+      if opt['file'] then
+        input_source = File.new(opt['file'], 'r')
       else
         input_source = STDIN
       end
-      # Generate rdf from model (serialization).
-      # Note: if-clauses are lexicographically ordered.
-      writer = nil
-      if opt['rdf'] == 'rdf.bh12.sio' then
-        writer = BioInterchange::TextMining::RDFWriter.new(File.new(opt['out'], 'w')) if opt['out']
-        writer = BioInterchange::TextMining::RDFWriter.new(STDOUT) unless opt['out']
-      end
-      if opt['rdf'] == 'rdf.biointerchange.gff3' or opt['rdf'] == 'rdf.biointerchange.gvf' then
-        writer = BioInterchange::Genomics::RDFWriter.new(File.new(opt['out'], 'w')) if opt['out']
-        writer = BioInterchange::Genomics::RDFWriter.new(STDOUT) unless opt['out']
+      output_source = nil
+      if opt['out'] then
+        output_source = File.new(opt['out'], 'w')
+      else
+        output_source = STDOUT
       end
+      # Generate rdf from model (serialization).
+      writer = Registry.writer(opt['rdf']).new(output_source)
       begin
         model = reader.deserialize(input_source)
@@ -270,6 +259,10 @@ module BioInterchange
 private
+  def self.batching_not_supported
+    raise ArgumentError, 'Batching is not supported for this input/output format combination.'
+  end
   def self.unsupported_combination
     raise ArgumentError, 'This input/output format combination is not supported.'
   end

data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb CHANGED Viewed

@@ -15,6 +15,22 @@ module BioInterchange::Genomics
 # - rdf.biointerchange.gvf
 class RDFWriter < BioInterchange::Writer
+  # Register writers:
+  BioInterchange::Registry.register_writer(
+    'rdf.biointerchange.gff3',
+    BioInterchange::Genomics::RDFWriter,
+    [ 'biointerchange.gff3' ],
+    true,
+    'Generic Feature Format Version 3 Ontology (GFF3O) based RDFization'
+  )
+  BioInterchange::Registry.register_writer(
+    'rdf.biointerchange.gvf',
+    BioInterchange::Genomics::RDFWriter,
+    [ 'biointerchange.gvf' ],
+    true,
+    'Genome Variation Format Version 1 Ontology (GVF1O) based RDFization'
+  )
   # Creates a new instance of a RDFWriter that will use the provided output stream to serialize RDF.
   #
   # +ostream+:: instance of an IO class or derivative that is used for RDF serialization

data/lib/biointerchange/genomics/gff3_reader.rb CHANGED Viewed

@@ -4,6 +4,20 @@ module BioInterchange::Genomics
 class GFF3Reader < BioInterchange::Reader
+  # Register reader:
+  BioInterchange::Registry.register_reader(
+    'biointerchange.gff3',
+    GFF3Reader,
+    [ 'name', 'name_uri', 'date' ],
+    true,
+    'Generic Feature Format Version 3 (GFF3) reader',
+    [
+      [ 'date <date>', 'date when the GFF3 file was created (optional)' ],
+      [ 'name <name>', 'name of the GFF3 file creator (optional)' ],
+      [ 'name_id <id>', 'email address of the GFF3 file creator (optional)' ]
+    ]
+  )
   # Creates a new instance of a Generic Feature Format Version 3 (GFF3) reader.
   #
   # The reader supports batch processing.
@@ -94,7 +108,7 @@ protected
       if type.match(/^SO:\d{7}$/) then
         type = RDF::URI.new("http://www.sequenceontology.org/miso/current_release/term/#{feature.type}")
       else
-        type = BioInterchange::SOFA.send(BioInterchange.make_safe_label(type))
+        type = BioInterchange::SO.send(BioInterchange.make_safe_label(type))
       end
     rescue NoMethodError
       raise BioInterchange::Exceptions::InputFormatError, "Type of feature is set to an unknown SOFA term: \"#{type}\""
@@ -154,15 +168,15 @@ protected
       feature_set.set_pragma(name, { name => value.to_f })
     elsif name == 'sequence-region' then
       regions = feature_set.pragma(name)
-      regions = {} unless regions
+      regions = { name => {} } unless regions
       seqid, start_coordinate, end_coordinate = value.split(/\s+/, 3)
-      regions[seqid] = BioInterchange::Genomics::GFF3Landmark.new(seqid, start_coordinate.to_i, end_coordinate.to_i)
+      regions[name][seqid] = BioInterchange::Genomics::GFF3Landmark.new(seqid, start_coordinate.to_i, end_coordinate.to_i)
       feature_set.set_pragma(name, regions)
     elsif name == 'species' then
       feature_set.set_pragma(name, { name => value })
     else
       # Unhandled pragma. Just save the value in its string form.
-      feature_set.set_pragma(name, value)
+      feature_set.set_pragma(name, { name => value })
     end
   end

data/lib/biointerchange/genomics/gvf_reader.rb CHANGED Viewed

@@ -2,6 +2,20 @@ module BioInterchange::Genomics
 class GVFReader < GFF3Reader
+  # Register reader:
+  BioInterchange::Registry.register_reader(
+    'biointerchange.gvf',
+    GVFReader,
+    [ 'name', 'name_uri', 'date' ],
+    true,
+    'Genome Variation Format Version 1 (GVF) reader',
+    [
+      [ 'date <date>', 'date when the GVF file was created (optional)' ],
+      [ 'name <name>', 'name of the GVF file creator (optional)' ],
+      [ 'name_id <id>', 'email address of the GVF file creator (optional)' ]
+    ]
+  )
   # Creates a new instance of a Genome Variation Format (GVF) reader.
   #
   # +name+:: Optional name of the person who generated the GVF file.

data/lib/biointerchange/phylogenetics/cdao_rdf_ntriples.rb ADDED Viewed

@@ -0,0 +1,108 @@
+require 'rdf'
+require 'rdf/ntriples'
+module BioInterchange::Phylogenetics
+# Serialized phylogenetic tree models based on BioRuby's phylogenetic tree implementation.
+class CDAORDFWriter < BioInterchange::Writer
+  # Register writers:
+  BioInterchange::Registry.register_writer(
+    'rdf.phylotastic.newick',
+    CDAORDFWriter,
+    [ 'phylotastic.newick' ],
+    true,
+    'Comparative Data Analysis Ontology (CDAO) based RDFization'
+  )
+  # Creates a new instance of a CDAORDFWriter that will use the provided output stream to serialize RDF.
+  #
+  # +ostream+:: instance of an IO class or derivative that is used for RDF serialization
+  def initialize(ostream)
+    @ostream = ostream
+  end
+  # Serialize a model as RDF.
+  #
+  # +model+:: a generic representation of input data that is an instance of BioInterchange::Phylogenetics::TreeSet
+  def serialize(model)
+    model.contents.each { |tree|
+      serialize_model(model, tree)
+    }
+  end
+protected
+  def serialize_model(model, tree)
+    graph = RDF::Graph.new
+    graph.fast_ostream(@ostream) if BioInterchange::skip_rdf_graph
+    tree_uri = RDF::URI.new(model.uri)
+    if model.date then
+      graph.insert(RDF::Statement.new(tree_uri, RDF::DC.date, RDF::Literal.new(model.date)))
+    end
+    serialize_tree(graph, tree, tree_uri, tree.root, true)
+    RDF::NTriples::Writer.dump(graph, @ostream)
+  end
+  def serialize_tree(graph, tree, tree_uri, node, is_root)
+    node_uri = RDF::URI.new("#{tree_uri.to_s}/node/#{node.object_id}")
+    if is_root then
+      graph.insert(RDF::Statement.new(tree_uri, RDF.type, BioInterchange::CDAO.NewickTree))
+      # Commented out some lines since it appears not to be determinable for Newick trees.
+      if tree.root then
+        # graph.insert(RDF::Statement.new(tree_uri, RDF.type, BioInterchange::CDAO.rootedtree))
+      else
+        # graph.insert(RDF::Statement.new(tree_uri, RDF.type, BioInterchange::CDAO.unrootedtree))
+        # Pick the first node available to permit serialization of the tree:
+        tree.root = node = tree.nodes.first
+      end
+    end
+    if node.name and not node.name.empty? then
+      taxonomic_unit_uri = RDF::URI.new("#{tree_uri.to_s}/taxonomic_unit/#{node.object_id}")
+      graph.insert(RDF::Statement.new(taxonomic_unit_uri, RDF.type, BioInterchange::CDAO.TU))
+      graph.insert(RDF::Statement.new(node_uri, BioInterchange::CDAO::represents_TU, taxonomic_unit_uri))
+      graph.insert(RDF::Statement.new(taxonomic_unit_uri, RDF::RDFS.label, RDF::Literal.new(node.name.gsub('_', ' '))))
+    end
+    if tree.descendents(node).empty? then
+      graph.insert(RDF::Statement.new(node_uri, RDF.type, BioInterchange::CDAO.TerminalNode))
+    else
+      graph.insert(RDF::Statement.new(node_uri, RDF.type, BioInterchange::CDAO.AncestralNode))
+    end
+    if not tree.root == node and tree.parent(node) then
+      parent_uri = RDF::URI.new("#{tree_uri.to_s}/node/#{tree.parent(node).object_id}")
+      edge_uri = RDF::URI.new("#{tree_uri.to_s}/edge/#{tree.get_edge(tree.parent(node), node).object_id}")
+      annotation_uri = RDF::URI.new("#{tree_uri.to_s}/edge/#{tree.get_edge(tree.parent(node), node).object_id}/annotation")
+      graph.insert(RDF::Statement.new(edge_uri, RDF.type, BioInterchange::CDAO.DirectedEdge))
+      graph.insert(RDF::Statement.new(edge_uri, BioInterchange::CDAO.belongs_to_Tree, tree_uri))
+      graph.insert(RDF::Statement.new(edge_uri, BioInterchange::CDAO.has_Parent_Node, parent_uri))
+      graph.insert(RDF::Statement.new(edge_uri, BioInterchange::CDAO.has_Child_Node, node_uri))
+      graph.insert(RDF::Statement.new(node_uri, BioInterchange::CDAO.belongs_to_Edge_as_Child, edge_uri))
+      graph.insert(RDF::Statement.new(node_uri, BioInterchange::CDAO.has_Parent, parent_uri))
+      graph.insert(RDF::Statement.new(parent_uri, BioInterchange::CDAO.belongs_to_Edge_as_Parent, edge_uri))
+      # if node.distance then
+      #  graph.insert(RDF::Statement.new(node_uri, BioInterchange::CDAO.has_Support_Value, RDF::Literal.new(node.distance, :datatype => RDF::URI.new('http://www.w3.org/2001/XMLSchema#decimal'))))
+      # end
+      graph.insert(RDF::Statement.new(edge_uri, BioInterchange::CDAO.has_Annotation, annotation_uri))
+      graph.insert(RDF::Statement.new(annotation_uri, RDF.type, BioInterchange::CDAO.EdgeLength))
+      graph.insert(RDF::Statement.new(annotation_uri, BioInterchange::CDAO.has_Value, RDF::Literal.new(tree.get_edge(tree.parent(node), node).distance, :datatype => RDF::URI.new('http://www.w3.org/2001/XMLSchema#decimal'))))
+    end
+    graph.insert(RDF::Statement.new(tree_uri, BioInterchange::CDAO.has_Root, node_uri))
+    graph.insert(RDF::Statement.new(node_uri, BioInterchange::CDAO.belongs_to_Tree, tree_uri))
+    # Now, continue traversing the tree by visiting the current node's descendents:
+    tree.descendents(node).each { |descendent_node|
+      serialize_tree(graph, tree, tree_uri, descendent_node, false)
+    }
+  end
+end
+end

data/lib/biointerchange/phylogenetics/newick_reader.rb ADDED Viewed

@@ -0,0 +1,81 @@
+require 'bio'
+require 'date'
+module BioInterchange::Phylogenetics
+class NewickReader < BioInterchange::Reader
+  # Register reader:
+  BioInterchange::Registry.register_reader(
+    'phylotastic.newick',
+    NewickReader,
+    [ 'date' ],
+    true,
+    'Newick Tree File Format reader',
+    [
+      [ 'date <date>', 'date when the Newick file was created (optional)' ]
+    ]
+  )
+  # Creates a new instance of a Newick file format reader.
+  #
+  # The reader supports batch processing.
+  #
+  # +date+:: Optional date of when the Newick file was produced, annotated, etc.
+  # +batch_size+:: Optional integer that determines that number of features that
+  # should be processed in one go.
+  def initialize(date = nil, batch_size = nil)
+    @date = date
+    @batch_size = batch_size
+  end
+  # Reads a Newick file from the input stream and returns an associated model.
+  #
+  # If this method is called when +postponed?+ returns true, then the reading will
+  # continue from where it has been interrupted beforehand.
+  #
+  # +inputstream+:: an instance of class IO or String that holds the contents of a Newick file
+  def deserialize(inputstream)
+    if inputstream.kind_of?(IO)
+      create_model(inputstream)
+    elsif inputstream.kind_of?(String) then
+      create_model(StringIO.new(inputstream))
+    else
+      raise BioInterchange::Exceptions::ImplementationReaderError, 'The provided input stream needs to be either of type IO or String.'
+    end
+  end
+  # Returns true if the reading of the input was postponed due to a full batch.
+  def postponed?
+    @postponed
+  end
+protected
+  def create_model(newick)
+    if @postponed then
+      @postponed = false
+      @trees.prune
+    else
+      @trees = BioInterchange::Phylogenetics::TreeSet.new()
+      @trees.set_date(Date.parse(@date)) if @date
+    end
+    tree_io = Bio::FlatFile.open(Bio::Newick, newick)
+    while newick_tree = tree_io.next_entry
+      newick_tree.options[:bootstrap_style] = :disabled
+      @trees.add(newick_tree.tree)
+      if @batch_size and feature_no >= @batch_size then
+        @postponed = true
+        break
+      end
+    end
+    @trees
+  end
+end
+end

data/lib/biointerchange/phylogenetics/tree_set.rb ADDED Viewed

@@ -0,0 +1,50 @@
+require 'digest/sha1'
+module BioInterchange::Phylogenetics
+# A phylogenetic tree set that can contain multiple phylogenetic trees.
+class TreeSet < BioInterchange::Model
+  # Create a new instance of a tree set. A tree set can contain multiple phylogenetic trees.
+  def initialize
+    # Trees are stored as the keys of a hash map to increase performance:
+    @set = {}
+  end
+  # Returns the contents of the tree set.
+  def contents
+    @set.keys
+  end
+  # If a date was provided, then this method returns its value.
+  def date
+    @date
+  end
+  # Sets a date that is associated with the trees in this model (e.g., annotation date, creation date, etc.).
+  #
+  # +date+:: an instance of Date that is associated with all trees in the model
+  def set_date(date)
+    @date = date
+  end
+  # Returns an URI for this particular tree set, which is a not necessarily globally unique SHA1 hash.
+  def uri
+    "biointerchange://phylogenetics/treeset/self/#{Digest::SHA1.hexdigest(Time.now.to_s)}"
+  end
+  # Add a tree to the tree set.
+  #
+  # +tree+:: BioRuby tree instance that is added to the contents of this tree set
+  def add(tree)
+    @set[tree] = true
+  end
+  # Removes all features from the set, but keeps additional data (e.g., the date).
+  def prune
+    @set.clear
+  end
+end
+end

data/lib/biointerchange/registry.rb CHANGED Viewed

@@ -1,27 +1,69 @@
 module BioInterchange
+# A registry of Reader and Writer subclasses that also keeps track which
+# Reader/Writer combinations can be used together. The registry makes it
+# possible to implement readers and writers without the need to modify
+# other BioInterchange framework code.
 class Registry
-  def self.register_reader(uid, service)
-    @@readers[uid] = service
+  def self.register_reader(reader_id, reader_class, parameters, supports_batch_processing, descriptive_name, options_help)
+    @@readers[reader_id] = [ reader_class ] + parameters
+    @@reader_batch_processors[reader_id] = true if supports_batch_processing
+    @@reader_descriptions[reader_id] = descriptive_name
+    @@reader_help_texts[reader_id] = options_help
   end
-  def self.register_writer(uid, service)
-    @@writers[uid] = service
+  def self.register_writer(writer_id, writer_class, compatible_reader_ids, supports_batch_processing, descriptive_name)
+    @@writers[writer_id] = writer_class
+    @@writer_batch_processors[writer_id] = true if supports_batch_processing
+    @@writer_descriptions[writer_id] = descriptive_name
+    compatible_reader_ids.each { |reader_id|
+      @@compatible_reader_writer_pairs["#{reader_id} #{writer_id}"] = true
+    }
   end
-  def self.readers
-    @@readers.clone.freeze
+  def self.is_supported?(reader_id, writer_id)
+    @@compatible_reader_writer_pairs["#{reader_id} #{writer_id}"] == true
   end
-  def self.writers
-    @@writers.clone.freeze
+  def self.is_supporting_batch_processing?(reader_id, writer_id)
+    @@reader_batch_processors[reader_id] and @@writer_batch_processors[writer_id]
+  end
+  def self.reader(reader_id)
+    @@readers[reader_id]
+  end
+  def self.writer(writer_id)
+    @@writers[writer_id]
+  end
+  def self.reader_descriptions
+    @@reader_descriptions.clone.freeze
+  end
+  def self.writer_descriptions
+    @@writer_descriptions.clone.freeze
+  end
+  def self.options_help(reader_id)
+    @@reader_help_texts[reader_id].clone.freeze
+  end
+  def self.reader_writer_pairs
+    @@compatible_reader_writer_pairs.keys.sort.map { |reader_writer_pair| reader_writer_pair.split(/ /, 2) }.freeze
   end
 private
   @@readers = {}
   @@writers = {}
+  @@reader_batch_processors = {}
+  @@writer_batch_processors = {}
+  @@reader_descriptions = {}
+  @@writer_descriptions = {}
+  @@reader_help_texts = {}
+  @@compatible_reader_writer_pairs = {}
 end