RubyGems - biointerchange - Versions diffs - 0.1.2 → 0.1.3 - Mend

biointerchange 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

data/README.md +49 -4
data/VERSION +1 -1
data/examples/chromosome_BF.gff +1701 -0
data/examples/estd176_Banerjee_et_al_2011.2012-11-29.NCBI36.gvf +4326 -0
data/examples/pubannotation.10096561.json +1 -0
data/examples/{pubannotation.json → pubannotation.10096561.json.old} +0 -0
data/examples/pubannotation.2626671.json +1 -0
data/lib/biointerchange/core.rb +58 -16
data/lib/biointerchange/genomics/gff3_feature.rb +1 -0
data/lib/biointerchange/genomics/gff3_feature_set.rb +31 -1
data/lib/biointerchange/genomics/gff3_pragmas.rb +35 -0
data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb +60 -23
data/lib/biointerchange/genomics/gff3_reader.rb +74 -40
data/lib/biointerchange/genomics/gvf_feature.rb +24 -0
data/lib/biointerchange/genomics/gvf_feature_set.rb +14 -0
data/lib/biointerchange/genomics/gvf_pragmas.rb +6 -0
data/lib/biointerchange/genomics/gvf_reader.rb +37 -0
data/lib/biointerchange/gff3o.rb +1 -1
data/lib/biointerchange/gvf1o.rb +145 -17
data/lib/biointerchange/textmining/content.rb +1 -0
data/lib/biointerchange/textmining/content_connection.rb +74 -0
data/lib/biointerchange/textmining/document.rb +3 -1
data/lib/biointerchange/textmining/pubannos_json_reader.rb +87 -9
data/lib/biointerchange/textmining/text_mining_rdf_ntriples.rb +58 -2
data/spec/gff3_rdfwriter_spec.rb +9 -1
data/spec/gvf_rdfwriter_spec.rb +81 -0
data/spec/text_mining_pubannos_json_reader_spec.rb +82 -10
data/spec/text_mining_rdfwriter_spec.rb +11 -0
data/web/api.html +30 -23
metadata +156 -138

data/lib/biointerchange/textmining/document.rb CHANGED

@@ -21,7 +21,9 @@ class Document
   #
   # +content+:: content of type +BioInterchange::TextMining::Content+ that should be added to the document
   def add(content)
-    raise BioInterchange::Exceptions::ImplementationModelError, 'Content has to be of kind BioInterchange::TextMining::Content' unless content.kind_of?(BioInterchange::TextMining::Content)
+    if ( (! content.kind_of?(BioInterchange::TextMining::Content)) && (! content.kind_of?(BioInterchange::TextMining::ContentConnection)) )
+      raise BioInterchange::Exceptions::ImplementationModelError, 'Content has to be of kind BioInterchange::TextMining::Content or kind BioInterchange::TextMining::ContentConnection'
+    end
     @content << content
   end

data/lib/biointerchange/textmining/pubannos_json_reader.rb CHANGED

@@ -27,7 +27,7 @@ private
       raise BioInterchange::Exceptions::InputFormatError, 'Error parsing the JSON input file: #{result["Error"]}'
     end
     text = result['text']
     #doc_uri = "http://pubannotation.dbcls.jp/pmdocs/" + result['pmid'].to_s
     doc_uri = result['docurl']
@@ -39,18 +39,25 @@ private
     #so our document requires content of type document or abstract
     #should it hold the content string?
+    #hash to remember annotation in case they are needed for building upon based on ids later
+    contents = {}
     if result['catanns']
       result['catanns'].each do |annot|
-        start_offset = annot['begin']
-        end_offset = annot['end']
+        start_offset = 0
+        end_offset = 0
+        if annot['span']
+          start_offset = annot['span']['begin']
+          end_offset = annot['span']['end']
+        elsif annot['begin'] and annot['end']
+          start_offset = annot['begin']
+          end_offset = annot['end']
+        end
         length = end_offset - start_offset
-        created_time = annot['created_at']
-        updated_time = annot['updated_at']
         category = annot['category']
-        #annset_id = annot['annset_id']
-        #doc_id = annot['doc_id']
-        #id = annot['id']
+        id = annot['id']
         entity = text.slice(start_offset..end_offset)
@@ -58,11 +65,82 @@ private
         con = Content.new(start_offset, length, Content::PHRASE, @process)
         con.setContext(doc)
         doc.add(con)
+        contents[id] = con
         #set process.date = updated_time?
       end
     end
+    if result['insanns']
+      result['insanns'].each do |annot|
+        #unsure what to do about this (con1), 'E1' is the ID of something not created yet.
+        #it is perhaps a case of making a new content, but with what params...?
+        #need to conform what this is refering to with JDK
+        con1 = nil
+        con2 = contents[annot['object']]
+        #get annotation type
+        type = ContentConnection::UNSPECIFIED
+        case annot['type']
+        when 'subClassOf'
+          type = ContentConnection::SUBCLASS
+        end
+        connection = ContentConnection.new(con1, con2, type, @process)
+        connection.setContext(doc)
+        doc.add(connection)
+        contents[annot['id']] = connection
+      end
+    end
+    if result['relanns']
+      result['relanns'].each do |annot|
+        con1 = contents[annot['subject']]
+        con2 = contents[annot['object']]
+        #get annotation type
+        type = ContentConnection::UNSPECIFIED
+        case annot['type']
+        when 'equivalentTo'
+          type = ContentConnection::EQUIVALENCE
+        when 'themeOf'
+          type = ContentConnection::THEME
+        end
+        connection = ContentConnection.new(con1, con2, type, @process)
+        connection.setContext(doc)
+        doc.add(connection)
+        contents[annot['id']] = connection
+      end
+    end
+    if result['modanns']
+      result['modanns'].each do |annot|
+        #in this case, it is a modification of an already existing content object (speculation/negation).
+        con = contents[annot['object']]
+        #get annotation type
+        type = ContentConnection::UNSPECIFIED
+        case annot['type']
+        when 'Speculation'
+          type = ContentConnection::SPECULATION
+        when 'Negation'
+          type = ContentConnection::NEGATION
+        end
+        connection = ContentConnection.new(con, nil, type, @process)
+        connection.setContext(doc)
+        doc.add(connection)
+        contents[annot['id']] = connection
+      end
+    end
     doc
   end

data/lib/biointerchange/textmining/text_mining_rdf_ntriples.rb CHANGED

@@ -10,7 +10,7 @@ class RDFWriter < BioInterchange::Writer
   #
   # +ostream+:: instance of an IO class or derivative that is used for RDF serialization
   def initialize(ostream)
-    raise BioInterchange::Exceptions::ImplementationWriterError, 'The output stream is not an instance of IO or its subclasses.' unless ostream.kind_of?(IO)
+    raise BioInterchange::Exceptions::ImplementationWriterError, 'The output stream is not an instance of IO or its subclasses.' unless ostream.kind_of?(IO) || ostream.kind_of?(StringIO)
     @ostream = ostream
   end
@@ -68,6 +68,22 @@ private
     end
   end
+  # Generates an URI for a given contentconnection and its contents.
+  #
+  # +contentcon+:: content connection instance
+  # +kind+:: kind of the URI that should be generated, for example, whether the URI should represent the name, date, etc.
+  def content_connection_uri(contentcon, kind)
+    base_uri = 'biointerchange://textmining/content_connection'
+    case kind
+    when :start
+      RDF::URI.new("#{base_uri}/start/#{content.uri.sub(/^.*?:\/\//, '')}")
+    when :stop
+      RDF::URI.new("#{base_uri}/stop/#{content.uri.sub(/^.*?:\/\//, '')}")
+    else
+      raise BioInterchange::Exceptions::ImplementationWriterError, "There is no implementation for serializing a content as #{kind}."
+    end
+  end
   # Serializes RDF for a textual document representation using the Semanticsciene Integrated Ontology
   # (http://code.google.com/p/semanticscience/wiki/SIO).
   #
@@ -77,7 +93,13 @@ private
     document_uri = RDF::URI.new(model.uri)
     graph.insert(RDF::Statement.new(document_uri, RDF.type, BioInterchange::SIO.document))
     model.contents.each { |content|
-      serialize_content(graph, document_uri, content)
+      if content.kind_of?(BioInterchange::TextMining::Content)
+        serialize_content(graph, document_uri, content)
+      elsif content.kind_of?(BioInterchange::TextMining::ContentConnection)
+        serialize_contentconnection(graph, document_uri, content)
+      else
+        raise BioInterchange::Exceptions::ImplementationWriterError, "Can only serialize Content and ContentConnection from a Document."
+      end
     }
     RDF::NTriples::Writer.dump(graph, @ostream)
   end
@@ -128,6 +150,40 @@ private
   end
+  # Serializes a ContentConnection object for a given document URI.
+  #
+  # +graph+:: RDF graph to which content is added
+  # +document_uri+:: the document URI to which the added content belongs to
+  # +content+:: an instance that describes the content
+  def serialize_contentconnection(graph, document_uri, contentcon)
+    contentcon_uri = RDF::URI.new(contentcon.uri)
+    graph.insert(RDF::Statement.new(document_uri, BioInterchange::SIO.has_attribute, contentcon_uri))
+    serialize_process(graph, document_uri, contentcon_uri, contentcon.process) if contentcon.process
+    #TODO these sio tags need confirming - there are here as a initial proof of concept
+    #next issue, some of these are relations and some are labels, need to separate out which
+    #I seem to recall that the only relationship types that should be used are "has_attribute" and "RDF::type", in which case these need adjusting for that.
+    #I presume this'd mean making a "has_attribute" link between the content1 and the contentconnection relationship in some way.
+    case contentcon.type
+    when ContentConnection::UNSPECIFIED
+      graph.insert(RDF::Statement.new(contentcon.content1.uri, BioInterchange::SIO.has_attribute, BioInterchange::SIO.language_entity))
+    when ContentConnection::EQUIVALENCE
+      graph.insert(RDF::Statement.new(contentcon.content1.uri, BioInterchange::SIO.is_equal_to, contentcon.content2.uri))
+    when ContentConnection::SUBCLASS
+      #TODO this class needs more information, the relationship is between a contentcon.content, and 'something'... I've yet to work out what
+      graph.insert(RDF::Statement.new(contentcon.content2.uri, BioInterchange::SIO.has_attribute, BioInterchange::SIO.in_relation_to))
+    when ContentConnection::THEME
+      #TODO there are other more specific options for this that need investigating as options.
+      graph.insert(RDF::Statement.new(contentcon.content1.uri, BioInterchange::SIO.has_target, contentcon.content2.uri))
+    when ContentConnection::SPECULATION
+      graph.insert(RDF::Statement.new(contentcon.content1.uri, BioInterchange::SIO.has_attribute, BioInterchange::SIO.speculation))
+    when ContentConnection::NEGATION
+      graph.insert(RDF::Statement.new(contentcon.content1.uri, BioInterchange::SIO.denotes, BioInterchange::SIO.negative_regulation))
+    end
+  end
   # Serializes a process object for a specific document uri
   #
   #

data/spec/gff3_rdfwriter_spec.rb CHANGED

@@ -66,7 +66,15 @@ describe BioInterchange::Genomics::RDFWriter do
       set.add(feature)
       BioInterchange::Genomics::RDFWriter.new(ostream).serialize(set)
       ostream.close
-      istream.read.lines.count.should be == 43
+      lines = istream.read.lines
+      feature_no = 0
+      lines.each { |line|
+        subject, predicate, object = line.chomp.split(/\s/, 3)
+        object.sub!(/\s+\.$/, '')
+        feature_no += 1 if predicate == "<#{RDF.type}>" and object == "<#{BioInterchange::GFF3O.Feature}>"
+      }
+      lines.count.should be == 43
+      feature_no.should be == 3
     end
   end
 end

data/spec/gvf_rdfwriter_spec.rb ADDED

@@ -0,0 +1,81 @@
+require 'rubygems'
+require 'rspec'
+# Turn off verbose reporting here, since class definitions may be loaded multiple
+# times here. That reports that constants have been already been initialized, which
+# is true, but they are only "re-initialized" with the very same values.
+v, $VERBOSE = $VERBOSE, nil
+load 'lib/biointerchange/core.rb'
+load 'lib/biointerchange/gvf1o.rb'
+load 'lib/biointerchange/sofa.rb'
+load 'lib/biointerchange/reader.rb'
+load 'lib/biointerchange/writer.rb'
+load 'lib/biointerchange/genomics/gvf_feature_set.rb'
+load 'lib/biointerchange/genomics/gvf_feature.rb'
+# The GVF implementation extends the GFF3 implementation, so load those classes too:
+load 'lib/biointerchange/genomics/gff3_rdf_ntriples.rb'
+load 'lib/biointerchange/genomics/gff3_feature_set.rb'
+load 'lib/biointerchange/genomics/gff3_feature.rb'
+$VERBOSE = v
+describe BioInterchange::Genomics::RDFWriter do
+  describe 'serialization of GVF models' do
+    it 'empty document' do
+      istream, ostream = IO.pipe
+      BioInterchange::Genomics::RDFWriter.new(ostream).serialize(BioInterchange::Genomics::GVFFeatureSet.new())
+      ostream.close
+      istream.read.lines.count.should eq(1)
+    end
+    it 'model with three features' do
+      istream, ostream = IO.pipe
+      set = BioInterchange::Genomics::GVFFeatureSet.new()
+      feature = BioInterchange::Genomics::GVFFeature.new(
+          'GRCh37.1',
+          'NCBI',
+          BioInterchange::SOFA.CDS,
+          32890598,
+          32890664,
+          0.1,
+          BioInterchange::Genomics::GFF3Feature::POSITIVE,
+          { 'ID' => [ 'BRCA2' ], 'annotation' => [ 'manual' ] }
+        )
+      set.add(feature)
+      feature = BioInterchange::Genomics::GVFFeature.new(
+          'GRCh37.1',
+          'NCBI',
+          BioInterchange::SOFA.modified_base,
+          32890599,
+          32890599,
+          0.8,
+          BioInterchange::Genomics::GFF3Feature::POSITIVE,
+          { 'ID' => [ 'aModifiedBase' ], 'Parent' => [ 'BRCA2' ] }
+        )
+      set.add(feature)
+      feature = BioInterchange::Genomics::GVFFeature.new(
+          'GRCh37.1',
+          'NCBI',
+          BioInterchange::SOFA.modified_base,
+          32890599,
+          32890599,
+          0.8,
+          BioInterchange::Genomics::GFF3Feature::POSITIVE,
+          { 'Parent' => [ 'BRCA2', 'aModifiedBase' ] }
+        )
+      set.add(feature)
+      BioInterchange::Genomics::RDFWriter.new(ostream).serialize(set)
+      ostream.close
+      lines = istream.read.lines
+      feature_no = 0
+      lines.each { |line|
+        subject, predicate, object = line.chomp.split(/\s/, 3)
+        object.sub!(/\s+\.$/, '')
+        feature_no += 1 if predicate == "<#{RDF.type}>" and object == "<#{BioInterchange::GVF1O.Feature}>"
+      }
+      lines.count.should be == 43
+      feature_no.should be == 3
+    end
+  end
+end

data/spec/text_mining_pubannos_json_reader_spec.rb CHANGED

@@ -12,6 +12,7 @@ load 'lib/biointerchange/textmining/text_mining_reader.rb'
 load 'lib/biointerchange/textmining/pubannos_json_reader.rb'
 load 'lib/biointerchange/textmining/document.rb'
 load 'lib/biointerchange/textmining/content.rb'
+load 'lib/biointerchange/textmining/content_connection.rb'
 load 'lib/biointerchange/textmining/process.rb'
 $VERBOSE = v
@@ -28,23 +29,55 @@ describe BioInterchange::TextMining::PubannosJsonReader do
         model.should be_an_instance_of BioInterchange::TextMining::Document
       end
       it 'read json from file' do
-        model = @reader.deserialize(File.new('examples/pubannotation.json'))
+        model = @reader.deserialize(File.new('examples/pubannotation.10096561.json'))
         model.should be_an_instance_of BioInterchange::TextMining::Document
       end
+	  it 'read old json from file' do
+        model = @reader.deserialize(File.new('examples/pubannotation.10096561.json.old'))
+        model.should be_an_instance_of BioInterchange::TextMining::Document
+      end
+    end
+    describe 'old json generated model checks' do
+      before :all do
+        reader = BioInterchange::TextMining::PubannosJsonReader.new("TestOld", "http://test.com", "00-00-0000", BioInterchange::TextMining::Process::UNSPECIFIED, "0.0")
+        @model = reader.deserialize('{ "name": "Peter Smith", "name_id": "<peter.smith@example.json>", "date": "2012-08-12", "version": "3", "docurl":"http://example.org/example_json", "text":"Some document text. With two annotations of type protein.\n", "catanns":[{"annset_id":1,"begin":0,"category":"Protein","doc_id":9,"end":10,"id":139},{"annset_id":1,"begin":20,"category":"Protein","doc_id":9,"end":42,"id":138}]}')
+      end
+      it 'model is of type document' do
+        @model.should be_an_instance_of BioInterchange::TextMining::Document
+      end
+      it 'document uri (job id read)' do
+        @model.uri.should eql "http://example.org/example_json"
+      end
+      it 'document has content' do
+        @model.contents.size.should eql 3
+      end
+      it 'document document' do
+        @model.contents[0].type.should eql BioInterchange::TextMining::Content::DOCUMENT and @model.contents[0].offset.should eql 0 and @model.contents[0].length.should eql 58
+      end
+      it 'document phrase' do
+        @model.contents[1].type.should eql BioInterchange::TextMining::Content::PHRASE and @model.contents[1].offset.should eql 0 and @model.contents[1].length.should eql 10 and
+        @model.contents[2].type.should eql BioInterchange::TextMining::Content::PHRASE and @model.contents[2].offset.should eql 20 and @model.contents[2].length.should eql 22
+      end
     end
-    describe 'generated model check' do
+    describe 'basic generated model checks' do
       before :all do
         reader = BioInterchange::TextMining::PubannosJsonReader.new("Test", "http://test.com", "00-00-0000", BioInterchange::TextMining::Process::UNSPECIFIED, "0.0")
-        @model = reader.deserialize('{ "name": "Peter Smith", "name_id": "<peter.smith@example.json>", "date": "2012-08-12", "version": "3", "docurl":"http://example.org/example_json", "text":"Some document text. With two annotations of type protein.\n", "catanns":[{"annset_id":1,"begin":0,"category":"Protein","doc_id":9,"end":10,"id":139},{"annset_id":1,"begin":20,"category":"Protein","doc_id":9,"end":42,"id":138}]}')
-        #puts "Document Model: #{@model.uri}"
-        #  @model.contents.each do |c|
-        #  puts "\tContent: #{c.type}, #{c.offset}, #{c.length}"
-        #end
+        @model = reader.deserialize('{ "name": "Peter Smith", "name_id": "<peter.smith@example.json>", "date": "2012-12-08", "version": "3", "docurl":"http://example.org/example_json", "text":"Some document text. With two annotations of type protein.\n", 	"catanns":[{"id":"T1","span":{"begin":0,"end":10},"category":"NP"},{"id":"T2","span":{"begin":20,"end":42},"category":"NP"}]}')
       end
       it 'model is of type document' do
@@ -65,10 +98,49 @@ describe BioInterchange::TextMining::PubannosJsonReader do
       it 'document phrase' do
         @model.contents[1].type.should eql BioInterchange::TextMining::Content::PHRASE and @model.contents[1].offset.should eql 0 and @model.contents[1].length.should eql 10 and
         @model.contents[2].type.should eql BioInterchange::TextMining::Content::PHRASE and @model.contents[2].offset.should eql 20 and @model.contents[2].length.should eql 22
       end
+    end
+    describe 'advanced generated model checks' do
+      before :all do
+        reader = BioInterchange::TextMining::PubannosJsonReader.new("Test", "http://test.com", "00-00-0000", BioInterchange::TextMining::Process::UNSPECIFIED, "0.0")
+        @model = reader.deserialize(File.new('examples/pubannotation.2626671.json'))
+      end
+      it 'model is of type document' do
+        @model.should be_an_instance_of BioInterchange::TextMining::Document
+      end
+      it 'document uri (job id read)' do
+        @model.uri.should eql "http://www.ncbi.nlm.nih.gov/pubmed/2626671"
+      end
+      it 'document has content' do
+        @model.contents.size.should eql 91
+      end
+      it 'document document' do
+		#range as exact length seems to depend on encoding used...
+        @model.contents[0].type.should eql BioInterchange::TextMining::Content::DOCUMENT and @model.contents[0].offset.should eql 0 and ( @model.contents[0].length.should > 2350 or @model.contents[0].length.should < 2360 )
+      end
+      it 'document content types and interconnections' do
+        doc = 1
+        sub = 39
+        eq = 62
+        th = 64
+        spec = 87
+        neg = 89
+        @model.contents[doc].type.should eql BioInterchange::TextMining::Content::PHRASE and @model.contents[doc].offset.should eql 9 and @model.contents[doc].length.should eql 10 and
+        @model.contents[sub].type.should eql BioInterchange::TextMining::ContentConnection::SUBCLASS and @model.contents[sub].content1.should eql nil and @model.contents[sub].content2.offset.should eql 9 and
+        @model.contents[eq].type.should eql BioInterchange::TextMining::ContentConnection::EQUIVALENCE and @model.contents[eq].content1.offset.should eql 396 and @model.contents[eq].content2.offset.should eql 386 and
+        @model.contents[th].type.should eql BioInterchange::TextMining::ContentConnection::THEME and @model.contents[th].content1.offset.should eql 32 and @model.contents[th].content2.content2.offset.should eql 9 and
+        @model.contents[spec].type.should eql BioInterchange::TextMining::ContentConnection::SPECULATION and @model.contents[spec].content1.content2.offset.should eql 9 and @model.contents[spec].content2.should eql nil and
+        @model.contents[neg].type.should eql BioInterchange::TextMining::ContentConnection::NEGATION and @model.contents[neg].content1.content2.offset.should eql 426 and @model.contents[neg].content2.should eql nil
+      end
     end
   end

data/spec/text_mining_rdfwriter_spec.rb CHANGED

@@ -58,6 +58,17 @@ describe BioInterchange::TextMining::RDFWriter do
       ostream.close
       istream.read.lines.count.should be > 1
     end
+    it 'full advanced json document' do
+      ostream = StringIO.new
+      reader = BioInterchange::TextMining::PubannosJsonReader.new("Test", "http://test.com", "2012-12-09", BioInterchange::TextMining::Process::UNSPECIFIED, "0.0")
+      model = reader.deserialize(File.new('examples/pubannotation.2626671.json'))
+      BioInterchange::TextMining::RDFWriter.new(ostream).serialize(model)
+      ostream.close_write
+      ostream.string.lines.count.should > 100
+    end
   end
 end