RubyGems - obo_parser - Versions diffs - 0.3.3 → 0.3.4 - Mend

obo_parser 0.3.3 → 0.3.4

Files changed (9) hide show

data/README.rdoc CHANGED Viewed

@@ -30,14 +30,18 @@ A simple Ruby gem for parsing OBO 1.2 formatted ontology files.  Useful for repo
     first_typdef.id.value                               # => 'Some typedef id'
     first_typdef.name.value                             # => 'Some typedef name'
-    foo.terms.first.tags_named('is_a')                  # => [OboParser#Tag, ... ]
-    foo.terms.first.tags_named('is_a').first.tag        # => 'is_a'
-    foo.terms.first.tags_named('is_a').first.value      # => 'Some Term id'
+    foo.terms.first.tags_named('synonym')               # => [OboParser#Tag, ... ]
+    foo.terms.first.tags_named('synonym').first.tag     # => 'synonym'
+    foo.terms.first.tags_named('synonym').first.value   # => 'Some label'
+    foo.terms.first.relationships                       # => [['relation_ship', 'FOO:123'], ['other_relationship', 'FOO:456'] ...] An array of [relation, related term id], includes 'is_a', 'disjoint_from' and Typedefs
 See also /test/test_obo_parser.rb
 == Utilties
+!! UTILTIES ARE PRESENTLY BORKED !!
 A small set of methods (e.g. comparing OBO ontologies) utilizing the gem are included in utilities.rb. See /lib/utilities.rb.  For example, shared labels across sets of ontologies can be found and returned.
 == Copyright

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.3.3
1	+ 0.3.4

data/lib/obo_parser.rb CHANGED Viewed

@@ -34,6 +34,12 @@ module OboParser
       @terms.inject({}) {|sum, t| sum.update(t.id.value => t.name.value)}
     end
+    # A single line in a Stanza within an OBO file
+    class Tag
+      attr_accessor :tag, :value, :xrefs, :comment, :qualifier, :related_term, :relation
+    end
+    # A collection of single lines (Tags)
     class Stanza
       # Make special reference to several specific types of tags (:name, :id), subclasses will remove additional special typs from :other_tags
       attr_accessor :name, :id, :def, :other_tags
@@ -45,10 +51,11 @@ module OboParser
           t = tags.shift
           new_tag = OboParser::Tag.new
           new_tag.tag = t.tag
           new_tag.value = t.value
           new_tag.comment = t.comment
-          new_tag.xrefs = t.xrefs
+          new_tag.xrefs = t.xrefs
           case new_tag.tag
           when 'id'
@@ -58,6 +65,11 @@ module OboParser
           when 'def'
             @def = new_tag
           else
+            if new_tag.tag == 'relationship'
+              new_tag.related_term = t.related_term
+              new_tag.relation = t.relation
+            end
             @other_tags.push(new_tag)
           end
         end
@@ -78,21 +90,23 @@ module OboParser
     # TODO: likely deprecate and run with one model (Stanza)
     class Term < Stanza
-  #   attr_accessor :some_term_specific_def
+     attr_accessor :relationships
       def initialize(tags)
-        super
-  #     anonymous_tags = []
-  #     # Loop through "unclaimed" tags and reference those specific to Term
-  #     while @other_tags.size != 0
-  #       t = @other_tags.shift
-  #       case t.tag
-  #       when 'def'
-  #         @def = t
-  #       else
-  #         anonymous_tags.push(t)
-  #       end
-  #     end
-  #     @other_tags = anonymous_tags
+       super
+       @relationships = []
+       anonymous_tags = []
+       # Loop through "unclaimed" tags and reference those specific to Term
+       while @other_tags.size != 0
+         t = @other_tags.shift
+         case t.tag
+         when 'relationship'
+           @relationships.push([t.relation, t.related_term])
+         else
+           anonymous_tags.push(t)
+         end
+       end
+       @other_tags = anonymous_tags
       end
     end
@@ -114,10 +128,6 @@ module OboParser
       end
     end
-    class Tag
-      attr_accessor :tag, :value, :xrefs, :comment, :qualifier
-    end
   end
   class OboParserBuilder

data/lib/parser.rb CHANGED Viewed

@@ -6,23 +6,24 @@ class OboParser::Parser
   def parse_file
     # At present we ignore the header lines
-    while !@lexer.peek(OboParser::Tokens::Term)
+    while !@lexer.peek(OboParser::Tokens::Term) && !@lexer.peek(OboParser::Tokens::Typedef)
       @lexer.pop(OboParser::Tokens::TagValuePair)
     end
     i = 0
     while !@lexer.peek(OboParser::Tokens::Typedef) && !@lexer.peek(OboParser::Tokens::EndOfFile)
-      raise OboParser::ParseError, "infinite loop in Terms" if i > 10000000 # there aren't that many words!
+      raise OboParser::ParseError, "infinite loop in Terms?" if i > 20000 # there aren't that many words!
       parse_term
       i += 1
     end
     i = 0
     while @lexer.peek(OboParser::Tokens::Typedef)
-      raise OboParser::ParseError,"infinite loop in Typedefs" if i > 1000000
+      raise OboParser::ParseError,"infinite loop in Typedefs?" if i > 20000
       parse_typedef
       i += 1
     end
   end
   def parse_term
@@ -30,8 +31,18 @@ class OboParser::Parser
     tags = []
     while !@lexer.peek(OboParser::Tokens::Term) && !@lexer.peek(OboParser::Tokens::Typedef) && !@lexer.peek(OboParser::Tokens::EndOfFile)
       begin
-        t = @lexer.pop(OboParser::Tokens::TagValuePair)
+        if @lexer.peek(OboParser::Tokens::IsATag)
+          t = @lexer.pop(OboParser::Tokens::IsATag)
+        elsif @lexer.peek(OboParser::Tokens::DisjointFromTag)
+          t = @lexer.pop(OboParser::Tokens::DisjointFromTag)
+        elsif @lexer.peek(OboParser::Tokens::RelationshipTag)
+          t = @lexer.pop(OboParser::Tokens::RelationshipTag)
+        else
+          t = @lexer.pop(OboParser::Tokens::TagValuePair)
+        end
         tags.push(t)
       rescue
         raise
       end

data/lib/tokens.rb CHANGED Viewed

@@ -17,45 +17,101 @@ module OboParser::Tokens
     @regexp = Regexp.new(/\A\s*(\[typedef\])\s*/i)
   end
+  # Token eeds simplification, likely through creating additional tokens for quoted qualifiers, optional modifiers ({}), and the creation of individual
+  # tokens for individual tags that don't conform to the pattern used for def: tags.
+  # The code can't presently handle escaped characters (like \,), as bizzarely found in some OBO files.
   class TagValuePair < Token
-    attr_reader :tag, :comment, :xrefs, :qualifier
+    attr_reader :tag, :comment, :xrefs, :qualifier, :description
     @regexp = Regexp.new(/\A\s*([^:]+:.+)\s*\n*/i)
     def initialize(str)
       str.strip!
       tag, value = str.split(':',2)
       value.strip!
-      # Handle comments
-      if value =~ /(!\s*.+)\Z/i
+      if tag == 'comment'
+        @tag = tag.strip
+        @value = value.strip
+        return
+      end
+      @xrefs = []
+      # Handle inline comments
+      if value =~ /(\s+!\s*.+)\s*\n*\z/i
         @comment = $1
         value.gsub!(@comment, '')
-        @comment.gsub!(/\A!\s*/, '')
         @comment.strip!
+        @comment.gsub!(/\A!\s*/, '')
+      end
+      value.strip!
+      # Qualifier for the whole tag
+      if value =~ /(\{[^{]*?\})\s*\n*\z/
+        @qualifier = $1
+        value.gsub!(@qualifier, '')
+        @qualifier.strip!
       end
-      # Break out the xrefs, could be made made robust
-      # Assumes non-quoted comma delimited in format 'foo:bar, stuff:things'
-      if value =~ /(\s*\[.*\]\s*)/i
+      value.strip!
+      # Handle a xref list TODO: Tokenize
+      if value =~ /(\[.*\])/i
         xref_list = $1
         value.gsub!(xref_list, '')
         xref_list.strip!
-        xref_list = xref_list[1..-2] # strip []
-        @xrefs = xref_list.split(",")
+        xref_list = xref_list[1..-2] # [] off
+        qq = 0 # some failsafes
+        while xref_list.length > 0
+          qq += 1
+          raise "#{xref_list}" if qq > 500
+          xref_list.gsub!(/\A\s*,\s*/, '')
+          xref_list =~ /\A(.+?:[^\"|\{|\,]+)/i
+          v = $1
+          if !(v == "") && !v.nil?
+            v.strip!
+            r = Regexp.escape v
+            xref_list.gsub!(/\A#{r}\s*/, '')
+            @xrefs.push(v) if !v.nil?
+          end
+          xref_list.strip!
+          # A description
+          if xref_list =~ /\A(\s*".*?")/i
+            d = $1
+            r = Regexp.escape d
+            xref_list.gsub!(/\A#{r}/, '')
+            xref_list.strip!
+          end
+          # A optional modifier
+          if xref_list =~ /\A(\s*\{[^\}]*?\})/
+            m = $1
+            r = Regexp.escape m
+            xref_list.gsub!(/\A#{r}/, '')
+            xref_list.strip!
+          end
+          xref_list.strip!
+        end
       end
-      if value =~ /\A\"/
-        value =~ /(".*")/
-        @value = $1
-        value.gsub!(@value, '')
-        @qualifier = value.strip
+      value.strip!
+      # At this point we still might have a '"foo" QUALIFIER' combination
+      if value =~ /\A(\"[^\"]*\")\s+(.*)/
+        @value = $1.strip
+        @qualifier = $2.strip if !$2.nil?
       else
         @value = value.strip
-        @qualifier = nil
       end
-      @value = @value[1..-2].strip if @value[0..0] == "\"" # get rid of quote marks
-      @value = @value[1..-2].strip if @value[0..0] == "'"  # get rid of quote marks
+      @value = @value[1..-2].strip if @value[0..0] == "\""
       @tag = tag.strip
       @value.strip!
     end
@@ -73,6 +129,51 @@ module OboParser::Tokens
     end
   end
+  class RelationshipTag < Token
+    attr_reader :tag, :related_term, :relation, :comment, :xrefs #, :qualifier
+    @regexp = Regexp.new(/\A\s*relationship:\s*(.+)\s*\n*/i) #  returns key => value hash for tokens like 'foo=bar' or foo = 'b a ar'
+    def initialize(str)
+      @tag = 'relationship'
+      @xrefs = []
+      @relation, @related_term = str.split(/\s/,3)
+      str =~ /\s+!\s+(.*)\s*\n*/i
+      @comment = $1
+      @comment ||= ""
+      [@relation, @related_term, @comment].map(&:strip!)
+    end
+  end
+  class IsATag < Token
+    attr_reader :tag, :related_term, :relation, :comment, :xrefs #, :qualifier
+    @regexp = Regexp.new(/\A\s*is_a:\s*(.+)\s*\n*/i) #  returns key => value hash for tokens like 'foo=bar' or foo = 'b a ar'
+    def initialize(str)
+      @tag = 'relationship'
+      @relation = 'is_a'
+      @related_term, @comment = str.split(/\s/,2)
+      @comment ||= ""
+      @comment.gsub!(/\A!\s*/, '')
+      [@relation, @related_term, @comment].map(&:strip!)
+      @xrefs = []
+    end
+  end
+  class DisjointFromTag < Token
+    attr_reader :tag, :related_term, :relation, :comment, :xrefs #, :qualifier
+    @regexp = Regexp.new(/\A\s*disjoint_from:\s*(.+)\s*\n*/i) #  returns key => value hash for tokens like 'foo=bar' or foo = 'b a ar'
+    def initialize(str)
+      @tag = 'relationship'
+      @relation = 'disjoint_from'
+      @related_term, @comment = str.split(/\s/,2)
+      @comment ||= ""
+      @comment.gsub!(/\A!\s*/, '')
+      [@relation, @related_term, @comment].map(&:strip!)
+      @xrefs = []
+    end
+  end
   class NameValuePair < Token
     @regexp = Regexp.new('fail')
   end
@@ -167,6 +268,9 @@ module OboParser::Tokens
       OboParser::Tokens::Term,
       OboParser::Tokens::Typedef,
       OboParser::Tokens::LBracket,
+      OboParser::Tokens::DisjointFromTag,
+      OboParser::Tokens::IsATag,
+      OboParser::Tokens::RelationshipTag,
       OboParser::Tokens::TagValuePair,
       OboParser::Tokens::XrefList,
       OboParser::Tokens::EndOfFile

data/obo_parser.gemspec CHANGED Viewed

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = %q{obo_parser}
-  s.version = "0.3.3"
+  s.version = "0.3.4"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["mjy"]
-  s.date = %q{2011-04-07}
+  s.date = %q{2011-04-11}
   s.description = %q{Provides all-in-one object containing the contents of an OBO formatted file.  OBO version 1.2 is targeted, though this should work for 1.0. }
   s.email = %q{diapriid@gmail.com}
   s.extra_rdoc_files = [

data/test/cell.obo CHANGED Viewed

@@ -2979,7 +2979,7 @@ is_a: CL:0000412 ! polyploid cell
 [Term]
 id: CL:0000418
 name: arcade cell
-def: "An epithelial cell found in C. elegans that firmly hold the outer body wall and the lips to the inner cylinder of the pharynx in a manner that keeps these organs from breaking apart, while still giving each organ freedom of movement during feeding." [GOC:tf\,, http://www.wormatlas.org/ver1/handbook/hypodermis/hypsupportother.htm#arcadecells]
+def: "An epithelial cell found in C. elegans that firmly hold the outer body wall and the lips to the inner cylinder of the pharynx in a manner that keeps these organs from breaking apart, while still giving each organ freedom of movement during feeding." [GOC:tf, http://www.wormatlas.org/ver1/handbook/hypodermis/hypsupportother.htm#arcadecells]
 is_a: CL:0000066 ! epithelial cell
 [Term]

data/test/test_obo_parser.rb CHANGED Viewed

@@ -25,18 +25,18 @@ class Test_Regex < Test::Unit::TestCase
 end
 class Test_Lexer < Test::Unit::TestCase
   def test_term
-     lexer = OboParser::Lexer.new("[Term]")
-     assert lexer.pop(OboParser::Tokens::Term)
+    lexer = OboParser::Lexer.new("[Term]")
+    assert lexer.pop(OboParser::Tokens::Term)
   end
   def test_end_of_file
-     lexer = OboParser::Lexer.new("    \n\n")
-     assert lexer.pop(OboParser::Tokens::EndOfFile)
-     lexer = OboParser::Lexer.new("\n")
-     assert lexer.pop(OboParser::Tokens::EndOfFile)
+    lexer = OboParser::Lexer.new("    \n\n")
+    assert lexer.pop(OboParser::Tokens::EndOfFile)
+    lexer = OboParser::Lexer.new("\n")
+    assert lexer.pop(OboParser::Tokens::EndOfFile)
   end
   def test_parse_term_stanza
@@ -69,8 +69,18 @@ class Test_Lexer < Test::Unit::TestCase
     assert_equal 'PATO:0001301', t.value
   end
+  def test_typdef
+     input = '[Typedef]
+     id: part_of
+     name: part of
+     is_transitive: true'
+     assert foo = parse_obo_file(input)
+     assert_equal 1, foo.typedefs.size
+     assert_equal 'part_of', foo.typedefs.first.id.value
+  end
   def test_parse_term_stanza2
-   input = '[Term]
+    input = '[Term]
       id: CL:0000009
       name: fusiform initial
       alt_id: CL:0000274
@@ -85,23 +95,62 @@ class Test_Lexer < Test::Unit::TestCase
     assert_equal 'xylem initial', foo.terms.first.tags_named('synonym').first.value
     assert_equal 'xylem mother cell', foo.terms.first.tags_named('synonym')[1].value
     assert_equal 'CL:0000274', foo.terms.first.tags_named('alt_id').first.value
+    assert_equal 2, foo.terms.first.relationships.size
+    assert_equal(['CL:0000272', 'CL:0000610'], foo.terms.first.relationships.collect{|r| r[1]}.sort)
+    assert_equal(['is_a', 'is_a'], foo.terms.first.relationships.collect{|r| r[0]}.sort)
   end
   def test_parse_term
-     lexer = OboParser::Lexer.new("[Term]")
-     assert lexer.pop(OboParser::Tokens::Term)
+    lexer = OboParser::Lexer.new("[Term]")
+    assert lexer.pop(OboParser::Tokens::Term)
   end
   def test_xref_list
-     lexer = OboParser::Lexer.new("[foo:bar, stuff:things]")
-     assert t = lexer.pop(OboParser::Tokens::XrefList)
-     hsh = {'foo' => 'bar', 'stuff' => 'things'}
-     assert_equal hsh, t.value
+    lexer = OboParser::Lexer.new("[foo:bar, stuff:things]")
+    assert t = lexer.pop(OboParser::Tokens::XrefList)
+    assert_equal( {'foo' => 'bar', 'stuff' => 'things'} , t.value)
+  end
+  def test_relationship_tag
+    lexer = OboParser::Lexer.new("relationship: develops_from CL:0000333 ! neural crest cell")
+    assert t = lexer.pop(OboParser::Tokens::RelationshipTag)
+    assert_equal 'develops_from', t.relation
+    assert_equal 'CL:0000333', t.related_term
+    assert_equal 'relationship', t.tag
+    lexer = OboParser::Lexer.new("relationship: develops_from CL:0000333")
+    assert t = lexer.pop(OboParser::Tokens::RelationshipTag)
+    assert_equal 'develops_from', t.relation
+    assert_equal 'CL:0000333', t.related_term
+    assert_equal 'relationship', t.tag
+    lexer = OboParser::Lexer.new("is_a: CL:0000333 ! Foo")
+    assert t = lexer.pop(OboParser::Tokens::IsATag)
+    assert_equal 'is_a', t.relation
+    assert_equal 'CL:0000333', t.related_term
+    assert_equal 'Foo', t.comment
+    lexer = OboParser::Lexer.new("disjoint_from: CL:0000333")
+    assert t = lexer.pop(OboParser::Tokens::DisjointFromTag)
+    assert_equal 'disjoint_from', t.relation
+    assert_equal 'CL:0000333', t.related_term
+    assert_equal "", t.comment
+    lexer = OboParser::Lexer.new("relationship: part_of CL:0000333 ! Foo")
+    assert t = lexer.pop(OboParser::Tokens::RelationshipTag)
+    assert_equal 'part_of', t.relation
+    assert_equal 'CL:0000333', t.related_term
+    assert_equal 'Foo', t.comment
   end
   def test_tagvaluepair
-     lexer = OboParser::Lexer.new("id: PATO:0000179")
-     assert lexer.pop(OboParser::Tokens::TagValuePair)
+    lexer = OboParser::Lexer.new("id: PATO:0000179")
+    assert lexer.pop(OboParser::Tokens::TagValuePair)
   end
   def test_tagvaluepair_with_comments_and_xrefs
@@ -123,6 +172,22 @@ class Test_Lexer < Test::Unit::TestCase
     assert_equal([], t.xrefs)
   end
+  def test_that_xref_lists_parse_as_part_of_tagvalue_pair
+    lexer = OboParser::Lexer.new('def: "Foo and the bar, and stuff, and things.  More stuff, and things!" [GO_REF:0000031 "Foo!" , GOC:msz {some=trailingmodifier}, GOC:tfm, ISBN:9780781765190 "Fundamental Immunology!, 6ed (Paul,ed), 2003", PMID:16014527] {qualifier=foo} ! and a comment')
+    assert t = lexer.pop(OboParser::Tokens::TagValuePair)
+    assert_equal 'def', t.tag
+    assert_equal 'Foo and the bar, and stuff, and things.  More stuff, and things!', t.value
+    assert_equal(['GO_REF:0000031', 'GOC:msz', 'GOC:tfm', 'ISBN:9780781765190', 'PMID:16014527'], t.xrefs)
+  end
+  def test_crummy_space_filled_xrefs
+    lexer = OboParser::Lexer.new('def: "A quality inhering in a bearer by virtue of emitting light during exposure to radiation from an external source." [The Free Online dictionary:The Free Online dictionary "www.thefreedictionary.com/ -"]')
+    assert t = lexer.pop(OboParser::Tokens::TagValuePair)
+    assert_equal 'def', t.tag
+    assert_equal 'A quality inhering in a bearer by virtue of emitting light during exposure to radiation from an external source.', t.value
+    assert_equal(['The Free Online dictionary:The Free Online dictionary'], t.xrefs)
+  end
 end
 class Test_Parser < Test::Unit::TestCase
@@ -153,13 +218,7 @@ class Test_Parser < Test::Unit::TestCase
     assert_equal 'xylem mother cell', tmp[1].value
     assert_equal([], tmp[1].xrefs)
-    assert_equal 2, foo.terms[9].tags_named('is_a').size
-  end
-  def teardown
-    @of = nil
+    assert_equal 2, foo.terms[9].relationships.size
   end
   def test_file_completes_without_typedefs
@@ -167,5 +226,9 @@ class Test_Parser < Test::Unit::TestCase
     assert foo = parse_obo_file(@of2)
   end
+  def teardown
+    @of = nil
+  end
 end

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: obo_parser
 version: !ruby/object:Gem::Version
-  hash: 21
+  hash: 27
   prerelease:
   segments:
   - 0
   - 3
-  - 3
-  version: 0.3.3
+  - 4
+  version: 0.3.4
 platform: ruby
 authors:
 - mjy
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-04-07 00:00:00 -04:00
+date: 2011-04-11 00:00:00 -04:00
 default_executable:
 dependencies: []