RubyGems - rbbt-text - Versions diffs - 1.3.0 → 1.3.5 - Mend

rbbt-text 1.3.0 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

checksums.yaml +4 -4
data/lib/rbbt/bow/bow.rb +5 -2
data/lib/rbbt/bow/dictionary.rb +27 -23
data/lib/rbbt/document.rb +20 -5
data/lib/rbbt/document/annotation.rb +7 -4
data/lib/rbbt/document/corpus.rb +30 -3
data/lib/rbbt/document/corpus/pubmed.rb +2 -1
data/lib/rbbt/ner/abner.rb +3 -2
data/lib/rbbt/ner/banner.rb +3 -1
data/lib/rbbt/ner/brat.rb +1 -1
data/lib/rbbt/ner/g_norm_plus.rb +7 -1
data/lib/rbbt/ner/linnaeus.rb +2 -1
data/lib/rbbt/ner/patterns.rb +0 -1
data/lib/rbbt/ner/rner.rb +229 -0
data/lib/rbbt/ner/token_trieNER.rb +32 -18
data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
data/lib/rbbt/nlp/spaCy.rb +195 -0
data/lib/rbbt/relationship.rb +24 -0
data/lib/rbbt/segment.rb +9 -4
data/lib/rbbt/segment/annotation.rb +3 -3
data/lib/rbbt/segment/named_entity.rb +7 -0
data/lib/rbbt/segment/range_index.rb +1 -1
data/lib/rbbt/segment/relationship.rb +7 -0
data/lib/rbbt/segment/transformed.rb +5 -1
data/share/install/software/OpenNLP +1 -1
data/share/rner/config.rb +51 -0
data/test/rbbt/document/corpus/test_pubmed.rb +2 -1
data/test/rbbt/document/test_annotation.rb +15 -6
data/test/rbbt/document/test_corpus.rb +15 -1
data/test/rbbt/ner/test_g_norm_plus.rb +11 -3
data/test/rbbt/ner/test_rner.rb +132 -0
data/test/rbbt/nlp/genia/test_sentence_splitter.rb +27 -3
data/test/rbbt/segment/test_annotation.rb +3 -4
data/test/rbbt/segment/test_encoding.rb +1 -1
data/test/rbbt/segment/test_named_entity.rb +7 -5
data/test/rbbt/segment/test_range_index.rb +1 -2
data/test/rbbt/segment/test_transformed.rb +33 -4
data/test/rbbt/test_segment.rb +5 -10
data/test/test_spaCy.rb +144 -0
metadata +12 -3

data/lib/rbbt/segment/range_index.rb CHANGED Viewed

@@ -6,7 +6,7 @@ module Segment::RangeIndex
     SegID.setup(res, :corpus => corpus)
   end
-  def self.index(segments, corpus, persist_file = :memory)
+  def self.index(segments, corpus = nil, persist_file = :memory)
     segments = segments.values.flatten if Hash === segments
     annotation_index =

data/lib/rbbt/segment/relationship.rb ADDED Viewed

@@ -0,0 +1,7 @@
+module Relationship
+  extend Entity
+  self.annotation :segments
+  self.annotation :type
+end

data/lib/rbbt/segment/transformed.rb CHANGED Viewed

@@ -68,6 +68,10 @@ module Transformed
     segments = [segments] unless Array === segments
     orig_length = self.length
+    offset = self.respond_to?(:offset) ? self.offset.to_i : 0
+    segments = segments.select{|s| s.offset.to_i >= offset && s.offset.to_i <= offset + self.length - 1 }
     Segment.clean_sort(segments).each do |segment|
       next if segment.offset.nil?
@@ -86,7 +90,7 @@ module Transformed
       updated_text = self[updated_begin..updated_end]
       if updated_text.nil?
-        Log.warn "Range outside of segment: #{self.length} #{segment.locus} (#{updated_range})"
+        Log.warn "Range outside of segment: #{self.length} #{segment.range} (#{updated_range})"
         next
       end

data/share/install/software/OpenNLP CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/bin/bash
 name='OpenNLP'
-url="http://apache.rediris.es/opennlp/opennlp-1.9.1/apache-opennlp-1.9.1-bin.tar.gz"
+url="http://apache.rediris.es/opennlp/opennlp-1.9.2/apache-opennlp-1.9.2-bin.tar.gz"
 get_src "$name" "$url"
 move_opt "$name"

data/share/rner/config.rb ADDED Viewed

@@ -0,0 +1,51 @@
+isLetters     /^[A-Z]+$/i
+isUpper       /^[A-Z]+$/
+isLower       /^[a-z]+$/
+isDigits      /^[0-9]+$/i
+isRoman       /^[IVX]+$/
+isGreek       /^(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)$/i
+isPunctuation /^[,.;]$/
+isDelim       /^[\/()\[\]{}\-]$/
+isNonWord     /^[^\w]+$/
+isConjunction /^and|or|&|,$/
+hasLetters    /[A-Z]/i
+hasUpper      /.[A-Z]/
+hasLower      /[a-z]/
+hasDigits     /[0-9]/i
+hasGreek      /(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)/i
+hasPunctuation /[,.;]/
+hasDelim      /[\/()\[\]{}\-]/
+hasNonWord    /[^\w]/
+caspMix       /[a-z].[A-Z]/
+keywords      /(?:protein|gene|domain|ase)s?$/
+hasSuffix     /[a-z][A-Z0-9]$/
+numLetters    do |w| w.scan(/[A-Z]/i).length end
+numDigits     do |w| w.scan(/[0-9]/).length end
+#
+prefix_3      /^(...)/
+prefix_4      /^(....)/
+suffix_3      /(...)$/
+suffix_4      /(....)$/
+token1        do |w|
+                 w.sub(/[A-Z]/,'A').
+                   sub(/[a-z]/,'a').
+                   sub(/[0-9]/,'0').
+                   sub(/[^0-9a-z]/i,'x')
+              end
+token2        do  |w|
+                 w.sub(/[A-Z]+/,'A').
+                   sub(/[a-z]+/,'a').
+                   sub(/[0-9]+/,'0').
+                   sub(/[^0-9a-z]+/i,'x')
+               end
+token3         do |w| w.downcase end
+special        do |w| w.is_special? end
+context   %w(special token2 isPunctuation isDelim)
+window     %w(1 2 3 -1 -2 -3)
+#direction :reverse

data/test/rbbt/document/corpus/test_pubmed.rb CHANGED Viewed

@@ -7,7 +7,8 @@ class TestCorpusPubmed < Test::Unit::TestCase
   def test_add_pmid
     corpus = Document::Corpus.setup({})
-    document = corpus.add_pmid("32299157", :abstract).first
+    document = corpus.add_pmid("33359141", :abstract).first
+    iii document.docid
     title = document.to(:title)
     assert title.include?("COVID-19")
   end

data/test/rbbt/document/test_annotation.rb CHANGED Viewed

@@ -4,6 +4,7 @@ require 'rbbt/document/corpus'
 require 'rbbt/segment'
 require 'rbbt/document/annotation'
 require 'rbbt/segment/named_entity'
+require 'rbbt/ner/abner'
 class TestAnnotation < Test::Unit::TestCase
   class CalledOnce < Exception; end
@@ -28,6 +29,12 @@ class TestAnnotation < Test::Unit::TestCase
       self.split(" ").collect{|e| NamedEntity.setup(e, :code => Misc.digest(e)) }
     end
+    Document.define :abner do
+      $called_once = true
+      Abner.new.match(self)
+    end
     Document.persist :ner
   end
@@ -36,7 +43,7 @@ class TestAnnotation < Test::Unit::TestCase
     Document.setup(text, "TEST", "test_doc1", nil)
     corpus = {}
-    corpus.extend Document::Corpus
+    Document::Corpus.setup corpus
     corpus.add_document(text)
@@ -50,7 +57,7 @@ class TestAnnotation < Test::Unit::TestCase
     Document.setup(text2, "TEST", "test_doc2", nil)
     corpus = {}
-    corpus.extend Document::Corpus
+    Document::Corpus.setup corpus
     corpus.add_document(text1)
     corpus.add_document(text2)
@@ -68,7 +75,7 @@ class TestAnnotation < Test::Unit::TestCase
     Document.setup(text, "TEST", "test_doc1", nil)
     corpus = {}
-    corpus.extend Document::Corpus
+    Document::Corpus.setup corpus
     corpus.add_document(text)
@@ -95,7 +102,7 @@ class TestAnnotation < Test::Unit::TestCase
     Document.setup(text, "TEST", "test_doc1", nil)
     corpus = {}
-    corpus.extend Document::Corpus
+    Document::Corpus.setup corpus
     corpus.add_document(text)
@@ -122,7 +129,7 @@ class TestAnnotation < Test::Unit::TestCase
     Document.setup(text, "TEST", "test_doc1", nil)
     corpus = {}
-    corpus.extend Document::Corpus
+    Document::Corpus.setup corpus
     corpus.add_document(text)
@@ -133,7 +140,9 @@ class TestAnnotation < Test::Unit::TestCase
     text.ner
     assert ! $called_once
+    assert_equal text.abner.first.docid, text.docid
     assert  text.ner.first.segid.include?("TEST:")
   end
 end

data/test/rbbt/document/test_corpus.rb CHANGED Viewed

@@ -26,7 +26,21 @@ class TestDocumentCorpus < Test::Unit::TestCase
       corpus.add_document(text)
-      assert corpus.prefix("TEST:").include?(text.docid)
+      assert corpus.docids("TEST:").include?(text.docid)
+    end
+  end
+  def test_load
+    text = "This is a document"
+    Document.setup(text, "TEST", "test_doc1", nil)
+    TmpFile.with_file do |path|
+      corpus = Persist.open_tokyocabinet(path, true, :single, "BDB")
+      corpus.extend Document::Corpus
+      corpus.add_document(text)
+      assert corpus.docids("TEST:").include?(text.docid)
     end
   end
 end

data/test/rbbt/ner/test_g_norm_plus.rb CHANGED Viewed

@@ -5,12 +5,17 @@ Log.severity = 0
 class TestGNormPlus < Test::Unit::TestCase
   def test_match
     text =<<-EOF
-We found that TP53 is regulated by MDM2 in Homo sapiens
+Introduction
+We found that TP53 is regulated by MDM2 in Homo
+sapiens
     EOF
     mentions = GNormPlus.process({:file => text})
     assert_equal 1, mentions.length
-    assert_equal 2, mentions["file"].length
+    assert_equal 3, mentions["file"].length
   end
   def test_entities
@@ -19,7 +24,10 @@ We found that TP53 is regulated by MDM2 in Homo sapiens
     EOF
     mentions = GNormPlus.entities({:file => text})
-    mentions["file"].include? "TP53"
+    assert mentions["file"].include?("TP53")
+    mentions["file"].each do |mention|
+      assert_equal mention, text[mention.range].sub("\n", ' ')
+    end
   end
 end

data/test/rbbt/ner/test_rner.rb ADDED Viewed

@@ -0,0 +1,132 @@
+require File.dirname(__FILE__) + '/../../test_helper'
+require 'rbbt'
+require 'rbbt/ner/rner'
+require 'test/unit'
+class TestRNer < Test::Unit::TestCase
+  def setup
+    @parser = NERFeatures.new() do
+      isLetters     /^[A-Z]+$/i
+      context prefix_3      /^(...)/
+      downcase do |w| w.downcase end
+      context %w(downcase)
+    end
+  end
+  def test_config
+    config = <<-EOC
+      isLetters     /^[A-Z]+$/i
+      context prefix_3      /^(...)/
+      downcase do |w| w.downcase end
+      context %w(downcase)
+    EOC
+    assert_equal config.strip, @parser.config.strip
+  end
+  def test_reverse
+    assert_equal("protein P53", NERFeatures.reverse("P53 protein"))
+    assert_equal(
+       ". LH of assay - radioimmuno serum the with compared was LH urinary for ) GONAVIS - HI ( test hemagglutination direct new A",
+     NERFeatures.reverse(
+       "A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH."
+      ))
+  end
+  def test_features
+    assert_equal  @parser.features("abCdE"), ["abCdE",true,'abC','abcde']
+  end
+  def test_template
+    template =<<-EOT
+UisLetters: %x[0,1]
+Uprefix_3: %x[0,2]
+Uprefix_3#1: %x[1,2]
+Uprefix_3#-1: %x[-1,2]
+Udowncase: %x[0,3]
+Udowncase#1: %x[1,3]
+Udowncase#-1: %x[-1,3]
+B
+    EOT
+    assert(@parser.template == template)
+  end
+  def test_tokens
+    assert( NERFeatures.tokens("A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH.")==
+           ["A", "new", "direct", "hemagglutination", "test", "(", "HI", "-", "GONAVIS", ")", "for", "urinary", "LH", "was", "compared", "with", "the", "serum", "radioimmuno", "-", "assay", "of", "LH", "."])
+  end
+  def test_text_features
+    assert(@parser.text_features("abCdE 1234") == [["abCdE",true, "abC", "abcde"], ["1234",false, "123", "1234"]])
+    assert(@parser.text_features("abCdE 1234",true) == [["abCdE",true, "abC", "abcde",1], ["1234",false, "123", "1234",2]])
+    assert(@parser.text_features("abCdE 1234",false) == [["abCdE",true, "abC", "abcde",0], ["1234",false, "123", "1234",0]])
+  end
+  def test_tagged_features
+    assert_equal(
+      [["phosphorilation",true, "pho", "phosphorilation", 0],
+        ["of",true, false, "of", 0],
+        ["GENE1",false, "GEN", "gene1", 1],
+        [".", false, false, ".", 0]],
+      @parser.tagged_features("phosphorilation of GENE1.",['GENE1']))
+      assert_equal(
+        [["GENE1",false, "GEN", "gene1", 1],
+          ["phosphorilation",true, "pho", "phosphorilation", 0]],
+      @parser.tagged_features("GENE1 phosphorilation",['GENE1']))
+    assert_equal(
+           [["phosphorilation",true, "pho", "phosphorilation", 0],
+            ["of",true, false, "of", 0],
+            ["GENE",true, "GEN", "gene", 1],
+            ["1",false, false, "1", 2],
+            [".", false, false, ".", 0]],
+      @parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
+  end
+  def test_tagged_features_reverse
+    @parser.reverse = true
+    assert_equal(
+      [
+        ["GENE1",false, "GEN", "gene1", 1],
+        ["of",true, false, "of", 0],
+        ["phosphorilation",true, "pho", "phosphorilation", 0]
+    ],
+    @parser.tagged_features("phosphorilation of GENE1",['GENE1']))
+    assert_equal(
+          [
+            [".", false, false, ".", 0],
+            ["1",false, false, "1", 1],
+            ["GENE",true, "GEN", "gene", 2],
+            ["of",true, false, "of", 0],
+            ["phosphorilation",true, "pho", "phosphorilation", 0]
+        ],
+    @parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
+  end
+  def test_default_config
+    require 'rbbt/bow/misc'
+    text =<<-EOF
+This text explains how MDM2 interacts with TP53.
+    EOF
+    @parser = NERFeatures.new Rbbt.share.rner["config.rb"].find
+    features = @parser.tagged_features text, %w(TP53 MDM2)
+    assert features.first.first == "This"
+  end
+  def __test_CRFPP_install
+    assert(require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP'))
+  end
+end

data/test/rbbt/nlp/genia/test_sentence_splitter.rb CHANGED Viewed

@@ -7,13 +7,37 @@ class TestNLP < Test::Unit::TestCase
 This is a sentence.
 A funky character ™ in a sentence.
 This is a sentence.
-This is a
+This is a broken
 sentence. This is
-another sentence.
+another broken sentence.
     EOF
-    assert_equal "This is a \nsentence.", NLP.geniass_sentence_splitter(text)[3]
+    iii NLP.geniass_sentence_splitter(text)
+    assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
   end
+  def test_sentences_2
+    text =<<-EOF
+This is a sentence.
+This is a sentence.
+This is a broken
+sentence. This is
+another broken sentence.
+    EOF
+    assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
+  end
+  def test_sentences_ext
+    text =<<-EOF
+This is a sentence.
+This is a sentence.
+This is a broken
+sentence. This is
+another broken sentence.
+    EOF
+    assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
+  end
 end

data/test/rbbt/segment/test_annotation.rb CHANGED Viewed

@@ -12,18 +12,17 @@ class TestAnnotation < Test::Unit::TestCase
     segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
     annotation = SegmentAnnotation.setup(segment, :type => :verb)
-    assert_equal 'verb', annotation.annotid.split(":").last
+    assert_equal 'verb', annotation.annotid.split(":")[5]
     annotation = SegmentAnnotation.setup(segment.segid, :type => :verb)
-    assert_equal 'verb', annotation.annotid.split(":").last
+    assert_equal 'verb', annotation.annotid.split(":")[5]
   end
   def test_annotid
     text = "This is a document"
     Document.setup(text, "TEST", "test_doc1", nil)
-    corpus = {}
-    corpus.extend Document::Corpus
+    corpus = Document::Corpus.setup({})
     corpus.add_document(text)

data/test/rbbt/segment/test_encoding.rb CHANGED Viewed

@@ -2,7 +2,7 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helpe
 require 'rbbt/segment/encoding'
 class TestEncoding < Test::Unit::TestCase
-  def _test_bad_chars
+  def test_bad_chars
     text = "A funky character ™ in a sentence."
     assert_equal ["™"], Segment.bad_chars(text)