RubyGems - rbbt-text - Versions diffs - 1.1.9 → 1.3.3 - Mend

rbbt-text 1.1.9 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

checksums.yaml +4 -4
data/lib/rbbt/bow/bow.rb +5 -2
data/lib/rbbt/bow/dictionary.rb +27 -23
data/lib/rbbt/document.rb +56 -0
data/lib/rbbt/document/annotation.rb +45 -0
data/lib/rbbt/document/corpus.rb +61 -0
data/lib/rbbt/document/corpus/pubmed.rb +33 -0
data/lib/rbbt/ner/NER.rb +3 -3
data/lib/rbbt/ner/abner.rb +1 -1
data/lib/rbbt/ner/banner.rb +1 -1
data/lib/rbbt/ner/brat.rb +1 -1
data/lib/rbbt/ner/chemical_tagger.rb +1 -2
data/lib/rbbt/ner/g_norm_plus.rb +42 -12
data/lib/rbbt/ner/linnaeus.rb +3 -3
data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
data/lib/rbbt/ner/oscar3.rb +1 -2
data/lib/rbbt/ner/oscar4.rb +3 -3
data/lib/rbbt/ner/patterns.rb +5 -5
data/lib/rbbt/ner/regexpNER.rb +1 -2
data/lib/rbbt/ner/token_trieNER.rb +35 -22
data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
data/lib/rbbt/nlp/nlp.rb +5 -5
data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
data/lib/rbbt/nlp/spaCy.rb +52 -0
data/lib/rbbt/segment.rb +179 -0
data/lib/rbbt/segment/annotation.rb +58 -0
data/lib/rbbt/segment/encoding.rb +18 -0
data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -10
data/lib/rbbt/segment/overlaps.rb +63 -0
data/lib/rbbt/segment/range_index.rb +35 -0
data/lib/rbbt/segment/relationship.rb +7 -0
data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
data/lib/rbbt/segment/token.rb +23 -0
data/lib/rbbt/{text/segment → segment}/transformed.rb +10 -8
data/lib/rbbt/segment/tsv.rb +41 -0
data/share/install/software/Linnaeus +1 -1
data/share/install/software/OpenNLP +1 -1
data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
data/test/rbbt/document/test_annotation.rb +140 -0
data/test/rbbt/document/test_corpus.rb +33 -0
data/test/rbbt/ner/test_finder.rb +3 -3
data/test/rbbt/ner/test_g_norm_plus.rb +20 -3
data/test/rbbt/ner/test_patterns.rb +9 -9
data/test/rbbt/ner/test_regexpNER.rb +14 -14
data/test/rbbt/ner/test_rnorm.rb +3 -4
data/test/rbbt/ner/test_token_trieNER.rb +1 -0
data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
data/test/rbbt/segment/test_annotation.rb +39 -0
data/test/rbbt/segment/test_corpus.rb +36 -0
data/test/rbbt/segment/test_encoding.rb +24 -0
data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
data/test/rbbt/segment/test_overlaps.rb +69 -0
data/test/rbbt/segment/test_range_index.rb +42 -0
data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
data/test/rbbt/test_document.rb +14 -0
data/test/rbbt/test_segment.rb +182 -0
data/test/test_helper.rb +5 -3
data/test/test_spaCy.rb +32 -0
metadata +44 -32
data/lib/rbbt/text/corpus.rb +0 -106
data/lib/rbbt/text/corpus/document.rb +0 -361
data/lib/rbbt/text/corpus/document_repo.rb +0 -68
data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
data/lib/rbbt/text/document.rb +0 -39
data/lib/rbbt/text/segment.rb +0 -355
data/lib/rbbt/text/segment/docid.rb +0 -46
data/lib/rbbt/text/segment/relationship.rb +0 -24
data/lib/rbbt/text/segment/token.rb +0 -49
data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
data/test/rbbt/text/corpus/test_document.rb +0 -52
data/test/rbbt/text/segment/test_relationship.rb +0 -0
data/test/rbbt/text/segment/test_segmented.rb +0 -23
data/test/rbbt/text/test_corpus.rb +0 -34
data/test/rbbt/text/test_document.rb +0 -58
data/test/rbbt/text/test_segment.rb +0 -100

data/lib/rbbt/{text/segment → segment}/transformed.rb RENAMED

@@ -1,6 +1,3 @@
-require 'rbbt/util/misc'
-require 'rbbt/text/segment'
 module Transformed
   def self.transform(text, segments, replacement = nil, &block)
@@ -71,6 +68,10 @@ module Transformed
     segments = [segments] unless Array === segments
     orig_length = self.length
+    offset = self.respond_to?(:offset) ? self.offset.to_i : 0
+    segments = segments.select{|s| s.offset.to_i >= offset && s.offset.to_i <= offset + self.length - 1 }
     Segment.clean_sort(segments).each do |segment|
       next if segment.offset.nil?
@@ -89,7 +90,7 @@ module Transformed
       updated_text = self[updated_begin..updated_end]
       if updated_text.nil?
-        Log.warn "Range outside of segment: #{self.length} #{segment.locus} (#{updated_range})"
+        Log.warn "Range outside of segment: #{self.length} #{segment.range} (#{updated_range})"
         next
       end
@@ -122,13 +123,13 @@ module Transformed
   def fix_segment(segment, range, diff)
     case
       # Before
-    when segment.end < range.begin
+    when segment.eend < range.begin
       # After
     when segment.offset.to_i > range.end + diff
       segment.offset = segment.offset.to_i - diff
       # Includes
-    when (segment.offset.to_i <= range.begin and segment.end >= range.end + diff)
-      segment.replace self[segment.offset.to_i..segment.end - diff]
+    when (segment.offset.to_i <= range.begin and segment.eend >= range.end + diff)
+      segment.replace self[segment.offset.to_i..segment.eend - diff]
     else
       raise "Segment Overlaps"
     end
@@ -141,7 +142,8 @@ module Transformed
     if first_only
       @transformation_stack.pop.reverse.each do |id|
-        orig_range, diff, text, range = @transformed_segments.delete id
+        segment_info = @transformed_segments.delete id
+        orig_range, diff, text, range = segment_info
         new_range = (range.begin..range.last + diff)
         self[new_range] = text

data/lib/rbbt/segment/tsv.rb ADDED

@@ -0,0 +1,41 @@
+#module Segment
+#
+#  def self.set_tsv_fields(fields, segments)
+#    tsv_fields = []
+#    add_types = ! (fields.delete(:no_types) || fields.delete("no_types") || fields.include?(:JSON) || fields.include?("JSON"))
+#    literal = (fields.delete(:literal) || fields.delete("literal"))
+#    tsv_fields << "Start" << "End"
+#    tsv_fields << :annotation_types if add_types
+#    tsv_fields << :literal if literal
+#
+#    if fields.any? and not (fields == [:all] or fields == ["all"])
+#      tsv_fields.concat fields
+#    else
+#      tsv_fields.concat segments.first.annotations if segments.any?
+#    end
+#    tsv_fields
+#    tsv_fields.collect!{|f| f.to_s}
+#    tsv_fields.delete "offset"
+#    tsv_fields
+#  end
+#
+#  def self.tsv(segments, *fields)
+#    fields = set_tsv_fields fields, segments
+#    tsv = TSV.setup({}, :key_field => "ID", :fields => fields, :type => :double)
+#
+#    segments.each do |segment|
+#      tsv[segment.segment_id] = self.tsv_values_for_segment(segment, fields)
+#    end
+#
+#    tsv
+#  end
+#
+#  def self.load_tsv(tsv)
+#    fields = tsv.fields
+#    tsv.with_unnamed do
+#      tsv.collect do |id, values|
+#        Annotated.load_tsv_values(id, values, fields)
+#      end
+#    end
+#  end
+#end

data/share/install/software/Linnaeus CHANGED

@@ -12,7 +12,7 @@ pkg_dir="`opt_dir \"$name\"`"
 build_dir=`build_dir`
 mv "$build_dir" "$pkg_dir"
 tmp_file="~/.rbbt/tmp/species-proxy-properties.tmp"
-mkdir -p $(basename "$tmp_file")
+mkdir -p $(dirname "$tmp_file")
 cat  "$pkg_dir/species-proxy/properties.conf" |grep -v "^.dir =" >> $tmp_file
 echo "\$dir = $pkg_dir/species-proxy/" > "$pkg_dir/species-proxy/properties.conf"
 cat $tmp_file | grep -v "^#" >>  "$pkg_dir/species-proxy/properties.conf"

data/share/install/software/OpenNLP CHANGED

@@ -1,7 +1,7 @@
 #!/bin/bash
 name='OpenNLP'
-url="http://apache.rediris.es/opennlp/opennlp-1.9.1/apache-opennlp-1.9.1-bin.tar.gz"
+url="http://apache.rediris.es/opennlp/opennlp-1.9.2/apache-opennlp-1.9.2-bin.tar.gz"
 get_src "$name" "$url"
 move_opt "$name"

data/test/rbbt/document/corpus/test_pubmed.rb ADDED

@@ -0,0 +1,15 @@
+require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
+require 'rbbt/document'
+require 'rbbt/document/corpus'
+require 'rbbt/document/corpus/pubmed'
+class TestCorpusPubmed < Test::Unit::TestCase
+  def test_add_pmid
+    corpus = Document::Corpus.setup({})
+    document = corpus.add_pmid("32299157", :abstract).first
+    title = document.to(:title)
+    assert title.include?("COVID-19")
+  end
+end

data/test/rbbt/document/test_annotation.rb ADDED

@@ -0,0 +1,140 @@
+require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
+require 'rbbt/document'
+require 'rbbt/document/corpus'
+require 'rbbt/segment'
+require 'rbbt/document/annotation'
+require 'rbbt/segment/named_entity'
+class TestAnnotation < Test::Unit::TestCase
+  class CalledOnce < Exception; end
+  def setup
+    Document.define :words do
+      self.split(" ")
+    end
+    $called_once = false
+    Document.define :persisted_words do
+      raise CalledOnce if $called_once
+      $called_once = true
+      self.split(" ")
+    end
+    Document.define_multiple :multiple_words do |list|
+      list.collect{|doc| doc.words}
+    end
+    Document.define :ner do
+      $called_once = true
+      self.split(" ").collect{|e| NamedEntity.setup(e, :code => Misc.digest(e)) }
+    end
+    Document.persist :ner
+  end
+  def test_define
+    text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
+    Document.setup(text, "TEST", "test_doc1", nil)
+    corpus = {}
+    Document::Corpus.setup corpus
+    corpus.add_document(text)
+    assert_equal text[text.words[1].range], text.words[1]
+  end
+  def test_define_multiple
+    text1 = "This sentence mentions the TP53 gene and the CDK5R1 protein"
+    text2 = "This is another sentence"
+    Document.setup(text1, "TEST", "test_doc1", nil)
+    Document.setup(text2, "TEST", "test_doc2", nil)
+    corpus = {}
+    Document::Corpus.setup corpus
+    corpus.add_document(text1)
+    corpus.add_document(text2)
+    assert_equal 2, Document.setup([text1, text2]).multiple_words.length
+    assert_equal text1.split(" "), text1.multiple_words
+    #Document.persist :multiple_words, :annotations, :annotation_repo => Rbbt.tmp.test.multiple_words
+    #assert_equal 2, Document.setup([text1, text2]).multiple_words.length
+    #assert_equal text1.split(" "), text1.multiple_words
+  end
+  def test_persist
+    text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
+    Document.setup(text, "TEST", "test_doc1", nil)
+    corpus = {}
+    Document::Corpus.setup corpus
+    corpus.add_document(text)
+    assert_equal "persisted_words", text.persisted_words.first.type
+    assert_raise CalledOnce do
+      assert_equal text[text.persisted_words[1].range], text.persisted_words[1]
+    end
+    Log.severity = 0
+    Document.persist :persisted_words, :annotations, :file => Rbbt.tmp.test.persisted_words.find(:user)
+    $called_once = false
+    text.persisted_words
+    assert $called_once
+    assert_nothing_raised  do
+      assert_equal text[text.persisted_words[1].range], text.persisted_words[1]
+    end
+  end
+  def test_persist_annotation_repo
+    text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
+    Document.setup(text, "TEST", "test_doc1", nil)
+    corpus = {}
+    Document::Corpus.setup corpus
+    corpus.add_document(text)
+    assert_equal "persisted_words", text.persisted_words.first.type
+    assert_raise CalledOnce do
+      assert_equal text[text.persisted_words[1].range], text.persisted_words[1]
+    end
+    Log.severity = 0
+    Document.persist :persisted_words, :annotations, :annotation_repo => Rbbt.tmp.test.persisted_words_repo.find(:user)
+    $called_once = false
+    text.persisted_words
+    assert $called_once
+    assert_nothing_raised  do
+      assert_equal text[text.persisted_words[1].range], text.persisted_words[1]
+    end
+  end
+  def test_persist_ner
+    text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
+    Document.setup(text, "TEST", "test_doc1", nil)
+    corpus = {}
+    Document::Corpus.setup corpus
+    corpus.add_document(text)
+    text.ner
+    $called_once = false
+    text.ner
+    assert ! $called_once
+    assert  text.ner.first.segid.include?("TEST:")
+  end
+end

data/test/rbbt/document/test_corpus.rb ADDED

@@ -0,0 +1,33 @@
+require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
+require 'rbbt/document'
+require 'rbbt/document/corpus'
+class TestDocumentCorpus < Test::Unit::TestCase
+  def test_corpus
+    text = "This is a document"
+    Document.setup(text, "TEST", "test_doc1", nil)
+    corpus = Document::Corpus.setup({})
+    corpus.add_document(text)
+    docid = text.docid(corpus)
+    assert_equal docid.document, text
+  end
+  def test_find
+    text = "This is a document"
+    Document.setup(text, "TEST", "test_doc1", nil)
+    TmpFile.with_file do |path|
+      corpus = Persist.open_tokyocabinet(path, true, :single, "BDB")
+      corpus.extend Document::Corpus
+      corpus.add_document(text)
+      assert corpus.docids("TEST:").include?(text.docid)
+    end
+  end
+end

data/test/rbbt/ner/test_finder.rb CHANGED

@@ -8,13 +8,13 @@ require 'rbbt/sources/NCI'
 class TestFinder < Test::Unit::TestCase
-  def test_namespace_and_format
+  def _test_namespace_and_format
     f = Finder.new(CMD.cmd("head -n 1000", :in => Open.open(Organism.identifiers(Organism.default_code("Hsa")).produce.find)))
     assert_equal Organism.default_code("Hsa"), f.instances.first.namespace
     assert_equal "Ensembl Gene ID", f.instances.first.format
   end
-  def test_find
+  def _test_find
     f = Finder.new(Organism.lexicon(Organism.default_code("Hsa")), :grep => ["SF3B1"])
     assert_equal "ENSG00000115524", f.find("SF3B1").first
@@ -23,7 +23,7 @@ class TestFinder < Test::Unit::TestCase
     end
   end
-  def test_find2
+  def _test_find2
     f = Finder.new(Organism.lexicon(Organism.default_code("Hsa")), :grep => ["RASGRF2"])
     m = f.find("RAS").first

data/test/rbbt/ner/test_g_norm_plus.rb CHANGED

@@ -5,12 +5,29 @@ Log.severity = 0
 class TestGNormPlus < Test::Unit::TestCase
   def test_match
     text =<<-EOF
-We found that TP53 is regulated by MDM2 in Homo sapiens
-    EOF
+Introduction
+We found that TP53 is regulated by MDM2 in Homo
+sapiens
+    EOF
     mentions = GNormPlus.process({:file => text})
-    Log.tsv mentions
+    assert_equal 1, mentions.length
+    assert_equal 3, mentions["file"].length
+  end
+  def test_entities
+    text =<<-EOF
+We found that TP53 is regulated by MDM2 in Homo sapiens
+    EOF
+    mentions = GNormPlus.entities({:file => text})
+    assert mentions["file"].include?("TP53")
+    mentions["file"].each do |mention|
+      assert_equal mention, text[mention.range].sub("\n", ' ')
+    end
   end
 end

data/test/rbbt/ner/test_patterns.rb CHANGED

@@ -2,17 +2,17 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.r
 require 'rbbt/ner/patterns'
 class TestPatternRelExt < Test::Unit::TestCase
-  def test_simple_pattern
+  def _test_simple_pattern
     text = "Experiments have shown that TP53 interacts with CDK5 under certain conditions"
     gene1 = "TP53"
-    NamedEntity.setup(gene1, text.index(gene1), "Gene")
+    NamedEntity.setup(gene1, :offset => text.index(gene1), :entity_type => "Gene")
     gene2 = "CDK5"
-    NamedEntity.setup(gene2, text.index(gene2), "Gene")
+    NamedEntity.setup(gene2, :offset => text.index(gene2), :entity_type => "Gene")
     interaction = "interacts"
-    NamedEntity.setup(interaction, text.index(interaction), "Interaction")
+    NamedEntity.setup(interaction, :offset => text.index(interaction), :entity_type => "Interaction")
     Segmented.setup(text, [gene1, gene2, interaction])
@@ -23,13 +23,13 @@ class TestPatternRelExt < Test::Unit::TestCase
     text = "Experiments have shown that TP53 found in cultivated cells interacts with CDK5 under certain conditions"
     gene1 = "TP53"
-    NamedEntity.setup(gene1, text.index(gene1), "Gene")
+    NamedEntity.setup(gene1, :offset => text.index(gene1), :entity_type => "Gene")
     gene2 = "CDK5"
-    NamedEntity.setup(gene2, text.index(gene2), "Gene")
+    NamedEntity.setup(gene2, :offset => text.index(gene2), :entity_type => "Gene")
     interaction = "interacts"
-    NamedEntity.setup(interaction, text.index(interaction), "Interaction")
+    NamedEntity.setup(interaction, :offset => text.index(interaction), :entity_type => "Interaction")
     Segmented.setup(text, {:entities => [gene1, gene2, interaction]})
@@ -40,7 +40,7 @@ class TestPatternRelExt < Test::Unit::TestCase
       PatternRelExt.new(["NP[entity:Gene] VP[stem:interacts] with NP[entity:Gene]"]).match_sentences([text]).first.first
   end
-  def test_chunk_pattern
+  def _test_chunk_pattern
     text = "There is a concern with the use of thiazolidinediones in patients with an increased risk of colon cancer (e.g., familial colon polyposis)."
     drug = "thiazolidinediones"
@@ -57,7 +57,7 @@ class TestPatternRelExt < Test::Unit::TestCase
   end
-  def test_entities_with_spaces
+  def _test_entities_with_spaces
     PatternRelExt.new("NP[entity:Gene Name]").token_trie
   end

data/test/rbbt/ner/test_regexpNER.rb CHANGED

@@ -23,9 +23,9 @@ class TestRegExpNER < Test::Unit::TestCase
     matches = RegExpNER.match_regexp_hash(sentence, regexp_hash)
     assert_equal ["this", "this", "that"].sort, matches.sort
-    assert_equal "In ".length, matches.select{|m| m.type == :this}[0].offset
-    assert_equal "In this sentence I should find ".length, matches.select{|m| m.type == :this}[1].offset
-    assert_equal :this, matches.select{|m| m.type == :this}[0].type
+    assert_equal "In ".length, matches.select{|m| m.entity_type == :this}[0].offset
+    assert_equal "In this sentence I should find ".length, matches.select{|m| m.entity_type == :this}[1].offset
+    assert_equal :this, matches.select{|m| m.entity_type == :this}[0].entity_type
   end
   def test_define_regexps
@@ -39,9 +39,9 @@ class TestRegExpNER < Test::Unit::TestCase
     matches = ner.entities(sentence)
     assert_equal ["this", "this", "that"].sort, matches.sort
-    assert_equal "In ".length, matches.select{|m| m.type == :this }[0].offset
-    assert_equal "In this sentence I should find ".length, matches.select{|m| m.type == :this }[1].offset
-    assert_equal :this, matches.select{|m| m.type == :this }[0].type
+    assert_equal "In ".length, matches.select{|m| m.entity_type == :this }[0].offset
+    assert_equal "In this sentence I should find ".length, matches.select{|m| m.entity_type == :this }[1].offset
+    assert_equal :this, matches.select{|m| m.entity_type == :this }[0].entity_type
   end
@@ -51,9 +51,9 @@ class TestRegExpNER < Test::Unit::TestCase
     ner = RegExpNER.new({:this => /this/, :that => /that/})
     matches = ner.entities(sentence)
     assert_equal ["this", "this", "that"].sort, matches.sort
-    assert_equal "In ".length, matches.select{|m| m.type == :this}[0].offset
-    assert_equal "In this sentence I should find ".length, matches.select{|m| m.type == :this}[1].offset
-    assert_equal :this, matches.select{|m| m.type == :this}[0].type
+    assert_equal "In ".length, matches.select{|m| m.entity_type == :this}[0].offset
+    assert_equal "In this sentence I should find ".length, matches.select{|m| m.entity_type == :this}[1].offset
+    assert_equal :this, matches.select{|m| m.entity_type == :this}[0].entity_type
     Segmented.setup(sentence)
     ner_this = RegExpNER.new({:this => /this/})
@@ -64,9 +64,9 @@ class TestRegExpNER < Test::Unit::TestCase
     matches = sentence.segments
     assert_equal ["this", "this", "that"].sort, matches.sort
-    assert_equal "In ".length, matches.select{|m| m.type == :this}[0].offset
-    assert_equal "In this sentence I should find ".length, matches.select{|m| m.type == :this}[1].offset
-    assert_equal :this, matches.select{|m| m.type == :this}[0].type
+    assert_equal "In ".length, matches.select{|m| m.entity_type == :this}[0].offset
+    assert_equal "In this sentence I should find ".length, matches.select{|m| m.entity_type == :this}[1].offset
+    assert_equal :this, matches.select{|m| m.entity_type == :this}[0].entity_type
   end
   def test_entities_captures
@@ -75,8 +75,8 @@ class TestRegExpNER < Test::Unit::TestCase
     ner = RegExpNER.new({:this => /this/, :that => /that/, :should => /I (should)/})
     matches = ner.entities(sentence)
     assert_equal ["this", "this", "that", "should"].sort, matches.sort
-    assert_equal "In this sentence I ".length, matches.select{|m| m.type == :should}[0].offset
-    assert_equal :should, matches.select{|m| m.type == :should}[0].type
+    assert_equal "In this sentence I ".length, matches.select{|m| m.entity_type == :should}[0].offset
+    assert_equal :should, matches.select{|m| m.entity_type == :should}[0].entity_type
   end