RubyGems - rbbt-text - Versions diffs - 1.2.0 → 1.3.4 - Mend

rbbt-text 1.2.0 → 1.3.4

Files changed (76) hide show

checksums.yaml +4 -4
data/lib/rbbt/bow/bow.rb +5 -2
data/lib/rbbt/bow/dictionary.rb +27 -23
data/lib/rbbt/document.rb +55 -0
data/lib/rbbt/document/annotation.rb +45 -0
data/lib/rbbt/document/corpus.rb +63 -0
data/lib/rbbt/document/corpus/pubmed.rb +33 -0
data/lib/rbbt/ner/NER.rb +3 -3
data/lib/rbbt/ner/abner.rb +1 -1
data/lib/rbbt/ner/banner.rb +1 -1
data/lib/rbbt/ner/brat.rb +1 -1
data/lib/rbbt/ner/chemical_tagger.rb +1 -2
data/lib/rbbt/ner/g_norm_plus.rb +26 -3
data/lib/rbbt/ner/linnaeus.rb +3 -3
data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
data/lib/rbbt/ner/oscar3.rb +1 -2
data/lib/rbbt/ner/oscar4.rb +3 -3
data/lib/rbbt/ner/patterns.rb +5 -5
data/lib/rbbt/ner/regexpNER.rb +1 -2
data/lib/rbbt/ner/token_trieNER.rb +35 -22
data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
data/lib/rbbt/nlp/nlp.rb +5 -5
data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
data/lib/rbbt/nlp/spaCy.rb +52 -0
data/lib/rbbt/segment.rb +179 -0
data/lib/rbbt/segment/annotation.rb +58 -0
data/lib/rbbt/segment/encoding.rb +18 -0
data/lib/rbbt/{text/segment → segment}/named_entity.rb +14 -11
data/lib/rbbt/segment/overlaps.rb +63 -0
data/lib/rbbt/segment/range_index.rb +35 -0
data/lib/rbbt/segment/relationship.rb +7 -0
data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
data/lib/rbbt/segment/token.rb +23 -0
data/lib/rbbt/{text/segment → segment}/transformed.rb +12 -10
data/lib/rbbt/segment/tsv.rb +41 -0
data/share/install/software/Linnaeus +1 -1
data/share/install/software/OpenNLP +1 -1
data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
data/test/rbbt/document/test_annotation.rb +140 -0
data/test/rbbt/document/test_corpus.rb +33 -0
data/test/rbbt/ner/test_finder.rb +3 -3
data/test/rbbt/ner/test_g_norm_plus.rb +20 -2
data/test/rbbt/ner/test_patterns.rb +9 -9
data/test/rbbt/ner/test_regexpNER.rb +14 -14
data/test/rbbt/ner/test_rnorm.rb +3 -4
data/test/rbbt/ner/test_token_trieNER.rb +1 -0
data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
data/test/rbbt/segment/test_annotation.rb +39 -0
data/test/rbbt/segment/test_corpus.rb +36 -0
data/test/rbbt/segment/test_encoding.rb +24 -0
data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
data/test/rbbt/segment/test_overlaps.rb +69 -0
data/test/rbbt/segment/test_range_index.rb +42 -0
data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
data/test/rbbt/test_document.rb +14 -0
data/test/rbbt/test_segment.rb +182 -0
data/test/test_helper.rb +5 -3
data/test/test_spaCy.rb +32 -0
metadata +44 -32
data/lib/rbbt/text/corpus.rb +0 -106
data/lib/rbbt/text/corpus/document.rb +0 -383
data/lib/rbbt/text/corpus/document_repo.rb +0 -68
data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
data/lib/rbbt/text/document.rb +0 -39
data/lib/rbbt/text/segment.rb +0 -363
data/lib/rbbt/text/segment/docid.rb +0 -46
data/lib/rbbt/text/segment/relationship.rb +0 -24
data/lib/rbbt/text/segment/token.rb +0 -49
data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
data/test/rbbt/text/corpus/test_document.rb +0 -82
data/test/rbbt/text/segment/test_relationship.rb +0 -0
data/test/rbbt/text/segment/test_segmented.rb +0 -23
data/test/rbbt/text/test_corpus.rb +0 -34
data/test/rbbt/text/test_document.rb +0 -58
data/test/rbbt/text/test_segment.rb +0 -100

data/test/rbbt/document/test_corpus.rb ADDED

@@ -0,0 +1,33 @@
+require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
+require 'rbbt/document'
+require 'rbbt/document/corpus'
+class TestDocumentCorpus < Test::Unit::TestCase
+  def test_corpus
+    text = "This is a document"
+    Document.setup(text, "TEST", "test_doc1", nil)
+    corpus = Document::Corpus.setup({})
+    corpus.add_document(text)
+    docid = text.docid(corpus)
+    assert_equal docid.document, text
+  end
+  def test_find
+    text = "This is a document"
+    Document.setup(text, "TEST", "test_doc1", nil)
+    TmpFile.with_file do |path|
+      corpus = Persist.open_tokyocabinet(path, true, :single, "BDB")
+      corpus.extend Document::Corpus
+      corpus.add_document(text)
+      assert corpus.docids("TEST:").include?(text.docid)
+    end
+  end
+end

data/test/rbbt/ner/test_finder.rb CHANGED

@@ -8,13 +8,13 @@ require 'rbbt/sources/NCI'
 class TestFinder < Test::Unit::TestCase
-  def test_namespace_and_format
+  def _test_namespace_and_format
     f = Finder.new(CMD.cmd("head -n 1000", :in => Open.open(Organism.identifiers(Organism.default_code("Hsa")).produce.find)))
     assert_equal Organism.default_code("Hsa"), f.instances.first.namespace
     assert_equal "Ensembl Gene ID", f.instances.first.format
   end
-  def test_find
+  def _test_find
     f = Finder.new(Organism.lexicon(Organism.default_code("Hsa")), :grep => ["SF3B1"])
     assert_equal "ENSG00000115524", f.find("SF3B1").first
@@ -23,7 +23,7 @@ class TestFinder < Test::Unit::TestCase
     end
   end
-  def test_find2
+  def _test_find2
     f = Finder.new(Organism.lexicon(Organism.default_code("Hsa")), :grep => ["RASGRF2"])
     m = f.find("RAS").first

data/test/rbbt/ner/test_g_norm_plus.rb CHANGED

@@ -5,11 +5,29 @@ Log.severity = 0
 class TestGNormPlus < Test::Unit::TestCase
   def test_match
     text =<<-EOF
-We found that TP53 is regulated by MDM2 in Homo sapiens
+Introduction
+We found that TP53 is regulated by MDM2 in Homo
+sapiens
     EOF
     mentions = GNormPlus.process({:file => text})
-    Log.tsv mentions
+    assert_equal 1, mentions.length
+    assert_equal 3, mentions["file"].length
+  end
+  def test_entities
+    text =<<-EOF
+We found that TP53 is regulated by MDM2 in Homo sapiens
+    EOF
+    mentions = GNormPlus.entities({:file => text})
+    assert mentions["file"].include?("TP53")
+    mentions["file"].each do |mention|
+      assert_equal mention, text[mention.range].sub("\n", ' ')
+    end
   end
 end

data/test/rbbt/ner/test_patterns.rb CHANGED

@@ -2,17 +2,17 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.r
 require 'rbbt/ner/patterns'
 class TestPatternRelExt < Test::Unit::TestCase
-  def test_simple_pattern
+  def _test_simple_pattern
     text = "Experiments have shown that TP53 interacts with CDK5 under certain conditions"
     gene1 = "TP53"
-    NamedEntity.setup(gene1, text.index(gene1), "Gene")
+    NamedEntity.setup(gene1, :offset => text.index(gene1), :entity_type => "Gene")
     gene2 = "CDK5"
-    NamedEntity.setup(gene2, text.index(gene2), "Gene")
+    NamedEntity.setup(gene2, :offset => text.index(gene2), :entity_type => "Gene")
     interaction = "interacts"
-    NamedEntity.setup(interaction, text.index(interaction), "Interaction")
+    NamedEntity.setup(interaction, :offset => text.index(interaction), :entity_type => "Interaction")
     Segmented.setup(text, [gene1, gene2, interaction])
@@ -23,13 +23,13 @@ class TestPatternRelExt < Test::Unit::TestCase
     text = "Experiments have shown that TP53 found in cultivated cells interacts with CDK5 under certain conditions"
     gene1 = "TP53"
-    NamedEntity.setup(gene1, text.index(gene1), "Gene")
+    NamedEntity.setup(gene1, :offset => text.index(gene1), :entity_type => "Gene")
     gene2 = "CDK5"
-    NamedEntity.setup(gene2, text.index(gene2), "Gene")
+    NamedEntity.setup(gene2, :offset => text.index(gene2), :entity_type => "Gene")
     interaction = "interacts"
-    NamedEntity.setup(interaction, text.index(interaction), "Interaction")
+    NamedEntity.setup(interaction, :offset => text.index(interaction), :entity_type => "Interaction")
     Segmented.setup(text, {:entities => [gene1, gene2, interaction]})
@@ -40,7 +40,7 @@ class TestPatternRelExt < Test::Unit::TestCase
       PatternRelExt.new(["NP[entity:Gene] VP[stem:interacts] with NP[entity:Gene]"]).match_sentences([text]).first.first
   end
-  def test_chunk_pattern
+  def _test_chunk_pattern
     text = "There is a concern with the use of thiazolidinediones in patients with an increased risk of colon cancer (e.g., familial colon polyposis)."
     drug = "thiazolidinediones"
@@ -57,7 +57,7 @@ class TestPatternRelExt < Test::Unit::TestCase
   end
-  def test_entities_with_spaces
+  def _test_entities_with_spaces
     PatternRelExt.new("NP[entity:Gene Name]").token_trie
   end

data/test/rbbt/ner/test_regexpNER.rb CHANGED

@@ -23,9 +23,9 @@ class TestRegExpNER < Test::Unit::TestCase
     matches = RegExpNER.match_regexp_hash(sentence, regexp_hash)
     assert_equal ["this", "this", "that"].sort, matches.sort
-    assert_equal "In ".length, matches.select{|m| m.type == :this}[0].offset
-    assert_equal "In this sentence I should find ".length, matches.select{|m| m.type == :this}[1].offset
-    assert_equal :this, matches.select{|m| m.type == :this}[0].type
+    assert_equal "In ".length, matches.select{|m| m.entity_type == :this}[0].offset
+    assert_equal "In this sentence I should find ".length, matches.select{|m| m.entity_type == :this}[1].offset
+    assert_equal :this, matches.select{|m| m.entity_type == :this}[0].entity_type
   end
   def test_define_regexps
@@ -39,9 +39,9 @@ class TestRegExpNER < Test::Unit::TestCase
     matches = ner.entities(sentence)
     assert_equal ["this", "this", "that"].sort, matches.sort
-    assert_equal "In ".length, matches.select{|m| m.type == :this }[0].offset
-    assert_equal "In this sentence I should find ".length, matches.select{|m| m.type == :this }[1].offset
-    assert_equal :this, matches.select{|m| m.type == :this }[0].type
+    assert_equal "In ".length, matches.select{|m| m.entity_type == :this }[0].offset
+    assert_equal "In this sentence I should find ".length, matches.select{|m| m.entity_type == :this }[1].offset
+    assert_equal :this, matches.select{|m| m.entity_type == :this }[0].entity_type
   end
@@ -51,9 +51,9 @@ class TestRegExpNER < Test::Unit::TestCase
     ner = RegExpNER.new({:this => /this/, :that => /that/})
     matches = ner.entities(sentence)
     assert_equal ["this", "this", "that"].sort, matches.sort
-    assert_equal "In ".length, matches.select{|m| m.type == :this}[0].offset
-    assert_equal "In this sentence I should find ".length, matches.select{|m| m.type == :this}[1].offset
-    assert_equal :this, matches.select{|m| m.type == :this}[0].type
+    assert_equal "In ".length, matches.select{|m| m.entity_type == :this}[0].offset
+    assert_equal "In this sentence I should find ".length, matches.select{|m| m.entity_type == :this}[1].offset
+    assert_equal :this, matches.select{|m| m.entity_type == :this}[0].entity_type
     Segmented.setup(sentence)
     ner_this = RegExpNER.new({:this => /this/})
@@ -64,9 +64,9 @@ class TestRegExpNER < Test::Unit::TestCase
     matches = sentence.segments
     assert_equal ["this", "this", "that"].sort, matches.sort
-    assert_equal "In ".length, matches.select{|m| m.type == :this}[0].offset
-    assert_equal "In this sentence I should find ".length, matches.select{|m| m.type == :this}[1].offset
-    assert_equal :this, matches.select{|m| m.type == :this}[0].type
+    assert_equal "In ".length, matches.select{|m| m.entity_type == :this}[0].offset
+    assert_equal "In this sentence I should find ".length, matches.select{|m| m.entity_type == :this}[1].offset
+    assert_equal :this, matches.select{|m| m.entity_type == :this}[0].entity_type
   end
   def test_entities_captures
@@ -75,8 +75,8 @@ class TestRegExpNER < Test::Unit::TestCase
     ner = RegExpNER.new({:this => /this/, :that => /that/, :should => /I (should)/})
     matches = ner.entities(sentence)
     assert_equal ["this", "this", "that", "should"].sort, matches.sort
-    assert_equal "In this sentence I ".length, matches.select{|m| m.type == :should}[0].offset
-    assert_equal :should, matches.select{|m| m.type == :should}[0].type
+    assert_equal "In this sentence I ".length, matches.select{|m| m.entity_type == :should}[0].offset
+    assert_equal :should, matches.select{|m| m.entity_type == :should}[0].entity_type
   end

data/test/rbbt/ner/test_rnorm.rb CHANGED

@@ -27,10 +27,9 @@ S000000376	AAA	GENE1	DDD
      assert_equal(["S000000029"], @norm.match("FUN21"))
      assert_equal(["S000000030", "S000000029", "S000000031"].sort, @norm.match("FUN").sort)
      assert_equal(["S000000030", "S000000029", "S000000031"].sort, @norm.match("FUN 2").sort)
-     assert_equal(["S000000030", "S000000029", "S000000031"].sort, @norm.match("FUN 21").sort)
-     assert_equal([], @norm.match("GER4"))
-     @norm.match("FUN21")
+     assert_equal(["S000000029"].sort, @norm.match("FUN 21").sort)
+     assert_equal([], @norm.match("Non-sense"))
+     assert_equal(["S000000029", "S000000374"], @norm.match("GER4"))
   end
   def test_select

data/test/rbbt/ner/test_token_trieNER.rb CHANGED

@@ -74,6 +74,7 @@ C2;11;22;3 3;bb
       index = TokenTrieNER.new("test", TSV.open(file, :flat, :sep => ';'))
       assert index.match(' asdfa dsf asdf aa asdfasdf ').select{|m| m.code.include? 'C1'}.any?
+      assert index.match(' asdfa dsf asdf aa asdfasdf ').select{|m| m.code.include? 'C1'}.any?
     end
   end

data/test/rbbt/nlp/genia/test_sentence_splitter.rb CHANGED

@@ -1,9 +1,43 @@
 require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
 require 'rbbt/nlp/genia/sentence_splitter'
-class TestClass < Test::Unit::TestCase
-  def test_true
-    assert true
+class TestNLP < Test::Unit::TestCase
+  def test_sentences
+    text =<<-EOF
+This is a sentence.
+A funky character ™ in a sentence.
+This is a sentence.
+This is a broken
+sentence. This is
+another broken sentence.
+    EOF
+    iii NLP.geniass_sentence_splitter(text)
+    assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
+  end
+  def test_sentences_2
+    text =<<-EOF
+This is a sentence.
+This is a sentence.
+This is a broken
+sentence. This is
+another broken sentence.
+    EOF
+    assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
+  end
+  def test_sentences_ext
+    text =<<-EOF
+This is a sentence.
+This is a sentence.
+This is a broken
+sentence. This is
+another broken sentence.
+    EOF
+    assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
   end
 end

data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb CHANGED

@@ -1,6 +1,6 @@
 require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
 require 'rbbt/nlp/open_nlp/sentence_splitter'
-require 'rbbt/ner/segment'
+require 'rbbt/segment'
 $text=<<-EOF
 Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors
@@ -22,6 +22,22 @@ class TestClass < Test::Unit::TestCase
   def test_sentences
     text =<<-EOF
 This is a sentence.
+No funky character in this sentence.
+This is a sentence.
+This is a
+sentence. This is
+another sentence.
+    EOF
+    assert_equal 5, OpenNLP.sentence_split_detector.sentDetect(text).length
+    assert_equal 5, OpenNLP.sentence_splitter(text).length
+    assert_equal "This is a \nsentence.", OpenNLP.sentence_splitter(text)[3]
+  end
+  def test_sentences_fix_utf8
+    text =<<-EOF
+This is a sentence.
 A funky character ™ in a sentence.
 This is a sentence.
 This is a
@@ -35,12 +51,12 @@ another sentence.
     assert_equal "This is a \nsentence.", OpenNLP.sentence_splitter(text)[3]
   end
-  def _test_text_sentences
+  def test_text_sentences
     Misc.benchmark(100) do
-      OpenNLP.sentence_splitter($text).include? "Our
+      assert OpenNLP.sentence_splitter($text).include?("Our
 findings highlight the role of SMARCA4 in the pathogenesis of SMARCB1-positive
 AT/RT and the usefulness of antibodies directed against SMARCA4 in this
-diagnostic setting."
+diagnostic setting.")
     end
   end
 end

data/test/rbbt/segment/test_annotation.rb ADDED

@@ -0,0 +1,39 @@
+require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
+require 'rbbt/document'
+require 'rbbt/document/corpus'
+require 'rbbt/segment'
+require 'rbbt/segment/annotation'
+class TestAnnotation < Test::Unit::TestCase
+  def test_annotation
+    text = "This is a document"
+    Document.setup(text, "TEST", "test_doc1", nil)
+    segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
+    annotation = SegmentAnnotation.setup(segment, :type => :verb)
+    assert_equal 'verb', annotation.annotid.split(":")[5]
+    annotation = SegmentAnnotation.setup(segment.segid, :type => :verb)
+    assert_equal 'verb', annotation.annotid.split(":")[5]
+  end
+  def test_annotid
+    text = "This is a document"
+    Document.setup(text, "TEST", "test_doc1", nil)
+    corpus = Document::Corpus.setup({})
+    corpus.add_document(text)
+    segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
+    annotation = SegmentAnnotation.setup(segment, :type => :verb)
+    annotid = annotation.annotid(corpus)
+    assert_equal 'verb', annotid.type
+    assert_equal 'verb', annotid.annotation.type
+    assert_equal 'is', annotid.annotation
+  end
+end

data/test/rbbt/segment/test_corpus.rb ADDED

@@ -0,0 +1,36 @@
+require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
+require 'rbbt/document'
+require 'rbbt/document/corpus'
+require 'rbbt/segment'
+require 'rbbt/segment/corpus'
+class TestSegmentCorpus < Test::Unit::TestCase
+  def test_corpus
+    text = "This is a document"
+    Document.setup(text, "TEST", "test_doc1", nil)
+    corpus = {}
+    corpus.extend Document::Corpus
+    corpus.add_document(text)
+    docid = text.docid(corpus)
+    assert_equal docid.document, text
+  end
+  def test_find
+    text = "This is a document"
+    Document.setup(text, "TEST", "test_doc1", nil)
+    TmpFile.with_file do |path|
+      corpus = Persist.open_tokyocabinet(path, true, :single, "BDB")
+      corpus.extend Document::Corpus
+      corpus.add_document(text)
+      assert corpus.prefix("TEST:").include?(text.docid)
+    end
+  end
+end

data/test/rbbt/segment/test_encoding.rb ADDED

@@ -0,0 +1,24 @@
+require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
+require 'rbbt/segment/encoding'
+class TestEncoding < Test::Unit::TestCase
+  def test_bad_chars
+    text = "A funky character ™ in a sentence."
+    assert_equal ["™"], Segment.bad_chars(text)
+  end
+  def test_ascii
+    text = "A funky character ™ in a sentence."
+    Segment.ascii(text) do
+      assert_equal "A funky character ? in a sentence.",  text
+    end
+    Segment.ascii(text, "NONASCII") do
+      assert_equal "A funky character NONASCII in a sentence.",  text
+    end
+    assert_equal "A funky character ™ in a sentence.",  text
+  end
+end

data/test/rbbt/{text/segment → segment}/test_named_entity.rb RENAMED

@@ -1,6 +1,6 @@
-require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
-require 'rbbt/text/segment'
-require 'rbbt/text/segment/named_entity'
+require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
+require 'rbbt/segment'
+require 'rbbt/segment/named_entity'
 class TestClass < Test::Unit::TestCase
   def test_info
@@ -15,35 +15,39 @@ class TestClass < Test::Unit::TestCase
   def test_all_args
     a = "test"
-    NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
+    NamedEntity.setup a, 10, "TEST:doc1:test_type:hash", "NamedEntity", "TYPE", "CODE", "SCORE"
     assert_equal 10, a.offset
+    assert_equal "NamedEntity", a.type
+    assert_equal "TYPE", a.entity_type
+    assert_equal "SCORE", a.score
   end
   def test_tsv
     a = "test"
     NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
-    assert Segment.tsv([a]).fields.include? "code"
-    assert Segment.tsv([a], nil).fields.include? "code"
-    assert Segment.tsv([a], "literal").fields.include? "code"
+    assert Annotated.tsv([a]).fields.include? "code"
+    assert Annotated.tsv([a], nil).fields.include? "code"
+    assert Annotated.tsv([a], :all).fields.include? "code"
+    assert Annotated.tsv([a], :all).fields.include? "literal"
   end
-  def test_segment_brat
+  def __test_segment_brat
     a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
     gene1 = "TP53"
     gene1.extend NamedEntity
     gene1.offset = a.index gene1
-    gene1.type = "Gene"
+    gene1.entity_type = "Gene"
     gene2 = "CDK5R1"
     gene2.extend NamedEntity
     gene2.offset = a.index gene2
-    gene2.type = "Gene"
+    gene2.entity_type = "Gene"
     gene3 = "TP53 gene"
     gene3.extend NamedEntity
     gene3.offset = a.index gene3
-    gene3.type = "Gene"
+    gene3.entity_type = "Gene"
     segments = [gene1, gene2, gene3]
     assert segments.collect{|s| s.to_brat}.include? "Gene 27 35"