RubyGems - rbbt-text - Versions diffs - 1.3.0 → 1.3.5 - Mend

rbbt-text 1.3.0 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

checksums.yaml +4 -4
data/lib/rbbt/bow/bow.rb +5 -2
data/lib/rbbt/bow/dictionary.rb +27 -23
data/lib/rbbt/document.rb +20 -5
data/lib/rbbt/document/annotation.rb +7 -4
data/lib/rbbt/document/corpus.rb +30 -3
data/lib/rbbt/document/corpus/pubmed.rb +2 -1
data/lib/rbbt/ner/abner.rb +3 -2
data/lib/rbbt/ner/banner.rb +3 -1
data/lib/rbbt/ner/brat.rb +1 -1
data/lib/rbbt/ner/g_norm_plus.rb +7 -1
data/lib/rbbt/ner/linnaeus.rb +2 -1
data/lib/rbbt/ner/patterns.rb +0 -1
data/lib/rbbt/ner/rner.rb +229 -0
data/lib/rbbt/ner/token_trieNER.rb +32 -18
data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
data/lib/rbbt/nlp/spaCy.rb +195 -0
data/lib/rbbt/relationship.rb +24 -0
data/lib/rbbt/segment.rb +9 -4
data/lib/rbbt/segment/annotation.rb +3 -3
data/lib/rbbt/segment/named_entity.rb +7 -0
data/lib/rbbt/segment/range_index.rb +1 -1
data/lib/rbbt/segment/relationship.rb +7 -0
data/lib/rbbt/segment/transformed.rb +5 -1
data/share/install/software/OpenNLP +1 -1
data/share/rner/config.rb +51 -0
data/test/rbbt/document/corpus/test_pubmed.rb +2 -1
data/test/rbbt/document/test_annotation.rb +15 -6
data/test/rbbt/document/test_corpus.rb +15 -1
data/test/rbbt/ner/test_g_norm_plus.rb +11 -3
data/test/rbbt/ner/test_rner.rb +132 -0
data/test/rbbt/nlp/genia/test_sentence_splitter.rb +27 -3
data/test/rbbt/segment/test_annotation.rb +3 -4
data/test/rbbt/segment/test_encoding.rb +1 -1
data/test/rbbt/segment/test_named_entity.rb +7 -5
data/test/rbbt/segment/test_range_index.rb +1 -2
data/test/rbbt/segment/test_transformed.rb +33 -4
data/test/rbbt/test_segment.rb +5 -10
data/test/test_spaCy.rb +144 -0
metadata +12 -3

data/test/rbbt/segment/test_named_entity.rb CHANGED Viewed

@@ -22,12 +22,14 @@ class TestClass < Test::Unit::TestCase
     assert_equal "SCORE", a.score
   end
-  def __test_tsv
+  def test_tsv
     a = "test"
-    NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
-    assert Segment.tsv([a]).fields.include? "code"
-    assert Segment.tsv([a], nil).fields.include? "code"
-    assert Segment.tsv([a], "literal").fields.include? "code"
+    NamedEntity.setup a, 10, "DocID", "TYPE", "CODE", "SCORE"
+    ppp Annotated.tsv([a,a])
+    assert Annotated.tsv([a]).fields.include? "code"
+    assert Annotated.tsv([a], nil).fields.include? "code"
+    assert Annotated.tsv([a], :all).fields.include? "code"
+    assert Annotated.tsv([a], :all).fields.include? "literal"
   end
   def __test_segment_brat

data/test/rbbt/segment/test_range_index.rb CHANGED Viewed

@@ -9,8 +9,7 @@ class TestRangeIndex < Test::Unit::TestCase
     text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
     Document.setup(text, "TEST", "test_doc1", nil)
-    corpus = {}
-    corpus.extend Document::Corpus
+    corpus = Document::Corpus.setup({})
     corpus.add_document(text)

data/test/rbbt/segment/test_transformed.rb CHANGED Viewed

@@ -101,6 +101,35 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
     assert_equal "CDK5R1 protein", exp2
   end
+  def test_with_transform_sentences
+    a = "This first sentence mentions Bread. This sentence mentions the TP53 gene and the CDK5R1 protein"
+    original = a.dup
+    gene1 = "TP53"
+    gene1.extend NamedEntity
+    gene1.offset = a.index gene1
+    gene2 = "CDK5R1"
+    gene2.extend NamedEntity
+    gene2.offset = a.index gene2
+    bread = "Bread"
+    bread.extend NamedEntity
+    bread.offset = a.index bread
+    sentences = Segment.align(a, a.split(". "))
+    Transformed.with_transform(sentences[1], [gene1, gene2, bread], "GN") do
+      assert sentences[1].include?("GN gene and the GN protein")
+    end
+    Transformed.with_transform(sentences[0], [gene1, gene2, bread], "BR") do
+      assert sentences[0].include?("first sentence mentions BR")
+    end
+  end
   def test_html
     a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
@@ -115,7 +144,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
     gene2.entity_type = "Protein"
     Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
-      assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
+      assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
     end
   end
@@ -136,7 +165,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
     gene2.entity_type = "Protein"
     Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
-      assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
+      assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
     end
   end
@@ -156,9 +185,9 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
     assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
     Transformed.with_transform(a, [gene1], Proc.new{|e| e.html}) do
-      assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the CDK5R1 protein", a
+      assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
       Transformed.with_transform(a, [gene2], Proc.new{|e| e.html}) do
-        assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene'><span class='Entity' attr-entity-type='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
+        assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
       end
     end
   end

data/test/rbbt/test_segment.rb CHANGED Viewed

@@ -17,8 +17,7 @@ class TestSegment < Test::Unit::TestCase
     text = "This is a document"
     Document.setup(text, "TEST", "test_doc1", nil)
-    corpus = {}
-    corpus.extend Document::Corpus
+    corpus = Document::Corpus.setup({})
     corpus.add_document(text)
@@ -41,8 +40,7 @@ class TestSegment < Test::Unit::TestCase
     text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
     Document.setup(text, "TEST", "test_doc1", nil)
-    corpus = {}
-    corpus.extend Document::Corpus
+    corpus = Document::Corpus.setup({})
     corpus.add_document(text)
@@ -65,8 +63,7 @@ class TestSegment < Test::Unit::TestCase
     text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
     Document.setup(text, "TEST", "test_doc1", nil)
-    corpus = {}
-    corpus.extend Document::Corpus
+    corpus = Document::Corpus.setup({})
     corpus.add_document(text)
@@ -94,8 +91,7 @@ class TestSegment < Test::Unit::TestCase
     text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
     Document.setup(text, "TEST", "test_doc1", nil)
-    corpus = {}
-    corpus.extend Document::Corpus
+    corpus = Document::Corpus.setup({})
     corpus.add_document(text)
@@ -142,8 +138,7 @@ Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of
     text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
     Document.setup(text, "TEST", "test_doc1", nil)
-    corpus = {}
-    corpus.extend Document::Corpus
+    corpus = Document::Corpus.setup({})
     corpus.add_document(text)

data/test/test_spaCy.rb ADDED Viewed

@@ -0,0 +1,144 @@
+require File.join(File.expand_path(File.dirname(__FILE__)), '', 'test_helper.rb')
+require 'rbbt/nlp/spaCy'
+require 'rbbt/document/corpus'
+class TestSpaCy < Test::Unit::TestCase
+  def test_tokens
+    text = "I tell a story"
+    tokens = SpaCy.tokens(text)
+    assert_equal 4, tokens.length
+    assert_equal "tell", tokens[1].to_s
+  end
+  def test_chunks
+    text = "Miguel Vazquez tell a good story"
+    tokens = SpaCy.chunks(text)
+    assert_equal 2, tokens.length
+    assert_equal "Miguel Vazquez", tokens[0].to_s
+  end
+  def test_segments
+    text = "I tell a story. It's a very good story."
+    corpus = Document::Corpus.setup({})
+    Document.setup(text, "TEST", "test_doc1", "simple_sentence")
+    corpus.add_document text
+    text.corpus = corpus
+    segments = SpaCy.segments(text)
+    segments.each do |segment|
+      assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
+    end
+  end
+  def test_chunk_segments
+    text = "I tell a story. It's a very good story."
+    corpus = Document::Corpus.setup({})
+    Document.setup(text, "TEST", "test_doc1", "simple_sentence")
+    corpus.add_document text
+    text.corpus = corpus
+    segments = SpaCy.chunk_segments(text)
+    segments.each do |segment|
+      assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
+    end
+  end
+  def test_dep_graph
+    text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
+    graph = SpaCy.dep_graph(text, true)
+    tokens = SpaCy.segments(text)
+    index = Segment.index tokens
+    tf_s = tokens.select{|t| t == "TF" }.first
+    tg_s = tokens.select{|t| t == "TG" }.first
+    require 'rbbt/network/paths'
+    path = Paths.dijkstra(graph, tf_s.segid, [tg_s.segid])
+    path_tokens = path.collect do |segid|
+      range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
+      text[range]
+    end
+    assert path_tokens.include? 'increase'
+  end
+  def test_chunk_dep_graph
+    text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
+    graph = SpaCy.chunk_dep_graph(text, true)
+    tokens = SpaCy.chunk_segments(text)
+    index = Segment.index tokens
+    tf_s = tokens.select{|t| t.include? "TF" }.first
+    tg_s = tokens.select{|t| t.include? "TG" }.first
+    require 'rbbt/network/paths'
+    path = Paths.dijkstra(graph, tf_s.segid, [tg_s.segid])
+    path_tokens = path.collect do |segid|
+      range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
+      text[range]
+    end
+    assert path_tokens.include? 'increase'
+  end
+  def test_paths
+    text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
+    path = SpaCy.paths(text, Segment.setup("TF", :offset => text.index("TF")), Segment.setup("TG",:offset =>  text.index("TG")))
+    path_tokens = path.collect do |segid|
+      range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
+      text[range]
+    end
+    ppp text
+    iii path_tokens
+    assert path_tokens.include? 'increase'
+  end
+  def test_paths2
+    text = "Deletion and domain swap experiments identified small, discreet positive and negative elements in A-Myb and TF that were required for the regulation of specific genes, such as DHRS2, TG, and mim-1"
+    path = SpaCy.paths(text, Segment.setup("TF", :offset => text.index("TF")), Segment.setup("TG",:offset =>  text.index("TG")))
+    path_tokens = path.collect do |segid|
+      range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
+      text[range]
+    end
+    iii path_tokens
+    assert path_tokens.include? 'regulation'
+  end
+  def test_paths3
+    text = "Therefore, we speculate that PEA3 factors may contribute to the up-regulation of COX-2 expression resulting from both APC mutation and Wnt1 expression"
+    path = SpaCy.paths(text, *Segment.align(text,["PEA3", "Wnt1"]))
+    path_tokens = path.collect do |segid|
+      range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
+      text[range]
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rbbt-text
 version: !ruby/object:Gem::Version
-  version: 1.3.0
+  version: 1.3.5
 platform: ruby
 authors:
 - Miguel Vazquez
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2020-04-18 00:00:00.000000000 Z
+date: 2021-06-17 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rbbt-util
@@ -95,6 +95,7 @@ files:
 - lib/rbbt/ner/oscar4.rb
 - lib/rbbt/ner/patterns.rb
 - lib/rbbt/ner/regexpNER.rb
+- lib/rbbt/ner/rner.rb
 - lib/rbbt/ner/rnorm.rb
 - lib/rbbt/ner/rnorm/cue_index.rb
 - lib/rbbt/ner/rnorm/tokens.rb
@@ -102,12 +103,15 @@ files:
 - lib/rbbt/nlp/genia/sentence_splitter.rb
 - lib/rbbt/nlp/nlp.rb
 - lib/rbbt/nlp/open_nlp/sentence_splitter.rb
+- lib/rbbt/nlp/spaCy.rb
+- lib/rbbt/relationship.rb
 - lib/rbbt/segment.rb
 - lib/rbbt/segment/annotation.rb
 - lib/rbbt/segment/encoding.rb
 - lib/rbbt/segment/named_entity.rb
 - lib/rbbt/segment/overlaps.rb
 - lib/rbbt/segment/range_index.rb
+- lib/rbbt/segment/relationship.rb
 - lib/rbbt/segment/segmented.rb
 - lib/rbbt/segment/token.rb
 - lib/rbbt/segment/transformed.rb
@@ -124,6 +128,7 @@ files:
 - share/install/software/OpenNLP
 - share/install/software/StanfordParser
 - share/patterns/drug_induce_disease
+- share/rner/config.rb
 - share/rnorm/cue_default
 - share/rnorm/tokens_default
 - share/wordlists/stopwords
@@ -146,6 +151,7 @@ files:
 - test/rbbt/ner/test_oscar4.rb
 - test/rbbt/ner/test_patterns.rb
 - test/rbbt/ner/test_regexpNER.rb
+- test/rbbt/ner/test_rner.rb
 - test/rbbt/ner/test_rnorm.rb
 - test/rbbt/ner/test_token_trieNER.rb
 - test/rbbt/nlp/genia/test_sentence_splitter.rb
@@ -161,6 +167,7 @@ files:
 - test/rbbt/test_document.rb
 - test/rbbt/test_segment.rb
 - test/test_helper.rb
+- test/test_spaCy.rb
 homepage: http://github.com/mikisvaz/rbbt-util
 licenses: []
 metadata: {}
@@ -179,7 +186,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.0.6
+rubygems_version: 3.1.4
 signing_key:
 specification_version: 4
 summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
@@ -207,6 +214,7 @@ test_files:
 - test/rbbt/ner/test_banner.rb
 - test/rbbt/ner/test_token_trieNER.rb
 - test/rbbt/ner/test_finder.rb
+- test/rbbt/ner/test_rner.rb
 - test/rbbt/ner/test_linnaeus.rb
 - test/rbbt/ner/test_oscar4.rb
 - test/rbbt/test_segment.rb
@@ -217,4 +225,5 @@ test_files:
 - test/rbbt/segment/test_encoding.rb
 - test/rbbt/segment/test_range_index.rb
 - test/rbbt/segment/test_corpus.rb
+- test/test_spaCy.rb
 - test/test_helper.rb