RubyGems - rbbt-text - Versions diffs - 1.3.4 → 1.3.5 - Mend

rbbt-text 1.3.4 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

checksums.yaml +4 -4
data/lib/rbbt/document.rb +6 -0
data/lib/rbbt/document/annotation.rb +2 -2
data/lib/rbbt/document/corpus.rb +5 -3
data/lib/rbbt/document/corpus/pubmed.rb +1 -0
data/lib/rbbt/ner/abner.rb +3 -2
data/lib/rbbt/ner/banner.rb +3 -1
data/lib/rbbt/ner/brat.rb +1 -1
data/lib/rbbt/ner/linnaeus.rb +2 -1
data/lib/rbbt/ner/rner.rb +229 -0
data/lib/rbbt/ner/token_trieNER.rb +2 -1
data/lib/rbbt/nlp/spaCy.rb +158 -15
data/lib/rbbt/relationship.rb +24 -0
data/lib/rbbt/segment.rb +3 -0
data/lib/rbbt/segment/named_entity.rb +4 -0
data/lib/rbbt/segment/range_index.rb +1 -1
data/share/rner/config.rb +51 -0
data/test/rbbt/document/corpus/test_pubmed.rb +2 -1
data/test/rbbt/document/test_annotation.rb +10 -1
data/test/rbbt/document/test_corpus.rb +14 -0
data/test/rbbt/ner/test_rner.rb +132 -0
data/test/rbbt/segment/test_named_entity.rb +2 -1
data/test/rbbt/segment/test_transformed.rb +4 -4
data/test/test_spaCy.rb +113 -1
metadata +8 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 496288d7d3ff1215ded1fd210192d5887a6a071eea5f322295a669a5d648d77b
-  data.tar.gz: 47996496009cbcdaab38a9dc9bf6efbbe7fc0145f315b0a48bfab0f543742f94
+  metadata.gz: '0846f900d745dd27df8006eecbc9d294f9f38a23dd76001de2a5dc0313db7e22'
+  data.tar.gz: 675985882a6c8b9813f620d7ef0a555efa5c148c7c2fe36e0030f84f3fd88cf0
 SHA512:
-  metadata.gz: 36e7415ad06207066844a30001c8541865f066d1e83a4a2ddc5182c54b704cd3d442cbccce219bd2114717a83656d07558c42725eca75597fea239b6e13244ab
-  data.tar.gz: 988eff4d242d0425910b96fac4188df079c8c53c3abea2825cc97d5af5118841680705fa33461a5b4cfa7b8d6b32a486465e44b75f20fad324e4623c6c8083d8
+  metadata.gz: dfd9c333b94181496134b825c63d6e93a0390f81d426526f79c00cf12556021b60004b29b57ca9b0b274141937027f7bc780552a60de007e5f790b19910354c0
+  data.tar.gz: 205beeb8829c8358fd29c0a18351522e566106e24220af3d7bec3676694d37d682b92243e4fd4cd495b542f9945a28cf8585e587342672d31779d0b21b53ae4e

data/lib/rbbt/document.rb CHANGED Viewed

@@ -53,3 +53,9 @@ module Document
   alias id docid
 end
+#class String
+#  def docid
+#    digest = Misc.digest(self)
+#    ["STRING", digest, nil, nil] * ":"
+#  end
+#end

data/lib/rbbt/document/annotation.rb CHANGED Viewed

@@ -13,7 +13,7 @@ module Document
       end
       docid = self.docid
-      segments.each{|s| s.docid = docid if s.docid.nil? }
+      segments.each{|s| s.docid = docid }
       segments
     end
@@ -36,7 +36,7 @@ module Document
         docid = document.docid
-        segments.each{|s| s.docid = docid if s.docid.nil? }
+        segments.each{|s| s.docid = docid }
         segments
       end

data/lib/rbbt/document/corpus.rb CHANGED Viewed

@@ -3,6 +3,7 @@ require 'rbbt-util'
 module Document::Corpus
   def self.setup(corpus)
+    corpus = Persist.open_tokyocabinet(corpus, true, :single, "BDB") if String === corpus
     corpus.extend Document::Corpus unless Document::Corpus === corpus
     corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
     corpus
@@ -16,7 +17,8 @@ module Document::Corpus
     end
   end
-  def docids(prefix)
+  def docids(*prefix)
+    prefix = prefix * ":"
     prefix += ":" unless prefix == :all || prefix[-1] == ":"
     docids = self.read_and_close do
       prefix == :all ? self.keys : self.prefix(prefix)
@@ -24,8 +26,8 @@ module Document::Corpus
     DocID.setup(docids, :corpus => self)
   end
-  def documents(prefix)
-    self.docids(prefix).document
+  def documents(*prefix)
+    self.docids(*prefix).document
   end
   def [](*args)

data/lib/rbbt/document/corpus/pubmed.rb CHANGED Viewed

@@ -16,6 +16,7 @@ module Document::Corpus
                  end
       Log.debug "Loading pmid #{pmid}"
       add_document(document)
+      document
     end
     Document.setup(res)

data/lib/rbbt/ner/abner.rb CHANGED Viewed

@@ -39,14 +39,15 @@ class Abner < NER
     types = res[1]
     strings = res[0]
+    docid = Misc.digest(text)
     global_offset = 0
     strings.zip(types).collect do |mention, type|
       mention = mention.to_s;
       offset = text.index(mention)
       if offset.nil?
-        NamedEntity.setup(mention, nil, type.to_s)
+        NamedEntity.setup(mention, :docid => docid, :entity_type => type)
       else
-        NamedEntity.setup(mention, offset + global_offset, type.to_s)
+        NamedEntity.setup(mention, :offset => offset + global_offset, :docid => docid, :entity_type => type.to_s)
         text = text[offset + mention.length..-1]
         global_offset += offset + mention.length
       end

data/lib/rbbt/ner/banner.rb CHANGED Viewed

@@ -55,6 +55,7 @@ class Banner < NER
   # text.
   def match(text)
     return [] if text.nil?
+    text = text.dup if text.frozen?
     text.gsub!(/\n/,' ')
     text.gsub!(/\|/,'/') # Character | gives an error
     return [] if text.strip.empty?
@@ -66,6 +67,7 @@ class Banner < NER
     @parenPP.postProcess(sentence)
     tagged = sentence.getSGML
+    docid = Misc.digest text
     res = tagged.scan(/<GENE>.*?<\/GENE>/).
       collect{|r|
       r.match(/<GENE>(.*?)<\/GENE>/)
@@ -73,7 +75,7 @@ class Banner < NER
       mention.sub!(/^\s*/,'')
       mention.sub!(/\s*$/,'')
       offset = text.index(mention)
-      NamedEntity.setup(mention, offset, 'GENE')
+      NamedEntity.setup(mention, :offset => offset, :docid => docid, :entity_type => 'GENE')
       mention
     }
     res

data/lib/rbbt/ner/brat.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 require 'rbbt/segment/named_entity'
-require 'rbbt/text/segment/relationship'
+require 'rbbt/relationship'
 module Brat
   Rbbt.claim Rbbt.software.opt.Brat, :install, "https://github.com/nlplab/brat.git"

data/lib/rbbt/ner/linnaeus.rb CHANGED Viewed

@@ -31,7 +31,8 @@ module Linnaeus
     init unless defined? @@Matcher
     @@Matcher.match(text).toArray().collect do |mention|
-      NamedEntity.setup(mention.text(), :offset => mention.start(), :entity_type => "Organism", :code => mention.ids(), :score => mention.probabilities())
+      best_id, best_prob = mention.ids().zip(mention.probabilities()).sort_by{|i,p| p.to_f }.last
+      NamedEntity.setup(mention.text(), :offset => mention.start(), :entity_type => "Organism", :code => best_id, :score => best_prob)
     end
   end
 end

data/lib/rbbt/ner/rner.rb ADDED Viewed

@@ -0,0 +1,229 @@
+require 'rbbt'
+require 'rbbt/util/open'
+require 'rbbt/util/misc'
+require 'rbbt/util/simpleDSL'
+class NERFeatures
+  include SimpleDSL
+  def self.tokens(text)
+    text.scan(/
+              \w*-?(?:\d*\d[.,]\d\d*|\d+)\w*|
+              \w-\w*|
+              \w+-[A-Z](?!\w)|
+              \w+|
+              [.,()\/\[\]{}'"+-]
+              /x)
+  end
+  def self.reverse(text)
+    tokens(text).reverse.join(" ")
+  end
+  def define(name, *args, &block)
+    action = args[0] || block ||  /#{name.to_s}s?/i
+    raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
+    @types[name.to_s] = action
+    @order.push name.to_s
+    name.to_s
+  end
+  attr_accessor :reverse
+  def initialize(file = nil, reverse = false, &block)
+    @types   = {}
+    @order   = []
+    @context = []
+    @reverse = reverse
+    file ||= Rbbt.share.ner['config.rb'].find if !file && !block
+    parse(:define, file, &block)
+  end
+  def config
+    @config[:define]
+  end
+  def window(positions)
+    @window = positions
+  end
+  def context(name, &block)
+    if name.is_a? Array
+      @context += name
+    else
+      @context.push name
+      # The block might be wrongly assigned to this function
+      # instead of the actual definition, fix that.
+      if block
+        @types[name] = block
+      end
+    end
+  end
+  def direction(dir)
+    if dir.to_sym == :reverse
+      @reverse = true
+    end
+  end
+  def features(word)
+    values = [word]
+    @order.each{|features|
+      action = @types[features]
+      if action.is_a?(Proc)
+        values.push(action.call(word))
+      else
+        m = action.match(word)
+        if m
+          if m[1]
+            values.push(m[1])
+          else
+            values.push(m != nil)
+          end
+        else
+          values.push(false)
+        end
+      end
+    }
+    values
+  end
+  def template(window=nil)
+    window ||= @window || [1,-1]
+    template = ""
+    i = 1
+    @order.each{|feat|
+      template += "U#{ feat }: %x[0,#{ i }]\n"
+      if @context.include?(feat)
+        window.each{|p|
+          template += "U#{ feat }##{ p}: %x[#{ p },#{ i }]\n"
+        }
+      end
+      i += 1
+    }
+    template += "B\n"
+    template
+  end
+  def text_features(text, positive = nil)
+    text = self.class.reverse(text) if @reverse
+    initial = true
+    self.class.tokens(text).collect{|token|
+      features = features(token)
+      if !positive.nil?
+        features << (positive ? (initial ? 1 : 2) : 0)
+        initial = false
+      end
+      features
+    }
+  end
+  def tagged_features(text, mentions)
+    mentions ||= []
+    mentions = ['IMPOSSIBLE_MATCH'] if mentions.empty?
+    re = mentions.collect{|mention|
+      Regexp.quote(mention.gsub(/\s+/,' ')).sub(/\\s/,'\s+')
+    }.join("|")
+    positive = false
+    features = []
+    chunks = text.split(/(#{re})/)
+    chunks.each{|t|
+      chunk_features = text_features(t, positive)
+      positive = !positive
+      if @reverse
+        features = chunk_features + features
+      else
+        features = features + chunk_features
+      end
+    }
+    features
+  end
+  def train(features, model)
+    tmp_template = TmpFile.tmp_file("template-")
+    Open.write(tmp_template,template)
+    cmd = "#{File.join(Rbbt.datadir, 'third_party/crf++/bin/crf_learn')} '#{tmp_template}'  '#{features}' '#{model}'"
+    system cmd
+    Open.write(model + '.config',config)
+    FileUtils.rm tmp_template
+  end
+end
+class NER
+  def initialize(model = nil)
+    begin
+      require 'CRFPP'
+    rescue Exception
+      require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP')
+    end
+    model ||= File.join(Rbbt.datadir, + 'ner/model/BC2')
+    @parser = NERFeatures.new(model + '.config')
+    @reverse = @parser.reverse
+    @tagger = CRFPP::Tagger.new("-m #{ model } -v 3 -n2")
+  end
+  def extract(text)
+    features = @parser.text_features(text)
+    @tagger.clear
+    features.each{|feats|
+      @tagger.add(feats.join(" "))
+    }
+    @tagger.parse
+    found = []
+    mention = []
+    @tagger.size.times{|i|
+      label = @tagger.y(i)
+      word  = @tagger.x(i,0)
+      if word == ')'
+        mention.push(')') if mention.join =~ /\(/
+        next
+      end
+      case label
+      when 1
+        if mention.any? && ( mention.join(" ").is_special? || mention.select{|m| m.is_special?}.any?)
+          found.push(mention)
+          mention = []
+        end
+        mention.push(word)
+      when 2
+        mention.push(word)
+      when 0
+        found.push(mention) if mention.any?
+        mention = []
+      end
+    }
+    found << mention if mention.any?
+    found.collect{|list|
+      list = list.reverse if @reverse
+      list.join(" ")
+    }
+  end
+end

data/lib/rbbt/ner/token_trieNER.rb CHANGED Viewed

@@ -249,7 +249,8 @@ class TokenTrieNER < NER
       match << ((t.respond_to?(:original) and not t.original.nil?) ? t.original : t)
     }
-    NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes)
+    type = type.first
+    NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes, :type => type)
   end
   attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean, :stem

data/lib/rbbt/nlp/spaCy.rb CHANGED Viewed

@@ -2,30 +2,55 @@ require 'rbbt/segment'
 require 'rbbt/document'
 require 'rbbt/segment/annotation'
 require 'rbbt/util/python'
+require 'rbbt/network/paths'
 module SpaCy
-  PROPERTIES = %w(lemma_ is_punct is_space shape_ pos_ tag_)
+  TOKEN_PROPERTIES = %w(lemma_ is_punct is_space shape_ pos_ tag_)
+  CHUNK_PROPERTIES = %w(lemma_)
-  def self.tokens(text, lang = 'en')
+  def self.nlp(lang = 'en_core_web_md')
+    @@nlp ||= {}
+    @@nlp[lang] ||= RbbtPython.run :spacy do
+      spacy.load(lang)
+    end
+  end
+  def self.tokens(text, lang = 'en_core_web_sm')
     tokens = []
-    RbbtPython.run 'spacy' do
-      nlp = spacy.load(lang)
-      doc = nlp.call(text)
-      doc.__len__.times do |i|
-        tokens << doc.__getitem__(i)
-      end
+    nlp = nlp(lang)
+    doc = nlp.call(text)
+    doc.__len__.times do |i|
+      tokens << doc.__getitem__(i)
+    end
+    tokens
+  end
+  def self.chunks(text, lang = 'en_core_web_sm')
+    tokens = []
+    nlp = nlp(lang)
+    doc = nlp.call(text)
+    chunks = doc.noun_chunks.__iter__
+    RbbtPython.iterate chunks do |item|
+      tokens << item
     end
     tokens
   end
-  def self.segments(text, lang = 'en')
-    docid = text.docid if Document === text
+  def self.segments(text, lang = 'en_core_web_sm')
+    docid = text.docid if Document === text
     corpus = text.corpus if Document === text
     tokens = self.tokens(text, lang).collect do |token|
       info = {}
-      PROPERTIES.each do |p|
+      TOKEN_PROPERTIES.each do |p|
         info[p] = token.instance_eval(p.to_s)
       end
       info[:type] = "SpaCy"
@@ -35,7 +60,120 @@ module SpaCy
       info[:corpus] = corpus if corpus
       SpaCyToken.setup(token.text, info)
     end
-    SpaCyToken.setup(tokens, :corpus => corpus)
+    tokens
+  end
+  def self.chunk_segments(text, lang = 'en_core_web_sm')
+    docid = text.docid if Document === text
+    corpus = text.corpus if Document === text
+    chunks = self.chunks(text, lang).collect do |chunk|
+      info = {}
+      CHUNK_PROPERTIES.each do |p|
+        info[p] = chunk.instance_eval(p.to_s)
+      end
+      start = eend =  nil
+      deps = []
+      RbbtPython.iterate chunk.__iter__ do |token|
+        start = token.idx if start.nil?
+        eend = start + chunk.text.length if eend.nil?
+        deps << token.idx.to_s + ":" + token.dep_ + "->" + token.head.idx.to_s if token.head.idx < start || token.head.idx > eend
+      end
+      info[:type] = "SpaCy"
+      info[:offset] = chunk.__iter__.__next__.idx
+      info[:dep] = deps * ";"
+      info[:docid] = docid if docid
+      info[:corpus] = corpus if corpus
+      SpaCySpan.setup(chunk.text, info)
+    end
+    chunks
+  end
+  def self.dep_graph(text, reverse = false, lang = 'en_core_web_md')
+    tokens = self.segments(text, lang)
+    index = Segment.index(tokens)
+    associations = {}
+    tokens.each do |token|
+      type, target_pos = token.dep.split("->")
+      target_tokens = index[target_pos.to_i]
+      associations[token.segid] = target_tokens
+    end
+    if reverse
+      old = associations.dup
+      old.each do |s,ts|
+        ts.each do |t|
+          associations[t] ||= []
+          associations[t] += [s] unless associations[t].include?(s)
+        end
+      end
+    end
+    associations
+  end
+  def self.chunk_dep_graph(text, reverse = false, lang = 'en_core_web_md')
+    associations = dep_graph(text, false, lang)
+    chunks = self.chunk_segments(text, lang)
+    tokens = self.segments(text, lang)
+    index = Segment.index(tokens + chunks)
+    chunks.each do |chunk|
+      target_token_ids = chunk.dep.split(";").collect do|dep|
+        type, target_pos = dep.split("->")
+        index[target_pos.to_i]
+      end.flatten
+      target_tokens = target_token_ids.collect do |target_token_id|
+        range = Range.new(*target_token_id.split(":").last.split("..").map(&:to_i))
+        range.collect do |pos|
+          index[pos]
+        end.uniq
+      end.flatten
+      associations[chunk.segid] = target_tokens
+    end
+    if reverse
+      old = associations.dup
+      old.each do |s,ts|
+        ts.each do |t|
+          associations[t] ||= []
+          associations[t] += [s] unless associations[t].include?(s)
+        end
+      end
+    end
+    associations
+  end
+  def self.paths(text, source, target, reverse = true, lang = 'en_core_web_md')
+    graph = SpaCy.chunk_dep_graph(text, reverse, lang)
+    chunk_index = Segment.index(SpaCy.chunk_segments(text, lang))
+    source_id = chunk_index[source.offset].first || source.segid
+    target_id = chunk_index[target.offset].first || target.segid
+    path = Paths.dijkstra(graph, source_id, [target_id])
+    return nil if path.nil?
+    path.reverse
+  end
+  def self.config(base, target = nil)
+    TmpFile.with_file(base) do |baseconfig|
+      if target
+        CMD.cmd(:spacy, "init fill-config #{baseconfig} #{target}")
+      else
+        TmpFile.with_file do |tmptarget|
+          CMD.cmd(:spacy, "init fill-config #{baseconfig} #{tmptarget}")
+          Open.read(targetconfig)
+        end
+      end
+    end
   end
 end
@@ -43,10 +181,15 @@ module SpaCyToken
   extend Entity
   include SegmentAnnotation
-  self.annotation *SpaCy::PROPERTIES
+  self.annotation *SpaCy::TOKEN_PROPERTIES
   self.annotation :dep
 end
-if __FILE__ == $0
-  ppp Annotated.tsv(SpaCy.segments("I tell a story"), :all)
+module SpaCySpan
+  extend Entity
+  include SegmentAnnotation
+  self.annotation *SpaCy::CHUNK_PROPERTIES
+  self.annotation :dep
 end

data/lib/rbbt/relationship.rb ADDED Viewed

@@ -0,0 +1,24 @@
+require 'rbbt/segment'
+module Relationship
+  extend Annotation
+  self.annotation :segment
+  self.annotation :terms
+  self.annotation :type
+  def text
+    if segment
+      segment
+    else
+      type + ": " + terms * ", "
+    end
+  end
+  def html
+    text = <<-EOF
+<span class='Relationship'\
+>#{ self.text }</span>
+    EOF
+    text.chomp
+  end
+end

data/lib/rbbt/segment.rb CHANGED Viewed

@@ -49,10 +49,13 @@ module Segment
     length
   end
   def eend
     offset.to_i + length - 1
   end
+  alias end eend
   def range
     (offset.to_i..eend)
   end

data/lib/rbbt/segment/named_entity.rb CHANGED Viewed

@@ -8,6 +8,10 @@ module NamedEntity
   self.annotation :entity_type, :code, :score
+  def entity_type
+    annotation_values[:entity_type] || annotation_values[:type]
+  end
   def report
     <<-EOF
 String: #{ self }

data/lib/rbbt/segment/range_index.rb CHANGED Viewed

@@ -6,7 +6,7 @@ module Segment::RangeIndex
     SegID.setup(res, :corpus => corpus)
   end
-  def self.index(segments, corpus, persist_file = :memory)
+  def self.index(segments, corpus = nil, persist_file = :memory)
     segments = segments.values.flatten if Hash === segments
     annotation_index =

data/share/rner/config.rb ADDED Viewed

@@ -0,0 +1,51 @@
+isLetters     /^[A-Z]+$/i
+isUpper       /^[A-Z]+$/
+isLower       /^[a-z]+$/
+isDigits      /^[0-9]+$/i
+isRoman       /^[IVX]+$/
+isGreek       /^(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)$/i
+isPunctuation /^[,.;]$/
+isDelim       /^[\/()\[\]{}\-]$/
+isNonWord     /^[^\w]+$/
+isConjunction /^and|or|&|,$/
+hasLetters    /[A-Z]/i
+hasUpper      /.[A-Z]/
+hasLower      /[a-z]/
+hasDigits     /[0-9]/i
+hasGreek      /(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)/i
+hasPunctuation /[,.;]/
+hasDelim      /[\/()\[\]{}\-]/
+hasNonWord    /[^\w]/
+caspMix       /[a-z].[A-Z]/
+keywords      /(?:protein|gene|domain|ase)s?$/
+hasSuffix     /[a-z][A-Z0-9]$/
+numLetters    do |w| w.scan(/[A-Z]/i).length end
+numDigits     do |w| w.scan(/[0-9]/).length end
+#
+prefix_3      /^(...)/
+prefix_4      /^(....)/
+suffix_3      /(...)$/
+suffix_4      /(....)$/
+token1        do |w|
+                 w.sub(/[A-Z]/,'A').
+                   sub(/[a-z]/,'a').
+                   sub(/[0-9]/,'0').
+                   sub(/[^0-9a-z]/i,'x')
+              end
+token2        do  |w|
+                 w.sub(/[A-Z]+/,'A').
+                   sub(/[a-z]+/,'a').
+                   sub(/[0-9]+/,'0').
+                   sub(/[^0-9a-z]+/i,'x')
+               end
+token3         do |w| w.downcase end
+special        do |w| w.is_special? end
+context   %w(special token2 isPunctuation isDelim)
+window     %w(1 2 3 -1 -2 -3)
+#direction :reverse

data/test/rbbt/document/corpus/test_pubmed.rb CHANGED Viewed

@@ -7,7 +7,8 @@ class TestCorpusPubmed < Test::Unit::TestCase
   def test_add_pmid
     corpus = Document::Corpus.setup({})
-    document = corpus.add_pmid("32299157", :abstract).first
+    document = corpus.add_pmid("33359141", :abstract).first
+    iii document.docid
     title = document.to(:title)
     assert title.include?("COVID-19")
   end

data/test/rbbt/document/test_annotation.rb CHANGED Viewed

@@ -4,6 +4,7 @@ require 'rbbt/document/corpus'
 require 'rbbt/segment'
 require 'rbbt/document/annotation'
 require 'rbbt/segment/named_entity'
+require 'rbbt/ner/abner'
 class TestAnnotation < Test::Unit::TestCase
   class CalledOnce < Exception; end
@@ -28,6 +29,12 @@ class TestAnnotation < Test::Unit::TestCase
       self.split(" ").collect{|e| NamedEntity.setup(e, :code => Misc.digest(e)) }
     end
+    Document.define :abner do
+      $called_once = true
+      Abner.new.match(self)
+    end
     Document.persist :ner
   end
@@ -133,7 +140,9 @@ class TestAnnotation < Test::Unit::TestCase
     text.ner
     assert ! $called_once
+    assert_equal text.abner.first.docid, text.docid
     assert  text.ner.first.segid.include?("TEST:")
   end
 end

data/test/rbbt/document/test_corpus.rb CHANGED Viewed

@@ -29,5 +29,19 @@ class TestDocumentCorpus < Test::Unit::TestCase
       assert corpus.docids("TEST:").include?(text.docid)
     end
   end
+  def test_load
+    text = "This is a document"
+    Document.setup(text, "TEST", "test_doc1", nil)
+    TmpFile.with_file do |path|
+      corpus = Persist.open_tokyocabinet(path, true, :single, "BDB")
+      corpus.extend Document::Corpus
+      corpus.add_document(text)
+      assert corpus.docids("TEST:").include?(text.docid)
+    end
+  end
 end

data/test/rbbt/ner/test_rner.rb ADDED Viewed

@@ -0,0 +1,132 @@
+require File.dirname(__FILE__) + '/../../test_helper'
+require 'rbbt'
+require 'rbbt/ner/rner'
+require 'test/unit'
+class TestRNer < Test::Unit::TestCase
+  def setup
+    @parser = NERFeatures.new() do
+      isLetters     /^[A-Z]+$/i
+      context prefix_3      /^(...)/
+      downcase do |w| w.downcase end
+      context %w(downcase)
+    end
+  end
+  def test_config
+    config = <<-EOC
+      isLetters     /^[A-Z]+$/i
+      context prefix_3      /^(...)/
+      downcase do |w| w.downcase end
+      context %w(downcase)
+    EOC
+    assert_equal config.strip, @parser.config.strip
+  end
+  def test_reverse
+    assert_equal("protein P53", NERFeatures.reverse("P53 protein"))
+    assert_equal(
+       ". LH of assay - radioimmuno serum the with compared was LH urinary for ) GONAVIS - HI ( test hemagglutination direct new A",
+     NERFeatures.reverse(
+       "A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH."
+      ))
+  end
+  def test_features
+    assert_equal  @parser.features("abCdE"), ["abCdE",true,'abC','abcde']
+  end
+  def test_template
+    template =<<-EOT
+UisLetters: %x[0,1]
+Uprefix_3: %x[0,2]
+Uprefix_3#1: %x[1,2]
+Uprefix_3#-1: %x[-1,2]
+Udowncase: %x[0,3]
+Udowncase#1: %x[1,3]
+Udowncase#-1: %x[-1,3]
+B
+    EOT
+    assert(@parser.template == template)
+  end
+  def test_tokens
+    assert( NERFeatures.tokens("A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH.")==
+           ["A", "new", "direct", "hemagglutination", "test", "(", "HI", "-", "GONAVIS", ")", "for", "urinary", "LH", "was", "compared", "with", "the", "serum", "radioimmuno", "-", "assay", "of", "LH", "."])
+  end
+  def test_text_features
+    assert(@parser.text_features("abCdE 1234") == [["abCdE",true, "abC", "abcde"], ["1234",false, "123", "1234"]])
+    assert(@parser.text_features("abCdE 1234",true) == [["abCdE",true, "abC", "abcde",1], ["1234",false, "123", "1234",2]])
+    assert(@parser.text_features("abCdE 1234",false) == [["abCdE",true, "abC", "abcde",0], ["1234",false, "123", "1234",0]])
+  end
+  def test_tagged_features
+    assert_equal(
+      [["phosphorilation",true, "pho", "phosphorilation", 0],
+        ["of",true, false, "of", 0],
+        ["GENE1",false, "GEN", "gene1", 1],
+        [".", false, false, ".", 0]],
+      @parser.tagged_features("phosphorilation of GENE1.",['GENE1']))
+      assert_equal(
+        [["GENE1",false, "GEN", "gene1", 1],
+          ["phosphorilation",true, "pho", "phosphorilation", 0]],
+      @parser.tagged_features("GENE1 phosphorilation",['GENE1']))
+    assert_equal(
+           [["phosphorilation",true, "pho", "phosphorilation", 0],
+            ["of",true, false, "of", 0],
+            ["GENE",true, "GEN", "gene", 1],
+            ["1",false, false, "1", 2],
+            [".", false, false, ".", 0]],
+      @parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
+  end
+  def test_tagged_features_reverse
+    @parser.reverse = true
+    assert_equal(
+      [
+        ["GENE1",false, "GEN", "gene1", 1],
+        ["of",true, false, "of", 0],
+        ["phosphorilation",true, "pho", "phosphorilation", 0]
+    ],
+    @parser.tagged_features("phosphorilation of GENE1",['GENE1']))
+    assert_equal(
+          [
+            [".", false, false, ".", 0],
+            ["1",false, false, "1", 1],
+            ["GENE",true, "GEN", "gene", 2],
+            ["of",true, false, "of", 0],
+            ["phosphorilation",true, "pho", "phosphorilation", 0]
+        ],
+    @parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
+  end
+  def test_default_config
+    require 'rbbt/bow/misc'
+    text =<<-EOF
+This text explains how MDM2 interacts with TP53.
+    EOF
+    @parser = NERFeatures.new Rbbt.share.rner["config.rb"].find
+    features = @parser.tagged_features text, %w(TP53 MDM2)
+    assert features.first.first == "This"
+  end
+  def __test_CRFPP_install
+    assert(require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP'))
+  end
+end

data/test/rbbt/segment/test_named_entity.rb CHANGED Viewed

@@ -24,7 +24,8 @@ class TestClass < Test::Unit::TestCase
   def test_tsv
     a = "test"
-    NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
+    NamedEntity.setup a, 10, "DocID", "TYPE", "CODE", "SCORE"
+    ppp Annotated.tsv([a,a])
     assert Annotated.tsv([a]).fields.include? "code"
     assert Annotated.tsv([a], nil).fields.include? "code"
     assert Annotated.tsv([a], :all).fields.include? "code"

data/test/rbbt/segment/test_transformed.rb CHANGED Viewed

@@ -144,7 +144,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
     gene2.entity_type = "Protein"
     Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
-      assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
+      assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
     end
   end
@@ -165,7 +165,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
     gene2.entity_type = "Protein"
     Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
-      assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
+      assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
     end
   end
@@ -185,9 +185,9 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
     assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
     Transformed.with_transform(a, [gene1], Proc.new{|e| e.html}) do
-      assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the CDK5R1 protein", a
+      assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
       Transformed.with_transform(a, [gene2], Proc.new{|e| e.html}) do
-        assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene'><span class='Entity' attr-entity-type='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
+        assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
       end
     end
   end

data/test/test_spaCy.rb CHANGED Viewed

@@ -3,7 +3,7 @@ require 'rbbt/nlp/spaCy'
 require 'rbbt/document/corpus'
 class TestSpaCy < Test::Unit::TestCase
-  def _test_tokens
+  def test_tokens
     text = "I tell a story"
     tokens = SpaCy.tokens(text)
@@ -12,6 +12,16 @@ class TestSpaCy < Test::Unit::TestCase
     assert_equal "tell", tokens[1].to_s
   end
+  def test_chunks
+    text = "Miguel Vazquez tell a good story"
+    tokens = SpaCy.chunks(text)
+    assert_equal 2, tokens.length
+    assert_equal "Miguel Vazquez", tokens[0].to_s
+  end
   def test_segments
     text = "I tell a story. It's a very good story."
@@ -28,5 +38,107 @@ class TestSpaCy < Test::Unit::TestCase
       assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
     end
   end
+  def test_chunk_segments
+    text = "I tell a story. It's a very good story."
+    corpus = Document::Corpus.setup({})
+    Document.setup(text, "TEST", "test_doc1", "simple_sentence")
+    corpus.add_document text
+    text.corpus = corpus
+    segments = SpaCy.chunk_segments(text)
+    segments.each do |segment|
+      assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
+    end
+  end
+  def test_dep_graph
+    text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
+    graph = SpaCy.dep_graph(text, true)
+    tokens = SpaCy.segments(text)
+    index = Segment.index tokens
+    tf_s = tokens.select{|t| t == "TF" }.first
+    tg_s = tokens.select{|t| t == "TG" }.first
+    require 'rbbt/network/paths'
+    path = Paths.dijkstra(graph, tf_s.segid, [tg_s.segid])
+    path_tokens = path.collect do |segid|
+      range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
+      text[range]
+    end
+    assert path_tokens.include? 'increase'
+  end
+  def test_chunk_dep_graph
+    text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
+    graph = SpaCy.chunk_dep_graph(text, true)
+    tokens = SpaCy.chunk_segments(text)
+    index = Segment.index tokens
+    tf_s = tokens.select{|t| t.include? "TF" }.first
+    tg_s = tokens.select{|t| t.include? "TG" }.first
+    require 'rbbt/network/paths'
+    path = Paths.dijkstra(graph, tf_s.segid, [tg_s.segid])
+    path_tokens = path.collect do |segid|
+      range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
+      text[range]
+    end
+    assert path_tokens.include? 'increase'
+  end
+  def test_paths
+    text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
+    path = SpaCy.paths(text, Segment.setup("TF", :offset => text.index("TF")), Segment.setup("TG",:offset =>  text.index("TG")))
+    path_tokens = path.collect do |segid|
+      range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
+      text[range]
+    end
+    ppp text
+    iii path_tokens
+    assert path_tokens.include? 'increase'
+  end
+  def test_paths2
+    text = "Deletion and domain swap experiments identified small, discreet positive and negative elements in A-Myb and TF that were required for the regulation of specific genes, such as DHRS2, TG, and mim-1"
+    path = SpaCy.paths(text, Segment.setup("TF", :offset => text.index("TF")), Segment.setup("TG",:offset =>  text.index("TG")))
+    path_tokens = path.collect do |segid|
+      range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
+      text[range]
+    end
+    iii path_tokens
+    assert path_tokens.include? 'regulation'
+  end
+  def test_paths3
+    text = "Therefore, we speculate that PEA3 factors may contribute to the up-regulation of COX-2 expression resulting from both APC mutation and Wnt1 expression"
+    path = SpaCy.paths(text, *Segment.align(text,["PEA3", "Wnt1"]))
+    path_tokens = path.collect do |segid|
+      range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
+      text[range]
+    end
+  end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rbbt-text
 version: !ruby/object:Gem::Version
-  version: 1.3.4
+  version: 1.3.5
 platform: ruby
 authors:
 - Miguel Vazquez
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2020-07-23 00:00:00.000000000 Z
+date: 2021-06-17 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rbbt-util
@@ -95,6 +95,7 @@ files:
 - lib/rbbt/ner/oscar4.rb
 - lib/rbbt/ner/patterns.rb
 - lib/rbbt/ner/regexpNER.rb
+- lib/rbbt/ner/rner.rb
 - lib/rbbt/ner/rnorm.rb
 - lib/rbbt/ner/rnorm/cue_index.rb
 - lib/rbbt/ner/rnorm/tokens.rb
@@ -103,6 +104,7 @@ files:
 - lib/rbbt/nlp/nlp.rb
 - lib/rbbt/nlp/open_nlp/sentence_splitter.rb
 - lib/rbbt/nlp/spaCy.rb
+- lib/rbbt/relationship.rb
 - lib/rbbt/segment.rb
 - lib/rbbt/segment/annotation.rb
 - lib/rbbt/segment/encoding.rb
@@ -126,6 +128,7 @@ files:
 - share/install/software/OpenNLP
 - share/install/software/StanfordParser
 - share/patterns/drug_induce_disease
+- share/rner/config.rb
 - share/rnorm/cue_default
 - share/rnorm/tokens_default
 - share/wordlists/stopwords
@@ -148,6 +151,7 @@ files:
 - test/rbbt/ner/test_oscar4.rb
 - test/rbbt/ner/test_patterns.rb
 - test/rbbt/ner/test_regexpNER.rb
+- test/rbbt/ner/test_rner.rb
 - test/rbbt/ner/test_rnorm.rb
 - test/rbbt/ner/test_token_trieNER.rb
 - test/rbbt/nlp/genia/test_sentence_splitter.rb
@@ -182,7 +186,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.0.6
+rubygems_version: 3.1.4
 signing_key:
 specification_version: 4
 summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
@@ -210,6 +214,7 @@ test_files:
 - test/rbbt/ner/test_banner.rb
 - test/rbbt/ner/test_token_trieNER.rb
 - test/rbbt/ner/test_finder.rb
+- test/rbbt/ner/test_rner.rb
 - test/rbbt/ner/test_linnaeus.rb
 - test/rbbt/ner/test_oscar4.rb
 - test/rbbt/test_segment.rb