RubyGems - rbbt-text - Versions diffs - 1.3.0 → 1.3.5 - Mend

rbbt-text 1.3.0 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

checksums.yaml +4 -4
data/lib/rbbt/bow/bow.rb +5 -2
data/lib/rbbt/bow/dictionary.rb +27 -23
data/lib/rbbt/document.rb +20 -5
data/lib/rbbt/document/annotation.rb +7 -4
data/lib/rbbt/document/corpus.rb +30 -3
data/lib/rbbt/document/corpus/pubmed.rb +2 -1
data/lib/rbbt/ner/abner.rb +3 -2
data/lib/rbbt/ner/banner.rb +3 -1
data/lib/rbbt/ner/brat.rb +1 -1
data/lib/rbbt/ner/g_norm_plus.rb +7 -1
data/lib/rbbt/ner/linnaeus.rb +2 -1
data/lib/rbbt/ner/patterns.rb +0 -1
data/lib/rbbt/ner/rner.rb +229 -0
data/lib/rbbt/ner/token_trieNER.rb +32 -18
data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
data/lib/rbbt/nlp/spaCy.rb +195 -0
data/lib/rbbt/relationship.rb +24 -0
data/lib/rbbt/segment.rb +9 -4
data/lib/rbbt/segment/annotation.rb +3 -3
data/lib/rbbt/segment/named_entity.rb +7 -0
data/lib/rbbt/segment/range_index.rb +1 -1
data/lib/rbbt/segment/relationship.rb +7 -0
data/lib/rbbt/segment/transformed.rb +5 -1
data/share/install/software/OpenNLP +1 -1
data/share/rner/config.rb +51 -0
data/test/rbbt/document/corpus/test_pubmed.rb +2 -1
data/test/rbbt/document/test_annotation.rb +15 -6
data/test/rbbt/document/test_corpus.rb +15 -1
data/test/rbbt/ner/test_g_norm_plus.rb +11 -3
data/test/rbbt/ner/test_rner.rb +132 -0
data/test/rbbt/nlp/genia/test_sentence_splitter.rb +27 -3
data/test/rbbt/segment/test_annotation.rb +3 -4
data/test/rbbt/segment/test_encoding.rb +1 -1
data/test/rbbt/segment/test_named_entity.rb +7 -5
data/test/rbbt/segment/test_range_index.rb +1 -2
data/test/rbbt/segment/test_transformed.rb +33 -4
data/test/rbbt/test_segment.rb +5 -10
data/test/test_spaCy.rb +144 -0
metadata +12 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: c2a24d8e7faf30d53e41a00a27f6145e8e9f18f0c10af57cdddaea0ee18c35d6
-  data.tar.gz: 3475006965110391e35151cd1b5368028dacf467aa276f8eb68fce3320be1122
+  metadata.gz: '0846f900d745dd27df8006eecbc9d294f9f38a23dd76001de2a5dc0313db7e22'
+  data.tar.gz: 675985882a6c8b9813f620d7ef0a555efa5c148c7c2fe36e0030f84f3fd88cf0
 SHA512:
-  metadata.gz: da40a039a4792eb5e7fa00270870279221c74dcbf51df1b5278b247496fefbfa888a87b7ab19f05676644c51a01177eb49e229cb0156fe7f0190dd4933d41e24
-  data.tar.gz: a32fca5f21a987dcbb6b5541015cc33879330e6f1ef7c4a28e75debe5bdd1dc8bf7b98bfc91d828e605f29868aa972b55cd59bb4f86e66d2fb0cfea31fac2ae0
+  metadata.gz: dfd9c333b94181496134b825c63d6e93a0390f81d426526f79c00cf12556021b60004b29b57ca9b0b274141937027f7bc780552a60de007e5f790b19910354c0
+  data.tar.gz: 205beeb8829c8358fd29c0a18351522e566106e24220af3d7bec3676694d37d682b92243e4fd4cd495b542f9945a28cf8585e587342672d31779d0b21b53ae4e

data/lib/rbbt/bow/bow.rb CHANGED Viewed

@@ -69,6 +69,11 @@ module BagOfWords
     count = bigrams ? count(bigrams(text)) : count(words(text))
     count.values_at(*terms)
   end
+  def self.weighted_features(text, weights)
+    features = features(text, weights.keys)
+    features.zip(weights.values).collect{|f,w| f * w }
+  end
 end
 class String
@@ -82,5 +87,3 @@ class String
     BagOfWords.bigrams(self)
   end
 end

data/lib/rbbt/bow/dictionary.rb CHANGED Viewed

@@ -74,28 +74,32 @@ class Dictionary::TF_IDF
   end
   def best(options = {})
-    high, low, limit = {
-      :low   => 0,
-      :high    => 1,
-    }.merge(options).
-    values_at(:high, :low, :limit)
-    num_docs = @num_docs.to_f
-    best = df.select{|term, value|
-      value >= low && value <= high
-    }.collect{|p|
-      term     = p.first
-      df_value = p.last
-      [term,
-       @terms[term].to_f / num_docs * Math::log(1.0/df_value)
-      ]
-    }
-    if limit
-      Hash[*best.sort{|a,b| b[1] <=>  a[1]}.slice(0, limit).flatten]
-    else
-      Hash[*best.flatten]
-    end
+    key = Misc.obj2digest(options)
+    @best ||= {}
+    @best[key] ||= begin
+                     high, low, limit = {
+                       :low   => 0,
+                       :high    => 1,
+                     }.merge(options).
+                     values_at(:high, :low, :limit)
+                     num_docs = @num_docs.to_f
+                     best = df.select{|term, value|
+                       value >= low && value <= high
+                     }.collect{|p|
+                       term     = p.first
+                       df_value = p.last
+                       [term,
+                        @terms[term].to_f / num_docs * Math::log(1.0/df_value)
+                       ]
+                     }
+                     if limit
+                       Hash[*best.sort{|a,b| b[1] <=>  a[1]}.slice(0, limit-1).flatten]
+                     else
+                       Hash[*best.flatten]
+                     end
+                   end
   end
   def weights(options = {})
@@ -173,7 +177,7 @@ class Dictionary::KL
       best[term] = pos * Math::log(pos / neg) + neg * Math::log(neg / pos)
     }
     if limit
-      Hash[*best.sort{|a,b| b[1] <=>  a[1]}.slice(0, limit).flatten]
+      Hash[*best.sort{|a,b| b[1] <=>  a[1]}.slice(0, limit-1).flatten]
     else
       best
     end

data/lib/rbbt/document.rb CHANGED Viewed

@@ -1,6 +1,5 @@
 require 'rbbt-util'
 require 'rbbt/entity'
-require 'rbbt/document/annotation'
 module DocID
   extend Entity
@@ -19,10 +18,20 @@ module DocID
     DocID.setup([namespace, code, "title"] * ":", :corpus => corpus)
   end
-  def document
-    text = self.corpus[self]
-    namespace, id, type = self.split(":")
-    Document.setup(text, namespace, id, type, :corpus => corpus)
+  property :document => :both do
+    if Array === self
+      namespace, id, type = nil, nil, nil
+      docs = self.collect do |docid|
+        text = self.corpus[docid]
+        namespace, id, type = docid.split(":")
+        text
+      end
+      Document.setup(docs, :corpus => corpus)
+    else
+      text = self.corpus[self]
+      namespace, id, type = self.split(":")
+      Document.setup(text, :namespace => namespace, :code => id, :type => type, :corpus => corpus)
+    end
   end
 end
@@ -44,3 +53,9 @@ module Document
   alias id docid
 end
+#class String
+#  def docid
+#    digest = Misc.digest(self)
+#    ["STRING", digest, nil, nil] * ":"
+#  end
+#end

data/lib/rbbt/document/annotation.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+require 'rbbt/segment'
 require 'rbbt/segment/annotation'
 module Document
@@ -12,7 +13,7 @@ module Document
       end
       docid = self.docid
-      segments.each{|s| s.docid = docid if s.docid.nil? }
+      segments.each{|s| s.docid = docid }
       segments
     end
@@ -22,18 +23,20 @@ module Document
     send :property, type => :multiple do |list|
       doc_segments = self.instance_exec list, &block
-      doc_segments = doc_segments.chunked_values_at(self) if Hash === doc_segments
+      doc_segments = doc_segments.chunked_values_at(list) if Hash === doc_segments
       doc_segments.each_with_index do |segments,i|
+        next if segments.nil?
         document = list[i]
-        Segment.align(document, segments) unless segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
+        Segment.align(document, segments) unless segments.nil? || segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
         segments.each do |segment|
           SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
         end
         docid = document.docid
-        segments.each{|s| s.docid = docid if s.docid.nil? }
+        segments.each{|s| s.docid = docid }
         segments
       end

data/lib/rbbt/document/corpus.rb CHANGED Viewed

@@ -3,17 +3,43 @@ require 'rbbt-util'
 module Document::Corpus
   def self.setup(corpus)
-    corpus.extend Document::Corpus
+    corpus = Persist.open_tokyocabinet(corpus, true, :single, "BDB") if String === corpus
+    corpus.extend Document::Corpus unless Document::Corpus === corpus
+    corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
+    corpus
   end
   def add_document(document)
-    self[document.docid] = document
+    docid = document.docid
+    return self[docid] if self.include?(docid)
+    self.write_and_close do
+      self[docid] = document
+    end
+  end
+  def docids(*prefix)
+    prefix = prefix * ":"
+    prefix += ":" unless prefix == :all || prefix[-1] == ":"
+    docids = self.read_and_close do
+      prefix == :all ? self.keys : self.prefix(prefix)
+    end
+    DocID.setup(docids, :corpus => self)
+  end
+  def documents(*prefix)
+    self.docids(*prefix).document
   end
   def [](*args)
     docid, *rest = args
-    res = super(*args)
+    res = self.read_and_close do
+      super(*args)
+    end
+    res.force_encoding(Encoding.default_external) if res
     return res if args.length > 1
     namespace, id, type  = docid.split(":")
     if res.nil?
@@ -22,6 +48,7 @@ module Document::Corpus
       end
     end
+    res.force_encoding(Encoding.default_external) if res
     Document.setup(res, namespace, id, type, self) unless res.nil?
     res

data/lib/rbbt/document/corpus/pubmed.rb CHANGED Viewed

@@ -6,7 +6,6 @@ module Document::Corpus
     type = nil if String === type and type.empty?
     res = PubMed.get_article(pmids).collect do |pmid, article|
-      Log.debug "Loading pmid #{pmid}"
       document = if type.nil? || type.to_sym == :abstract
                    Document.setup(article.abstract || "", "PMID", pmid, :abstract, self, :corpus => self)
                  elsif type.to_sym == :title
@@ -15,7 +14,9 @@ module Document::Corpus
                    raise "No FullText available for #{ pmid }" if article.full_text.nil?
                    Document.setup(article.full_text, :PMID, pmid, :fulltext, self, :corpus => self)
                  end
+      Log.debug "Loading pmid #{pmid}"
       add_document(document)
+      document
     end
     Document.setup(res)

data/lib/rbbt/ner/abner.rb CHANGED Viewed

@@ -39,14 +39,15 @@ class Abner < NER
     types = res[1]
     strings = res[0]
+    docid = Misc.digest(text)
     global_offset = 0
     strings.zip(types).collect do |mention, type|
       mention = mention.to_s;
       offset = text.index(mention)
       if offset.nil?
-        NamedEntity.setup(mention, nil, type.to_s)
+        NamedEntity.setup(mention, :docid => docid, :entity_type => type)
       else
-        NamedEntity.setup(mention, offset + global_offset, type.to_s)
+        NamedEntity.setup(mention, :offset => offset + global_offset, :docid => docid, :entity_type => type.to_s)
         text = text[offset + mention.length..-1]
         global_offset += offset + mention.length
       end

data/lib/rbbt/ner/banner.rb CHANGED Viewed

@@ -55,6 +55,7 @@ class Banner < NER
   # text.
   def match(text)
     return [] if text.nil?
+    text = text.dup if text.frozen?
     text.gsub!(/\n/,' ')
     text.gsub!(/\|/,'/') # Character | gives an error
     return [] if text.strip.empty?
@@ -66,6 +67,7 @@ class Banner < NER
     @parenPP.postProcess(sentence)
     tagged = sentence.getSGML
+    docid = Misc.digest text
     res = tagged.scan(/<GENE>.*?<\/GENE>/).
       collect{|r|
       r.match(/<GENE>(.*?)<\/GENE>/)
@@ -73,7 +75,7 @@ class Banner < NER
       mention.sub!(/^\s*/,'')
       mention.sub!(/\s*$/,'')
       offset = text.index(mention)
-      NamedEntity.setup(mention, offset, 'GENE')
+      NamedEntity.setup(mention, :offset => offset, :docid => docid, :entity_type => 'GENE')
       mention
     }
     res

data/lib/rbbt/ner/brat.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 require 'rbbt/segment/named_entity'
-require 'rbbt/text/segment/relationship'
+require 'rbbt/relationship'
 module Brat
   Rbbt.claim Rbbt.software.opt.Brat, :install, "https://github.com/nlplab/brat.git"

data/lib/rbbt/ner/g_norm_plus.rb CHANGED Viewed

@@ -55,11 +55,16 @@ EOF
         Open.mkdir 'tmp'
         texts.each do |name,text|
+          text = Misc.fixutf8(text)
+          text = text.gsub('|', '#').gsub("\n", " ").gsub(/\t/,' ')
           Open.write("input/#{name}.txt") do |f|
-            f.puts "#{name}|a|" << text.gsub("\n\n", "\n·")
+            f.puts "#{name}|a|" << text
             f.puts
           end
         end
         Open.write('config', CONFIG)
         CMD.cmd_log("java -Xmx20G -Xms20G  -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
@@ -95,6 +100,7 @@ EOF
       res[name] = segments
     end
+    res
   end
 end

data/lib/rbbt/ner/linnaeus.rb CHANGED Viewed

@@ -31,7 +31,8 @@ module Linnaeus
     init unless defined? @@Matcher
     @@Matcher.match(text).toArray().collect do |mention|
-      NamedEntity.setup(mention.text(), :offset => mention.start(), :entity_type => "Organism", :code => mention.ids(), :score => mention.probabilities())
+      best_id, best_prob = mention.ids().zip(mention.probabilities()).sort_by{|i,p| p.to_f }.last
+      NamedEntity.setup(mention.text(), :offset => mention.start(), :entity_type => "Organism", :code => best_id, :score => best_prob)
     end
   end
 end

data/lib/rbbt/ner/patterns.rb CHANGED Viewed

@@ -15,7 +15,6 @@ class PatternRelExt
     segments = sentence.segments
     segments = segments.values.flatten if Hash === segments
     Transformed.with_transform(sentence, segments, Proc.new{|s| s.entity_type.to_s.upcase}) do |sentence|
-      ppp sentence
       regexpNER.entities(sentence)
     end
   end

data/lib/rbbt/ner/rner.rb ADDED Viewed

@@ -0,0 +1,229 @@
+require 'rbbt'
+require 'rbbt/util/open'
+require 'rbbt/util/misc'
+require 'rbbt/util/simpleDSL'
+class NERFeatures
+  include SimpleDSL
+  def self.tokens(text)
+    text.scan(/
+              \w*-?(?:\d*\d[.,]\d\d*|\d+)\w*|
+              \w-\w*|
+              \w+-[A-Z](?!\w)|
+              \w+|
+              [.,()\/\[\]{}'"+-]
+              /x)
+  end
+  def self.reverse(text)
+    tokens(text).reverse.join(" ")
+  end
+  def define(name, *args, &block)
+    action = args[0] || block ||  /#{name.to_s}s?/i
+    raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
+    @types[name.to_s] = action
+    @order.push name.to_s
+    name.to_s
+  end
+  attr_accessor :reverse
+  def initialize(file = nil, reverse = false, &block)
+    @types   = {}
+    @order   = []
+    @context = []
+    @reverse = reverse
+    file ||= Rbbt.share.ner['config.rb'].find if !file && !block
+    parse(:define, file, &block)
+  end
+  def config
+    @config[:define]
+  end
+  def window(positions)
+    @window = positions
+  end
+  def context(name, &block)
+    if name.is_a? Array
+      @context += name
+    else
+      @context.push name
+      # The block might be wrongly assigned to this function
+      # instead of the actual definition, fix that.
+      if block
+        @types[name] = block
+      end
+    end
+  end
+  def direction(dir)
+    if dir.to_sym == :reverse
+      @reverse = true
+    end
+  end
+  def features(word)
+    values = [word]
+    @order.each{|features|
+      action = @types[features]
+      if action.is_a?(Proc)
+        values.push(action.call(word))
+      else
+        m = action.match(word)
+        if m
+          if m[1]
+            values.push(m[1])
+          else
+            values.push(m != nil)
+          end
+        else
+          values.push(false)
+        end
+      end
+    }
+    values
+  end
+  def template(window=nil)
+    window ||= @window || [1,-1]
+    template = ""
+    i = 1
+    @order.each{|feat|
+      template += "U#{ feat }: %x[0,#{ i }]\n"
+      if @context.include?(feat)
+        window.each{|p|
+          template += "U#{ feat }##{ p}: %x[#{ p },#{ i }]\n"
+        }
+      end
+      i += 1
+    }
+    template += "B\n"
+    template
+  end
+  def text_features(text, positive = nil)
+    text = self.class.reverse(text) if @reverse
+    initial = true
+    self.class.tokens(text).collect{|token|
+      features = features(token)
+      if !positive.nil?
+        features << (positive ? (initial ? 1 : 2) : 0)
+        initial = false
+      end
+      features
+    }
+  end
+  def tagged_features(text, mentions)
+    mentions ||= []
+    mentions = ['IMPOSSIBLE_MATCH'] if mentions.empty?
+    re = mentions.collect{|mention|
+      Regexp.quote(mention.gsub(/\s+/,' ')).sub(/\\s/,'\s+')
+    }.join("|")
+    positive = false
+    features = []
+    chunks = text.split(/(#{re})/)
+    chunks.each{|t|
+      chunk_features = text_features(t, positive)
+      positive = !positive
+      if @reverse
+        features = chunk_features + features
+      else
+        features = features + chunk_features
+      end
+    }
+    features
+  end
+  def train(features, model)
+    tmp_template = TmpFile.tmp_file("template-")
+    Open.write(tmp_template,template)
+    cmd = "#{File.join(Rbbt.datadir, 'third_party/crf++/bin/crf_learn')} '#{tmp_template}'  '#{features}' '#{model}'"
+    system cmd
+    Open.write(model + '.config',config)
+    FileUtils.rm tmp_template
+  end
+end
+class NER
+  def initialize(model = nil)
+    begin
+      require 'CRFPP'
+    rescue Exception
+      require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP')
+    end
+    model ||= File.join(Rbbt.datadir, + 'ner/model/BC2')
+    @parser = NERFeatures.new(model + '.config')
+    @reverse = @parser.reverse
+    @tagger = CRFPP::Tagger.new("-m #{ model } -v 3 -n2")
+  end
+  def extract(text)
+    features = @parser.text_features(text)
+    @tagger.clear
+    features.each{|feats|
+      @tagger.add(feats.join(" "))
+    }
+    @tagger.parse
+    found = []
+    mention = []
+    @tagger.size.times{|i|
+      label = @tagger.y(i)
+      word  = @tagger.x(i,0)
+      if word == ')'
+        mention.push(')') if mention.join =~ /\(/
+        next
+      end
+      case label
+      when 1
+        if mention.any? && ( mention.join(" ").is_special? || mention.select{|m| m.is_special?}.any?)
+          found.push(mention)
+          mention = []
+        end
+        mention.push(word)
+      when 2
+        mention.push(word)
+      when 0
+        found.push(mention) if mention.any?
+        mention = []
+      end
+    }
+    found << mention if mention.any?
+    found.collect{|list|
+      list = list.reverse if @reverse
+      list.join(" ")
+    }
+  end
+end