RubyGems - rbbt-text - Versions diffs - 0.2.0 → 0.2.1 - Mend

rbbt-text 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

data/lib/rbbt/bow/dictionary.rb +1 -1
data/lib/rbbt/bow/misc.rb +2 -2
data/lib/rbbt/ner/NER.rb +22 -0
data/lib/rbbt/ner/abner.rb +8 -4
data/lib/rbbt/ner/annotations.rb +123 -0
data/lib/rbbt/ner/banner.rb +6 -4
data/lib/rbbt/ner/oscar3.rb +29 -13
data/lib/rbbt/ner/regexpNER.rb +69 -45
data/lib/rbbt/ner/token_trieNER.rb +168 -0
data/test/rbbt/ner/test_NER.rb +10 -0
data/test/rbbt/ner/test_abner.rb +2 -2
data/test/rbbt/ner/test_annotations.rb +8 -0
data/test/rbbt/ner/test_banner.rb +2 -2
data/test/rbbt/ner/test_oscar3.rb +35 -2
data/test/rbbt/ner/test_regexpNER.rb +83 -35
data/test/rbbt/ner/test_token_trieNER.rb +112 -0
metadata +15 -12
data/lib/rbbt/ner/named_entity.rb +0 -11
data/lib/rbbt/ner/tokenNER.rb +0 -237
data/test/rbbt/ner/test_named_entity.rb +0 -16
data/test/rbbt/ner/test_tokenNER.rb +0 -239

data/lib/rbbt/bow/dictionary.rb CHANGED Viewed

@@ -174,7 +174,7 @@ class Dictionary::KL
     if limit
       Hash[*best.sort{|a,b| b[1] <=>  a[1]}.slice(0, limit).flatten]
     else
-      Hash[*best.flatten]
+      best
     end
   end

data/lib/rbbt/bow/misc.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 require 'rbbt'
 require 'rbbt/util/open'
-Rbbt.add_datafiles 'stopwords' => ['wordlists', 'stopwords']
+Rbbt.claim 'stopwords', 'stopwords', 'wordlist'
-$stopwords = Open.read(Rbbt.find_datafile 'stopwords').scan(/\w+/) if File.exists?(Rbbt.find_datafile 'stopwords')
+$stopwords = Rbbt.files.wordlists.stopwords.read.scan(/\w+/)

data/lib/rbbt/ner/NER.rb ADDED Viewed

@@ -0,0 +1,22 @@
+require 'rbbt/ner/annotations'
+class NER
+  def entities(text, overlap = true, *args)
+    case
+    when Array === text
+      text.collect do |element|
+        matches = entities(element, overlap, *args)
+        matches.each{|match|
+          match.offset += element.offset if match.offset
+        }
+        matches
+      end.flatten
+    when (Annotated === text and not overlap)
+      entities(text.split, overlap, *args)
+    else
+      match(text, *args)
+    end
+  end
+end

data/lib/rbbt/ner/abner.rb CHANGED Viewed

@@ -1,10 +1,11 @@
 require 'rbbt'
 require 'rjb'
-require 'rbbt/ner/named_entity'
+require 'rbbt/ner/annotations'
+require 'rbbt/ner/NER'
 # Offers a Ruby interface to the Abner Named Entity Recognition Package
 # in Java Abner[http://www.cs.wisc.edu/~bsettles/abner/].
-class Abner
+class Abner < NER
   Rbbt.add_software "ABNER" => ['','']
@@ -25,13 +26,16 @@ class Abner
   # Given a chunk of text, it finds all the mentions appearing in it. It
   # returns all the mentions found, regardless of type, to be coherent
   # with the rest of NER packages in Rbbt.
-  def extract(text)
+  def match(text)
     res = @tagger.getEntities(text)
     types = res[1]
     strings = res[0]
-    strings.zip(types).collect{|mention, type| mention = mention.to_s; NamedEntity mention, types.to_s; mention}
+    strings.zip(types).collect do |mention, type|
+      mention = mention.to_s;
+      NamedEntity.annotate(mention, nil, type.to_s)
+    end
   end
 end

data/lib/rbbt/ner/annotations.rb ADDED Viewed

@@ -0,0 +1,123 @@
+module Segment
+  attr_accessor :offset
+  def self.sort(segments, inline = true)
+    if inline
+      segments.sort do |a,b|
+        case
+        when ((a.nil? and b.nil?) or (a.offset.nil? and b.offset.nil?))
+          0
+        when (a.nil? or a.offset.nil?)
+          -1
+        when (b.nil? or b.offset.nil?)
+          +1
+        when (not a.range.include? b.offset and not b.range.include? a.offset)
+          a.offset <=> b.offset
+        else
+          b.length <=> a.length
+        end
+      end.reverse
+    else
+      segments.sort_by do |segment| segment.offset || 0 end
+    end
+  end
+  def self.split(text, segments)
+    sorted_segments = sort segments
+    chunks      = []
+    segment_end = 0
+    text_offset = 0
+    sorted_segments.each do |segment|
+      return chunks if text.nil? or text.empty?
+      next if segment.offset.nil?
+      offset = segment.offset - text_offset
+      # Consider segment offset. Save pre, or skip if overlap
+      case
+      when offset < 0 # Overlap, skip
+        next
+      when offset > 0 # Save pre
+        chunk = text[0..offset - 1]
+        Segment.annotate(chunk, text_offset)
+        chunks << chunk
+      end
+      segment_end = offset + segment.length - 1
+      chunk = text[offset..segment_end]
+      Segment.annotate(chunk, text_offset + offset)
+      chunks << chunk
+      text_offset += segment_end + 1
+      text = text[segment_end + 1..-1]
+    end
+    if not text.nil? and text.any?
+      chunk = text.dup
+      Segment.annotate(chunk, text_offset)
+      chunks << chunk
+    end
+    chunks
+  end
+  def self.annotate(string, offset = nil)
+    string.extend Segment
+    string.offset = offset
+    string
+  end
+  def range
+    (offset..offset + length - 1)
+  end
+end
+module Annotated
+  attr_accessor :annotations
+  def self.annotate(string)
+    string.extend Annotated
+    string.annotations = []
+    string
+  end
+  def split
+    Segment.split(self, @annotations)
+  end
+end
+module NamedEntity
+  include Segment
+  attr_accessor :type, :code, :score
+  def self.annotate(string, offset = nil, type = nil, code = nil, score = nil)
+    string.extend NamedEntity
+    string.offset = offset
+    string.type  = type
+    string.code  = code
+    string.score = score
+    string
+  end
+  def to_s
+    <<-EOF
+String: #{ self }
+Offset: #{ offset.inspect }
+Type: #{type.inspect}
+Code: #{code.inspect}
+Score: #{score.inspect}
+    EOF
+  end
+end
+module Token
+  include Segment
+  attr_accessor :original
+  def self.annotate(string, offset = nil, original = nil)
+    string.extend Token
+    string.offset   = offset
+    string.original = original
+    string
+  end
+end

data/lib/rbbt/ner/banner.rb CHANGED Viewed

@@ -1,10 +1,11 @@
 require 'rbbt'
 require 'rjb'
-require 'rbbt/ner/named_entity'
+require 'rbbt/ner/annotations'
+require 'rbbt/ner/NER'
 # Offers a Ruby interface to the Banner Named Entity Recognition Package
 # in Java. Banner[http://banner.sourceforge.net/].
-class Banner
+class Banner < NER
   Rbbt.add_software "BANNER" => ['','']
@@ -48,10 +49,11 @@ class Banner
   # Returns an array with the mention found in the provided piece of
   # text.
-  def extract(text)
+  def match(text)
     text.gsub!(/\n/,' ')
     text.gsub!(/\|/,'/') # Character | gives an error
     sentence = @@Sentence.new(text)
     @tokenizer.tokenize(sentence)
     @tagger.tag(sentence)
     @parenPP.postProcess(sentence)
@@ -63,7 +65,7 @@ class Banner
       mention = $1
       mention.sub!(/^\s*/,'')
       mention.sub!(/\s*$/,'')
-      NamedEntity.annotate mention
+      NamedEntity.annotate mention, nil, 'GENE'
       mention
     }
     res

data/lib/rbbt/ner/oscar3.rb CHANGED Viewed

@@ -1,10 +1,11 @@
 require 'rbbt'
 require 'rjb'
 require 'libxml'
-require 'rbbt/ner/named_entity'
+require 'rbbt/ner/annotations'
+require 'rbbt/ner/NER'
 require 'rbbt/util/log'
-class OSCAR3
+class OSCAR3 < NER
   Rbbt.add_software "OSCAR3" => ['','']
   @@TextToSciXML   = Rjb::import('uk.ac.cam.ch.wwmm.ptclib.scixml.TextToSciXML')
@@ -14,37 +15,52 @@ class OSCAR3
   @@MEMM = @@MEMMSingleton.getInstance();
   @@DFA  = @@DFANEFinder.getInstance();
-  def self.extract(text,  type = nil, memm =  true)
+  def self.match(text,  type = nil, memm =  false)
     doc  = @@ProcessingDocumentFactory.getInstance().makeTokenisedDocument(@@TextToSciXML.textToSciXML(text), true, false, false);
     mentions = []
     it = doc.getTokenSequences().iterator
-    reconizer = memm ? @@MEMM : @@DFA
     type = [type] unless type.nil? or Array === type
-    pos = 0
     while it.hasNext do
-      Log.debug "OSCAR3: Finding mentions in sequence #{pos += 1}"
       sequence = it.next
-      entities = @@MEMM.findNEs(sequence, text)
-      keys = entities.keySet.iterator
+      # Fix sequence offset
+      sequence_str = sequence.getSourceString.to_s
+      sequence_offset = sequence.offset.to_i
+      offset = 0
+      while text[(sequence_offset + offset)..(sequence_offset + offset + sequence_str.length - 1)] != sequence_str and
+        not offset + sequence_offset + sequence_str.length > text.length
+        offset += 1
+      end
+      next if offset + sequence_offset + sequence_str.length > text.length
+      if memm
+        entities = @@MEMM.findNEs(sequence, text)
+        keys = entities.keySet.iterator
+      else
+        entities = @@DFA.getNEs(sequence)
+        keys = entities.iterator
+      end
       while keys.hasNext do
         key = keys.next
         mention_type, rstart, rend, mention = key.to_string.match(/\[NE:(.*):(.*):(.*):(.*)\]/).values_at(1,2,3,4)
         next unless type.nil? or type.include? mention_type
-        score  = entities.get(key)
+        score  = memm ? entities.get(key).to_string.to_f : nil
-        NamedEntity.annotate mention, mention_type, score.to_string.to_f, (rstart..rend)
+        NamedEntity.annotate mention, rstart.to_i + offset, mention_type, nil, score
-        mentions << mention
+        mentions << mention unless mentions.collect{|m| m.to_s}.include? mention.to_s
       end
     end
     mentions
   end
-  def extract(*args)
-    OSCAR3.extract *args
+  def match(*args)
+    OSCAR3.match *args
   end
 end

data/lib/rbbt/ner/regexpNER.rb CHANGED Viewed

@@ -1,67 +1,91 @@
-require 'rbbt-util'
-require 'rbbt/bow/misc'
+require 'rbbt/ner/annotations'
+require 'rbbt/ner/NER'
+require 'rbbt/util/simpleDSL'
-class RegExpNER
-  def initialize(lexicon, options = {})
-    options = Misc.add_defaults options, :flatten => true, :case_insensitive => true, :stopwords => nil
+class RegExpNER < NER
+  include SimpleDSL
-    if $stopwords and  (options[:stopwords].nil? || options[:stopwords] == true)
-      options[:stopwords] = $stopwords
-    else
-      options[:stopwords] = []
+  def self.match_regexp(text, regexp, type = nil)
+    matches = []
+    start = 0
+    while matchdata = text.match(regexp)
+      pre   = matchdata.pre_match
+      post  = matchdata.post_match
+      match = matchdata[0]
+      if matchdata.captures.any?
+        capture = matchdata.captures.first
+        more_pre, more_post = match.split(/#{capture}/)
+        match = capture
+        pre << more_pre if more_pre
+        post = more_post << post if more_post
+      end
+      if match and not match.empty?
+        NamedEntity.annotate(match, start + pre.length, type)
+        matches << match
+      end
+      start += pre.length + match.length
+      text = post
     end
-    data = TSV.new(lexicon, options)
-    @index = {}
-    data.each{|code, names|
-      next if code.nil? || code == ""
-      names << code if names.empty?
-      if options[:stopwords].any?
-        names = names.select{|n|
-          ! options[:stopwords].include?(options[:case_insensitive] ? n.downcase : n)
-        }
+    matches
+  end
+  def self.match_regexp_list(text, regexp_list, type = nil)
+    matches = []
+    regexp_list.each do |regexp|
+      chunks = Segment.split(text, matches)
+      chunks.each do |chunk|
+        new_matches = match_regexp(chunk, regexp, type)
+        new_matches.each do |match| match.offset += chunk.offset; matches << match end
       end
-      @index[code] = RegExpNER.build_re(names, options[:case_insensitive])
-   }
+    end
+    matches
   end
+  def self.match_regexp_hash(text, regexp_hash)
+    matches = []
-  def self.build_re(names, ignorecase=true)
-    res = names.compact.reject{|n| n.empty? or n.length < 3}.
-      sort_by{|a| a.length }.reverse.collect{|n| Regexp.quote(n) }
+    regexp_hash.each do |type, regexp_list|
+      regexp_list = [regexp_list] unless Array === regexp_list
+      chunks = Segment.split(text, matches)
+      chunks.each do |chunk|
+        chunk_offset = chunk.offset
+        match_regexp_list(chunk, regexp_list, type).collect do |match|
+          match.offset += chunk_offset;
+          matches << match
+        end
+      end
+    end
-    return nil if res.empty?
+    matches
+  end
-    /\b(#{ res.join("|").gsub(/\\?\s/,'\s+') })\b/i
+  attr_accessor :regexps
+  def initialize(regexps = {})
+    @regexps = regexps.collect
   end
-  def self.match_re(text, res)
-    res = [res] unless Array === res
-    res.collect{|re|
-      text.scan(re)
-    }.flatten
+  def __define_regexp_hook(name, regexp, *args)
+    @regexps << [name, regexp]
   end
+  def define_regexp(*args, &block)
+    load_config("__define_regexp_hook", *args, &block)
+  end
-  def match_hash(text)
-    return {} if text.nil? or text.empty?
-    matches = {}
-    @index.each{|code, re|
-      next if re.nil?
-      RegExpNER.match_re(text, re).each{|match|
-         matches[code] ||= []
-         matches[code] << match
-      }
-    }
-    matches
+  def add_regexp(list = {})
+    @regexps.concat list.collect
   end
   def match(text)
-    match_hash(text)
+    matches = RegExpNER.match_regexp_hash(text, @regexps)
   end
 end