RubyGems - rbbt-text - Versions diffs - 0.2.1 → 0.5.0 - Mend

rbbt-text 0.2.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

data/bin/get_ppis.rb +52 -0
data/lib/rbbt/bow/dictionary.rb +9 -9
data/lib/rbbt/bow/misc.rb +86 -2
data/lib/rbbt/corpus/corpus.rb +55 -0
data/lib/rbbt/corpus/document.rb +289 -0
data/lib/rbbt/corpus/document_repo.rb +115 -0
data/lib/rbbt/corpus/sources/pubmed.rb +26 -0
data/lib/rbbt/ner/NER.rb +7 -5
data/lib/rbbt/ner/abner.rb +13 -2
data/lib/rbbt/ner/annotations.rb +182 -51
data/lib/rbbt/ner/annotations/annotated.rb +15 -0
data/lib/rbbt/ner/annotations/named_entity.rb +37 -0
data/lib/rbbt/ner/annotations/relations.rb +25 -0
data/lib/rbbt/ner/annotations/token.rb +28 -0
data/lib/rbbt/ner/annotations/transformed.rb +170 -0
data/lib/rbbt/ner/banner.rb +8 -5
data/lib/rbbt/ner/chemical_tagger.rb +34 -0
data/lib/rbbt/ner/ngram_prefix_dictionary.rb +136 -0
data/lib/rbbt/ner/oscar3.rb +1 -1
data/lib/rbbt/ner/oscar4.rb +41 -0
data/lib/rbbt/ner/patterns.rb +132 -0
data/lib/rbbt/ner/rnorm.rb +141 -0
data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
data/lib/rbbt/ner/rnorm/tokens.rb +218 -0
data/lib/rbbt/ner/token_trieNER.rb +185 -51
data/lib/rbbt/nlp/genia/sentence_splitter.rb +214 -0
data/lib/rbbt/nlp/nlp.rb +235 -0
data/share/install/software/ABNER +0 -4
data/share/install/software/ChemicalTagger +81 -0
data/share/install/software/Gdep +115 -0
data/share/install/software/Geniass +118 -0
data/share/install/software/OSCAR4 +16 -0
data/share/install/software/StanfordParser +15 -0
data/share/patterns/drug_induce_disease +22 -0
data/share/rnorm/cue_default +10 -0
data/share/rnorm/tokens_default +86 -0
data/share/{stopwords → wordlists/stopwords} +0 -0
data/test/rbbt/bow/test_bow.rb +1 -1
data/test/rbbt/bow/test_dictionary.rb +1 -1
data/test/rbbt/bow/test_misc.rb +1 -1
data/test/rbbt/corpus/test_corpus.rb +99 -0
data/test/rbbt/corpus/test_document.rb +222 -0
data/test/rbbt/ner/annotations/test_named_entity.rb +14 -0
data/test/rbbt/ner/annotations/test_transformed.rb +175 -0
data/test/rbbt/ner/test_abner.rb +1 -1
data/test/rbbt/ner/test_annotations.rb +64 -2
data/test/rbbt/ner/test_banner.rb +1 -1
data/test/rbbt/ner/test_chemical_tagger.rb +56 -0
data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +20 -0
data/test/rbbt/ner/{test_oscar3.rb → test_oscar4.rb} +12 -13
data/test/rbbt/ner/test_patterns.rb +66 -0
data/test/rbbt/ner/test_regexpNER.rb +1 -1
data/test/rbbt/ner/test_rnorm.rb +47 -0
data/test/rbbt/ner/test_token_trieNER.rb +60 -35
data/test/rbbt/nlp/test_nlp.rb +88 -0
data/test/test_helper.rb +20 -0
metadata +93 -20

data/lib/rbbt/ner/oscar3.rb CHANGED

@@ -6,7 +6,7 @@ require 'rbbt/ner/NER'
 require 'rbbt/util/log'
 class OSCAR3 < NER
-  Rbbt.add_software "OSCAR3" => ['','']
+  Rbbt.software.opt.OSCAR3.define_as_install Rbbt.share.install.software.OSCAR3.find
   @@TextToSciXML   = Rjb::import('uk.ac.cam.ch.wwmm.ptclib.scixml.TextToSciXML')
   @@ProcessingDocumentFactory   = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocumentFactory')

data/lib/rbbt/ner/oscar4.rb ADDED

@@ -0,0 +1,41 @@
+require 'rbbt'
+require 'rjb'
+require 'libxml'
+require 'rbbt/ner/annotations'
+require 'rbbt/ner/NER'
+require 'rbbt/util/log'
+class OSCAR4 < NER
+  Rbbt.software.opt.OSCAR4.define_as_install Rbbt.share.install.software.OSCAR4.find
+  Rjb::load(nil, jvmargs = ['-Xms128m','-Xmx2048m'])
+  @@OSCAR = Rjb::import('uk.ac.cam.ch.wwmm.oscar.Oscar')
+  def self.match(text,  type = nil, memm =  false)
+    return [] if text.nil? or text.strip.empty?
+    oscar = @@OSCAR.new();
+    entities = oscar.findAndResolveNamedEntities(text);
+    it = entities.iterator
+    result = []
+    while it.hasNext
+      entity = it.next
+      mention = entity.getSurface
+      result << mention
+      NamedEntity.annotate mention, entity.getStart, entity.getType, nil, entity.getNamedEntity.getConfidence
+    end
+    result
+  end
+  def match(*args)
+    OSCAR4.match *args
+  end
+end

data/lib/rbbt/ner/patterns.rb ADDED

@@ -0,0 +1,132 @@
+require 'rbbt/ner/annotations/named_entity'
+require 'rbbt/ner/annotations/annotated'
+require 'rbbt/ner/annotations/transformed'
+require 'rbbt/ner/annotations/relations'
+require 'rbbt/ner/regexpNER'
+require 'rbbt/ner/token_trieNER'
+require 'rbbt/nlp/nlp'
+require 'stemmer'
+class PatternRelExt
+  def self.simple_pattern(sentence, patterns, type = nil)
+    patterns = Array === patterns ? patterns : [patterns]
+    type ||= "Simple Pattern"
+    regexpNER = RegExpNER.new type => patterns.collect{|p| /#{p}/}
+    Transformed.with_transform(sentence, sentence.annotations, Proc.new{|s| s.type.to_s.upcase}) do |sentence|
+      regexpNER.entities(sentence)
+    end
+  end
+  def self.transform_key(key)
+    case
+    when key =~ /(.*)\[entity:(.*)\]/
+      chunk_type, chunk_value = $1, $2
+      annotation_types = chunk_value.split(",")
+      Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
+        ((Hash === chunk.annotations ? chunk.annotations.values.flatten : chunk.annotations).flatten.select{|a| NamedEntity === a}.collect{|a| a.type.to_s}.flatten & annotation_types).any? }
+    when key =~ /(.*)\[code:(.*)\]/
+      chunk_type, chunk_value = $1, $2
+      annotation_codes = chunk_value.split(",")
+      Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
+        ((Hash === chunk.annotations ? chunk.annotations.values.flatten : chunk.annotations).select{|a| NamedEntity === a}.collect{|a| a.code}.flatten & annotation_codes).any? }
+    when key =~ /(.*)\[stem:(.*)\]/
+      chunk_type, chunk_value = $1, $2
+      Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
+        chunk.split(/\s+/).select{|w| w.stem == chunk_value.stem}.any?}
+    when key =~ /(.*)\[(.*)\]/
+      chunk_type, chunk_value = $1, $2
+      Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
+        chunk.parts.values.select{|a| a == chunk_value}.any?}
+    else
+      key
+    end
+  end
+  def self.transform_index(index)
+    new = {}
+    index.each do |key,next_index|
+      if Hash === next_index
+        new_key = transform_key(key)
+        if Proc === new_key
+          new[:PROCS] ||= {}
+          new[:PROCS][new_key] = transform_index(next_index)
+        else
+          new[new_key] = transform_index(next_index)
+        end
+      else
+        new[transform_key(key)] = next_index
+      end
+    end
+    new
+  end
+  def self.prepare_chunk_patterns(token_trie, patterns, type = nil)
+    token_trie.merge(transform_index(TokenTrieNER.process({}, patterns)), type)
+  end
+  attr_accessor :token_trie, :type
+  def new_token_trie
+    @token_trie = TokenTrieNER.new({})
+  end
+  def token_trie
+    @token_trie || new_token_trie
+  end
+  def slack(slack)
+    @token_trie.slack = slack
+  end
+  def initialize(patterns, slack = nil, type = nil)
+    patterns = case
+               when (Hash === patterns or TSV === patterns)
+                 patterns
+               when Array === patterns
+                 {:Relation => patterns}
+               when String === patterns
+                 {:Relation => [patterns]}
+               end
+    @type = type
+    tokenized_patterns = {}
+    patterns.each do |key, values|
+      tokenized_patterns[key] = values.collect do |v|
+        Token.tokenize(v, /(NP\[[^\]]+\])|\s+/)
+      end
+    end
+    PatternRelExt.prepare_chunk_patterns(new_token_trie, tokenized_patterns, type)
+    token_trie.slack = slack || Proc.new{|t| t.type != 'O'}
+  end
+  def match_chunks(chunks)
+    token_trie.match(chunks).each do |match|
+      match.extend Relationship
+    end
+  end
+  def match_sentences(sentences)
+    sentence_chunks = NLP.gdep_chunk_sentences(sentences)
+    sentences.zip(sentence_chunks).collect do |sentence, chunks|
+      annotation_index = Segment.index(sentence.annotations)
+      chunks.each do |chunk|
+        Annotated.annotate(chunk, annotation_index[chunk.range])
+      end
+      match_chunks(chunks)
+    end
+  end
+end

data/lib/rbbt/ner/rnorm.rb ADDED

@@ -0,0 +1,141 @@
+require 'rbbt/ner/rnorm/cue_index'
+require 'rbbt/ner/rnorm/tokens'
+require 'rbbt/util/open'
+require 'rbbt/util/tsv'
+require 'rbbt/sources/entrez'
+require 'rbbt/bow/bow.rb'
+class Normalizer
+  # Given a list of pairs of candidates along with their scores as
+  # parameter +values+, and a minimum value for the scores. It returns
+  # a list of pairs of the candidates that score the highest and that
+  # score above the minimum. Otherwise it return an empty list.
+  def self.get_best(values, min)
+    return [] if values.empty?
+    best = values.collect{|p| p[1]}.max
+    return [] if best < min
+    values.select{|p| p[1] == best}
+  end
+  # Compares the tokens and gives each candidate a score based on the
+  # commonalities and differences amongst the tokens.
+  def token_score(candidates, mention)
+    candidates.collect{|code|
+      next if @synonyms[code].nil?
+      value = @synonyms[code].select{|name| name =~ /\w/}.collect{|name|
+        case
+        when mention == name
+          100
+        when mention.downcase == name.downcase
+          90
+        when mention.downcase.gsub(/\s/,'') == name.downcase.gsub(/\s/,'')
+          80
+        else
+          @tokens.evaluate(mention, name)
+        end
+      }.max
+      [code, value]
+    }.compact
+  end
+  # Order candidates with the number of words in common between the text
+  # in their Entrez Gene entry and the text passed as parameter. Because
+  # candidate genes might be in some other format than Entrez Gene Ids,
+  # the +to_entrez+ variable can hold the way to translate between them,
+  # been a Proc or a Hash.
+  def entrez_score(candidates, text, to_entrez = nil)
+      code2entrez = {}
+      candidates.each{|code|
+        if to_entrez.is_a? Proc
+          entrez = to_entrez.call(code)
+        elsif to_entrez.is_a? Hash
+          entrez = @to_entrez[code]
+        else
+          entrez = code
+        end
+        code2entrez[code] = entrez unless entrez.nil?
+      }
+      # Get all at once, better performance
+      genes = Entrez.get_gene(code2entrez.values)
+      code2entrez_genes = code2entrez.collect{|key, value| [key, genes[value]]}
+      code2entrez_genes.collect{|p|
+        [p[0], Entrez.gene_text_similarity(p[1], text)]
+      }
+  end
+  # Takes a list of candidate codes and selects the ones that have the
+  # mention explicitly in their list of synonyms, and in the earliest
+  # positions. This is based on the idea that synonym lists order their
+  # synonyms by importance.
+  def appearence_order(candidates, mention)
+    positions = candidates.collect{|code|
+      next unless @synonyms[code]
+      pos = nil
+      @synonyms[code].each_with_index{|list,i|
+        next if pos
+        pos = i if list.include? mention
+      }
+      pos
+    }
+    return nil if positions.compact.empty?
+    best = candidates.zip(positions).sort{|a,b| a[1] <=> b[1]}.first[1]
+    candidates.zip(positions).select{|p| p[1] == best}.collect{|p| p[0]}
+  end
+  def initialize(lexicon, options = {})
+    @synonyms = TSV.new(lexicon, :flat)
+    @index = CueIndex.new
+    @index.load(lexicon, options[:max_candidates])
+    @to_entrez = options[:to_entrez]
+    @tokens = Tokenizer.new(options[:file])
+  end
+  def match(mention)
+    @index.match(mention)
+  end
+  def select(candidates, mention, text = nil, options = {})
+    threshold  = options[:threshold] || 0
+    max_candidates  = options[:max_candidates] || 200
+    max_entrez  = options[:max_entrez] || 10
+    # Abort if too ambigous
+    return [] if candidates.empty?
+    return [] if candidates.length > max_candidates
+    scores = token_score(candidates, mention)
+    best_codes = Normalizer::get_best(scores, threshold).collect{|p| p[0]}
+    # Abort if too ambigous
+    return [] if best_codes.length > max_entrez
+    if best_codes.length > 1 and text
+      scores = entrez_score(best_codes, text, @to_entrez)
+      Normalizer::get_best(scores, 0).collect{|p| p[0]}
+    else
+      orders = appearence_order(best_codes, mention)
+      if orders
+        orders
+      else
+        best_codes
+      end
+    end
+  end
+  def resolve(mention, text = nil, options = {})
+    candidates = match(mention)
+    select(candidates, mention, text, options)
+  end
+end

data/lib/rbbt/ner/rnorm/cue_index.rb ADDED

@@ -0,0 +1,80 @@
+require 'rbbt-util'
+require 'rbbt/util/misc'
+require 'rbbt/util/simpleDSL'
+class CueIndex
+  include SimpleDSL
+  class LexiconMissingError < StandardError; end
+  def define(name, *args, &block)
+    @rules << [name,block]
+    nil
+  end
+  def initialize(file = nil, &block)
+    @rules   = []
+    file ||= Rbbt.share.rnorm.cue_default.produce if !file && !block
+    load_config(:define, file, &block)
+  end
+  def config
+    @config[:define]
+  end
+  def cues(word)
+    @rules.collect{|rule|
+      c = rule[1].call(word)
+      c = [c] unless  c.is_a? Array
+      c
+    }
+  end
+  def clean(max)
+    @indexes.each{|index|
+      remove = []
+      index.each{|key,values|
+        remove << key if values.length > max
+      }
+      remove.each{|key|
+        index.delete(key)
+      }
+    }
+  end
+  def load(file, max_candidates = 50)
+    @indexes = Array.new(@rules.size){Hash.new}
+    data = TSV.new(file, :flat)
+    data.each{|code, values|
+      values.each{|value|
+        cues(value).each_with_index{|cue_list,i|
+          cue_list.each{|cue|
+            @indexes[i][cue] ||= []
+            @indexes[i][cue]  << code unless @indexes[i][cue].include? code
+          }
+        }
+      }
+    }
+    clean(max_candidates) if max_candidates
+    nil
+  end
+  def match(name)
+    raise LexiconMissingError, "Load Lexicon before matching" unless @indexes
+    cues = cues(name)
+    @indexes.each_with_index{|index,i|
+      best = []
+      cues[i].each{|cue|
+        best << index[cue] if index[cue]
+      }
+      return best.flatten if best.any?
+    }
+    return []
+  end
+end

data/lib/rbbt/ner/rnorm/tokens.rb ADDED

@@ -0,0 +1,218 @@
+require 'rbbt/util/simpleDSL'
+require 'rbbt/util/misc'
+require 'rbbt/bow/misc'
+require 'set'
+class Tokenizer
+  include SimpleDSL
+  #{{{ Classes for Comparisons
+  @@ignore_case = true
+  def self.ignore_case(ignore = nil)
+    if ignore.nil?
+      return @@ignore_case
+    else
+      @@ignore_case = ignore
+    end
+  end
+  class Operation
+    def initialize(comparison)
+      @comparison = comparison
+      @ignore_case = Tokenizer::ignore_case
+    end
+    def ignore_case(ignore = true)
+      @ignore_case = ignore
+      self
+    end
+    def method_missing(name, *args, &bloc)
+      @token = name.to_sym
+      @value = *args.first
+      self
+    end
+    def eval(list1, list2)
+      toks1 = list1.select{|p| p[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
+      toks2 = list2.select{|p| p[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
+      value = 0
+      case @comparison.to_s
+      when 'same'
+        if toks1 == toks2 && toks1.any?
+          value = @value
+        end
+      when 'diff'
+        if toks1 != toks2
+          value = @value
+        end
+      when 'common'
+        if toks1.to_set.intersection(toks2.to_set).length > 0
+          value = @value
+        end
+      when 'distinct'
+        if toks1.to_set.intersection(toks2.to_set).length == 0
+          value = @value
+        end
+      when 'miss'
+        missing = (toks1 - toks2)
+        if missing.length > 0
+          value = @value * missing.length
+        end
+      when 'extr'
+        extr = (toks2 - toks1)
+        if extr.length > 0
+          value = @value * extr.length
+        end
+      end
+      return value
+    end
+  end
+  class Custom
+    def initialize
+      @ignore_case = Tokenizer::ignore_case
+    end
+    def ignore_case(ignore = true)
+      @ignore_case = ignore
+      self
+    end
+    def method_missing(name, *args, &block)
+      @token = name.to_sym
+      @block = block
+    end
+    def eval(list1, list2)
+      toks1 = list1.select{|t| t[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
+      toks2 = list2.select{|t| t[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
+      @block.call(toks1, toks2)
+    end
+  end
+  class Transform
+    def initialize
+    end
+    def method_missing(name, *args, &block)
+      @token = name.to_sym
+      if block_given?
+        @block = block
+      else
+        @block = args.first
+      end
+      self
+    end
+    def transform(token)
+      if token[1] == @token
+        token = @block.call(token[0])
+      else
+        token
+      end
+    end
+  end
+  #{{{ Metaprogramming hooks
+  def define_tokens(name, *args, &block)
+    action = *args[0] || block ||  /#{name.to_s}s?/i
+      raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
+    @types[name.to_sym] = action
+    @order.push name.to_sym
+    name.to_sym
+  end
+  def define_comparisons(name, *args, &block)
+     o = nil
+    case name.to_sym
+    when :compare
+      o = Custom.new
+      @operations << o
+    when :transform
+      o = Transform.new
+      @transforms << o
+    else
+      o = Operation.new(name)
+      @operations << o
+    end
+    o
+  end
+  def main(name, *args, &block)
+    parse("define_" + name.to_s,block)
+  end
+  #{{{ Initialize
+  def initialize(file=nil, &block)
+    @types = {}
+    @order = []
+    @operations = []
+    @transforms = []
+    file ||= Rbbt.share.rnorm.tokens_default.produce if !file && !block
+    load_config :main, file, &block
+  end
+  #{{{ Token Types
+  GREEK_RE = "(?:" + $greek.keys.select{|w| w.length > 3}.collect{|w| w.downcase}.join("|") + ")"
+  def tokenize(word)
+    return word.
+      gsub(/([^IVX])I$/,'\1|I|').     # Separate last roman number
+      gsub(/(\d+[,.]?\d+|\d+)/,'|\1|').     # Separate number
+      gsub(/([a-z])([A-Z])/,'\1-\2').
+      gsub(/([A-Z]{2,})([a-z])/,'\1-\2').
+      gsub(/^(#{GREEK_RE})/,'\1-').
+      gsub(/(#{GREEK_RE})$/,'-\1').
+      split( /[^\w.]+/).  # Split by separator char
+      select{|t|  !t.empty? }
+  end
+  def type(token)
+    @order.each{|type|
+      action = @types[type]
+      if action.is_a? Proc
+        return type if action.call(token)
+      else
+        return type if action.match(token)
+      end
+    }
+    return :unknown
+  end
+  def token_types(word)
+    tokenize(word).collect{|token|
+      [token, type(token)]
+    }
+  end
+  #{{{ Comparisons
+  def evaluate_tokens(list1, list2)
+    @operations.inject(0){| acc, o|
+      acc + o.eval(list1, list2)
+    }
+  end
+  def evaluate(mention, name)
+    mention_tokens, name_tokens = [mention, name].collect{|n|
+      token_types(n).collect{|t|
+        @transforms.inject(t){|t,o|
+          t = o.transform(t)
+        }
+      }
+    }
+    evaluate_tokens(mention_tokens, name_tokens)
+  end
+end