RubyGems - rbbt - Versions diffs - 1.0.0 - Mend

rbbt 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

data/LICENSE +20 -0
data/README.rdoc +17 -0
data/bin/rbbt_config +180 -0
data/install_scripts/classifier/R/classify.R +36 -0
data/install_scripts/classifier/Rakefile +140 -0
data/install_scripts/get_abner.sh +2 -0
data/install_scripts/get_banner.sh +25 -0
data/install_scripts/get_biocreative.sh +72 -0
data/install_scripts/get_crf++.sh +26 -0
data/install_scripts/get_entrez.sh +4 -0
data/install_scripts/get_go.sh +4 -0
data/install_scripts/get_polysearch.sh +8 -0
data/install_scripts/ner/Rakefile +206 -0
data/install_scripts/ner/config/default.rb +52 -0
data/install_scripts/norm/Rakefile +218 -0
data/install_scripts/norm/config/cue_default.rb +10 -0
data/install_scripts/norm/config/tokens_default.rb +79 -0
data/install_scripts/norm/functions.sh +21 -0
data/install_scripts/organisms/Rakefile +25 -0
data/install_scripts/organisms/cgd.Rakefile +84 -0
data/install_scripts/organisms/human.Rakefile +145 -0
data/install_scripts/organisms/mgi.Rakefile +77 -0
data/install_scripts/organisms/pombe.Rakefile +40 -0
data/install_scripts/organisms/rake-include.rb +258 -0
data/install_scripts/organisms/rgd.Rakefile +88 -0
data/install_scripts/organisms/sgd.Rakefile +66 -0
data/install_scripts/organisms/tair.Rakefile +54 -0
data/install_scripts/organisms/worm.Rakefile +109 -0
data/install_scripts/stopwords +1 -0
data/install_scripts/wordlists/consonants +897 -0
data/install_scripts/wordlists/stopwords +1 -0
data/lib/rbbt/bow/bow.rb +87 -0
data/lib/rbbt/bow/classifier.rb +118 -0
data/lib/rbbt/bow/dictionary.rb +218 -0
data/lib/rbbt/ner/abner.rb +34 -0
data/lib/rbbt/ner/banner.rb +73 -0
data/lib/rbbt/ner/regexpNER.rb +62 -0
data/lib/rbbt/ner/rner.rb +227 -0
data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
data/lib/rbbt/ner/rnorm/tokens.rb +213 -0
data/lib/rbbt/ner/rnorm.rb +142 -0
data/lib/rbbt/sources/biocreative.rb +75 -0
data/lib/rbbt/sources/biomart.rb +106 -0
data/lib/rbbt/sources/entrez.rb +211 -0
data/lib/rbbt/sources/go.rb +40 -0
data/lib/rbbt/sources/organism.rb +197 -0
data/lib/rbbt/sources/polysearch.rb +88 -0
data/lib/rbbt/sources/pubmed.rb +111 -0
data/lib/rbbt/util/arrayHash.rb +255 -0
data/lib/rbbt/util/filecache.rb +72 -0
data/lib/rbbt/util/index.rb +69 -0
data/lib/rbbt/util/misc.rb +101 -0
data/lib/rbbt/util/open.rb +207 -0
data/lib/rbbt/util/simpleDSL.rb +87 -0
data/lib/rbbt/util/tmpfile.rb +19 -0
data/lib/rbbt/version.rb +10 -0
data/lib/rbbt.rb +86 -0
data/tasks/install.rake +123 -0
metadata +114 -0

data/lib/rbbt/ner/rner.rb ADDED Viewed

@@ -0,0 +1,227 @@
+require 'rbbt'
+require 'rbbt/util/open'
+require 'rbbt/util/misc'
+require 'rbbt/util/simpleDSL'
+class NERFeatures < SimpleDSL
+  def self.tokens(text)
+    text.scan(/
+              \w*-?(?:\d*\d[.,]\d\d*|\d+)\w*|
+              \w-\w*|
+              \w+-[A-Z](?!\w)|
+              \w+|
+              [.,()\/\[\]{}'"+-]
+              /x)
+  end
+  def self.reverse(text)
+    tokens(text).reverse.join(" ")
+  end
+  def define(name, *args, &block)
+    action = *args[0] || block ||  /#{name.to_s}s?/i
+    raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
+    @types[name.to_s] = action
+    @order.push name.to_s
+    name.to_s
+  end
+  attr_accessor :reverse
+  def initialize(file = nil, reverse = false, &block)
+    @types   = {}
+    @order   = []
+    @context = []
+    @reverse = reverse
+    file ||= File.join(Rbbt.datadir,'ner/config/default.rb') if !file && !block
+    super(:define,file, &block)
+  end
+  def config
+    @config[:define]
+  end
+  def window(positions)
+    @window = positions
+  end
+  def context(name, &block)
+    if name.is_a? Array
+      @context += name
+    else
+      @context.push name
+      # The block might be wrongly assigned to this function
+      # instead of the actual definition, fix that.
+      if block
+        @types[name] = block
+      end
+    end
+  end
+  def direction(dir)
+    if dir.to_sym == :reverse
+      @reverse = true
+    end
+  end
+  def features(word)
+    values = [word]
+    @order.each{|features|
+      action = @types[features]
+      if action.is_a?(Proc)
+        values.push(action.call(word))
+      else
+        m = action.match(word)
+        if m
+          if m[1]
+            values.push(m[1])
+          else
+            values.push(m != nil)
+          end
+        else
+          values.push(false)
+        end
+      end
+    }
+    values
+  end
+  def template(window=nil)
+    window ||= @window || [1,-1]
+    template = ""
+    i = 1
+    @order.each{|feat|
+      template += "U#{ feat }: %x[0,#{ i }]\n"
+      if @context.include?(feat)
+        window.each{|p|
+          template += "U#{ feat }##{ p}: %x[#{ p },#{ i }]\n"
+        }
+      end
+      i += 1
+    }
+    template += "B\n"
+    template
+  end
+  def text_features(text, positive = nil)
+    text = self.class.reverse(text) if @reverse
+    initial = true
+    self.class.tokens(text).collect{|token|
+      features = features(token)
+      if !positive.nil?
+        features << (positive ? (initial ? 1 : 2) : 0)
+        initial = false
+      end
+      features
+    }
+  end
+  def tagged_features(text, mentions)
+    mentions ||= []
+    mentions = ['IMPOSSIBLE_MATCH'] if mentions.empty?
+    re = mentions.collect{|mention|
+      Regexp.quote(mention.gsub(/\s+/,' ')).sub(/\\s/,'\s+')
+    }.join("|")
+    positive = false
+    features = []
+    chunks = text.split(/(#{re})/)
+    chunks.each{|t|
+      chunk_features = text_features(t, positive)
+      positive = !positive
+      if @reverse
+        features = chunk_features + features
+      else
+        features = features + chunk_features
+      end
+    }
+    features
+  end
+  def train(features, model)
+    tmp_template = TmpFile.tmp_file("template-")
+    Open.write(tmp_template,template)
+    cmd = "#{File.join(Rbbt.datadir, 'third_party/crf++/bin/crf_learn')} '#{tmp_template}'  '#{features}' '#{model}'"
+    system cmd
+    Open.write(model + '.config',config)
+    FileUtils.rm tmp_template
+  end
+end
+class NER
+  def initialize(model = nil)
+    begin
+      require 'CRFPP'
+    rescue Exception
+      require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP')
+    end
+    model ||= File.join(Rbbt.datadir, + 'ner/model/BC2')
+    @parser = NERFeatures.new(model + '.config')
+    @reverse = @parser.reverse
+    @tagger = CRFPP::Tagger.new("-m #{ model } -v 3 -n2")
+  end
+  def extract(text)
+    features = @parser.text_features(text)
+    @tagger.clear
+    features.each{|feats|
+      @tagger.add(feats.join(" "))
+    }
+    @tagger.parse
+    found = []
+    mention = []
+    @tagger.size.times{|i|
+      label = @tagger.y(i)
+      word  = @tagger.x(i,0)
+      if word == ')'
+        mention.push(')') if mention.join =~ /\(/
+        next
+      end
+      case label
+      when 1
+        if mention.any? && ( mention.join(" ").is_special? || mention.select{|m| m.is_special?}.any?)
+          found.push(mention)
+          mention = []
+        end
+        mention.push(word)
+      when 2
+        mention.push(word)
+      when 0
+        found.push(mention) if mention.any?
+        mention = []
+      end
+    }
+    found << mention if mention.any?
+    found.collect{|list|
+      list = list.reverse if @reverse
+      list.join(" ")
+    }
+  end
+end

data/lib/rbbt/ner/rnorm/cue_index.rb ADDED Viewed

@@ -0,0 +1,80 @@
+require 'rbbt/util/misc'
+require 'rbbt/util/simpleDSL'
+class CueIndex < SimpleDSL
+  class LexiconMissingError < StandardError; end
+  def define(name, *args, &block)
+    @rules << [name,block]
+    nil
+  end
+  def initialize(file = nil, &block)
+    @rules   = []
+    file ||= File.join(Rbbt.datadir,'norm/config/cue_default.rb') if !file && !block
+    super(:define, file, &block)
+  end
+  def config
+    @config[:define]
+  end
+  def cues(word)
+    @rules.collect{|rule|
+      c = rule[1].call(word)
+      c = [c] unless  c.is_a? Array
+      c
+    }
+  end
+  def clean(max)
+    @indexes.each{|index|
+      remove = []
+      index.each{|key,values|
+        remove << key if values.length > max
+      }
+      remove.each{|key|
+        index.delete(key)
+      }
+    }
+  end
+  def load(file, max_candidates = 50)
+    @indexes = Array.new(@rules.size){Hash.new}
+    data = Open.to_hash(file, :sep => "\t|\\|")
+    data.each{|code, values_lists|
+      values = values_lists.flatten.compact.uniq
+      values.each{|value|
+        cues(value).each_with_index{|cue_list,i|
+          cue_list.each{|cue|
+            @indexes[i][cue] ||= []
+            @indexes[i][cue]  << code unless @indexes[i][cue].include? code
+          }
+        }
+      }
+    }
+    clean(max_candidates) if max_candidates
+    nil
+  end
+  def match(name)
+    raise LexiconMissingError, "Load Lexicon before matching" unless @indexes
+    cues = cues(name)
+    @indexes.each_with_index{|index,i|
+      best = []
+      cues[i].each{|cue|
+        best << index[cue] if index[cue]
+      }
+      return best.flatten if best.any?
+    }
+    return []
+  end
+end

data/lib/rbbt/ner/rnorm/tokens.rb ADDED Viewed

@@ -0,0 +1,213 @@
+require 'rbbt'
+require 'rbbt/util/simpleDSL'
+require 'rbbt/util/misc'
+require 'set'
+class Tokenizer < SimpleDSL
+  #{{{ Classes for Comparisons
+  @@ignore_case = true
+  def self.ignore_case(ignore = nil)
+    if ignore.nil?
+      return @@ignore_case
+    else
+      @@ignore_case = ignore
+    end
+  end
+  class Operation
+    def initialize(comparison)
+      @comparison = comparison
+      @ignore_case = Tokenizer::ignore_case
+    end
+    def ignore_case(ignore = true)
+      @ignore_case = ignore
+      self
+    end
+    def method_missing(name, *args, &bloc)
+      @token = name.to_sym
+      @value = *args.first
+      self
+    end
+    def eval(list1, list2)
+      toks1 = list1.select{|p| p[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
+      toks2 = list2.select{|p| p[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
+      value = 0
+      case @comparison.to_s
+      when 'same':
+        if toks1 == toks2 && toks1.any?
+          value = @value
+        end
+      when 'diff':
+        if toks1 != toks2
+          value = @value
+        end
+      when 'common':
+        if toks1.to_set.intersection(toks2.to_set).length > 0
+          value = @value
+        end
+      when 'distinct':
+        if toks1.to_set.intersection(toks2.to_set).length == 0
+          value = @value
+        end
+      when 'miss':
+        missing = (toks1 - toks2)
+        if missing.length > 0
+          value = @value * missing.length
+        end
+      when 'extr':
+        extr = (toks2 - toks1)
+        if extr.length > 0
+          value = @value * extr.length
+        end
+      end
+      return value
+    end
+  end
+  class Custom
+    def initialize
+      @ignore_case = Tokenizer::ignore_case
+    end
+    def ignore_case(ignore = true)
+      @ignore_case = ignore
+      self
+    end
+    def method_missing(name, *args, &block)
+      @token = name.to_sym
+      @block = block
+    end
+    def eval(list1, list2)
+      toks1 = list1.select{|t| t[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
+      toks2 = list2.select{|t| t[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
+      @block.call(toks1, toks2)
+    end
+  end
+  class Transform
+    def initialize
+    end
+    def method_missing(name, *args, &block)
+      @token = name.to_sym
+      @block = block
+      self
+    end
+    def transform(token)
+      if token[1] == @token
+        token = @block.call(token[0])
+      else
+        token
+      end
+    end
+  end
+  #{{{ Metaprogramming hooks
+  def define_tokens(name, *args, &block)
+    action = *args[0] || block ||  /#{name.to_s}s?/i
+      raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
+    @types[name.to_sym] = action
+    @order.push name.to_sym
+    name.to_sym
+  end
+  def define_comparisons(name, *args, &block)
+     o = nil
+    case name.to_sym
+    when :compare
+      o = Custom.new
+      @operations << o
+    when :transform
+      o = Transform.new
+      @transforms << o
+    else
+      o = Operation.new(name)
+      @operations << o
+    end
+    o
+  end
+  def main(name, *args, &block)
+    parse("define_" + name.to_s,block)
+  end
+  #{{{ Initialize
+  def initialize(file=nil, &block)
+    @types = {}
+    @order = []
+    @operations = []
+    @transforms = []
+    file ||= File.join(Rbbt.datadir,'norm/config/tokens_default.rb') if !file && !block
+    super(:main, file, &block)
+  end
+  #{{{ Token Types
+  GREEK_RE = "(?:" + $greek.keys.select{|w| w.length > 3}.collect{|w| w.downcase}.join("|") + ")"
+  def tokenize(word)
+    return word.
+      gsub(/([^IVX])I$/,'\1|I|').     # Separate last roman number
+      gsub(/(\d+[,.]?\d+|\d+)/,'|\1|').     # Separate number
+      gsub(/([a-z])([A-Z])/,'\1-\2').
+      gsub(/([A-Z]{2,})([a-z])/,'\1-\2').
+      gsub(/^(#{GREEK_RE})/,'\1-').
+      gsub(/(#{GREEK_RE})$/,'-\1').
+      split( /[^\w.]+/).  # Split by separator char
+      select{|t|  !t.empty? }
+  end
+  def type(token)
+    @order.each{|type|
+      action = @types[type]
+      if action.is_a? Proc
+        return type if action.call(token)
+      else
+        return type if action.match(token)
+      end
+    }
+    return :unknown
+  end
+  def token_types(word)
+    tokenize(word).collect{|token|
+      [token, type(token)]
+    }
+  end
+  #{{{ Comparisons
+  def evaluate_tokens(list1, list2)
+    @operations.inject(0){| acc, o|
+      acc + o.eval(list1, list2)
+    }
+  end
+  def evaluate(mention, name)
+    mention_tokens, name_tokens = [mention, name].collect{|n|
+      token_types(n).collect{|t|
+        @transforms.inject(t){|t,o|
+          t = o.transform(t)
+        }
+      }
+    }
+    evaluate_tokens(mention_tokens, name_tokens)
+  end
+end

data/lib/rbbt/ner/rnorm.rb ADDED Viewed

@@ -0,0 +1,142 @@
+require 'rbbt'
+require 'rbbt/ner/rnorm/cue_index'
+require 'rbbt/ner/rnorm/tokens'
+require 'rbbt/util/index'
+require 'rbbt/util/open'
+require 'rbbt/sources/entrez'
+class Normalizer
+  # Given a list of pairs of candidates along with their scores as
+  # parameter +values+, and a minimum value for the scores. It returns
+  # a list of pairs of the candidates that score the highest and that
+  # score above the minimum. Otherwise it return an empty list.
+  def self.get_best(values, min)
+    return [] if values.empty?
+    best = values.collect{|p| p[1]}.max
+    return [] if best < min
+    values.select{|p| p[1] == best}
+  end
+  # Compares the tokens and gives each candidate a score based on the
+  # commonalities and differences amongst the tokens.
+  def token_score(candidates, mention)
+    candidates.collect{|code|
+      next if @synonyms[code].nil?
+      value = @synonyms[code].select{|name| name =~ /\w/}.collect{|name|
+        case
+        when mention == name
+          100
+        when mention.downcase == name.downcase
+          90
+        when mention.downcase.gsub(/\s/,'') == name.downcase.gsub(/\s/,'')
+          80
+        else
+          @tokens.evaluate(mention, name)
+        end
+      }.max
+      [code, value]
+    }.compact
+  end
+  # Order candidates with the number of words in common between the text
+  # in their Entrez Gene entry and the text passed as parameter. Because
+  # candidate genes might be in some other format than Entrez Gene Ids,
+  # the +to_entrez+ variable can hold the way to translate between them,
+  # been a Proc or a Hash.
+  def entrez_score(candidates, text, to_entrez = nil)
+      code2entrez = {}
+      candidates.each{|code|
+        if to_entrez.is_a? Proc
+          entrez = to_entrez.call(code)
+        elsif to_entrez.is_a? Hash
+          entrez = @to_entrez[code]
+        else
+          entrez = code
+        end
+        code2entrez[code] = entrez unless entrez.nil?
+      }
+      # Get all at once, better performance
+      genes = Entrez.get_gene(code2entrez.values)
+      code2entrez_genes = code2entrez.collect{|p| [p[0], genes[p[1]]]}
+      code2entrez_genes.collect{|p|
+        [p[0], Entrez.gene_text_similarity(p[1], text)]
+      }
+  end
+  # Takes a list of candidate codes and selects the ones that have the
+  # mention explicitly in their list of synonyms, and in the earliest
+  # positions. This is based on the idea that synonym list order their
+  # synonyms by importance.
+  def appearence_order(candidates, mention)
+    positions = candidates.collect{|code|
+      next unless @synonyms[code]
+      pos = nil
+      @synonyms[code].each_with_index{|list,i|
+        next if pos
+        pos = i if list.include? mention
+      }
+      pos
+    }
+    return nil if positions.compact.empty?
+    best = candidates.zip(positions).sort{|a,b| a[1] <=> b[1]}.first[1]
+    candidates.zip(positions).select{|p| p[1] == best}.collect{|p| p[0]}
+  end
+  def initialize(lexicon, options = {})
+    @synonyms = Open.to_hash(lexicon, :sep => "\t|\\|", :flatten => true)
+    @index = CueIndex.new
+    @index.load(lexicon, options[:max_candidates])
+    @to_entrez = options[:to_entrez]
+    @tokens = Tokenizer.new(options[:file])
+  end
+  def match(mention)
+    @index.match(mention)
+  end
+  def select(candidates, mention, text = nil, options = {})
+    threshold  = options[:threshold] || 0
+    max_candidates  = options[:max_candidates] || 200
+    max_entrez  = options[:max_entrez] || 10
+    # Abort if too ambigous
+    return [] if candidates.empty?
+    return [] if candidates.length > max_candidates
+    scores = token_score(candidates, mention)
+    best_codes = Normalizer::get_best(scores, threshold).collect{|p| p[0]}
+    # Abort if too ambigous
+    return [] if best_codes.length > max_entrez
+    if best_codes.length > 1 and text
+      scores = entrez_score(best_codes, text, @to_entrez)
+      Normalizer::get_best(scores, 0).collect{|p| p[0]}
+    else
+      orders = appearence_order(best_codes, mention)
+      if orders
+        orders
+      else
+        best_codes
+      end
+    end
+  end
+  def resolve(mention, text = nil, options = {})
+    candidates = match(mention)
+    select(candidates, mention, text, options)
+  end
+end