RubyGems - rbbt - Versions diffs - 1.2.5 → 2.0.0 - Mend

rbbt 1.2.5 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

checksums.yaml +7 -0
data/README.rdoc +2 -138
metadata +69 -214
data/LICENSE +0 -20
data/bin/rbbt_config +0 -245
data/install_scripts/classifier/R/classify.R +0 -36
data/install_scripts/classifier/Rakefile +0 -140
data/install_scripts/get_abner.sh +0 -2
data/install_scripts/get_banner.sh +0 -25
data/install_scripts/get_biocreative.sh +0 -72
data/install_scripts/get_crf++.sh +0 -26
data/install_scripts/get_entrez.sh +0 -4
data/install_scripts/get_go.sh +0 -4
data/install_scripts/get_polysearch.sh +0 -8
data/install_scripts/ner/Rakefile +0 -206
data/install_scripts/ner/config/default.rb +0 -52
data/install_scripts/norm/Rakefile +0 -219
data/install_scripts/norm/config/cue_default.rb +0 -10
data/install_scripts/norm/config/tokens_default.rb +0 -86
data/install_scripts/norm/functions.sh +0 -23
data/install_scripts/organisms/Ath.Rakefile +0 -55
data/install_scripts/organisms/Cal.Rakefile +0 -84
data/install_scripts/organisms/Cel.Rakefile +0 -109
data/install_scripts/organisms/Hsa.Rakefile +0 -140
data/install_scripts/organisms/Mmu.Rakefile +0 -77
data/install_scripts/organisms/Rakefile +0 -43
data/install_scripts/organisms/Rno.Rakefile +0 -88
data/install_scripts/organisms/Sce.Rakefile +0 -66
data/install_scripts/organisms/Spo.Rakefile +0 -40
data/install_scripts/organisms/rake-include.rb +0 -252
data/install_scripts/wordlists/consonants +0 -897
data/install_scripts/wordlists/stopwords +0 -1
data/lib/rbbt.rb +0 -83
data/lib/rbbt/bow/bow.rb +0 -88
data/lib/rbbt/bow/classifier.rb +0 -116
data/lib/rbbt/bow/dictionary.rb +0 -187
data/lib/rbbt/ner/abner.rb +0 -34
data/lib/rbbt/ner/banner.rb +0 -73
data/lib/rbbt/ner/dictionaryNER.rb +0 -98
data/lib/rbbt/ner/regexpNER.rb +0 -70
data/lib/rbbt/ner/rner.rb +0 -227
data/lib/rbbt/ner/rnorm.rb +0 -143
data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
data/lib/rbbt/sources/biocreative.rb +0 -75
data/lib/rbbt/sources/biomart.rb +0 -105
data/lib/rbbt/sources/entrez.rb +0 -211
data/lib/rbbt/sources/go.rb +0 -85
data/lib/rbbt/sources/gscholar.rb +0 -74
data/lib/rbbt/sources/organism.rb +0 -241
data/lib/rbbt/sources/polysearch.rb +0 -117
data/lib/rbbt/sources/pubmed.rb +0 -248
data/lib/rbbt/util/arrayHash.rb +0 -266
data/lib/rbbt/util/filecache.rb +0 -72
data/lib/rbbt/util/index.rb +0 -47
data/lib/rbbt/util/misc.rb +0 -106
data/lib/rbbt/util/open.rb +0 -251
data/lib/rbbt/util/rake.rb +0 -183
data/lib/rbbt/util/simpleDSL.rb +0 -87
data/lib/rbbt/util/tmpfile.rb +0 -35
data/tasks/install.rake +0 -124
data/test/rbbt/bow/test_bow.rb +0 -33
data/test/rbbt/bow/test_classifier.rb +0 -72
data/test/rbbt/bow/test_dictionary.rb +0 -91
data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
data/test/rbbt/ner/test_abner.rb +0 -17
data/test/rbbt/ner/test_banner.rb +0 -17
data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
data/test/rbbt/ner/test_regexpNER.rb +0 -33
data/test/rbbt/ner/test_rner.rb +0 -126
data/test/rbbt/ner/test_rnorm.rb +0 -47
data/test/rbbt/sources/test_biocreative.rb +0 -38
data/test/rbbt/sources/test_biomart.rb +0 -31
data/test/rbbt/sources/test_entrez.rb +0 -49
data/test/rbbt/sources/test_go.rb +0 -24
data/test/rbbt/sources/test_organism.rb +0 -59
data/test/rbbt/sources/test_polysearch.rb +0 -27
data/test/rbbt/sources/test_pubmed.rb +0 -39
data/test/rbbt/util/test_arrayHash.rb +0 -257
data/test/rbbt/util/test_filecache.rb +0 -37
data/test/rbbt/util/test_index.rb +0 -31
data/test/rbbt/util/test_misc.rb +0 -20
data/test/rbbt/util/test_open.rb +0 -110
data/test/rbbt/util/test_simpleDSL.rb +0 -57
data/test/rbbt/util/test_tmpfile.rb +0 -21
data/test/test_helper.rb +0 -4
data/test/test_rbbt.rb +0 -11

data/lib/rbbt/ner/rnorm.rb DELETED

@@ -1,143 +0,0 @@
-require 'rbbt'
-require 'rbbt/ner/rnorm/cue_index'
-require 'rbbt/ner/rnorm/tokens'
-require 'rbbt/util/index'
-require 'rbbt/util/open'
-require 'rbbt/sources/entrez'
-require 'rbbt/bow/bow.rb'
-class Normalizer
-  # Given a list of pairs of candidates along with their scores as
-  # parameter +values+, and a minimum value for the scores. It returns
-  # a list of pairs of the candidates that score the highest and that
-  # score above the minimum. Otherwise it return an empty list.
-  def self.get_best(values, min)
-    return [] if values.empty?
-    best = values.collect{|p| p[1]}.max
-    return [] if best < min
-    values.select{|p| p[1] == best}
-  end
-  # Compares the tokens and gives each candidate a score based on the
-  # commonalities and differences amongst the tokens.
-  def token_score(candidates, mention)
-    candidates.collect{|code|
-      next if @synonyms[code].nil?
-      value = @synonyms[code].select{|name| name =~ /\w/}.collect{|name|
-        case
-        when mention == name
-          100
-        when mention.downcase == name.downcase
-          90
-        when mention.downcase.gsub(/\s/,'') == name.downcase.gsub(/\s/,'')
-          80
-        else
-          @tokens.evaluate(mention, name)
-        end
-      }.max
-      [code, value]
-    }.compact
-  end
-  # Order candidates with the number of words in common between the text
-  # in their Entrez Gene entry and the text passed as parameter. Because
-  # candidate genes might be in some other format than Entrez Gene Ids,
-  # the +to_entrez+ variable can hold the way to translate between them,
-  # been a Proc or a Hash.
-  def entrez_score(candidates, text, to_entrez = nil)
-      code2entrez = {}
-      candidates.each{|code|
-        if to_entrez.is_a? Proc
-          entrez = to_entrez.call(code)
-        elsif to_entrez.is_a? Hash
-          entrez = @to_entrez[code]
-        else
-          entrez = code
-        end
-        code2entrez[code] = entrez unless entrez.nil?
-      }
-      # Get all at once, better performance
-      genes = Entrez.get_gene(code2entrez.values)
-      code2entrez_genes = code2entrez.collect{|key, value| [key, genes[value]]}
-      code2entrez_genes.collect{|p|
-        [p[0], Entrez.gene_text_similarity(p[1], text)]
-      }
-  end
-  # Takes a list of candidate codes and selects the ones that have the
-  # mention explicitly in their list of synonyms, and in the earliest
-  # positions. This is based on the idea that synonym list order their
-  # synonyms by importance.
-  def appearence_order(candidates, mention)
-    positions = candidates.collect{|code|
-      next unless @synonyms[code]
-      pos = nil
-      @synonyms[code].each_with_index{|list,i|
-        next if pos
-        pos = i if list.include? mention
-      }
-      pos
-    }
-    return nil if positions.compact.empty?
-    best = candidates.zip(positions).sort{|a,b| a[1] <=> b[1]}.first[1]
-    candidates.zip(positions).select{|p| p[1] == best}.collect{|p| p[0]}
-  end
-  def initialize(lexicon, options = {})
-    @synonyms = Open.to_hash(lexicon, :sep => "\t|\\|", :flatten => true)
-    @index = CueIndex.new
-    @index.load(lexicon, options[:max_candidates])
-    @to_entrez = options[:to_entrez]
-    @tokens = Tokenizer.new(options[:file])
-  end
-  def match(mention)
-    @index.match(mention)
-  end
-  def select(candidates, mention, text = nil, options = {})
-    threshold  = options[:threshold] || 0
-    max_candidates  = options[:max_candidates] || 200
-    max_entrez  = options[:max_entrez] || 10
-    # Abort if too ambigous
-    return [] if candidates.empty?
-    return [] if candidates.length > max_candidates
-    scores = token_score(candidates, mention)
-    best_codes = Normalizer::get_best(scores, threshold).collect{|p| p[0]}
-    # Abort if too ambigous
-    return [] if best_codes.length > max_entrez
-    if best_codes.length > 1 and text
-      scores = entrez_score(best_codes, text, @to_entrez)
-      Normalizer::get_best(scores, 0).collect{|p| p[0]}
-    else
-      orders = appearence_order(best_codes, mention)
-      if orders
-        orders
-      else
-        best_codes
-      end
-    end
-  end
-  def resolve(mention, text = nil, options = {})
-    candidates = match(mention)
-    select(candidates, mention, text, options)
-  end
-end

data/lib/rbbt/ner/rnorm/cue_index.rb DELETED

@@ -1,80 +0,0 @@
-require 'rbbt/util/misc'
-require 'rbbt/util/simpleDSL'
-class CueIndex < SimpleDSL
-  class LexiconMissingError < StandardError; end
-  def define(name, *args, &block)
-    @rules << [name,block]
-    nil
-  end
-  def initialize(file = nil, &block)
-    @rules   = []
-    file ||= File.join(Rbbt.datadir,'norm/config/cue_default.rb') if !file && !block
-    super(:define, file, &block)
-  end
-  def config
-    @config[:define]
-  end
-  def cues(word)
-    @rules.collect{|rule|
-      c = rule[1].call(word)
-      c = [c] unless  c.is_a? Array
-      c
-    }
-  end
-  def clean(max)
-    @indexes.each{|index|
-      remove = []
-      index.each{|key,values|
-        remove << key if values.length > max
-      }
-      remove.each{|key|
-        index.delete(key)
-      }
-    }
-  end
-  def load(file, max_candidates = 50)
-    @indexes = Array.new(@rules.size){Hash.new}
-    data = Open.to_hash(file, :sep => "\t|\\|")
-    data.each{|code, values_lists|
-      values = values_lists.flatten.compact.uniq
-      values.each{|value|
-        cues(value).each_with_index{|cue_list,i|
-          cue_list.each{|cue|
-            @indexes[i][cue] ||= []
-            @indexes[i][cue]  << code unless @indexes[i][cue].include? code
-          }
-        }
-      }
-    }
-    clean(max_candidates) if max_candidates
-    nil
-  end
-  def match(name)
-    raise LexiconMissingError, "Load Lexicon before matching" unless @indexes
-    cues = cues(name)
-    @indexes.each_with_index{|index,i|
-      best = []
-      cues[i].each{|cue|
-        best << index[cue] if index[cue]
-      }
-      return best.flatten if best.any?
-    }
-    return []
-  end
-end

data/lib/rbbt/ner/rnorm/tokens.rb DELETED

@@ -1,217 +0,0 @@
-require 'rbbt'
-require 'rbbt/util/simpleDSL'
-require 'rbbt/util/misc'
-require 'set'
-class Tokenizer < SimpleDSL
-  #{{{ Classes for Comparisons
-  @@ignore_case = true
-  def self.ignore_case(ignore = nil)
-    if ignore.nil?
-      return @@ignore_case
-    else
-      @@ignore_case = ignore
-    end
-  end
-  class Operation
-    def initialize(comparison)
-      @comparison = comparison
-      @ignore_case = Tokenizer::ignore_case
-    end
-    def ignore_case(ignore = true)
-      @ignore_case = ignore
-      self
-    end
-    def method_missing(name, *args, &bloc)
-      @token = name.to_sym
-      @value = *args.first
-      self
-    end
-    def eval(list1, list2)
-      toks1 = list1.select{|p| p[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
-      toks2 = list2.select{|p| p[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
-      value = 0
-      case @comparison.to_s
-      when 'same':
-        if toks1 == toks2 && toks1.any?
-          value = @value
-        end
-      when 'diff':
-        if toks1 != toks2
-          value = @value
-        end
-      when 'common':
-        if toks1.to_set.intersection(toks2.to_set).length > 0
-          value = @value
-        end
-      when 'distinct':
-        if toks1.to_set.intersection(toks2.to_set).length == 0
-          value = @value
-        end
-      when 'miss':
-        missing = (toks1 - toks2)
-        if missing.length > 0
-          value = @value * missing.length
-        end
-      when 'extr':
-        extr = (toks2 - toks1)
-        if extr.length > 0
-          value = @value * extr.length
-        end
-      end
-      return value
-    end
-  end
-  class Custom
-    def initialize
-      @ignore_case = Tokenizer::ignore_case
-    end
-    def ignore_case(ignore = true)
-      @ignore_case = ignore
-      self
-    end
-    def method_missing(name, *args, &block)
-      @token = name.to_sym
-      @block = block
-    end
-    def eval(list1, list2)
-      toks1 = list1.select{|t| t[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
-      toks2 = list2.select{|t| t[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
-      @block.call(toks1, toks2)
-    end
-  end
-  class Transform
-    def initialize
-    end
-    def method_missing(name, *args, &block)
-      @token = name.to_sym
-      if block_given?
-        @block = block
-      else
-        @block = args.first
-      end
-      self
-    end
-    def transform(token)
-      if token[1] == @token
-        token = @block.call(token[0])
-      else
-        token
-      end
-    end
-  end
-  #{{{ Metaprogramming hooks
-  def define_tokens(name, *args, &block)
-    action = *args[0] || block ||  /#{name.to_s}s?/i
-      raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
-    @types[name.to_sym] = action
-    @order.push name.to_sym
-    name.to_sym
-  end
-  def define_comparisons(name, *args, &block)
-     o = nil
-    case name.to_sym
-    when :compare
-      o = Custom.new
-      @operations << o
-    when :transform
-      o = Transform.new
-      @transforms << o
-    else
-      o = Operation.new(name)
-      @operations << o
-    end
-    o
-  end
-  def main(name, *args, &block)
-    parse("define_" + name.to_s,block)
-  end
-  #{{{ Initialize
-  def initialize(file=nil, &block)
-    @types = {}
-    @order = []
-    @operations = []
-    @transforms = []
-    file ||= File.join(Rbbt.datadir,'norm/config/tokens_default.rb') if !file && !block
-    super(:main, file, &block)
-  end
-  #{{{ Token Types
-  GREEK_RE = "(?:" + $greek.keys.select{|w| w.length > 3}.collect{|w| w.downcase}.join("|") + ")"
-  def tokenize(word)
-    return word.
-      gsub(/([^IVX])I$/,'\1|I|').     # Separate last roman number
-      gsub(/(\d+[,.]?\d+|\d+)/,'|\1|').     # Separate number
-      gsub(/([a-z])([A-Z])/,'\1-\2').
-      gsub(/([A-Z]{2,})([a-z])/,'\1-\2').
-      gsub(/^(#{GREEK_RE})/,'\1-').
-      gsub(/(#{GREEK_RE})$/,'-\1').
-      split( /[^\w.]+/).  # Split by separator char
-      select{|t|  !t.empty? }
-  end
-  def type(token)
-    @order.each{|type|
-      action = @types[type]
-      if action.is_a? Proc
-        return type if action.call(token)
-      else
-        return type if action.match(token)
-      end
-    }
-    return :unknown
-  end
-  def token_types(word)
-    tokenize(word).collect{|token|
-      [token, type(token)]
-    }
-  end
-  #{{{ Comparisons
-  def evaluate_tokens(list1, list2)
-    @operations.inject(0){| acc, o|
-      acc + o.eval(list1, list2)
-    }
-  end
-  def evaluate(mention, name)
-    mention_tokens, name_tokens = [mention, name].collect{|n|
-      token_types(n).collect{|t|
-        @transforms.inject(t){|t,o|
-          t = o.transform(t)
-        }
-      }
-    }
-    evaluate_tokens(mention_tokens, name_tokens)
-  end
-end

data/lib/rbbt/sources/biocreative.rb DELETED

@@ -1,75 +0,0 @@
-require 'rbbt'
-require 'rbbt/util/open'
-# Offers methods to help deal with the files distributed for the BioCreative
-# competition related to Gene Mention and Normalization.
-module Biocreative
-  # Read the files regarding the dataset and return a hash with the entry codes
-  # as keys and as values a hash with :text and the :mentions for that entry
-  def self.BC2GM(dataset)
-    data = {}
-    Open.read(File.join(Rbbt.datadir,"biocreative/BC2GM/#{dataset}/#{dataset}.in")).each_line{|l|
-      code, text = l.chomp.match(/(.*?) (.*)/).values_at(1,2)
-      data[code] ={ :text => text }
-    }
-    Open.read(File.join(Rbbt.datadir,"biocreative/BC2GM/#{dataset}/GENE.eval")).each_line{|l|
-      code, pos, mention = l.chomp.split(/\|/)
-      data[code] ||= {}
-      data[code][:mentions] ||= []
-      data[code][:mentions].push(mention)
-    }
-    data
-  end
-  # Given a string of text and a string with a mention, return positions for
-  # that mention in the format used in the evaluation.
-  def self.position(text, mention)
-    re = mention.gsub(/\W+/,' ')
-    re = Regexp.quote(re)
-    re = re.gsub(/\\ /,'\W*')
-    re = '\(?' + re if mention =~ /\)/
-    re = re + '\)?' if mention =~ /\(/
-    re = "'?" + re + "'?" if mention =~ /'/
-    positions = []
-    offset = 0
-    while text.match(/(.*?)(#{re})(.*)/s)
-      pre, mention, post = text.match(/(.*?)(#{re})(.*)/s).values_at(1,2,3)
-      start                     = offset  + pre.gsub(/\s/,'').length
-      last                      = offset  + pre.gsub(/\s/,'').length + mention.gsub(/\s/,'').length - 1
-      positions << [start, last]
-      offset                    = last + 1
-      text                      = post
-      end
-    return positions
-  end
-  # Run the evaluation perl script
-  def self.BC2GM_eval(results, dataset, outfile)
-    cmd = "/usr/bin/perl #{File.join(Rbbt.datadir, 'biocreative/BC2GM/alt_eval.perl')}\
-                         -gene #{File.join(Rbbt.datadir, "biocreative/BC2GM/#{dataset}/GENE.eval")}\
-                         -altgene #{File.join(Rbbt.datadir, "biocreative/BC2GM/#{dataset}/ALTGENE.eval")}\
-                          #{results} > #{outfile}"
-    system cmd
-  end
-end