RubyGems - rbbt - Versions diffs - 1.2.5 → 2.0.0 - Mend

rbbt 1.2.5 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

checksums.yaml +7 -0
data/README.rdoc +2 -138
metadata +69 -214
data/LICENSE +0 -20
data/bin/rbbt_config +0 -245
data/install_scripts/classifier/R/classify.R +0 -36
data/install_scripts/classifier/Rakefile +0 -140
data/install_scripts/get_abner.sh +0 -2
data/install_scripts/get_banner.sh +0 -25
data/install_scripts/get_biocreative.sh +0 -72
data/install_scripts/get_crf++.sh +0 -26
data/install_scripts/get_entrez.sh +0 -4
data/install_scripts/get_go.sh +0 -4
data/install_scripts/get_polysearch.sh +0 -8
data/install_scripts/ner/Rakefile +0 -206
data/install_scripts/ner/config/default.rb +0 -52
data/install_scripts/norm/Rakefile +0 -219
data/install_scripts/norm/config/cue_default.rb +0 -10
data/install_scripts/norm/config/tokens_default.rb +0 -86
data/install_scripts/norm/functions.sh +0 -23
data/install_scripts/organisms/Ath.Rakefile +0 -55
data/install_scripts/organisms/Cal.Rakefile +0 -84
data/install_scripts/organisms/Cel.Rakefile +0 -109
data/install_scripts/organisms/Hsa.Rakefile +0 -140
data/install_scripts/organisms/Mmu.Rakefile +0 -77
data/install_scripts/organisms/Rakefile +0 -43
data/install_scripts/organisms/Rno.Rakefile +0 -88
data/install_scripts/organisms/Sce.Rakefile +0 -66
data/install_scripts/organisms/Spo.Rakefile +0 -40
data/install_scripts/organisms/rake-include.rb +0 -252
data/install_scripts/wordlists/consonants +0 -897
data/install_scripts/wordlists/stopwords +0 -1
data/lib/rbbt.rb +0 -83
data/lib/rbbt/bow/bow.rb +0 -88
data/lib/rbbt/bow/classifier.rb +0 -116
data/lib/rbbt/bow/dictionary.rb +0 -187
data/lib/rbbt/ner/abner.rb +0 -34
data/lib/rbbt/ner/banner.rb +0 -73
data/lib/rbbt/ner/dictionaryNER.rb +0 -98
data/lib/rbbt/ner/regexpNER.rb +0 -70
data/lib/rbbt/ner/rner.rb +0 -227
data/lib/rbbt/ner/rnorm.rb +0 -143
data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
data/lib/rbbt/sources/biocreative.rb +0 -75
data/lib/rbbt/sources/biomart.rb +0 -105
data/lib/rbbt/sources/entrez.rb +0 -211
data/lib/rbbt/sources/go.rb +0 -85
data/lib/rbbt/sources/gscholar.rb +0 -74
data/lib/rbbt/sources/organism.rb +0 -241
data/lib/rbbt/sources/polysearch.rb +0 -117
data/lib/rbbt/sources/pubmed.rb +0 -248
data/lib/rbbt/util/arrayHash.rb +0 -266
data/lib/rbbt/util/filecache.rb +0 -72
data/lib/rbbt/util/index.rb +0 -47
data/lib/rbbt/util/misc.rb +0 -106
data/lib/rbbt/util/open.rb +0 -251
data/lib/rbbt/util/rake.rb +0 -183
data/lib/rbbt/util/simpleDSL.rb +0 -87
data/lib/rbbt/util/tmpfile.rb +0 -35
data/tasks/install.rake +0 -124
data/test/rbbt/bow/test_bow.rb +0 -33
data/test/rbbt/bow/test_classifier.rb +0 -72
data/test/rbbt/bow/test_dictionary.rb +0 -91
data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
data/test/rbbt/ner/test_abner.rb +0 -17
data/test/rbbt/ner/test_banner.rb +0 -17
data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
data/test/rbbt/ner/test_regexpNER.rb +0 -33
data/test/rbbt/ner/test_rner.rb +0 -126
data/test/rbbt/ner/test_rnorm.rb +0 -47
data/test/rbbt/sources/test_biocreative.rb +0 -38
data/test/rbbt/sources/test_biomart.rb +0 -31
data/test/rbbt/sources/test_entrez.rb +0 -49
data/test/rbbt/sources/test_go.rb +0 -24
data/test/rbbt/sources/test_organism.rb +0 -59
data/test/rbbt/sources/test_polysearch.rb +0 -27
data/test/rbbt/sources/test_pubmed.rb +0 -39
data/test/rbbt/util/test_arrayHash.rb +0 -257
data/test/rbbt/util/test_filecache.rb +0 -37
data/test/rbbt/util/test_index.rb +0 -31
data/test/rbbt/util/test_misc.rb +0 -20
data/test/rbbt/util/test_open.rb +0 -110
data/test/rbbt/util/test_simpleDSL.rb +0 -57
data/test/rbbt/util/test_tmpfile.rb +0 -21
data/test/test_helper.rb +0 -4
data/test/test_rbbt.rb +0 -11

data/lib/rbbt/util/arrayHash.rb DELETED

@@ -1,266 +0,0 @@
-class ArrayHash
-  def self.make_case_insensitive(hash)
-    new = {}
-    hash.each{|k,v|
-      new[k.to_s.downcase] = v
-    }
-    class << new; self; end.instance_eval{
-      alias_method :old_get, :[]
-      define_method(:[], proc{|key| old_get(key.to_s.downcase)})
-    }
-    new
-  end
-  # Take two strings of elements separated by the character sep_char and join them
-  # into one, removing repetitions.
-  def self.merge_values_string(list1, list2, sep_char ='|')
-    elem1 = list1.to_s.split(sep_char)
-    elem2 = list2.to_s.split(sep_char)
-    (elem1 + elem2).select{|e| e.to_s != ""}.uniq.join(sep_char)
-  end
-  # Merge two lists of elements. Elements could be strings of elements
-  # separated by the character sep_char, or arrays of lists of such strings.
-  def self.merge_values(list1, list2, sep_char = "|")
-    if String === list1 || String === list2
-      return merge_values_string(list1, list2)
-    end
-    if list1.nil?
-      list1 = [''] * list2.length
-    end
-    if list2.nil?
-      list2 = [''] * list1.length
-    end
-    new = []
-    list1.each_with_index{|elem, i|
-      new << merge_values_string(elem, list2[i], sep_char)
-    }
-    new
-  end
-  # Take an hash of arrays and a position and use the value at that position
-  # of the arrays to build a new hash with that value as key, and the original
-  # key prepended to the arrays. The options hash accepts the following keys
-  # :case_insensitive, which defaults to true, and :index, which indicates that
-  # the original key should be the value of the hash entry, instead of the
-  # complete array of values.
-  def self.pullout(hash, pos, options = {})
-    index = options[:index]; index = false if index.nil?
-    case_insensitive = options[:case_insensitive]; case_insensitive = true if case_insensitive.nil?
-    new = {}
-    hash.each{|key,values|
-      code = values[pos].to_s
-      next if code == ""
-      if index
-        list = key
-      else
-        list = [key] + values
-        list.delete_at(pos + 1)
-      end
-      code.split("|").each{|c|
-        c = c.downcase if case_insensitive
-        new[c] = merge_values(new[c], list)
-      }
-    }
-    new = make_case_insensitive new if case_insensitive
-    new
-  end
-  # Merge one hash of arrays into another. Each hash contains a number of fields for each
-  # entry. The pos1 and pos2 indicate what fields should be used to match
-  # entries, the values for pos1 and pos2 can be an integer indicating the
-  # position in the array or the symbol :main to refer to the key of the hash.
-  # The options hash accepts the key :case_insensitive, which defaults to true.
-  def self.merge(hash1, hash2, pos1 = :main, pos2 = :main, options = {})
-    case_insensitive = options[:case_insensitive]; case_insensitive = true if case_insensitive.nil?
-    raise "Key #{ pos1 } should be an Interger or :main" unless Fixnum === pos1 || pos1.to_s.downcase == 'main'
-    raise "Key #{ pos2 } should be an Interger or :main" unless Fixnum === pos2 || pos2.to_s.downcase == 'main'
-    # Pullout if pos2 is not :main
-    hash2 = pullout(hash2, pos2) unless pos2.to_s.downcase == 'main'
-    # Translate if pos1 is not :main
-    if pos1.to_s.downcase != 'main'
-      index = pullout(hash1, pos1, options.merge(:index => true))
-      new = {}
-      hash2.each do |key, list|
-        next unless index[key]
-        new[index[key]] = list
-      end
-      hash2 = new
-    end
-    # Get the lengths of the arrays on each hash (they should
-    # be the same for every entry)
-    length1 = hash1.values.first.length
-    length2 = hash2.values.first.length
-    if case_insensitive
-      hash1 = make_case_insensitive hash1
-      hash2 = make_case_insensitive hash2
-    end
-    new = {}
-    (hash1.keys + hash2.keys).uniq.each do |key|
-      if hash2[key].nil?
-        list2 = [''] * length2
-      else
-        list2 = hash2[key]
-      end
-      if hash1[key].nil?
-        list1 = [''] * length1
-      else
-        list1 = hash1[key]
-      end
-      new[key] = list1 + list2
-    end
-    new
-  end
-  # For a given hash of arrays, filter the position pos of each array with the
-  # block of code.
-  def self.process(hash, pos, &block)
-    new = {}
-    hash.each{|key, values|
-      v = values
-      v[pos] = v[pos].to_s.split("|").collect{|n| block.call(n)}.join("|")
-      new[key] = v
-    }
-    new
-  end
-  # Clean structure for repeated values. If the same value appears two times
-  # eliminate the one that appears latter on the values list (columns of the
-  # ArrayHash are assumed to be sorted for importance) if the appear on the
-  # same position, remove the one with the smaller vale of the code after
-  # turning it into integer.
-  def self.clean(hash, options = {})
-    case_sensitive = options[:case_sensitive]
-    found = {}
-    hash.each{|k, list|
-      list.each_with_index{|values,i|
-        (String === values ? values.split("|") : values).each{|v|
-          v = v.downcase if case_sensitive
-          if found[v].nil?
-            found[v] = [k,i]
-          else
-            last_k, last_i = found[v].values_at(0,1)
-            if last_i > i || (last_i == i && last_k.to_i > k.to_i)
-              found[v] = [k,i]
-            end
-          end
-        }
-      }
-    }
-    new_hash = {}
-    hash.each{|k,list|
-      new_list = []
-      list.each_with_index{|values,i|
-        new_values = []
-        (String === values ? values.split("|") : values).each{|v|
-          found_k, found_i = found[(case_sensitive ? v.downcase : v )].values_at(0,1)
-          if found_i == i && found_k == k
-            new_values << v
-          end
-        }
-        new_list << (String === values ? new_values.join("|") : values)
-      }
-      new_hash[k] = new_list
-    }
-    new_hash
-  end
-  attr_reader :main, :fields, :data
-  def initialize(hash, main, fields = nil)
-    @data = hash
-    @main = main.to_s
-    if fields.nil? || fields.empty?
-      l = hash.values.first.length
-      fields = []
-      l.times{|i| fields << "F#{i}"}
-    end
-    @fields = fields.collect{|f| f.to_s}
-  end
-  # Wrapper
-  def process(field, &block)
-    pos = self.field_pos(field)
-    @data = ArrayHash.process(self.data, pos, &block)
-    self
-  end
-  # Returns the position of a given field in the value arrays
-  def field_pos(field)
-    return :main if field == :main
-    if field.to_s.downcase == self.main.to_s.downcase
-      return :main
-    else
-      @fields.collect{|f| f.downcase }.index(field.to_s.downcase)
-    end
-  end
-  # Merge two ArrayHashes using the specified field
-  def merge(other, field = :main, options = {} )
-    field = self.main  if field == :main
-    pos1 = self.field_pos(field)
-    pos2 = other.field_pos(field)
-    raise "Field #{ field } not found in target hash" if pos1.nil?
-    raise "Field #{ field } not found in added hash" if pos2.nil?
-    new = ArrayHash.merge(self.data, other.data, pos1, pos2, options)
-    @data = new
-    if pos2 == :main
-      new_fields = other.fields
-    else
-      new_fields = other.fields
-      new_fields.delete_at(pos2)
-      new_fields.unshift(other.main)
-    end
-    @fields += new_fields
-    self
-  end
-  # Remove a field from the ArrayHash
-  def remove(field)
-    pos = self.field_pos(field)
-    return if pos.nil?
-    @data = self.data.each{|key,values| values.delete_at(pos)}
-    @fields.delete_at(pos)
-    self
-  end
-  def clean
-    @data = ArrayHash.clean(@data)
-    self
-  end
-end

data/lib/rbbt/util/filecache.rb DELETED

@@ -1,72 +0,0 @@
-require 'fileutils'
-require 'rbbt'
-# Provides caching functionality for files downloaded from the internet
-module FileCache
-  class BadPathError    < StandardError; end
-  class FileExistsError < StandardError; end
-  private
-  # Remove slash characters from filename.
-  def self.clean_path(filename)
-    filename.gsub(/\//,'_SLASH_')
-  end
-  # Check that the file name is safe and is in the correct format
-  def self.sanity_check(filename)
-    if filename =~ /\//
-      raise FileCache::BadPathError, "Character / not allowed in name: #{ filename }"
-    end
-    if filename !~ /.+\..+/
-      raise FileCache::BadPathError, "Filename '#{filename}' must have name and extension: name.ext"
-    end
-  end
-  public
-  # Find the path that a particular file would have in the cache
-  def self.path(filename)
-    sanity_check(filename)
-    name, extension = filename.match(/(.+)\.(.+)/).values_at(1,2)
-    dirs = name.scan(/./).reverse.values_at(0,1,2,3,4).reverse.compact.join('/')
-    return File.join(File.join(Rbbt.cachedir,dirs),filename)
-  end
-  # Add a file in the cache. Raise exception if exists, unless force is
-  # used.
-  def self.add_file(filename, content, options = {})
-    sanity_check(filename)
-    path = path(filename)
-    FileUtils.makedirs(File.dirname(path), :mode => 0777)
-    if File.exist?(path) and ! (options[:force] || options['force'])
-      raise FileCache::FileExistsError, "File #{filename} already in cache"
-    end
-    File.open(path,'w'){|f|
-      f.write(content)
-    }
-    FileUtils.chmod 0666, path
-    nil
-  end
-  # Removes the file from cache
-  def self.del_file(filename)
-    sanity_check(filename)
-    path = path(filename)
-    if File.exist? path
-      FileUtils.rm path
-    end
-    nil
-  end
-end

data/lib/rbbt/util/index.rb DELETED

@@ -1,47 +0,0 @@
-require 'rbbt/util/open'
-require 'rbbt/util/arrayHash'
-module Index
-  # Creates an inverse index. Takes a file with rows of elements
-  # separated by a given pattern (specified by +sep+) and returns a hash
-  # where each element points to the first element in the row. +lexicon+
-  # is the file containing the data.
-  def self.index(lexicon, options = {})
-    options = {:sep => "\t", :sep2 => '\|', :case_sensitive => true}.merge(options)
-    data = Open.to_hash(lexicon, options)
-    if options[:clean]
-      data = ArrayHash.clean(data)
-    end
-    index = {}
-    data.each{|code, id_lists|
-      next if code.nil? || code == ""
-      id_lists.flatten.compact.uniq.each{|id|
-        id = id.downcase unless options[:case_sensitive]
-        index[id] = code
-      }
-    }
-    data.each{|code, id_lists|
-      next if code.nil? || code == ""
-      id = code
-      id = id.downcase unless options[:case_sensitive]
-      index[id] = code
-    }
-    if !options[:case_sensitive]
-      class << index; self; end.instance_eval{
-        alias_method :old_get, :[]
-        define_method(:[], proc{|key| old_get(key.to_s.downcase)})
-        alias_method :old_values_at, :values_at
-        define_method(:values_at, proc{|*keys| old_values_at(*keys.collect{|key| key.to_s.downcase }) })
-      }
-    end
-    index
-  end
-end

data/lib/rbbt/util/misc.rb DELETED

@@ -1,106 +0,0 @@
-require 'rbbt'
-require 'rbbt/util/open'
-class String
-  CONSONANTS = []
-  if File.exists? File.join(Rbbt.datadir, 'wordlists/consonants')
-    Object::Open.read(File.join(Rbbt.datadir, 'wordlists/consonants')).each_line{|l| CONSONANTS << l.chomp}
-  end
-  # Uses heuristics to checks if a string seems like a special word, like a gene name.
-  def is_special?
-    # Only consonants
-    return true if self =~ /^[bcdfghjklmnpqrstvwxz]+$/i
-    # Not a word
-    return false if self =~ /[^\s]\s[^\s]/;
-    return false if self.length < 3;
-    # Alphanumeric
-    return true if self =~ /[0-9]/ &&  self =~ /[a-z]/i
-    # All Caps
-    return true if self =~ /[A-Z]{2,}/;
-    # Caps Mix
-    return true if self =~ /[a-z][A-Z]/;
-    # All consonants
-    return true if self =~ /^[a-z]$/i && self !~ /[aeiou]/i
-    # Dashed word
-    return true if self =~ /(^\w-|-\w$)/
-    # To many consonants (very heuristic)
-    if self =~ /([^aeiouy]{3,})/i && !CONSONANTS.include?($1.downcase)
-      return true
-    end
-    return false
-  end
-  # Turns the first letter to lowercase
-  def downcase_first
-    return "" if self == ""
-    letters = self.scan(/./)
-    letters[0].downcase!
-    letters.join("")
-  end
-  # Turns a roman number into arabic form is possible. Just simple
-  # romans only...
-  def arabic
-    return 1 if self =~ /^I$/;
-    return 2 if self =~ /^II$/;
-    return 3 if self =~ /^III$/;
-    return 4 if self =~ /^IV$/;
-    return 5 if self =~ /^V$/;
-    return 10 if self =~ /^X$/;
-    return nil
-  end
-end
-$greek = {
-    "alpha"   => "a",
-    "beta"    => "b",
-    "gamma"   => "g",
-    "delta"   => "d",
-    "epsilon" => "e",
-    "zeta"    => "z",
-    "eta"     => "e",
-    "theta"   => "th",
-    "iota"    => "i",
-    "kappa"   => "k",
-    "lambda"  => "l",
-    "mu"      => "m",
-    "nu"      => "n",
-    "xi"      => "x",
-    "omicron" => "o",
-    "pi"      => "p",
-    "rho"     => "r",
-    "sigma"   => "s",
-    "tau"     => "t",
-    "upsilon" => "u",
-    "phi"     => "ph",
-    "chi"     => "ch",
-    "psi"     => "ps",
-    "omega"   => "o"
-}
-$inverse_greek = Hash.new
-$greek.each{|l,s| $inverse_greek[s] = l }
-$stopwords = Open.read(File.join(Rbbt.datadir, 'wordlists/stopwords')).scan(/\w+/) if File.exists? File.join(Rbbt.datadir, 'wordlists/stopwords')
-class Array
-  # Divides the array into +num+ chunks of the same size by placing one
-  # element in each chunk iteratively.
-  def chunk(num)
-    chunks = []
-    each_with_index{|e, i|
-      c = i % num
-      chunks[c] ||=[]
-      chunks[c] << e
-    }
-    chunks
-  end
-end