RubyGems - rbbt - Versions diffs - 1.1.7 → 2.0.0 - Mend

rbbt 1.1.7 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

checksums.yaml +7 -0
data/README.rdoc +2 -138
metadata +72 -136
data/LICENSE +0 -20
data/bin/rbbt_config +0 -246
data/install_scripts/classifier/R/classify.R +0 -36
data/install_scripts/classifier/Rakefile +0 -145
data/install_scripts/get_abner.sh +0 -2
data/install_scripts/get_banner.sh +0 -25
data/install_scripts/get_biocreative.sh +0 -72
data/install_scripts/get_crf++.sh +0 -26
data/install_scripts/get_entrez.sh +0 -4
data/install_scripts/get_go.sh +0 -4
data/install_scripts/get_polysearch.sh +0 -8
data/install_scripts/ner/Rakefile +0 -206
data/install_scripts/ner/config/default.rb +0 -52
data/install_scripts/norm/Rakefile +0 -219
data/install_scripts/norm/config/cue_default.rb +0 -10
data/install_scripts/norm/config/tokens_default.rb +0 -79
data/install_scripts/norm/functions.sh +0 -23
data/install_scripts/organisms/Rakefile +0 -43
data/install_scripts/organisms/cgd.Rakefile +0 -84
data/install_scripts/organisms/human.Rakefile +0 -145
data/install_scripts/organisms/mgi.Rakefile +0 -77
data/install_scripts/organisms/pombe.Rakefile +0 -40
data/install_scripts/organisms/rake-include.rb +0 -258
data/install_scripts/organisms/rgd.Rakefile +0 -88
data/install_scripts/organisms/sgd.Rakefile +0 -66
data/install_scripts/organisms/tair.Rakefile +0 -54
data/install_scripts/organisms/worm.Rakefile +0 -109
data/install_scripts/wordlists/consonants +0 -897
data/install_scripts/wordlists/stopwords +0 -1
data/lib/rbbt.rb +0 -86
data/lib/rbbt/bow/bow.rb +0 -88
data/lib/rbbt/bow/classifier.rb +0 -116
data/lib/rbbt/bow/dictionary.rb +0 -187
data/lib/rbbt/ner/abner.rb +0 -34
data/lib/rbbt/ner/banner.rb +0 -73
data/lib/rbbt/ner/dictionaryNER.rb +0 -98
data/lib/rbbt/ner/regexpNER.rb +0 -70
data/lib/rbbt/ner/rner.rb +0 -227
data/lib/rbbt/ner/rnorm.rb +0 -143
data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
data/lib/rbbt/ner/rnorm/tokens.rb +0 -213
data/lib/rbbt/sources/biocreative.rb +0 -75
data/lib/rbbt/sources/biomart.rb +0 -105
data/lib/rbbt/sources/entrez.rb +0 -211
data/lib/rbbt/sources/go.rb +0 -40
data/lib/rbbt/sources/organism.rb +0 -245
data/lib/rbbt/sources/polysearch.rb +0 -117
data/lib/rbbt/sources/pubmed.rb +0 -111
data/lib/rbbt/util/arrayHash.rb +0 -255
data/lib/rbbt/util/filecache.rb +0 -72
data/lib/rbbt/util/index.rb +0 -47
data/lib/rbbt/util/misc.rb +0 -106
data/lib/rbbt/util/open.rb +0 -235
data/lib/rbbt/util/rake.rb +0 -183
data/lib/rbbt/util/simpleDSL.rb +0 -87
data/lib/rbbt/util/tmpfile.rb +0 -19
data/tasks/install.rake +0 -124

data/lib/rbbt/sources/biomart.rb DELETED

@@ -1,105 +0,0 @@
-require 'rbbt/util/open'
-require 'rbbt'
-# This module interacts with BioMart. It performs queries to BioMart and
-# synthesises a hash with the results. Note that this module connects to the
-# online BioMart WS using the Open in 'rbbt/util/open' module which offers
-# caching by default. To obtain up to date results you may need to clear the
-# cache from previous queries.
-module BioMart
-  class BioMart::QueryError < StandardError; end
-  private
-  @@biomart_query_xml = <<-EOT
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE Query>
-<Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
-<Dataset name = "<!--DATABASE-->" interface = "default" >
-<!--FILTERS-->
-<!--MAIN-->
-<!--ATTRIBUTES-->
-</Dataset>
-</Query>
-  EOT
-  def self.get(database, main, attrs = nil, filters = nil, data = nil)
-    attrs   ||= []
-    filters ||= ["with_#{main}"]
-    data    ||= {}
-    query = @@biomart_query_xml.clone
-    query.sub!(/<!--DATABASE-->/,database)
-    query.sub!(/<!--FILTERS-->/, filters.collect{|name| "<Filter name = \"#{ name }\" excluded = \"0\"/>"}.join("\n") )
-    query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
-    query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
-    response = Open.read('http://www.biomart.org/biomart/martservice?query=' + query.gsub(/\n/,' '))
-    if response =~ /Query ERROR:/
-      raise BioMart::QueryError, response
-    end
-    response.each_line{|l|
-      parts = l.chomp.split(/\t/)
-      main = parts.shift
-      next if main.nil? || main.empty?
-      data[main] ||= {}
-      attrs.each{|name|
-        value = parts.shift
-        data[main][name] ||= []
-        next if value.nil?
-        data[main][name] << value
-      }
-    }
-    data
-  end
-  public
-  # This method performs a query in biomart for a datasets and a given set of
-  # attributes, there must be a main attribute that will be used as the key in
-  # the result hash, optionally there may be a list of additional attributes
-  # and filters. The data parameter at the end is used internally to
-  # incrementally building the result, due to a limitation of the BioMart WS
-  # that only allows 3 external arguments, users normally should leave it
-  # unspecified or nil. The result is a hash, where the keys are the different
-  # values for the main attribute, and the value is a hash with every other
-  # attribute as key, and as value and array with all possible values (Note
-  # that for a given value of the main attribute, there may be more than one
-  # value for another attribute). If filters is left a nil it adds a filter to
-  # the BioMart query to remove results with the main attribute empty, this may
-  # cause an error if the BioMart WS does not allow filtering with that
-  # attribute.
-  def self.query(database, main, attrs = nil, filters = nil, data = nil)
-    attrs   ||= []
-    data    ||= {}
-    chunks = []
-    chunk = []
-    attrs.each{|a|
-      chunk << a
-      if chunk.length == 2
-        chunks << chunk
-        chunk = []
-      end
-    }
-    chunks << chunk if chunk.any?
-    chunks.each{|chunk|
-      data = get(database,main,chunk, filters, data)
-    }
-    data
-  end
-end

data/lib/rbbt/sources/entrez.rb DELETED

@@ -1,211 +0,0 @@
-require 'rbbt'
-require 'rbbt/util/open'
-require 'rbbt/util/tmpfile'
-require 'rbbt/util/filecache'
-require 'rbbt/bow/bow.rb'
-require 'set'
-# This module is used to parse and extract information from the
-# gene_info file at Entrez Gene, as well as from the gene2pubmed file.
-# Both need to be downloaded and accesible for Rbbt, which is done as
-# part of a normal installation.
-module Entrez
-  class NoFileError < StandardError; end
-  # Given a taxonomy, or set of taxonomies, it returns an inverse hash,
-  # where each key is the entrez id of a gene, and the value is an array
-  # of possible synonyms in other databases. Is mostly used to translate
-  # entrez ids to the native database id of the organism. The parameter
-  # +native+ specifies the position of the key containing synonym, the
-  # fifth by default, +fix+ and +check+ are Procs used, if present, to
-  # pre-process lines and to check if they should be processed.
-  def self.entrez2native(taxs, native = nil, fix = nil, check = nil)
-    raise NoFileError, "Install the Entrez gene_info file" unless File.exists? File.join(Rbbt.datadir, 'dbs/entrez/gene_info')
-    native ||= 5
-    taxs = [taxs] unless taxs.is_a?(Array)
-    taxs = taxs.collect{|t| t.to_s}
-    lexicon = {}
-    tmp = TmpFile.tmp_file("entrez-")
-    system "cat '#{File.join(Rbbt.datadir, 'dbs/entrez/gene_info')}' |grep '^\\(#{taxs.join('\\|')}\\)[[:space:]]' > #{tmp}"
-    File.open(tmp).each{|l|
-      parts = l.chomp.split(/\t/)
-      next if parts[native] == '-'
-      entrez = parts[1]
-      parts[native].split(/\|/).each{|id|
-        id = fix.call(id) if fix
-        next if check && !check.call(id)
-        lexicon[entrez] ||= []
-        lexicon[entrez] << id
-      }
-    }
-    FileUtils.rm tmp
-    lexicon
-  end
-  # For a given taxonomy, or set of taxonomies, it returns a hash with
-  # genes as keys and arrays of related PubMed ids as values, as
-  # extracted from the gene2pubmed file from Entrez Gene.
-  def self.entrez2pubmed(taxs)
-    raise NoFileError, "Install the Entrez gene2pubmed file" unless File.exists? File.join(Rbbt.datadir, 'dbs/entrez/gene2pubmed')
-    taxs = [taxs] unless taxs.is_a?(Array)
-    taxs = taxs.collect{|t| t.to_s}
-    data = {}
-    tmp = TmpFile.tmp_file("entrez-")
-    system "cat '#{File.join(Rbbt.datadir, 'dbs/entrez/gene2pubmed')}' |grep '^\\(#{taxs.join('\\|')}\\)[[:space:]]' > #{tmp}"
-    data = Open.to_hash(tmp, :native => 1, :extra => 2).each{|code, value_lists| value_lists.flatten!}
-    FileUtils.rm tmp
-    data
-  end
-  # This class parses an xml containing the information for a particular
-  # gene as served by Entrez Gene, and hold some of its information.
-  class Gene
-    attr_reader :organism, :symbol, :description, :aka, :protnames, :summary, :comentaries
-    def initialize(xml)
-      return if xml.nil?
-      @organism    = xml.scan(/<Org-ref_taxname>(.*)<\/Org-ref_taxname>/s)
-      @symbol      = xml.scan(/<Gene-ref_locus>(.*)<\/Gene-ref_locus>/s)
-      @description = xml.scan(/<Gene-ref_desc>(.*)<\/Gene-ref_desc>/s)
-      @aka         = xml.scan(/<Gene-ref_syn_E>(.*)<\Gene-ref_syn_E>/s)
-      @protnames   = xml.scan(/<Prot-ref_name_E>(.*)<\/Prot-ref_name_E>/s)
-      @summary     = xml.scan(/<Entrezgene_summary>(.*)<\/Entrezgene_summary>/s)
-      @comentaries = xml.scan(/<Gene-commentary_text>(.*)<\/Gene-commentary_text>/s)
-    end
-    # Joins the text from symbol, description, aka, protnames, and
-    # summary
-    def text
-      #[@organism, @symbol, @description, @aka,  @protnames, @summary,@comentaries.join(". ")].join(". ")
-      [@symbol, @description, @aka,  @protnames, @summary].flatten.join(". ")
-    end
-  end
-  private
-  @@last = Time.now
-  @@entrez_lag = 1
-  def self.get_online(geneids)
-    geneids_list = ( geneids.is_a?(Array) ? geneids.join(',') : geneids.to_s )
-    url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{geneids_list}"
-    diff = Time.now - @@last
-    sleep @@entrez_lag - diff unless diff > @@entrez_lag
-    xml = Open.read(url, :quiet => true, :nocache => true)
-    @@last = Time.now
-    genes = xml.scan(/(<Entrezgene>.*?<\/Entrezgene>)/sm).flatten
-    if geneids.is_a? Array
-      list = {}
-      genes.each_with_index{|gene,i|
-        #geneid  = gene.scan(/<Gene-track_geneid>(.*?)<\/Gene-track_geneid>/).flatten.first
-        geneid = geneids[i]
-        list[geneid ] = gene
-      }
-      return list
-    else
-      return genes.first
-    end
-  end
-  public
-  # Build a file name for a gene based on the id. Prefix the id by 'gene-',
-  # substitute the slashes with '_SLASH_', and add a '.xml' extension.
-  def self.gene_filename(id)
-    FileCache.clean_path('gene-' + id.to_s + '.xml')
-  end
-  # Returns a Gene object for the given Entrez Gene id. If an array of
-  # ids is given instead, a hash is returned. This method uses the
-  # caching facilities from Rbbt.
-  def self.get_gene(geneid)
-    return nil if geneid.nil?
-    if Array === geneid
-      missing = []
-      list = {}
-      geneid.each{|p|
-        next if p.nil?
-        filename = gene_filename p
-        if File.exists? FileCache.path(filename)
-          list[p] = Gene.new(Open.read(FileCache.path(filename)))
-        else
-          missing << p
-        end
-      }
-      return list unless missing.any?
-      genes = get_online(missing)
-      genes.each{|p, xml|
-        filename = gene_filename p
-        FileCache.add_file(filename,xml) unless File.exist? FileCache.path(filename)
-        list[p] =  Gene.new(xml)
-      }
-      return list
-    else
-      filename = gene_filename geneid
-      if File.exists? FileCache.path(filename)
-        return Gene.new(Open.read(FileCache.path(filename)))
-      else
-        xml = get_online(geneid)
-        FileCache.add_file(filename,xml)
-        return Gene.new(xml)
-      end
-    end
-  end
-  # Counts the words in common between a chunk of text and the text
-  # found in Entrez Gene for that particular gene. The +gene+ may be a
-  # gene identifier or a Gene class instance.
-  def self.gene_text_similarity(gene, text)
-    case
-    when Entrez::Gene === gene
-      gene_text = gene.text
-    when String === gene || Fixnum === gene
-      gene_text =  get_gene(gene).text
-    else
-      return 0
-    end
-    gene_words = gene_text.words.to_set
-    text_words = text.words.to_set
-    return 0 if gene_words.empty? || text_words.empty?
-    common = gene_words.intersection(text_words)
-    common.length / (gene_words.length + text_words.length).to_f
-  end
-end

data/lib/rbbt/sources/go.rb DELETED

@@ -1,40 +0,0 @@
-require 'rbbt'
-# This module holds helper methods to deal with the Gene Ontology files. Right
-# now all it does is provide a translation form id to the actual names.
-module GO
-  @@info = nil
-  # This method needs to be called before any translations can be made, it is
-  # called automatically the first time the id2name method is called. It loads
-  # the gene_ontology.obo file and extracts all the fields, although right now,
-  # only the name field is used.
-  def self.init
-    @@info = {}
-    File.open(File.join(Rbbt.datadir, 'dbs/go/gene_ontology.obo')).read.
-      split(/\[Term\]/).
-      each{|term|
-        term_info = {}
-        term.split(/\n/).
-          select{|l| l =~ /:/}.
-          each{|l|
-            key, value = l.chomp.match(/(.*?):(.*)/).values_at(1,2)
-            term_info[key.strip] = value.strip
-          }
-        @@info[term_info["id"]] = term_info
-      }
-  end
-  def self.id2name(id)
-    self.init unless @@info
-    if id.kind_of? Array
-      @@info.values_at(*id).collect{|i| i['name'] if i}
-    else
-      return "Name not found" unless @@info[id]
-      @@info[id]['name']
-    end
-  end
-end

data/lib/rbbt/sources/organism.rb DELETED

@@ -1,245 +0,0 @@
-require 'rbbt'
-require 'rbbt/util/open'
-require 'rbbt/util/index'
-# This module contains some Organism centric functionalities. Each organism is
-# identified by a keyword.
-module Organism
-  # Raised when trying to access information for an organism that has not been
-  # prepared already.
-  class OrganismNotProcessedError < StandardError; end
-  # Return the list of all supported organisms. The prepared flag is used to
-  # show only those that have been prepared.
-  def self.all(prepared = true)
-    if prepared
-      Dir.glob(File.join(Rbbt.datadir,'/organisms/') + '/*/identifiers').collect{|f| File.basename(File.dirname(f))}
-    else
-      Dir.glob(File.join(Rbbt.datadir,'/organisms/') + '/*').select{|f| File.directory? f}.collect{|f| File.basename(f)}
-    end
-  end
-  # Return the complete name of an organism. The org parameter is the organism
-  # keyword
-  def self.name(org)
-    raise OrganismNotProcessedError, "Missing 'name' file" if ! File.exists? File.join(Rbbt.datadir,"organisms/#{ org }/name")
-    Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/name"))
-  end
-  # Hash linking all the organism log names with their keywords in Rbbt. Its
-  # the inverse of the name method.
-  NAME2ORG = {}
-  Organism::all.each{|org|
-    name = Organism.name(org).strip.downcase
-    NAME2ORG[name] = org
-  }
-  # Return the key word associated with an organism.
-  def self.name2org(name)
-    NAME2ORG[name.strip.downcase]
-  end
-  # FIXME: The NER related stuff is harder to install, thats why we hide the
-  # requires next to where they are needed, next to options
-  # Return a NER object which could be of RNER, Abner or Banner class, this is
-  # selected using the type parameter.
-  def self.ner(org, type=:rner, options = {})
-    case type.to_sym
-    when :abner
-      require 'rbbt/ner/abner'
-      return Abner.new
-    when :banner
-      require 'rbbt/ner/banner'
-      return Banner.new
-    when :rner
-      require 'rbbt/ner/rner'
-      model = options[:model]
-      model ||= File.join(Rbbt.datadir,"ner/model/#{ org }") if File.exist? File.join(Rbbt.datadir,"ner/model/#{ org }")
-      model ||= File.join(Rbbt.datadir,'ner/model/BC2')
-      return NER.new(model)
-    else
-      raise "Ner type (#{ type }) unknown"
-    end
-  end
-  # Return a normalization object.
-  def self.norm(org, to_entrez = nil)
-    require 'rbbt/ner/rnorm'
-    if to_entrez.nil?
-      to_entrez = id_index(org, :native => 'Entrez Gene ID', :other => [supported_ids(org).first])
-    end
-    token_file = File.join(Rbbt.datadir, 'norm','config',org.to_s + '.config')
-    if !File.exists? token_file
-      token_file = nil
-    end
-    Normalizer.new(File.join(Rbbt.datadir,"organisms/#{ org }/lexicon"), :to_entrez => to_entrez, :file => token_file, :max_candidates => 20)
-  end
-  # Returns a hash with the names associated with each gene id. The ids are
-  # in Rbbt native format for that organism.
-  def self.lexicon(org, options = {})
-    options = {:sep => "\t|\\|", :flatten => true}.merge(options)
-    Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/lexicon"),options)
-  end
-  # Returns a hash with the list of go terms for each gene id. Gene ids are in
-  # Rbbt native format for that organism.
-  def self.goterms(org)
-    goterms = {}
-    Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/gene.go")).each_line{|l|
-      gene, go = l.chomp.split(/\t/)
-      goterms[gene.strip] ||= []
-      goterms[gene.strip] << go.strip
-    }
-    goterms
-  end
-  # Return list of PubMed ids associated to the organism. Determined using a
-  # PubMed query with the name of the organism
-  def self.literature(org)
-    Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/all.pmid")).scan(/\d+/)
-  end
-  # Return hash that associates genes to a list of PubMed ids.
-  def self.gene_literature(org)
-    Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene.pmid"), :flatten => true)
-  end
-  # Return hash that associates genes to a list of PubMed ids. Includes only
-  # those found to support GO term associations.
-  def self.gene_literature_go(org)
-    Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene_go.pmid"), :flatten => true)
-  end
-  # Returns a list with the names of the id formats supported for an organism.
-  # If examples are produced, the list is of [format, example] pairs.
-  #
-  # *Options:*
-  #
-  # *examples:* Include example ids for each format
-  def self.supported_ids(org, options = {})
-    formats  = []
-    examples = [] if options[:examples]
-    i= 0
-    Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers")).each_line{|l|
-      if i == 0
-        i += 1
-        next unless l=~/^\s*#/
-          formats  = Open.fields(l.sub(/^[\s#]+/,'')).collect{|n| n.strip}
-        return formats unless examples
-        next
-      end
-      if Open.fields(l).select{|name| name && name =~ /\w/}.length > examples.length
-        examples = Open.fields(l).collect{|name| name.split(/\|/).first}
-      end
-      i += 1
-    }
-    formats.zip(examples)
-  end
-  # Creates a hash where each possible id is associated with the names of the
-  # formats (its potentially possible for different formats to have the same
-  # id). This is used in the guessIdFormat method.
-  def self.id_formats(org)
-    id_types = {}
-    formats = supported_ids(org)
-    text = Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"))
-    if text.respond_to? :collect
-      lines = text.collect
-    else
-      lines = text.lines
-    end
-    lines.each{|l|
-      ids_per_type = Open.fields(l)
-      formats.zip(ids_per_type).each{|p|
-        format = p[0]
-        p[1] ||= ""
-        ids = p[1].split(/\|/)
-        ids.each{|id|
-          next if id.nil? || id == ""
-          id_types[id.downcase] ||= []
-          id_types[id.downcase] << format unless id_types[id.downcase].include? format
-        }
-      }
-    }
-    return id_types
-  end
-  def self.guessIdFormat(formats, query)
-    query = query.compact.collect{|gene| gene.downcase}.uniq
-    if String === formats
-      formats = id_formats(formats)
-    end
-    return nil if formats.values.empty?
-    values = formats.values_at(*query)
-    return nil if values.empty?
-    format_count = {}
-    values.compact.collect{|types| types.uniq}.flatten.each{|f|
-      format_count[f] ||= 0
-      format_count[f] += 1
-    }
-    return nil if format_count.values.empty?
-    format_count.select{|k,v| v > (query.length / 10)}.sort{|a,b| b[1] <=> a[1]}.first
-  end
-  def self.id_position(supported_ids, id_name, options = {})
-    pos = 0
-    supported_ids.each_with_index{|id, i|
-      if id.strip == id_name.strip || !options[:case_sensitive] && id.strip.downcase == id_name.strip.downcase
-        pos = i;
-      end
-    }
-    pos
-  end
-  def self.id_index(org, option = {})
-    native = option[:native]
-    other  = option[:other]
-    option[:case_sensitive] = false if option[:case_sensitive].nil?
-    if native.nil? and other.nil?
-      Index.index(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"), option)
-    else
-      supported = Organism.supported_ids(org)
-      first = nil
-      if native
-        first = id_position(supported,native,option)
-      else
-        first = 0
-      end
-      rest = nil
-      if other
-        rest = other.collect{|name| id_position(supported,name, option)}
-      else
-        rest = (0..supported.length - 1).to_a - [first]
-      end
-      option[:native] = first
-      option[:extra] = rest
-      index = Index.index(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"), option)
-      index
-    end
-  end
-end