RubyGems - rbbt-sources - Versions diffs - 0.1.0 - Mend

rbbt-sources 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

data/lib/rbbt/sources/bibtex.rb +130 -0
data/lib/rbbt/sources/biomart.rb +104 -0
data/lib/rbbt/sources/entrez.rb +145 -0
data/lib/rbbt/sources/go.rb +84 -0
data/lib/rbbt/sources/gscholar.rb +73 -0
data/lib/rbbt/sources/organism.rb +9 -0
data/lib/rbbt/sources/pubmed.rb +239 -0
data/test/rbbt/sources/test_biomart.rb +30 -0
data/test/rbbt/sources/test_entrez.rb +48 -0
data/test/rbbt/sources/test_go.rb +24 -0
data/test/rbbt/sources/test_pubmed.rb +39 -0
data/test/test_helper.rb +4 -0
metadata +123 -0

data/lib/rbbt/sources/bibtex.rb ADDED Viewed

@@ -0,0 +1,130 @@
+class BibTexFile
+  class Entry
+    FIELDS = %w(pmid title author journal pages number volume year abstract)
+    FIELDS.each do |field|
+      define_method(field, proc{@info[field]})
+    end
+    attr_reader :info, :fields, :name, :type
+    def initialize(name, type, info)
+      @name = name
+      @type = type
+      @info = info
+      @fields = info.keys
+    end
+    def method_missing(name, *args)
+      if name.to_s =~ /(.*)=$/
+        if (FIELDS + @fields).include?($1.to_s)
+          return @info[$1.to_s] = args[0].chomp
+        else
+          raise "No field named '#{ $1 }'"
+        end
+      else
+        if @fields.include?(name.to_s)
+          return @info[name.to_s]
+        else
+          raise "No field named '#{ name }'"
+        end
+      end
+    end
+    def to_s
+      str = "@#{type}{#{name},\n"
+      FIELDS.each do |field|
+        next if field.nil?
+        str += "    #{field} = {#{@info[field]}},\n"
+      end
+      (fields - FIELDS).sort.each do |field|
+        str += "    #{field} = {#{@info[field]}},\n"
+      end
+      str += "}"
+      str
+    end
+  end
+  def self.clean_string(string)
+    string.gsub(/[{}]/,'')
+  end
+  def self.parse_bibtex(bibtex)
+    bibtex.scan(/@\w+\{.*?^\}\s*/m)
+  end
+  def self.parse_entry(entry)
+    info = {}
+    type, name = entry.match(/@([^\s]+)\{([^\s]+)\s*,/).values_at(1,2)
+    entry.scan(/\s*(.*?)\s*=\s*\{?\s*(.*?)\s*\}?\s*,?\s*$/).each do |pair|
+      info[pair.first.chomp] = pair.last.chomp
+    end
+    [ type.chomp, name.chomp, info]
+  end
+  def self.load_file(file)
+    entries = {}
+    case
+    when File.exists?(file)
+      self.parse_bibtex File.open(file).read
+    when IO === file
+      self.parse_bibtex file.read
+    when String === file
+      self.parse_bibtex file
+    else
+      raise "Input format not recognized"
+    end.each do |entry|
+      type, name, info = self.parse_entry entry
+      entries[name] = Entry.new name, type, info
+    end
+    entries
+  end
+  def initialize(file)
+    @entries = BibTexFile.load_file(file)
+  end
+  def save(file)
+    text = entries.collect{|e| entry e }.sort{|a,b|
+      if a.year.to_i != b.year.to_i
+        a.year.to_i <=> b.year.to_i
+      else
+        a.name <=> b.name
+      end
+    }.reverse.collect do |entry|
+      entry.to_s
+    end * "\n"
+    File.open(file, 'w') do |fout| fout.puts text end
+  end
+  def add(bibtex)
+    type, name, info = BibTexFile.parse_entry bibtex
+    @entries[name] = BibTexFile::Entry.new name, type, info
+  end
+  def entries
+    @entries.keys
+  end
+  def entry(bibentry)
+    @entries[bibentry]
+  end
+end
+if __FILE__ == $0
+  b = BibTexFile.new('/home/miki/git/DrugReview/drug.bib')
+  puts b.entry("yao2009novel").to_s
+  b.save('foo.bib')
+end

data/lib/rbbt/sources/biomart.rb ADDED Viewed

@@ -0,0 +1,104 @@
+require 'rbbt'
+require 'rbbt/util/open'
+# This module interacts with BioMart. It performs queries to BioMart and
+# synthesises a hash with the results. Note that this module connects to the
+# online BioMart WS using the Open in 'rbbt/util/open' module which offers
+# caching by default. To obtain up to date results you may need to clear the
+# cache from previous queries.
+module BioMart
+  class BioMart::QueryError < StandardError; end
+  private
+  @@biomart_query_xml = <<-EOT
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE Query>
+<Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
+<Dataset name = "<!--DATABASE-->" interface = "default" >
+<!--FILTERS-->
+<!--MAIN-->
+<!--ATTRIBUTES-->
+</Dataset>
+</Query>
+  EOT
+  def self.get(database, main, attrs = nil, filters = nil, data = nil, options = {})
+    attrs   ||= []
+    filters ||= ["with_#{main}"]
+    data    ||= {}
+    query = @@biomart_query_xml.dup
+    query.sub!(/<!--DATABASE-->/,database)
+    query.sub!(/<!--FILTERS-->/, filters.collect{|name| "<Filter name = \"#{ name }\" excluded = \"0\"/>"}.join("\n") )
+    query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
+    query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
+    response = Open.read('http://www.biomart.org/biomart/martservice?query=' + query.gsub(/\n/,' '), options)
+    if response =~ /Query ERROR:/
+      raise BioMart::QueryError, response
+    end
+    response.each_line{|l|
+      parts = l.chomp.split(/\t/)
+      main = parts.shift
+      next if main.nil? || main.empty?
+      data[main] ||= {}
+      attrs.each{|name|
+        value = parts.shift
+        data[main][name] ||= []
+        next if value.nil?
+        data[main][name] << value
+      }
+    }
+    data
+  end
+  public
+  # This method performs a query in biomart for a datasets and a given set of
+  # attributes, there must be a main attribute that will be used as the key in
+  # the result hash, optionally there may be a list of additional attributes
+  # and filters. The data parameter at the end is used internally to
+  # incrementally building the result, due to a limitation of the BioMart WS
+  # that only allows 3 external arguments, users normally should leave it
+  # unspecified or nil. The result is a hash, where the keys are the different
+  # values for the main attribute, and the value is a hash with every other
+  # attribute as key, and as value and array with all possible values (Note
+  # that for a given value of the main attribute, there may be more than one
+  # value for another attribute). If filters is left a nil it adds a filter to
+  # the BioMart query to remove results with the main attribute empty, this may
+  # cause an error if the BioMart WS does not allow filtering with that
+  # attribute.
+  def self.query(database, main, attrs = nil, filters = nil, data = nil, options = {})
+    attrs   ||= []
+    data    ||= {}
+    chunks = []
+    chunk = []
+    attrs.each{|a|
+      chunk << a
+      if chunk.length == 2
+        chunks << chunk
+        chunk = []
+      end
+    }
+    chunks << chunk if chunk.any?
+    chunks.each{|chunk|
+      data = get(database, main, chunk, filters, data, options)
+    }
+    data
+  end
+end

data/lib/rbbt/sources/entrez.rb ADDED Viewed

@@ -0,0 +1,145 @@
+require 'rbbt-util'
+require 'rbbt/util/tsv'
+require 'rbbt/bow/bow'
+require 'set'
+module Entrez
+  Rbbt.add_datafiles "gene_info" => ['databases/entrez', 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'],
+    "gene2pubmed" => ["databases/entrez", "ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz" ]
+  def self.entrez2native(taxs, options = {})
+    options = Misc.add_defaults options, :native => 1, :extra => 5, :flatten => true, :persistence => true
+    taxs = [taxs] unless Array === taxs
+    options.merge! :grep => taxs
+    TSV.new(Rbbt.find_datafile('gene_info'), options)
+  end
+  def self.entrez2pubmed(taxs)
+    options = {:native => 1, :extra => 2, :flatten => true, :persistence => true}
+    taxs = [taxs] unless taxs.is_a?(Array)
+    taxs = taxs.collect{|t| t.to_s}
+    options.merge! :grep => taxs
+    TSV.new(Rbbt.find_datafile('gene2pubmed'), options)
+  end
+  class Gene
+    attr_reader :organism, :symbol, :description, :aka, :protnames, :summary, :comentaries
+    def initialize(xml)
+      return if xml.nil?
+      @organism    = xml.scan(/<Org-ref_taxname>(.*)<\/Org-ref_taxname>/s)
+      @symbol      = xml.scan(/<Gene-ref_locus>(.*)<\/Gene-ref_locus>/s)
+      @description = xml.scan(/<Gene-ref_desc>(.*)<\/Gene-ref_desc>/s)
+      @aka         = xml.scan(/<Gene-ref_syn_E>(.*)<\Gene-ref_syn_E>/s)
+      @protnames   = xml.scan(/<Prot-ref_name_E>(.*)<\/Prot-ref_name_E>/s)
+      @summary     = xml.scan(/<Entrezgene_summary>(.*)<\/Entrezgene_summary>/s)
+      @comentaries = xml.scan(/<Gene-commentary_text>(.*)<\/Gene-commentary_text>/s)
+    end
+    # Joins the text from symbol, description, aka, protnames, and
+    # summary
+    def text
+      #[@organism, @symbol, @description, @aka,  @protnames, @summary,@comentaries.join(". ")].join(". ")
+      [@symbol, @description, @aka,  @protnames, @summary].flatten.join(". ")
+    end
+  end
+  private
+  def self.get_online(geneids)
+    geneids_list = ( geneids.is_a?(Array) ? geneids.join(',') : geneids.to_s )
+    url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{geneids_list}"
+    xml = Open.read(url, :wget_options => {:quiet => true}, :nocache => true)
+    genes = xml.scan(/(<Entrezgene>.*?<\/Entrezgene>)/sm).flatten
+    if geneids.is_a? Array
+      list = {}
+      genes.each_with_index{|gene,i|
+        geneid = geneids[i]
+        list[geneid ] = gene
+      }
+      return list
+    else
+      return genes.first
+    end
+  end
+  public
+  def self.gene_filename(id)
+    'gene-' + id.to_s + '.xml'
+  end
+  def self.get_gene(geneid)
+    return nil if geneid.nil?
+    if Array === geneid
+      missing = []
+      list = {}
+      geneid.each{|p|
+        next if p.nil?
+        if FileCache.found(gene_filename p)
+          list[p] = Gene.new(Open.read(FileCache.path(gene_filename p)))
+        else
+          missing << p
+        end
+      }
+      return list unless missing.any?
+      genes = get_online(missing)
+      genes.each{|p, xml|
+        filename = gene_filename p
+        FileCache.add(filename,xml) unless FileCache.found(filename)
+        list[p] =  Gene.new(xml)
+      }
+      return list
+    else
+      filename = gene_filename geneid
+      if FileCache.found(filename)
+        return Gene.new(Open.read(FileCache.path(filename)))
+      else
+        xml = get_online(geneid)
+        FileCache.add(filename, xml) unless FileCache.found(filename)
+        return Gene.new(xml)
+      end
+    end
+  end
+  # Counts the words in common between a chunk of text and the text
+  # found in Entrez Gene for that particular gene. The +gene+ may be a
+  # gene identifier or a Gene class instance.
+  def self.gene_text_similarity(gene, text)
+    case
+    when Entrez::Gene === gene
+      gene_text = gene.text
+    when String === gene || Fixnum === gene
+      gene_text =  get_gene(gene).text
+    else
+      return 0
+    end
+    gene_words = gene_text.words.to_set
+    text_words = text.words.to_set
+    return 0 if gene_words.empty? || text_words.empty?
+    common = gene_words.intersection(text_words)
+    common.length / (gene_words.length + text_words.length).to_f
+  end
+end

data/lib/rbbt/sources/go.rb ADDED Viewed

@@ -0,0 +1,84 @@
+require 'rbbt-util'
+# This module holds helper methods to deal with the Gene Ontology files. Right
+# now all it does is provide a translation form id to the actual names.
+module GO
+  @@info = nil
+  MULTIPLE_VALUE_FIELDS = %w(is_a)
+  # This method needs to be called before any translations can be made, it is
+  # called automatically the first time the id2name method is called. It loads
+  # the gene_ontology.obo file and extracts all the fields, although right now,
+  # only the name field is used.
+  def self.init
+    @@info = {}
+    File.open(File.join(Rbbt.datadir, 'dbs/go/gene_ontology.obo')).read.
+      split(/\[Term\]/).
+      each{|term|
+        term_info = {}
+        term.split(/\n/).
+          select{|l| l =~ /:/}.
+          each{|l|
+            key, value = l.chomp.match(/(.*?):(.*)/).values_at(1,2)
+            if MULTIPLE_VALUE_FIELDS.include? key.strip
+              term_info[key.strip] ||= []
+              term_info[key.strip] << value.strip
+            else
+              term_info[key.strip] = value.strip
+            end
+          }
+        @@info[term_info["id"]] = term_info
+    }
+  end
+  def self.info
+    self.init unless @@info
+    @@info
+  end
+  def self.goterms
+    self.init unless @@info
+    @@info.keys
+  end
+  def self.id2name(id)
+    self.init unless @@info
+    if id.kind_of? Array
+      @@info.values_at(*id).collect{|i| i['name'] if i}
+    else
+      return nil if @@info[id].nil?
+      @@info[id]['name']
+    end
+  end
+  def self.id2ancestors(id)
+    self.init unless @@info
+    if id.kind_of? Array
+      @@info.values_at(*id).
+        select{|i| ! i['is_a'].nil?}.
+        collect{|i| i['is_a'].collect{|id|
+          id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
+        }.compact
+      }
+    else
+      return [] if @@info[id].nil? || @@info[id]['is_a'].nil?
+      @@info[id]['is_a'].
+        collect{|id|
+        id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
+      }.compact
+    end
+  end
+  def self.id2namespace(id)
+    self.init unless @@info
+    if id.kind_of? Array
+      @@info.values_at(*id).collect{|i| i['namespace'] if i}
+    else
+      return nil if @@info[id].nil?
+      @@info[id]['namespace']
+    end
+  end
+end

data/lib/rbbt/sources/gscholar.rb ADDED Viewed

@@ -0,0 +1,73 @@
+require 'mechanize'
+module GoogleScholar
+  def self.user_agent
+    @@a ||= Mechanize.new
+  end
+  def self.citation_link(title)
+    citation_link = nil
+    # Get citation page
+    user_agent.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
+      article = page.search('div[@class=gs_r]').first
+      return nil if article.nil?
+      return article.search('a').select{|link| link['href'] =~ /scholar\?cites/ && link.inner_html =~ /\d+$/ }.first
+    end
+  end
+  def self.full_text_url(title)
+    full_text_link = nil
+    # Get page
+    user_agent.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
+      article = page.search('div[@class=gs_r]').first
+      return nil if article.nil?
+      link =  article.search('a').select{ |link|
+        link['href'] =~ /\.pdf$/ || link['href'] =~ /type=pdf/
+      }.first
+      return nil if link.nil?
+      return link['href']
+    end
+  end
+  def self.number_cites(title)
+    link = citation_link title
+    return 0 if link.nil?
+    link.inner_html =~ /(\d+)$/
+    return $1.to_i
+  end
+end
+#def get_citers(title)
+#  puts title
+#  citation_link = nil
+#
+#  # Get citation page
+#  $a.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
+#    citation_link = page.search('div[@class=gs_r]').first.search('a').select{|link| link['href'] =~ /scholar\?cites/ && link.inner_html =~ /\d+$/ }.first
+#  end
+#
+#  return [] if citation_link.nil?
+#
+#  # Parse citations
+#
+#  citers = []
+#  $a.get("http://scholar.google.es" + citation_link['href']) do |page|
+#    citers = page.search('div[@class=gs_r]').collect do |entry|
+#      entry.search('h3').first.search('a').first.inner_html
+#    end
+#  end
+#
+#  return citers
+#end

data/lib/rbbt/sources/organism.rb ADDED Viewed

@@ -0,0 +1,9 @@
+require 'rbbt-util'
+module Organism
+  class OrganismNotProcessedError < StandardError; end
+  def self.datadir(org)
+    File.join(Rbbt.datadir, 'organisms', org)
+  end
+end

data/lib/rbbt/sources/pubmed.rb ADDED Viewed

@@ -0,0 +1,239 @@
+require 'rbbt-util'
+require 'libxml'
+# This module offers an interface with PubMed, to perform queries, and
+# retrieve simple information from articles. It uses the caching
+# services of Rbbt.
+module PubMed
+  private
+  @@pubmed_lag = 1
+  def self.get_online(pmids)
+    pmid_list = ( pmids.is_a?(Array) ? pmids.join(',') : pmids.to_s )
+    url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list}"
+    xml = Open.read(url, :quiet => true, :nocache => true, :nice => @@pubmed_lag, :nice_key => "PubMed")
+    articles = xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/smu).flatten
+    if pmids.is_a? Array
+      list = {}
+      articles.each{|article|
+        pmid = article.scan(/<PMID>(.*?)<\/PMID>/).flatten.first
+        list[pmid] = article
+      }
+      return list
+    else
+      return articles.first
+    end
+  end
+  public
+  # Processes the xml with an articles as served by MedLine and extracts
+  # the abstract, title and journal information
+  class Article
+    XML_KEYS = [
+      [:title    , "ArticleTitle"],
+      [:journal  , "Journal/Title"],
+      [:issue    , "Journal/JournalIssue/Issue"],
+      [:volume   , "Journal/JournalIssue/Volume"],
+      [:issn     , "Journal/ISSN"],
+      [:year     , "Journal/JournalIssue/PubDate/Year"],
+      [:month    , "Journal/JournalIssue/PubDate/Month"],
+      [:pages    , "Pagination/MedlinePgn"],
+      [:abstract , "Abstract/AbstractText"],
+    ]
+    PMC_PDF_URL = "http://www.ncbi.nlm.nih.gov/pmc/articles/PMCID/pdf/"
+    def self.escape_title(title)
+      title.gsub(/(\w*[A-Z][A-Z]+\w*)/, '{\1}')
+    end
+    def self.make_bibentry(lastname, year, title)
+      words = title.downcase.scan(/\w+/)
+      if words.first.length > 3
+        abrev = words.first
+      else
+        abrev = words[0..2].collect{|w| w.chars.first} * ""
+      end
+      [lastname.gsub(/\s/,'_'), year || "NOYEAR", abrev] * ""
+    end
+    def self.parse_xml(xml)
+      parser  = LibXML::XML::Parser.string(xml)
+      pubmed  = parser.parse.find("/PubmedArticle").first
+      medline = pubmed.find("MedlineCitation").first
+      article = medline.find("Article").first
+      info = {}
+      info[:pmid] = medline.find("PMID").first.content
+      XML_KEYS.each do |p|
+        name, key = p
+        node = article.find(key).first
+        next if node.nil?
+        info[name] = node.content
+      end
+      bibentry = nil
+      info[:author] = article.find("AuthorList/Author").collect do |author|
+        begin
+          lastname = author.find("LastName").first.content
+          if author.find("ForeName").first.nil?
+            forename = nil
+          else
+            forename = author.find("ForeName").first.content.split(/\s/).collect{|word| if word.length == 1; then word + '.'; else word; end} * " "
+          end
+          bibentry ||= make_bibentry lastname, info[:year], info[:title]
+        rescue
+        end
+        [lastname, forename] * ", "
+      end * " and "
+      info[:bibentry] = bibentry.downcase if bibentry
+      info[:pmc_pdf] = pubmed.find("PubmedData/ArticleIdList/ArticleId").select{|id| id[:IdType] == "pmc"}.first
+      if info[:pmc_pdf]
+        info[:pmc_pdf] = PMC_PDF_URL.sub(/PMCID/, info[:pmc_pdf].content)
+      end
+      info
+    end
+    attr_accessor :title, :abstract, :journal, :author, :pmid, :bibentry, :pmc_pdf, :gscholar_pdf, :pdf_url
+    attr_accessor *XML_KEYS.collect{|p| p.first }
+    def initialize(xml)
+      if xml && ! xml.empty?
+        info = PubMed::Article.parse_xml xml
+        info.each do |key, value|
+          self.send("#{ key }=", value)
+        end
+      end
+    end
+    def pdf_url
+      return pmc_pdf if pmc_pdf
+      @gscholar_pdf ||= GoogleScholar::full_text_url title
+    end
+    def full_text
+      return nil if pdf_url.nil?
+      text = nil
+      TmpFile.with_file do |pdf|
+        # Change user-agent, oh well...
+        `wget --user-agent=firefox #{ pdf_url } -O #{ pdf }`
+        TmpFile.with_file do |txt|
+          `pdftotext #{ pdf } #{ txt }`
+          text = Open.read(txt) if File.exists? txt
+        end
+      end
+      text
+    end
+    def bibtex
+      keys = [:author] + XML_KEYS.collect{|p| p.first } - [:bibentry]
+      bibtex = "@article{#{bibentry},\n"
+      keys.each do |key|
+        next if self.send(key).nil?
+        case key
+        when :title
+          bibtex += "  title = { #{ PubMed::Article.escape_title title } },\n"
+        when :issue
+          bibtex += "  number = { #{ issue } },\n"
+        else
+          bibtex += "  #{ key } = { #{ self.send(key) } },\n"
+        end
+      end
+      bibtex += "  fulltext = { #{ pdf_url } },\n" if pdf_url
+      bibtex += "  pmid = { #{ pmid } }\n}"
+      bibtex
+    end
+    # Join the text from title and abstract
+    def text
+      [title, abstract].join("\n")
+    end
+  end
+  # Returns the Article object containing the information for the PubMed
+  # ID specified as an argument. If +pmid+ is an array instead of a single
+  # identifier it returns an hash with the Article object for each id.
+  # It uses the Rbbt cache to save the articles xml.
+  def self.get_article(pmid)
+    if pmid.is_a? Array
+      missing = []
+      list = {}
+      pmid.each{|p|
+        filename = p.to_s + '.xml'
+        if File.exists? FileCache.path(filename)
+          list[p] = Article.new(Open.read(FileCache.path(filename)))
+        else
+          missing << p
+        end
+      }
+      return list unless missing.any?
+      chunk_size = [100, missing.length].min
+      chunks = (missing.length.to_f / chunk_size).ceil
+      articles = {}
+      chunks.times do |chunk|
+        pmids = missing[(chunk * chunk_size)..((chunk + 1) *chunk_size)]
+        articles.merge!(get_online(pmids))
+      end
+      articles.each{|p, xml|
+        filename = p + '.xml'
+        FileCache.add(filename,xml)
+        list[p] =  Article.new(xml)
+      }
+      return list
+    else
+      filename = pmid.to_s + '.xml'
+      if File.exists? FileCache.path(filename)
+        return Article.new(Open.read(FileCache.path(filename)))
+      else
+        xml = get_online(pmid)
+        FileCache.add(filename,xml)
+        return Article.new(xml)
+      end
+    end
+  end
+  # Performs the specified query and returns an array with the PubMed
+  # Ids returned. +retmax+ can be used to limit the number of ids
+  # returned, if is not specified 30000 is used.
+  def self.query(query, retmax=nil)
+    retmax ||= 30000
+    Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?retmax=#{retmax}&db=pubmed&term=#{query}",:quiet => true, :nocache => true).scan(/<Id>(\d+)<\/Id>/).flatten
+  end
+end

data/test/rbbt/sources/test_biomart.rb ADDED Viewed

@@ -0,0 +1,30 @@
+require File.dirname(__FILE__) + '/../../test_helper'
+require 'rbbt/sources/biomart'
+require 'test/unit'
+class TestBioMart < Test::Unit::TestCase
+  def test_get
+    assert_raise BioMart::QueryError do
+      BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],['with_unknownattr'])
+    end
+    data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],[], nil, :nocache => true, :wget_options => { :quiet => false})
+    assert(data['856452']['protein_id'].include? 'AAB68382')
+    data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['external_gene_id'],[], data, :nocache => true, :wget_options => { :quiet => false} )
+    assert(data['856452']['protein_id'].include? 'AAB68382')
+    assert(data['856452']['external_gene_id'].include? 'CUP1-2')
+  end
+  def test_query
+    data = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :nocache => true, :wget_options => { :quiet => false})
+    assert(data['856452']['protein_id'].include? 'AAB68382')
+    assert(data['856452']['external_gene_id'].include? 'CUP1-2')
+ end
+end

data/test/rbbt/sources/test_entrez.rb ADDED Viewed

@@ -0,0 +1,48 @@
+require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
+require 'rbbt/sources/entrez'
+require 'test/unit'
+class TestEntrez < Test::Unit::TestCase
+  $yeast_tax = 559292
+  def test_entrez2native
+    tax    = $yeast_tax
+    fix    = proc{|line| line.sub(/SGD:S0/,'S0') }
+    select = proc{|line| line.match(/\tSGD:S0/)}
+    lexicon = Entrez.entrez2native(tax, :fix => fix, :select => select)
+    assert(lexicon['855611'].include? 'S000005056')
+  end
+  def test_entrez2pubmed
+    tax   = $yeast_tax
+    data = Entrez.entrez2pubmed(tax)
+    assert(data['850320'].include? '15102838')
+  end
+  def test_getonline
+    geneids = 9129
+    assert_match(/PRP3 pre-mRNA processing factor/s, Entrez.get_online(geneids))
+    geneids = [9129,9]
+    assert_match(/PRP3 pre-mRNA processing factor/s, Entrez.get_online(geneids)[9129])
+  end
+  def test_getgene
+    geneids = 9129
+    assert_equal([["PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)"]], Entrez.get_gene(geneids).description)
+    geneids = [9129, 728049]
+    assert_equal([["PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)"]], Entrez.get_gene(geneids)[9129].description)
+  end
+  def test_similarity
+    assert(Entrez.gene_text_similarity(9129, "PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)") > 0)
+    assert_equal(0, Entrez.gene_text_similarity("NON EXISTEN GENEID", "PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)"))
+  end
+end

data/test/rbbt/sources/test_go.rb ADDED Viewed

@@ -0,0 +1,24 @@
+require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
+require 'rbbt/sources/go'
+require 'test/unit'
+class TestGo < Test::Unit::TestCase
+  def test_go
+    assert_match('vacuole inheritance',GO::id2name('GO:0000011'))
+    assert_equal(['vacuole inheritance','alpha-glucoside transport'], GO::id2name(['GO:0000011','GO:0000017']))
+  end
+  def test_ancestors
+    assert GO.id2ancestors('GO:0000001').include? 'GO:0048308'
+  end
+  def test_namespace
+    assert_equal 'biological_process', GO.id2namespace('GO:0000001')
+  end
+end

data/test/rbbt/sources/test_pubmed.rb ADDED Viewed

@@ -0,0 +1,39 @@
+require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
+require 'rbbt/sources/pubmed'
+require 'test/unit'
+class TestPubMed < Test::Unit::TestCase
+  def test_get_online
+    pmid = '16438716'
+    assert(PubMed.get_online(pmid) =~ /Discovering semantic features in the literature: a foundation for building functional associations./)
+    pmids = ['16438716', 17204154]
+    assert(PubMed.get_online(pmids)[pmid] =~ /Discovering semantic features in the literature: a foundation for building functional associations./)
+  end
+  def test_get_article
+    pmid = '16438716'
+    assert(PubMed.get_article(pmid).title == "Discovering semantic features in the literature: a foundation for building functional associations.")
+    pmids = ['16438716', 17204154]
+    assert(PubMed.get_article(pmids)[pmid].title == "Discovering semantic features in the literature: a foundation for building functional associations.")
+  end
+  def test_full_text
+    pmid = '16438716'
+    assert(PubMed.get_article(pmid).full_text =~ /Discovering/)
+  end
+  def test_query
+    assert(PubMed.query('chagoyen[All Fields] AND ("loattrfull text"[sb] AND hasabstract[text])').include? '16438716')
+  end
+  def test_bibentry
+    assert("vazquez2008sent", PubMed::Article.make_bibentry('vazquez', 2008, "SENT: Semantic features in text"))
+    assert("vazquez2008aes", PubMed::Article.make_bibentry('vazquez', 2008, "An Example System"))
+  end
+end

data/test/test_helper.rb ADDED Viewed

@@ -0,0 +1,4 @@
+require 'test/unit'
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+$LOAD_PATH.unshift(File.dirname(__FILE__))

metadata ADDED Viewed

@@ -0,0 +1,123 @@
+--- !ruby/object:Gem::Specification
+name: rbbt-sources
+version: !ruby/object:Gem::Version
+  hash: 27
+  prerelease: false
+  segments:
+  - 0
+  - 1
+  - 0
+  version: 0.1.0
+platform: ruby
+authors:
+- Miguel Vazquez
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2010-12-01 00:00:00 +01:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rbbt-util
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  type: :runtime
+  version_requirements: *id001
+- !ruby/object:Gem::Dependency
+  name: mechanize
+  prerelease: false
+  requirement: &id002 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  type: :runtime
+  version_requirements: *id002
+- !ruby/object:Gem::Dependency
+  name: libxml-ruby
+  prerelease: false
+  requirement: &id003 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  type: :runtime
+  version_requirements: *id003
+description: Data sources like PubMed, Entrez Gene, or Gene Ontology
+email: miguel.vazquez@fdi.ucm.es
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/rbbt/sources/bibtex.rb
+- lib/rbbt/sources/biomart.rb
+- lib/rbbt/sources/entrez.rb
+- lib/rbbt/sources/go.rb
+- lib/rbbt/sources/gscholar.rb
+- lib/rbbt/sources/organism.rb
+- lib/rbbt/sources/pubmed.rb
+- test/rbbt/sources/test_biomart.rb
+- test/rbbt/sources/test_entrez.rb
+- test/rbbt/sources/test_go.rb
+- test/rbbt/sources/test_pubmed.rb
+- test/test_helper.rb
+has_rdoc: true
+homepage: http://github.com/mikisvaz/rbbt-sources
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.7
+signing_key:
+specification_version: 3
+summary: Data sources for the Ruby Bioinformatics Toolkit (rbbt)
+test_files:
+- test/rbbt/sources/test_biomart.rb
+- test/rbbt/sources/test_entrez.rb
+- test/rbbt/sources/test_go.rb
+- test/rbbt/sources/test_pubmed.rb
+- test/test_helper.rb