RubyGems - rbbt - Versions diffs - 1.2.1 → 1.2.2 - Mend

rbbt 1.2.1 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

data/lib/rbbt/sources/gscholar.rb +74 -0
data/lib/rbbt/sources/organism.rb +3 -2
data/lib/rbbt/sources/pubmed.rb +107 -7
data/lib/rbbt/util/open.rb +13 -0
data/test/rbbt/util/test_open.rb +13 -0
metadata +45 -25

data/lib/rbbt/sources/gscholar.rb ADDED Viewed

@@ -0,0 +1,74 @@
+require 'mechanize'
+module GoogleScholar
+  def self.user_agent
+    @@a ||= Mechanize.new
+  end
+  def self.citation_link(title)
+    citation_link = nil
+    # Get citation page
+    user_agent.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
+      article = page.search('div[@class=gs_r]').first
+      return nil if article.nil?
+      return article.search('a').select{|link| link['href'] =~ /scholar\?cites/ && link.inner_html =~ /\d+$/ }.first
+    end
+  end
+  def self.full_text_url(title)
+    full_text_link = nil
+    # Get page
+    user_agent.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
+      article = page.search('div[@class=gs_r]').first
+      return nil if article.nil?
+      link =  article.search('a').select{ |link|
+        link['href'] =~ /\.pdf$/ || link['href'] =~ /type=pdf/
+      }.first
+      return nil if link.nil?
+      return link['href']
+    end
+  end
+  def self.number_cites(title)
+    link = citation_link title
+    return 0 if link.nil?
+    link.inner_html =~ /(\d+)$/
+    return $1.to_i
+  end
+end
+#def get_citers(title)
+#  puts title
+#  citation_link = nil
+#
+#  # Get citation page
+#  $a.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
+#    citation_link = page.search('div[@class=gs_r]').first.search('a').select{|link| link['href'] =~ /scholar\?cites/ && link.inner_html =~ /\d+$/ }.first
+#  end
+#
+#  return [] if citation_link.nil?
+#
+#  # Parse citations
+#
+#  citers = []
+#  $a.get("http://scholar.google.es" + citation_link['href']) do |page|
+#    citers = page.search('div[@class=gs_r]').collect do |entry|
+#      entry.search('h3').first.search('a').first.inner_html
+#    end
+#  end
+#
+#  return citers
+#end

data/lib/rbbt/sources/organism.rb CHANGED Viewed

@@ -127,12 +127,12 @@ module Organism
       if i == 0
         i += 1
         next unless l=~/^\s*#/
-          formats  = Open.fields(l.sub(/^[\s#]+/,'')).collect{|n| n.strip}
+        formats  = Open.fields(l.sub(/^[\s#]+/,'')).collect{|n| n.strip}
         return formats unless examples
         next
       end
-      if Open.fields(l).select{|name| name && name =~ /\w/}.length > examples.length
+      if Open.fields(l).select{|name| name && name =~ /\w/}.length > examples.compact.length
         examples = Open.fields(l).collect{|name| name.split(/\|/).first}
       end
       i += 1
@@ -216,6 +216,7 @@ module Organism
       first = nil
       if native
         first = id_position(supported,native,options)
+        raise "No match for native format '#{ native }'"
       else
         first = 0
       end

data/lib/rbbt/sources/pubmed.rb CHANGED Viewed

@@ -1,6 +1,8 @@
 require 'rbbt/util/filecache'
 require 'rbbt/util/open'
+require 'rbbt/sources/gscholar'
 require 'rbbt'
+require 'libxml'
 # This module offers an interface with PubMed, to perform queries, and
 # retrieve simple information from articles. It uses the caching
@@ -42,17 +44,115 @@ module PubMed
   # Processes the xml with an articles as served by MedLine and extracts
   # the abstract, title and journal information
   class Article
-    attr_reader :title, :abstract, :journal
+    XML_KEYS = [
+      [:title    , "ArticleTitle"],
+      [:journal  , "Journal/Title"],
+      [:issue    , "Journal/JournalIssue/Issue"],
+      [:volume   , "Journal/JournalIssue/Volume"],
+      [:issn     , "Journal/ISSN"],
+      [:year     , "Journal/JournalIssue/PubDate/Year"],
+      [:pages    , "Pagination/MedlinePgn"],
+      [:abstract , "Abstract/AbstractText"],
+    ]
+    PMC_PDF_URL = "http://www.ncbi.nlm.nih.gov/pmc/articles/PMCID/pdf/"
+    def self.escape_title(title)
+      title.gsub(/(\w*[A-Z][A-Z]+\w*)/, '{\1}')
+    end
+    def self.parse_xml(xml)
+      parser  = LibXML::XML::Parser.string(xml)
+      pubmed  = parser.parse.find("/PubmedArticle").first
+      medline = pubmed.find("MedlineCitation").first
+      article = medline.find("Article").first
+      info = {}
+      info[:pmid] = medline.find("PMID").first.content
+      XML_KEYS.each do |p|
+        name, key = p
+        node = article.find(key).first
+        next if node.nil?
+        info[name] = node.content
+      end
+      bibentry = nil
+      info[:author] = article.find("AuthorList/Author").collect do |author|
+        lastname = author.find("LastName").first.content
+        if author.find("ForeName").first.nil?
+          forename = nil
+        else
+          forename = author.find("ForeName").first.content.split(/\s/).collect{|word| if word.length == 1; then word + '.'; else word; end} * " "
+        end
+        bibentry ||= [lastname, (info[:year] || "NOYEAR"), info[:title].scan(/\w+/)[0]] * ""
+        [lastname, forename] * ", "
+      end * " and "
+      info[:bibentry] = bibentry.downcase
+      info[:pmc_pdf] = pubmed.find("PubmedData/ArticleIdList/ArticleId").select{|id| id[:IdType] == "pmc"}.first
+      if info[:pmc_pdf]
+        info[:pmc_pdf] = PMC_PDF_URL.sub(/PMCID/, info[:pmc_pdf].content)
+      end
+      info
+    end
+    attr_accessor :title, :abstract, :journal, :author, :pmid, :bibentry, :pmc_pdf, :gscholar_pdf, :pdf_url
+    attr_accessor *XML_KEYS.collect{|p| p.first }
     def initialize(xml)
-      xml ||= ""
-      @abstract = $1 if xml.match(/<AbstractText>(.*)<\/AbstractText>/sm)
-      @title    = $1 if xml.match(/<ArticleTitle>(.*)<\/ArticleTitle>/sm)
-      @journal  = $1 if xml.match(/<Title>(.*)<\/Title>/sm)
+      if xml && ! xml.empty?
+        info = PubMed::Article.parse_xml xml
+        info.each do |key, value|
+          self.send("#{ key }=", value)
+        end
+      end
+    end
+    def pdf_url
+      return pmc_pdf if pmc_pdf
+      @gscholar_pdf ||= GoogleScholar::full_text_url title
+    end
+    def bibtex
+      keys = [:author] + XML_KEYS.collect{|p| p.first } - [:bibentry]
+      bibtex = "@article{#{bibentry},\n"
+      keys.each do |key|
+        next if self.send(key).nil?
+        case key
+        when :title
+          bibtex += "  title = { #{ PubMed::Article.escape_title title } },\n"
+        when :issue
+          bibtex += "  number = { #{ issue } },\n"
+        else
+          bibtex += "  #{ key } = { #{ self.send(key) } },\n"
+        end
+      end
+      bibtex += "  fulltext = { #{ pdf_url } },\n" if pdf_url
+      bibtex += "  pmid = { #{ pmid } }\n}"
+      bibtex
     end
     # Join the text from title and abstract
     def text
-      [@title, @abstract].join("\n")
+      [title, abstract].join("\n")
     end
   end
@@ -78,7 +178,7 @@ module PubMed
       return list unless missing.any?
       chunk_size = [100, missing.length].min
       chunks = (missing.length.to_f / chunk_size).ceil
       articles = {}
       chunks.times do |chunk|
         pmids = missing[(chunk * chunk_size)..((chunk + 1) *chunk_size)]

data/lib/rbbt/util/open.rb CHANGED Viewed

@@ -6,6 +6,16 @@ require 'rbbt/util/tmpfile'
 # for accessing remote files. It supports caching the files.
 module Open
+  # Return a Proc to use in the :select parameter of the Open.to_hash method.
+  # It selects those lines with the content of the first field present on the
+  # entities array. The field can be chosen to be a different one in the
+  # options hash, also the separation string or regexp to determine fields.
+  def self.func_match_field(entities, options = {})
+    field, sep = {:field => 0, :sep => "\t"}.merge(options).values_at(:field, :sep)
+    Proc.new {|line| entities.include? line.split(sep)[field] }
+  end
   def self.fields(line, sep = "\t")
     chunks = line.chomp.split(/(#{sep})/).select{|c| c !~ /^#{sep}$/ }
     if line =~ /#{sep}$/
@@ -176,10 +186,12 @@ module Open
   # * :single => for each key select only the first of the values, instead of the complete array.
   # * :fix  => A Proc that is called to pre-process the line
   # * :exclude => A Proc that is called to check if the line must be excluded from the process.
+  # * :select => A Proc that is called to check if the line must be selected to process.
   def self.to_hash(input, options = {})
     native  = options[:native]  || 0
     extra   = options[:extra]
     exclude = options[:exclude]
+    select  = options[:select]
     fix     = options[:fix]
     sep     = options[:sep]     || "\t"
     sep2    = options[:sep2]    || "|"
@@ -200,6 +212,7 @@ module Open
     content.each_line{|l|
       l = fix.call(l) if fix
       next if exclude and exclude.call(l)
+      next if select  and ! select.call(l)
       row_fields = self.fields(l, sep)
       id = row_fields[native]

data/test/rbbt/util/test_open.rb CHANGED Viewed

@@ -89,6 +89,19 @@ row2 a d e r
     assert_equal(["","",""] , Open.fields("\t\t") )
   end
+  def test_select_field
+    data =<<-EOD
+row1 a b 3
+row1 aa bb 33
+row2 a d e r
+    EOD
+    TmpFile.with_file(data) do |file|
+      data = Open.to_hash(file, :select => Open.func_match_field(%w(row1), :sep => " "), :sep => " ")
+      assert ! data.include?('row2')
+      assert data.include?('row1')
+    end
+  end

metadata CHANGED Viewed

@@ -1,7 +1,12 @@
 --- !ruby/object:Gem::Specification
 name: rbbt
 version: !ruby/object:Gem::Version
-  version: 1.2.1
+  prerelease: false
+  segments:
+  - 1
+  - 2
+  - 2
+  version: 1.2.2
 platform: ruby
 authors:
 - Miguel Vazquez
@@ -9,59 +14,71 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-02-15 00:00:00 +01:00
+date: 2010-05-27 00:00:00 +02:00
 default_executable: rbbt_config
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake
-  type: :runtime
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
+        segments:
+        - 0
+        - 8
+        - 4
         version: 0.8.4
-    version:
+  type: :runtime
+  version_requirements: *id001
 - !ruby/object:Gem::Dependency
   name: simpleconsole
-  type: :runtime
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
+  prerelease: false
+  requirement: &id002 !ruby/object:Gem::Requirement
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
+        segments:
+        - 0
         version: "0"
-    version:
+  type: :runtime
+  version_requirements: *id002
 - !ruby/object:Gem::Dependency
   name: stemmer
-  type: :runtime
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
+  prerelease: false
+  requirement: &id003 !ruby/object:Gem::Requirement
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
+        segments:
+        - 0
         version: "0"
-    version:
+  type: :runtime
+  version_requirements: *id003
 - !ruby/object:Gem::Dependency
   name: progress-monitor
-  type: :runtime
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
+  prerelease: false
+  requirement: &id004 !ruby/object:Gem::Requirement
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
+        segments:
+        - 0
         version: "0"
-    version:
+  type: :runtime
+  version_requirements: *id004
 - !ruby/object:Gem::Dependency
   name: simpleconsole
-  type: :runtime
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
+  prerelease: false
+  requirement: &id005 !ruby/object:Gem::Requirement
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
+        segments:
+        - 0
         version: "0"
-    version:
+  type: :runtime
+  version_requirements: *id005
 description: |-
   This toolbox includes modules for text-mining, like Named Entity Recognition and Normalization and document
       classification, as well as data integration modules that interface with PubMed, Entrez Gene, BioMart.
@@ -118,6 +135,7 @@ files:
 - lib/rbbt/sources/biomart.rb
 - lib/rbbt/sources/entrez.rb
 - lib/rbbt/sources/go.rb
+- lib/rbbt/sources/gscholar.rb
 - lib/rbbt/sources/organism.rb
 - lib/rbbt/sources/polysearch.rb
 - lib/rbbt/sources/pubmed.rb
@@ -145,18 +163,20 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      segments:
+      - 0
       version: "0"
-  version:
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      segments:
+      - 0
       version: "0"
-  version:
 requirements: []
 rubyforge_project:
-rubygems_version: 1.3.5
+rubygems_version: 1.3.6
 signing_key:
 specification_version: 3
 summary: Bioinformatics and text mining toolbox