rbbt-sources 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,130 @@
1
+ class BibTexFile
2
+
3
+ class Entry
4
+
5
+ FIELDS = %w(pmid title author journal pages number volume year abstract)
6
+
7
+ FIELDS.each do |field|
8
+ define_method(field, proc{@info[field]})
9
+ end
10
+
11
+ attr_reader :info, :fields, :name, :type
12
+ def initialize(name, type, info)
13
+ @name = name
14
+ @type = type
15
+ @info = info
16
+ @fields = info.keys
17
+ end
18
+
19
+ def method_missing(name, *args)
20
+ if name.to_s =~ /(.*)=$/
21
+ if (FIELDS + @fields).include?($1.to_s)
22
+ return @info[$1.to_s] = args[0].chomp
23
+ else
24
+ raise "No field named '#{ $1 }'"
25
+ end
26
+ else
27
+ if @fields.include?(name.to_s)
28
+ return @info[name.to_s]
29
+ else
30
+ raise "No field named '#{ name }'"
31
+ end
32
+ end
33
+ end
34
+
35
+ def to_s
36
+ str = "@#{type}{#{name},\n"
37
+
38
+ FIELDS.each do |field|
39
+ next if field.nil?
40
+ str += " #{field} = {#{@info[field]}},\n"
41
+ end
42
+
43
+ (fields - FIELDS).sort.each do |field|
44
+ str += " #{field} = {#{@info[field]}},\n"
45
+ end
46
+
47
+ str += "}"
48
+
49
+ str
50
+ end
51
+ end
52
+
53
+ def self.clean_string(string)
54
+ string.gsub(/[{}]/,'')
55
+ end
56
+
57
+ def self.parse_bibtex(bibtex)
58
+ bibtex.scan(/@\w+\{.*?^\}\s*/m)
59
+ end
60
+
61
+ def self.parse_entry(entry)
62
+ info = {}
63
+
64
+ type, name = entry.match(/@([^\s]+)\{([^\s]+)\s*,/).values_at(1,2)
65
+
66
+ entry.scan(/\s*(.*?)\s*=\s*\{?\s*(.*?)\s*\}?\s*,?\s*$/).each do |pair|
67
+ info[pair.first.chomp] = pair.last.chomp
68
+ end
69
+
70
+ [ type.chomp, name.chomp, info]
71
+ end
72
+
73
+ def self.load_file(file)
74
+ entries = {}
75
+
76
+ case
77
+ when File.exists?(file)
78
+ self.parse_bibtex File.open(file).read
79
+ when IO === file
80
+ self.parse_bibtex file.read
81
+ when String === file
82
+ self.parse_bibtex file
83
+ else
84
+ raise "Input format not recognized"
85
+ end.each do |entry|
86
+ type, name, info = self.parse_entry entry
87
+ entries[name] = Entry.new name, type, info
88
+ end
89
+
90
+ entries
91
+ end
92
+
93
+ def initialize(file)
94
+ @entries = BibTexFile.load_file(file)
95
+ end
96
+
97
+ def save(file)
98
+ text = entries.collect{|e| entry e }.sort{|a,b|
99
+ if a.year.to_i != b.year.to_i
100
+ a.year.to_i <=> b.year.to_i
101
+ else
102
+ a.name <=> b.name
103
+ end
104
+ }.reverse.collect do |entry|
105
+ entry.to_s
106
+ end * "\n"
107
+
108
+ File.open(file, 'w') do |fout| fout.puts text end
109
+ end
110
+
111
+ def add(bibtex)
112
+ type, name, info = BibTexFile.parse_entry bibtex
113
+ @entries[name] = BibTexFile::Entry.new name, type, info
114
+ end
115
+
116
+ def entries
117
+ @entries.keys
118
+ end
119
+
120
+ def entry(bibentry)
121
+ @entries[bibentry]
122
+ end
123
+ end
124
+
125
+ if __FILE__ == $0
126
+
127
+ b = BibTexFile.new('/home/miki/git/DrugReview/drug.bib')
128
+ puts b.entry("yao2009novel").to_s
129
+ b.save('foo.bib')
130
+ end
@@ -0,0 +1,104 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/open'
3
+
4
+ # This module interacts with BioMart. It performs queries to BioMart and
5
+ # synthesises a hash with the results. Note that this module connects to the
6
+ # online BioMart WS using the Open in 'rbbt/util/open' module which offers
7
+ # caching by default. To obtain up to date results you may need to clear the
8
+ # cache from previous queries.
9
+ module BioMart
10
+
11
+ class BioMart::QueryError < StandardError; end
12
+ private
13
+
14
+ @@biomart_query_xml = <<-EOT
15
+ <?xml version="1.0" encoding="UTF-8"?>
16
+ <!DOCTYPE Query>
17
+ <Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
18
+ <Dataset name = "<!--DATABASE-->" interface = "default" >
19
+ <!--FILTERS-->
20
+ <!--MAIN-->
21
+ <!--ATTRIBUTES-->
22
+ </Dataset>
23
+ </Query>
24
+ EOT
25
+
26
+
27
+
28
+
29
+ def self.get(database, main, attrs = nil, filters = nil, data = nil, options = {})
30
+ attrs ||= []
31
+ filters ||= ["with_#{main}"]
32
+ data ||= {}
33
+
34
+ query = @@biomart_query_xml.dup
35
+ query.sub!(/<!--DATABASE-->/,database)
36
+ query.sub!(/<!--FILTERS-->/, filters.collect{|name| "<Filter name = \"#{ name }\" excluded = \"0\"/>"}.join("\n") )
37
+ query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
38
+ query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
39
+
40
+ response = Open.read('http://www.biomart.org/biomart/martservice?query=' + query.gsub(/\n/,' '), options)
41
+ if response =~ /Query ERROR:/
42
+ raise BioMart::QueryError, response
43
+ end
44
+
45
+ response.each_line{|l|
46
+ parts = l.chomp.split(/\t/)
47
+ main = parts.shift
48
+ next if main.nil? || main.empty?
49
+
50
+ data[main] ||= {}
51
+ attrs.each{|name|
52
+ value = parts.shift
53
+ data[main][name] ||= []
54
+ next if value.nil?
55
+ data[main][name] << value
56
+ }
57
+ }
58
+
59
+ data
60
+ end
61
+
62
+ public
63
+
64
+ # This method performs a query in biomart for a datasets and a given set of
65
+ # attributes, there must be a main attribute that will be used as the key in
66
+ # the result hash, optionally there may be a list of additional attributes
67
+ # and filters. The data parameter at the end is used internally to
68
+ # incrementally building the result, due to a limitation of the BioMart WS
69
+ # that only allows 3 external arguments, users normally should leave it
70
+ # unspecified or nil. The result is a hash, where the keys are the different
71
+ # values for the main attribute, and the value is a hash with every other
72
+ # attribute as key, and as value and array with all possible values (Note
73
+ # that for a given value of the main attribute, there may be more than one
74
+ # value for another attribute). If filters is left a nil it adds a filter to
75
+ # the BioMart query to remove results with the main attribute empty, this may
76
+ # cause an error if the BioMart WS does not allow filtering with that
77
+ # attribute.
78
+ def self.query(database, main, attrs = nil, filters = nil, data = nil, options = {})
79
+ attrs ||= []
80
+ data ||= {}
81
+
82
+ chunks = []
83
+ chunk = []
84
+ attrs.each{|a|
85
+ chunk << a
86
+ if chunk.length == 2
87
+ chunks << chunk
88
+ chunk = []
89
+ end
90
+ }
91
+
92
+ chunks << chunk if chunk.any?
93
+
94
+ chunks.each{|chunk|
95
+ data = get(database, main, chunk, filters, data, options)
96
+ }
97
+
98
+ data
99
+ end
100
+
101
+
102
+
103
+ end
104
+
@@ -0,0 +1,145 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/util/tsv'
3
+ require 'rbbt/bow/bow'
4
+ require 'set'
5
+
6
+ module Entrez
7
+
8
+ Rbbt.add_datafiles "gene_info" => ['databases/entrez', 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'],
9
+ "gene2pubmed" => ["databases/entrez", "ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz" ]
10
+
11
+ def self.entrez2native(taxs, options = {})
12
+ options = Misc.add_defaults options, :native => 1, :extra => 5, :flatten => true, :persistence => true
13
+
14
+ taxs = [taxs] unless Array === taxs
15
+ options.merge! :grep => taxs
16
+
17
+ TSV.new(Rbbt.find_datafile('gene_info'), options)
18
+ end
19
+
20
+ def self.entrez2pubmed(taxs)
21
+ options = {:native => 1, :extra => 2, :flatten => true, :persistence => true}
22
+
23
+ taxs = [taxs] unless taxs.is_a?(Array)
24
+ taxs = taxs.collect{|t| t.to_s}
25
+ options.merge! :grep => taxs
26
+
27
+ TSV.new(Rbbt.find_datafile('gene2pubmed'), options)
28
+ end
29
+
30
+ class Gene
31
+ attr_reader :organism, :symbol, :description, :aka, :protnames, :summary, :comentaries
32
+
33
+ def initialize(xml)
34
+ return if xml.nil?
35
+
36
+ @organism = xml.scan(/<Org-ref_taxname>(.*)<\/Org-ref_taxname>/s)
37
+ @symbol = xml.scan(/<Gene-ref_locus>(.*)<\/Gene-ref_locus>/s)
38
+ @description = xml.scan(/<Gene-ref_desc>(.*)<\/Gene-ref_desc>/s)
39
+ @aka = xml.scan(/<Gene-ref_syn_E>(.*)<\Gene-ref_syn_E>/s)
40
+ @protnames = xml.scan(/<Prot-ref_name_E>(.*)<\/Prot-ref_name_E>/s)
41
+ @summary = xml.scan(/<Entrezgene_summary>(.*)<\/Entrezgene_summary>/s)
42
+ @comentaries = xml.scan(/<Gene-commentary_text>(.*)<\/Gene-commentary_text>/s)
43
+
44
+
45
+ end
46
+
47
+ # Joins the text from symbol, description, aka, protnames, and
48
+ # summary
49
+ def text
50
+ #[@organism, @symbol, @description, @aka, @protnames, @summary,@comentaries.join(". ")].join(". ")
51
+ [@symbol, @description, @aka, @protnames, @summary].flatten.join(". ")
52
+ end
53
+ end
54
+
55
+ private
56
+
57
+ def self.get_online(geneids)
58
+ geneids_list = ( geneids.is_a?(Array) ? geneids.join(',') : geneids.to_s )
59
+ url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{geneids_list}"
60
+
61
+ xml = Open.read(url, :wget_options => {:quiet => true}, :nocache => true)
62
+
63
+ genes = xml.scan(/(<Entrezgene>.*?<\/Entrezgene>)/sm).flatten
64
+
65
+ if geneids.is_a? Array
66
+ list = {}
67
+ genes.each_with_index{|gene,i|
68
+ geneid = geneids[i]
69
+ list[geneid ] = gene
70
+ }
71
+ return list
72
+ else
73
+ return genes.first
74
+ end
75
+ end
76
+
77
+ public
78
+
79
+ def self.gene_filename(id)
80
+ 'gene-' + id.to_s + '.xml'
81
+ end
82
+
83
+ def self.get_gene(geneid)
84
+ return nil if geneid.nil?
85
+
86
+ if Array === geneid
87
+ missing = []
88
+ list = {}
89
+
90
+ geneid.each{|p|
91
+ next if p.nil?
92
+ if FileCache.found(gene_filename p)
93
+ list[p] = Gene.new(Open.read(FileCache.path(gene_filename p)))
94
+ else
95
+ missing << p
96
+ end
97
+ }
98
+
99
+ return list unless missing.any?
100
+ genes = get_online(missing)
101
+
102
+ genes.each{|p, xml|
103
+ filename = gene_filename p
104
+ FileCache.add(filename,xml) unless FileCache.found(filename)
105
+ list[p] = Gene.new(xml)
106
+ }
107
+
108
+ return list
109
+ else
110
+ filename = gene_filename geneid
111
+
112
+ if FileCache.found(filename)
113
+ return Gene.new(Open.read(FileCache.path(filename)))
114
+ else
115
+ xml = get_online(geneid)
116
+ FileCache.add(filename, xml) unless FileCache.found(filename)
117
+
118
+ return Gene.new(xml)
119
+ end
120
+ end
121
+ end
122
+
123
+ # Counts the words in common between a chunk of text and the text
124
+ # found in Entrez Gene for that particular gene. The +gene+ may be a
125
+ # gene identifier or a Gene class instance.
126
+ def self.gene_text_similarity(gene, text)
127
+
128
+ case
129
+ when Entrez::Gene === gene
130
+ gene_text = gene.text
131
+ when String === gene || Fixnum === gene
132
+ gene_text = get_gene(gene).text
133
+ else
134
+ return 0
135
+ end
136
+
137
+ gene_words = gene_text.words.to_set
138
+ text_words = text.words.to_set
139
+
140
+ return 0 if gene_words.empty? || text_words.empty?
141
+
142
+ common = gene_words.intersection(text_words)
143
+ common.length / (gene_words.length + text_words.length).to_f
144
+ end
145
+ end
@@ -0,0 +1,84 @@
1
+ require 'rbbt-util'
2
+
3
+ # This module holds helper methods to deal with the Gene Ontology files. Right
4
+ # now all it does is provide a translation form id to the actual names.
5
+ module GO
6
+
7
+ @@info = nil
8
+ MULTIPLE_VALUE_FIELDS = %w(is_a)
9
+
10
+ # This method needs to be called before any translations can be made, it is
11
+ # called automatically the first time the id2name method is called. It loads
12
+ # the gene_ontology.obo file and extracts all the fields, although right now,
13
+ # only the name field is used.
14
+ def self.init
15
+ @@info = {}
16
+ File.open(File.join(Rbbt.datadir, 'dbs/go/gene_ontology.obo')).read.
17
+ split(/\[Term\]/).
18
+ each{|term|
19
+ term_info = {}
20
+ term.split(/\n/).
21
+ select{|l| l =~ /:/}.
22
+ each{|l|
23
+ key, value = l.chomp.match(/(.*?):(.*)/).values_at(1,2)
24
+ if MULTIPLE_VALUE_FIELDS.include? key.strip
25
+ term_info[key.strip] ||= []
26
+ term_info[key.strip] << value.strip
27
+ else
28
+ term_info[key.strip] = value.strip
29
+ end
30
+ }
31
+ @@info[term_info["id"]] = term_info
32
+ }
33
+ end
34
+
35
+ def self.info
36
+ self.init unless @@info
37
+ @@info
38
+ end
39
+
40
+ def self.goterms
41
+ self.init unless @@info
42
+ @@info.keys
43
+ end
44
+
45
+ def self.id2name(id)
46
+ self.init unless @@info
47
+ if id.kind_of? Array
48
+ @@info.values_at(*id).collect{|i| i['name'] if i}
49
+ else
50
+ return nil if @@info[id].nil?
51
+ @@info[id]['name']
52
+ end
53
+ end
54
+
55
+ def self.id2ancestors(id)
56
+ self.init unless @@info
57
+ if id.kind_of? Array
58
+ @@info.values_at(*id).
59
+ select{|i| ! i['is_a'].nil?}.
60
+ collect{|i| i['is_a'].collect{|id|
61
+ id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
62
+ }.compact
63
+ }
64
+ else
65
+ return [] if @@info[id].nil? || @@info[id]['is_a'].nil?
66
+ @@info[id]['is_a'].
67
+ collect{|id|
68
+ id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
69
+ }.compact
70
+ end
71
+ end
72
+
73
+ def self.id2namespace(id)
74
+ self.init unless @@info
75
+ if id.kind_of? Array
76
+ @@info.values_at(*id).collect{|i| i['namespace'] if i}
77
+ else
78
+ return nil if @@info[id].nil?
79
+ @@info[id]['namespace']
80
+ end
81
+ end
82
+
83
+
84
+ end
@@ -0,0 +1,73 @@
1
+ require 'mechanize'
2
+
3
+ module GoogleScholar
4
+ def self.user_agent
5
+ @@a ||= Mechanize.new
6
+ end
7
+
8
+ def self.citation_link(title)
9
+ citation_link = nil
10
+
11
+ # Get citation page
12
+ user_agent.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
13
+ article = page.search('div[@class=gs_r]').first
14
+ return nil if article.nil?
15
+
16
+ return article.search('a').select{|link| link['href'] =~ /scholar\?cites/ && link.inner_html =~ /\d+$/ }.first
17
+ end
18
+ end
19
+
20
+ def self.full_text_url(title)
21
+ full_text_link = nil
22
+
23
+ # Get page
24
+ user_agent.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
25
+ article = page.search('div[@class=gs_r]').first
26
+ return nil if article.nil?
27
+
28
+ link = article.search('a').select{ |link|
29
+ link['href'] =~ /\.pdf$/ || link['href'] =~ /type=pdf/
30
+ }.first
31
+
32
+ return nil if link.nil?
33
+
34
+ return link['href']
35
+ end
36
+ end
37
+
38
+
39
+ def self.number_cites(title)
40
+
41
+ link = citation_link title
42
+ return 0 if link.nil?
43
+
44
+ link.inner_html =~ /(\d+)$/
45
+
46
+ return $1.to_i
47
+ end
48
+
49
+ end
50
+
51
+
52
+ #def get_citers(title)
53
+ # puts title
54
+ # citation_link = nil
55
+ #
56
+ # # Get citation page
57
+ # $a.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
58
+ # citation_link = page.search('div[@class=gs_r]').first.search('a').select{|link| link['href'] =~ /scholar\?cites/ && link.inner_html =~ /\d+$/ }.first
59
+ # end
60
+ #
61
+ # return [] if citation_link.nil?
62
+ #
63
+ # # Parse citations
64
+ #
65
+ # citers = []
66
+ # $a.get("http://scholar.google.es" + citation_link['href']) do |page|
67
+ # citers = page.search('div[@class=gs_r]').collect do |entry|
68
+ # entry.search('h3').first.search('a').first.inner_html
69
+ # end
70
+ # end
71
+ #
72
+ # return citers
73
+ #end
@@ -0,0 +1,9 @@
1
+ require 'rbbt-util'
2
+ module Organism
3
+ class OrganismNotProcessedError < StandardError; end
4
+
5
+ def self.datadir(org)
6
+ File.join(Rbbt.datadir, 'organisms', org)
7
+ end
8
+
9
+ end
@@ -0,0 +1,239 @@
1
+ require 'rbbt-util'
2
+ require 'libxml'
3
+
4
+ # This module offers an interface with PubMed, to perform queries, and
5
+ # retrieve simple information from articles. It uses the caching
6
+ # services of Rbbt.
7
+ module PubMed
8
+
9
+ private
10
+ @@pubmed_lag = 1
11
+ def self.get_online(pmids)
12
+
13
+ pmid_list = ( pmids.is_a?(Array) ? pmids.join(',') : pmids.to_s )
14
+ url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list}"
15
+
16
+ xml = Open.read(url, :quiet => true, :nocache => true, :nice => @@pubmed_lag, :nice_key => "PubMed")
17
+
18
+ articles = xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/smu).flatten
19
+
20
+ if pmids.is_a? Array
21
+ list = {}
22
+ articles.each{|article|
23
+ pmid = article.scan(/<PMID>(.*?)<\/PMID>/).flatten.first
24
+ list[pmid] = article
25
+ }
26
+ return list
27
+ else
28
+ return articles.first
29
+ end
30
+
31
+ end
32
+
33
+ public
34
+
35
+ # Processes the xml with an articles as served by MedLine and extracts
36
+ # the abstract, title and journal information
37
+ class Article
38
+
39
+
40
+ XML_KEYS = [
41
+ [:title , "ArticleTitle"],
42
+ [:journal , "Journal/Title"],
43
+ [:issue , "Journal/JournalIssue/Issue"],
44
+ [:volume , "Journal/JournalIssue/Volume"],
45
+ [:issn , "Journal/ISSN"],
46
+ [:year , "Journal/JournalIssue/PubDate/Year"],
47
+ [:month , "Journal/JournalIssue/PubDate/Month"],
48
+ [:pages , "Pagination/MedlinePgn"],
49
+ [:abstract , "Abstract/AbstractText"],
50
+ ]
51
+
52
+ PMC_PDF_URL = "http://www.ncbi.nlm.nih.gov/pmc/articles/PMCID/pdf/"
53
+
54
+ def self.escape_title(title)
55
+ title.gsub(/(\w*[A-Z][A-Z]+\w*)/, '{\1}')
56
+ end
57
+
58
+ def self.make_bibentry(lastname, year, title)
59
+ words = title.downcase.scan(/\w+/)
60
+ if words.first.length > 3
61
+ abrev = words.first
62
+ else
63
+ abrev = words[0..2].collect{|w| w.chars.first} * ""
64
+ end
65
+ [lastname.gsub(/\s/,'_'), year || "NOYEAR", abrev] * ""
66
+ end
67
+ def self.parse_xml(xml)
68
+ parser = LibXML::XML::Parser.string(xml)
69
+ pubmed = parser.parse.find("/PubmedArticle").first
70
+ medline = pubmed.find("MedlineCitation").first
71
+ article = medline.find("Article").first
72
+
73
+ info = {}
74
+
75
+ info[:pmid] = medline.find("PMID").first.content
76
+
77
+ XML_KEYS.each do |p|
78
+ name, key = p
79
+ node = article.find(key).first
80
+
81
+ next if node.nil?
82
+
83
+ info[name] = node.content
84
+ end
85
+
86
+ bibentry = nil
87
+ info[:author] = article.find("AuthorList/Author").collect do |author|
88
+ begin
89
+ lastname = author.find("LastName").first.content
90
+ if author.find("ForeName").first.nil?
91
+ forename = nil
92
+ else
93
+ forename = author.find("ForeName").first.content.split(/\s/).collect{|word| if word.length == 1; then word + '.'; else word; end} * " "
94
+ end
95
+ bibentry ||= make_bibentry lastname, info[:year], info[:title]
96
+ rescue
97
+ end
98
+ [lastname, forename] * ", "
99
+ end * " and "
100
+
101
+ info[:bibentry] = bibentry.downcase if bibentry
102
+
103
+ info[:pmc_pdf] = pubmed.find("PubmedData/ArticleIdList/ArticleId").select{|id| id[:IdType] == "pmc"}.first
104
+
105
+ if info[:pmc_pdf]
106
+ info[:pmc_pdf] = PMC_PDF_URL.sub(/PMCID/, info[:pmc_pdf].content)
107
+ end
108
+
109
+ info
110
+ end
111
+
112
+ attr_accessor :title, :abstract, :journal, :author, :pmid, :bibentry, :pmc_pdf, :gscholar_pdf, :pdf_url
113
+ attr_accessor *XML_KEYS.collect{|p| p.first }
114
+
115
+ def initialize(xml)
116
+ if xml && ! xml.empty?
117
+ info = PubMed::Article.parse_xml xml
118
+ info.each do |key, value|
119
+ self.send("#{ key }=", value)
120
+ end
121
+ end
122
+ end
123
+
124
+ def pdf_url
125
+ return pmc_pdf if pmc_pdf
126
+ @gscholar_pdf ||= GoogleScholar::full_text_url title
127
+ end
128
+
129
+ def full_text
130
+ return nil if pdf_url.nil?
131
+
132
+ text = nil
133
+ TmpFile.with_file do |pdf|
134
+
135
+ # Change user-agent, oh well...
136
+ `wget --user-agent=firefox #{ pdf_url } -O #{ pdf }`
137
+ TmpFile.with_file do |txt|
138
+ `pdftotext #{ pdf } #{ txt }`
139
+ text = Open.read(txt) if File.exists? txt
140
+ end
141
+ end
142
+
143
+ text
144
+ end
145
+
146
+ def bibtex
147
+ keys = [:author] + XML_KEYS.collect{|p| p.first } - [:bibentry]
148
+ bibtex = "@article{#{bibentry},\n"
149
+
150
+ keys.each do |key|
151
+ next if self.send(key).nil?
152
+
153
+ case key
154
+
155
+ when :title
156
+ bibtex += " title = { #{ PubMed::Article.escape_title title } },\n"
157
+
158
+ when :issue
159
+ bibtex += " number = { #{ issue } },\n"
160
+
161
+ else
162
+ bibtex += " #{ key } = { #{ self.send(key) } },\n"
163
+ end
164
+
165
+ end
166
+
167
+ bibtex += " fulltext = { #{ pdf_url } },\n" if pdf_url
168
+ bibtex += " pmid = { #{ pmid } }\n}"
169
+
170
+
171
+ bibtex
172
+ end
173
+
174
+ # Join the text from title and abstract
175
+ def text
176
+ [title, abstract].join("\n")
177
+ end
178
+ end
179
+
180
+ # Returns the Article object containing the information for the PubMed
181
+ # ID specified as an argument. If +pmid+ is an array instead of a single
182
+ # identifier it returns an hash with the Article object for each id.
183
+ # It uses the Rbbt cache to save the articles xml.
184
+ def self.get_article(pmid)
185
+
186
+ if pmid.is_a? Array
187
+ missing = []
188
+ list = {}
189
+
190
+ pmid.each{|p|
191
+ filename = p.to_s + '.xml'
192
+ if File.exists? FileCache.path(filename)
193
+ list[p] = Article.new(Open.read(FileCache.path(filename)))
194
+ else
195
+ missing << p
196
+ end
197
+ }
198
+
199
+ return list unless missing.any?
200
+ chunk_size = [100, missing.length].min
201
+ chunks = (missing.length.to_f / chunk_size).ceil
202
+
203
+ articles = {}
204
+ chunks.times do |chunk|
205
+ pmids = missing[(chunk * chunk_size)..((chunk + 1) *chunk_size)]
206
+ articles.merge!(get_online(pmids))
207
+ end
208
+
209
+ articles.each{|p, xml|
210
+ filename = p + '.xml'
211
+ FileCache.add(filename,xml)
212
+ list[p] = Article.new(xml)
213
+ }
214
+
215
+ return list
216
+
217
+ else
218
+ filename = pmid.to_s + '.xml'
219
+
220
+ if File.exists? FileCache.path(filename)
221
+ return Article.new(Open.read(FileCache.path(filename)))
222
+ else
223
+ xml = get_online(pmid)
224
+ FileCache.add(filename,xml)
225
+
226
+ return Article.new(xml)
227
+ end
228
+ end
229
+ end
230
+
231
+ # Performs the specified query and returns an array with the PubMed
232
+ # Ids returned. +retmax+ can be used to limit the number of ids
233
+ # returned, if is not specified 30000 is used.
234
+ def self.query(query, retmax=nil)
235
+ retmax ||= 30000
236
+
237
+ Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?retmax=#{retmax}&db=pubmed&term=#{query}",:quiet => true, :nocache => true).scan(/<Id>(\d+)<\/Id>/).flatten
238
+ end
239
+ end
@@ -0,0 +1,30 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt/sources/biomart'
3
+ require 'test/unit'
4
+
5
+ class TestBioMart < Test::Unit::TestCase
6
+
7
+ def test_get
8
+ assert_raise BioMart::QueryError do
9
+ BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],['with_unknownattr'])
10
+ end
11
+
12
+ data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],[], nil, :nocache => true, :wget_options => { :quiet => false})
13
+ assert(data['856452']['protein_id'].include? 'AAB68382')
14
+
15
+ data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['external_gene_id'],[], data, :nocache => true, :wget_options => { :quiet => false} )
16
+ assert(data['856452']['protein_id'].include? 'AAB68382')
17
+ assert(data['856452']['external_gene_id'].include? 'CUP1-2')
18
+
19
+ end
20
+
21
+ def test_query
22
+ data = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :nocache => true, :wget_options => { :quiet => false})
23
+
24
+ assert(data['856452']['protein_id'].include? 'AAB68382')
25
+ assert(data['856452']['external_gene_id'].include? 'CUP1-2')
26
+ end
27
+
28
+ end
29
+
30
+
@@ -0,0 +1,48 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
2
+ require 'rbbt/sources/entrez'
3
+ require 'test/unit'
4
+
5
+ class TestEntrez < Test::Unit::TestCase
6
+ $yeast_tax = 559292
7
+
8
+ def test_entrez2native
9
+ tax = $yeast_tax
10
+ fix = proc{|line| line.sub(/SGD:S0/,'S0') }
11
+ select = proc{|line| line.match(/\tSGD:S0/)}
12
+ lexicon = Entrez.entrez2native(tax, :fix => fix, :select => select)
13
+
14
+ assert(lexicon['855611'].include? 'S000005056')
15
+ end
16
+
17
+ def test_entrez2pubmed
18
+ tax = $yeast_tax
19
+
20
+ data = Entrez.entrez2pubmed(tax)
21
+ assert(data['850320'].include? '15102838')
22
+ end
23
+
24
+ def test_getonline
25
+ geneids = 9129
26
+
27
+ assert_match(/PRP3 pre-mRNA processing factor/s, Entrez.get_online(geneids))
28
+
29
+ geneids = [9129,9]
30
+ assert_match(/PRP3 pre-mRNA processing factor/s, Entrez.get_online(geneids)[9129])
31
+ end
32
+
33
+ def test_getgene
34
+ geneids = 9129
35
+ assert_equal([["PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)"]], Entrez.get_gene(geneids).description)
36
+
37
+ geneids = [9129, 728049]
38
+ assert_equal([["PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)"]], Entrez.get_gene(geneids)[9129].description)
39
+ end
40
+
41
+ def test_similarity
42
+ assert(Entrez.gene_text_similarity(9129, "PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)") > 0)
43
+ assert_equal(0, Entrez.gene_text_similarity("NON EXISTEN GENEID", "PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)"))
44
+ end
45
+
46
+ end
47
+
48
+
@@ -0,0 +1,24 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
2
+
3
+ require 'rbbt/sources/go'
4
+ require 'test/unit'
5
+
6
+ class TestGo < Test::Unit::TestCase
7
+
8
+ def test_go
9
+ assert_match('vacuole inheritance',GO::id2name('GO:0000011'))
10
+ assert_equal(['vacuole inheritance','alpha-glucoside transport'], GO::id2name(['GO:0000011','GO:0000017']))
11
+ end
12
+
13
+ def test_ancestors
14
+ assert GO.id2ancestors('GO:0000001').include? 'GO:0048308'
15
+ end
16
+
17
+ def test_namespace
18
+ assert_equal 'biological_process', GO.id2namespace('GO:0000001')
19
+ end
20
+
21
+
22
+ end
23
+
24
+
@@ -0,0 +1,39 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
2
+
3
+ require 'rbbt/sources/pubmed'
4
+ require 'test/unit'
5
+
6
+ class TestPubMed < Test::Unit::TestCase
7
+
8
+ def test_get_online
9
+ pmid = '16438716'
10
+ assert(PubMed.get_online(pmid) =~ /Discovering semantic features in the literature: a foundation for building functional associations./)
11
+
12
+ pmids = ['16438716', 17204154]
13
+ assert(PubMed.get_online(pmids)[pmid] =~ /Discovering semantic features in the literature: a foundation for building functional associations./)
14
+ end
15
+
16
+ def test_get_article
17
+ pmid = '16438716'
18
+ assert(PubMed.get_article(pmid).title == "Discovering semantic features in the literature: a foundation for building functional associations.")
19
+
20
+ pmids = ['16438716', 17204154]
21
+ assert(PubMed.get_article(pmids)[pmid].title == "Discovering semantic features in the literature: a foundation for building functional associations.")
22
+ end
23
+
24
+ def test_full_text
25
+ pmid = '16438716'
26
+ assert(PubMed.get_article(pmid).full_text =~ /Discovering/)
27
+ end
28
+
29
+ def test_query
30
+ assert(PubMed.query('chagoyen[All Fields] AND ("loattrfull text"[sb] AND hasabstract[text])').include? '16438716')
31
+ end
32
+
33
+ def test_bibentry
34
+ assert("vazquez2008sent", PubMed::Article.make_bibentry('vazquez', 2008, "SENT: Semantic features in text"))
35
+ assert("vazquez2008aes", PubMed::Article.make_bibentry('vazquez', 2008, "An Example System"))
36
+ end
37
+ end
38
+
39
+
@@ -0,0 +1,4 @@
1
+ require 'test/unit'
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
4
+
metadata ADDED
@@ -0,0 +1,123 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rbbt-sources
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Miguel Vazquez
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-12-01 00:00:00 +01:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: rbbt-util
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 3
30
+ segments:
31
+ - 0
32
+ version: "0"
33
+ type: :runtime
34
+ version_requirements: *id001
35
+ - !ruby/object:Gem::Dependency
36
+ name: mechanize
37
+ prerelease: false
38
+ requirement: &id002 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ hash: 3
44
+ segments:
45
+ - 0
46
+ version: "0"
47
+ type: :runtime
48
+ version_requirements: *id002
49
+ - !ruby/object:Gem::Dependency
50
+ name: libxml-ruby
51
+ prerelease: false
52
+ requirement: &id003 !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ hash: 3
58
+ segments:
59
+ - 0
60
+ version: "0"
61
+ type: :runtime
62
+ version_requirements: *id003
63
+ description: Data sources like PubMed, Entrez Gene, or Gene Ontology
64
+ email: miguel.vazquez@fdi.ucm.es
65
+ executables: []
66
+
67
+ extensions: []
68
+
69
+ extra_rdoc_files: []
70
+
71
+ files:
72
+ - lib/rbbt/sources/bibtex.rb
73
+ - lib/rbbt/sources/biomart.rb
74
+ - lib/rbbt/sources/entrez.rb
75
+ - lib/rbbt/sources/go.rb
76
+ - lib/rbbt/sources/gscholar.rb
77
+ - lib/rbbt/sources/organism.rb
78
+ - lib/rbbt/sources/pubmed.rb
79
+ - test/rbbt/sources/test_biomart.rb
80
+ - test/rbbt/sources/test_entrez.rb
81
+ - test/rbbt/sources/test_go.rb
82
+ - test/rbbt/sources/test_pubmed.rb
83
+ - test/test_helper.rb
84
+ has_rdoc: true
85
+ homepage: http://github.com/mikisvaz/rbbt-sources
86
+ licenses: []
87
+
88
+ post_install_message:
89
+ rdoc_options: []
90
+
91
+ require_paths:
92
+ - lib
93
+ required_ruby_version: !ruby/object:Gem::Requirement
94
+ none: false
95
+ requirements:
96
+ - - ">="
97
+ - !ruby/object:Gem::Version
98
+ hash: 3
99
+ segments:
100
+ - 0
101
+ version: "0"
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ none: false
104
+ requirements:
105
+ - - ">="
106
+ - !ruby/object:Gem::Version
107
+ hash: 3
108
+ segments:
109
+ - 0
110
+ version: "0"
111
+ requirements: []
112
+
113
+ rubyforge_project:
114
+ rubygems_version: 1.3.7
115
+ signing_key:
116
+ specification_version: 3
117
+ summary: Data sources for the Ruby Bioinformatics Toolkit (rbbt)
118
+ test_files:
119
+ - test/rbbt/sources/test_biomart.rb
120
+ - test/rbbt/sources/test_entrez.rb
121
+ - test/rbbt/sources/test_go.rb
122
+ - test/rbbt/sources/test_pubmed.rb
123
+ - test/test_helper.rb