rbbt-sources 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,130 @@
1
+ class BibTexFile
2
+
3
+ class Entry
4
+
5
+ FIELDS = %w(pmid title author journal pages number volume year abstract)
6
+
7
+ FIELDS.each do |field|
8
+ define_method(field, proc{@info[field]})
9
+ end
10
+
11
+ attr_reader :info, :fields, :name, :type
12
+ def initialize(name, type, info)
13
+ @name = name
14
+ @type = type
15
+ @info = info
16
+ @fields = info.keys
17
+ end
18
+
19
+ def method_missing(name, *args)
20
+ if name.to_s =~ /(.*)=$/
21
+ if (FIELDS + @fields).include?($1.to_s)
22
+ return @info[$1.to_s] = args[0].chomp
23
+ else
24
+ raise "No field named '#{ $1 }'"
25
+ end
26
+ else
27
+ if @fields.include?(name.to_s)
28
+ return @info[name.to_s]
29
+ else
30
+ raise "No field named '#{ name }'"
31
+ end
32
+ end
33
+ end
34
+
35
+ def to_s
36
+ str = "@#{type}{#{name},\n"
37
+
38
+ FIELDS.each do |field|
39
+ next if field.nil?
40
+ str += " #{field} = {#{@info[field]}},\n"
41
+ end
42
+
43
+ (fields - FIELDS).sort.each do |field|
44
+ str += " #{field} = {#{@info[field]}},\n"
45
+ end
46
+
47
+ str += "}"
48
+
49
+ str
50
+ end
51
+ end
52
+
53
+ def self.clean_string(string)
54
+ string.gsub(/[{}]/,'')
55
+ end
56
+
57
+ def self.parse_bibtex(bibtex)
58
+ bibtex.scan(/@\w+\{.*?^\}\s*/m)
59
+ end
60
+
61
+ def self.parse_entry(entry)
62
+ info = {}
63
+
64
+ type, name = entry.match(/@([^\s]+)\{([^\s]+)\s*,/).values_at(1,2)
65
+
66
+ entry.scan(/\s*(.*?)\s*=\s*\{?\s*(.*?)\s*\}?\s*,?\s*$/).each do |pair|
67
+ info[pair.first.chomp] = pair.last.chomp
68
+ end
69
+
70
+ [ type.chomp, name.chomp, info]
71
+ end
72
+
73
+ def self.load_file(file)
74
+ entries = {}
75
+
76
+ case
77
+ when File.exists?(file)
78
+ self.parse_bibtex File.open(file).read
79
+ when IO === file
80
+ self.parse_bibtex file.read
81
+ when String === file
82
+ self.parse_bibtex file
83
+ else
84
+ raise "Input format not recognized"
85
+ end.each do |entry|
86
+ type, name, info = self.parse_entry entry
87
+ entries[name] = Entry.new name, type, info
88
+ end
89
+
90
+ entries
91
+ end
92
+
93
+ def initialize(file)
94
+ @entries = BibTexFile.load_file(file)
95
+ end
96
+
97
+ def save(file)
98
+ text = entries.collect{|e| entry e }.sort{|a,b|
99
+ if a.year.to_i != b.year.to_i
100
+ a.year.to_i <=> b.year.to_i
101
+ else
102
+ a.name <=> b.name
103
+ end
104
+ }.reverse.collect do |entry|
105
+ entry.to_s
106
+ end * "\n"
107
+
108
+ File.open(file, 'w') do |fout| fout.puts text end
109
+ end
110
+
111
+ def add(bibtex)
112
+ type, name, info = BibTexFile.parse_entry bibtex
113
+ @entries[name] = BibTexFile::Entry.new name, type, info
114
+ end
115
+
116
+ def entries
117
+ @entries.keys
118
+ end
119
+
120
+ def entry(bibentry)
121
+ @entries[bibentry]
122
+ end
123
+ end
124
+
125
+ if __FILE__ == $0
126
+
127
+ b = BibTexFile.new('/home/miki/git/DrugReview/drug.bib')
128
+ puts b.entry("yao2009novel").to_s
129
+ b.save('foo.bib')
130
+ end
@@ -0,0 +1,104 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/open'
3
+
4
+ # This module interacts with BioMart. It performs queries to BioMart and
5
+ # synthesises a hash with the results. Note that this module connects to the
6
+ # online BioMart WS using the Open in 'rbbt/util/open' module which offers
7
+ # caching by default. To obtain up to date results you may need to clear the
8
+ # cache from previous queries.
9
+ module BioMart
10
+
11
+ class BioMart::QueryError < StandardError; end
12
+ private
13
+
14
+ @@biomart_query_xml = <<-EOT
15
+ <?xml version="1.0" encoding="UTF-8"?>
16
+ <!DOCTYPE Query>
17
+ <Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
18
+ <Dataset name = "<!--DATABASE-->" interface = "default" >
19
+ <!--FILTERS-->
20
+ <!--MAIN-->
21
+ <!--ATTRIBUTES-->
22
+ </Dataset>
23
+ </Query>
24
+ EOT
25
+
26
+
27
+
28
+
29
+ def self.get(database, main, attrs = nil, filters = nil, data = nil, options = {})
30
+ attrs ||= []
31
+ filters ||= ["with_#{main}"]
32
+ data ||= {}
33
+
34
+ query = @@biomart_query_xml.dup
35
+ query.sub!(/<!--DATABASE-->/,database)
36
+ query.sub!(/<!--FILTERS-->/, filters.collect{|name| "<Filter name = \"#{ name }\" excluded = \"0\"/>"}.join("\n") )
37
+ query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
38
+ query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
39
+
40
+ response = Open.read('http://www.biomart.org/biomart/martservice?query=' + query.gsub(/\n/,' '), options)
41
+ if response =~ /Query ERROR:/
42
+ raise BioMart::QueryError, response
43
+ end
44
+
45
+ response.each_line{|l|
46
+ parts = l.chomp.split(/\t/)
47
+ main = parts.shift
48
+ next if main.nil? || main.empty?
49
+
50
+ data[main] ||= {}
51
+ attrs.each{|name|
52
+ value = parts.shift
53
+ data[main][name] ||= []
54
+ next if value.nil?
55
+ data[main][name] << value
56
+ }
57
+ }
58
+
59
+ data
60
+ end
61
+
62
+ public
63
+
64
+ # This method performs a query in biomart for a datasets and a given set of
65
+ # attributes, there must be a main attribute that will be used as the key in
66
+ # the result hash, optionally there may be a list of additional attributes
67
+ # and filters. The data parameter at the end is used internally to
68
+ # incrementally building the result, due to a limitation of the BioMart WS
69
+ # that only allows 3 external arguments, users normally should leave it
70
+ # unspecified or nil. The result is a hash, where the keys are the different
71
+ # values for the main attribute, and the value is a hash with every other
72
+ # attribute as key, and as value and array with all possible values (Note
73
+ # that for a given value of the main attribute, there may be more than one
74
+ # value for another attribute). If filters is left a nil it adds a filter to
75
+ # the BioMart query to remove results with the main attribute empty, this may
76
+ # cause an error if the BioMart WS does not allow filtering with that
77
+ # attribute.
78
+ def self.query(database, main, attrs = nil, filters = nil, data = nil, options = {})
79
+ attrs ||= []
80
+ data ||= {}
81
+
82
+ chunks = []
83
+ chunk = []
84
+ attrs.each{|a|
85
+ chunk << a
86
+ if chunk.length == 2
87
+ chunks << chunk
88
+ chunk = []
89
+ end
90
+ }
91
+
92
+ chunks << chunk if chunk.any?
93
+
94
+ chunks.each{|chunk|
95
+ data = get(database, main, chunk, filters, data, options)
96
+ }
97
+
98
+ data
99
+ end
100
+
101
+
102
+
103
+ end
104
+
@@ -0,0 +1,145 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/util/tsv'
3
+ require 'rbbt/bow/bow'
4
+ require 'set'
5
+
6
+ module Entrez
7
+
8
+ Rbbt.add_datafiles "gene_info" => ['databases/entrez', 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'],
9
+ "gene2pubmed" => ["databases/entrez", "ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz" ]
10
+
11
+ def self.entrez2native(taxs, options = {})
12
+ options = Misc.add_defaults options, :native => 1, :extra => 5, :flatten => true, :persistence => true
13
+
14
+ taxs = [taxs] unless Array === taxs
15
+ options.merge! :grep => taxs
16
+
17
+ TSV.new(Rbbt.find_datafile('gene_info'), options)
18
+ end
19
+
20
+ def self.entrez2pubmed(taxs)
21
+ options = {:native => 1, :extra => 2, :flatten => true, :persistence => true}
22
+
23
+ taxs = [taxs] unless taxs.is_a?(Array)
24
+ taxs = taxs.collect{|t| t.to_s}
25
+ options.merge! :grep => taxs
26
+
27
+ TSV.new(Rbbt.find_datafile('gene2pubmed'), options)
28
+ end
29
+
30
+ class Gene
31
+ attr_reader :organism, :symbol, :description, :aka, :protnames, :summary, :comentaries
32
+
33
+ def initialize(xml)
34
+ return if xml.nil?
35
+
36
+ @organism = xml.scan(/<Org-ref_taxname>(.*)<\/Org-ref_taxname>/s)
37
+ @symbol = xml.scan(/<Gene-ref_locus>(.*)<\/Gene-ref_locus>/s)
38
+ @description = xml.scan(/<Gene-ref_desc>(.*)<\/Gene-ref_desc>/s)
39
+ @aka = xml.scan(/<Gene-ref_syn_E>(.*)<\Gene-ref_syn_E>/s)
40
+ @protnames = xml.scan(/<Prot-ref_name_E>(.*)<\/Prot-ref_name_E>/s)
41
+ @summary = xml.scan(/<Entrezgene_summary>(.*)<\/Entrezgene_summary>/s)
42
+ @comentaries = xml.scan(/<Gene-commentary_text>(.*)<\/Gene-commentary_text>/s)
43
+
44
+
45
+ end
46
+
47
+ # Joins the text from symbol, description, aka, protnames, and
48
+ # summary
49
+ def text
50
+ #[@organism, @symbol, @description, @aka, @protnames, @summary,@comentaries.join(". ")].join(". ")
51
+ [@symbol, @description, @aka, @protnames, @summary].flatten.join(". ")
52
+ end
53
+ end
54
+
55
+ private
56
+
57
+ def self.get_online(geneids)
58
+ geneids_list = ( geneids.is_a?(Array) ? geneids.join(',') : geneids.to_s )
59
+ url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{geneids_list}"
60
+
61
+ xml = Open.read(url, :wget_options => {:quiet => true}, :nocache => true)
62
+
63
+ genes = xml.scan(/(<Entrezgene>.*?<\/Entrezgene>)/sm).flatten
64
+
65
+ if geneids.is_a? Array
66
+ list = {}
67
+ genes.each_with_index{|gene,i|
68
+ geneid = geneids[i]
69
+ list[geneid ] = gene
70
+ }
71
+ return list
72
+ else
73
+ return genes.first
74
+ end
75
+ end
76
+
77
+ public
78
+
79
+ def self.gene_filename(id)
80
+ 'gene-' + id.to_s + '.xml'
81
+ end
82
+
83
+ def self.get_gene(geneid)
84
+ return nil if geneid.nil?
85
+
86
+ if Array === geneid
87
+ missing = []
88
+ list = {}
89
+
90
+ geneid.each{|p|
91
+ next if p.nil?
92
+ if FileCache.found(gene_filename p)
93
+ list[p] = Gene.new(Open.read(FileCache.path(gene_filename p)))
94
+ else
95
+ missing << p
96
+ end
97
+ }
98
+
99
+ return list unless missing.any?
100
+ genes = get_online(missing)
101
+
102
+ genes.each{|p, xml|
103
+ filename = gene_filename p
104
+ FileCache.add(filename,xml) unless FileCache.found(filename)
105
+ list[p] = Gene.new(xml)
106
+ }
107
+
108
+ return list
109
+ else
110
+ filename = gene_filename geneid
111
+
112
+ if FileCache.found(filename)
113
+ return Gene.new(Open.read(FileCache.path(filename)))
114
+ else
115
+ xml = get_online(geneid)
116
+ FileCache.add(filename, xml) unless FileCache.found(filename)
117
+
118
+ return Gene.new(xml)
119
+ end
120
+ end
121
+ end
122
+
123
+ # Counts the words in common between a chunk of text and the text
124
+ # found in Entrez Gene for that particular gene. The +gene+ may be a
125
+ # gene identifier or a Gene class instance.
126
+ def self.gene_text_similarity(gene, text)
127
+
128
+ case
129
+ when Entrez::Gene === gene
130
+ gene_text = gene.text
131
+ when String === gene || Fixnum === gene
132
+ gene_text = get_gene(gene).text
133
+ else
134
+ return 0
135
+ end
136
+
137
+ gene_words = gene_text.words.to_set
138
+ text_words = text.words.to_set
139
+
140
+ return 0 if gene_words.empty? || text_words.empty?
141
+
142
+ common = gene_words.intersection(text_words)
143
+ common.length / (gene_words.length + text_words.length).to_f
144
+ end
145
+ end
@@ -0,0 +1,84 @@
1
+ require 'rbbt-util'
2
+
3
+ # This module holds helper methods to deal with the Gene Ontology files. Right
4
+ # now all it does is provide a translation form id to the actual names.
5
+ module GO
6
+
7
+ @@info = nil
8
+ MULTIPLE_VALUE_FIELDS = %w(is_a)
9
+
10
+ # This method needs to be called before any translations can be made, it is
11
+ # called automatically the first time the id2name method is called. It loads
12
+ # the gene_ontology.obo file and extracts all the fields, although right now,
13
+ # only the name field is used.
14
+ def self.init
15
+ @@info = {}
16
+ File.open(File.join(Rbbt.datadir, 'dbs/go/gene_ontology.obo')).read.
17
+ split(/\[Term\]/).
18
+ each{|term|
19
+ term_info = {}
20
+ term.split(/\n/).
21
+ select{|l| l =~ /:/}.
22
+ each{|l|
23
+ key, value = l.chomp.match(/(.*?):(.*)/).values_at(1,2)
24
+ if MULTIPLE_VALUE_FIELDS.include? key.strip
25
+ term_info[key.strip] ||= []
26
+ term_info[key.strip] << value.strip
27
+ else
28
+ term_info[key.strip] = value.strip
29
+ end
30
+ }
31
+ @@info[term_info["id"]] = term_info
32
+ }
33
+ end
34
+
35
+ def self.info
36
+ self.init unless @@info
37
+ @@info
38
+ end
39
+
40
+ def self.goterms
41
+ self.init unless @@info
42
+ @@info.keys
43
+ end
44
+
45
+ def self.id2name(id)
46
+ self.init unless @@info
47
+ if id.kind_of? Array
48
+ @@info.values_at(*id).collect{|i| i['name'] if i}
49
+ else
50
+ return nil if @@info[id].nil?
51
+ @@info[id]['name']
52
+ end
53
+ end
54
+
55
+ def self.id2ancestors(id)
56
+ self.init unless @@info
57
+ if id.kind_of? Array
58
+ @@info.values_at(*id).
59
+ select{|i| ! i['is_a'].nil?}.
60
+ collect{|i| i['is_a'].collect{|id|
61
+ id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
62
+ }.compact
63
+ }
64
+ else
65
+ return [] if @@info[id].nil? || @@info[id]['is_a'].nil?
66
+ @@info[id]['is_a'].
67
+ collect{|id|
68
+ id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
69
+ }.compact
70
+ end
71
+ end
72
+
73
+ def self.id2namespace(id)
74
+ self.init unless @@info
75
+ if id.kind_of? Array
76
+ @@info.values_at(*id).collect{|i| i['namespace'] if i}
77
+ else
78
+ return nil if @@info[id].nil?
79
+ @@info[id]['namespace']
80
+ end
81
+ end
82
+
83
+
84
+ end
@@ -0,0 +1,73 @@
1
+ require 'mechanize'
2
+
3
+ module GoogleScholar
4
+ def self.user_agent
5
+ @@a ||= Mechanize.new
6
+ end
7
+
8
+ def self.citation_link(title)
9
+ citation_link = nil
10
+
11
+ # Get citation page
12
+ user_agent.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
13
+ article = page.search('div[@class=gs_r]').first
14
+ return nil if article.nil?
15
+
16
+ return article.search('a').select{|link| link['href'] =~ /scholar\?cites/ && link.inner_html =~ /\d+$/ }.first
17
+ end
18
+ end
19
+
20
+ def self.full_text_url(title)
21
+ full_text_link = nil
22
+
23
+ # Get page
24
+ user_agent.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
25
+ article = page.search('div[@class=gs_r]').first
26
+ return nil if article.nil?
27
+
28
+ link = article.search('a').select{ |link|
29
+ link['href'] =~ /\.pdf$/ || link['href'] =~ /type=pdf/
30
+ }.first
31
+
32
+ return nil if link.nil?
33
+
34
+ return link['href']
35
+ end
36
+ end
37
+
38
+
39
+ def self.number_cites(title)
40
+
41
+ link = citation_link title
42
+ return 0 if link.nil?
43
+
44
+ link.inner_html =~ /(\d+)$/
45
+
46
+ return $1.to_i
47
+ end
48
+
49
+ end
50
+
51
+
52
+ #def get_citers(title)
53
+ # puts title
54
+ # citation_link = nil
55
+ #
56
+ # # Get citation page
57
+ # $a.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
58
+ # citation_link = page.search('div[@class=gs_r]').first.search('a').select{|link| link['href'] =~ /scholar\?cites/ && link.inner_html =~ /\d+$/ }.first
59
+ # end
60
+ #
61
+ # return [] if citation_link.nil?
62
+ #
63
+ # # Parse citations
64
+ #
65
+ # citers = []
66
+ # $a.get("http://scholar.google.es" + citation_link['href']) do |page|
67
+ # citers = page.search('div[@class=gs_r]').collect do |entry|
68
+ # entry.search('h3').first.search('a').first.inner_html
69
+ # end
70
+ # end
71
+ #
72
+ # return citers
73
+ #end
@@ -0,0 +1,9 @@
1
+ require 'rbbt-util'
2
+ module Organism
3
+ class OrganismNotProcessedError < StandardError; end
4
+
5
+ def self.datadir(org)
6
+ File.join(Rbbt.datadir, 'organisms', org)
7
+ end
8
+
9
+ end
@@ -0,0 +1,239 @@
1
+ require 'rbbt-util'
2
+ require 'libxml'
3
+
4
+ # This module offers an interface with PubMed, to perform queries, and
5
+ # retrieve simple information from articles. It uses the caching
6
+ # services of Rbbt.
7
+ module PubMed
8
+
9
+ private
10
+ @@pubmed_lag = 1
11
+ def self.get_online(pmids)
12
+
13
+ pmid_list = ( pmids.is_a?(Array) ? pmids.join(',') : pmids.to_s )
14
+ url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list}"
15
+
16
+ xml = Open.read(url, :quiet => true, :nocache => true, :nice => @@pubmed_lag, :nice_key => "PubMed")
17
+
18
+ articles = xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/smu).flatten
19
+
20
+ if pmids.is_a? Array
21
+ list = {}
22
+ articles.each{|article|
23
+ pmid = article.scan(/<PMID>(.*?)<\/PMID>/).flatten.first
24
+ list[pmid] = article
25
+ }
26
+ return list
27
+ else
28
+ return articles.first
29
+ end
30
+
31
+ end
32
+
33
+ public
34
+
35
+ # Processes the xml with an articles as served by MedLine and extracts
36
+ # the abstract, title and journal information
37
+ class Article
38
+
39
+
40
+ XML_KEYS = [
41
+ [:title , "ArticleTitle"],
42
+ [:journal , "Journal/Title"],
43
+ [:issue , "Journal/JournalIssue/Issue"],
44
+ [:volume , "Journal/JournalIssue/Volume"],
45
+ [:issn , "Journal/ISSN"],
46
+ [:year , "Journal/JournalIssue/PubDate/Year"],
47
+ [:month , "Journal/JournalIssue/PubDate/Month"],
48
+ [:pages , "Pagination/MedlinePgn"],
49
+ [:abstract , "Abstract/AbstractText"],
50
+ ]
51
+
52
+ PMC_PDF_URL = "http://www.ncbi.nlm.nih.gov/pmc/articles/PMCID/pdf/"
53
+
54
+ def self.escape_title(title)
55
+ title.gsub(/(\w*[A-Z][A-Z]+\w*)/, '{\1}')
56
+ end
57
+
58
+ def self.make_bibentry(lastname, year, title)
59
+ words = title.downcase.scan(/\w+/)
60
+ if words.first.length > 3
61
+ abrev = words.first
62
+ else
63
+ abrev = words[0..2].collect{|w| w.chars.first} * ""
64
+ end
65
+ [lastname.gsub(/\s/,'_'), year || "NOYEAR", abrev] * ""
66
+ end
67
+ def self.parse_xml(xml)
68
+ parser = LibXML::XML::Parser.string(xml)
69
+ pubmed = parser.parse.find("/PubmedArticle").first
70
+ medline = pubmed.find("MedlineCitation").first
71
+ article = medline.find("Article").first
72
+
73
+ info = {}
74
+
75
+ info[:pmid] = medline.find("PMID").first.content
76
+
77
+ XML_KEYS.each do |p|
78
+ name, key = p
79
+ node = article.find(key).first
80
+
81
+ next if node.nil?
82
+
83
+ info[name] = node.content
84
+ end
85
+
86
+ bibentry = nil
87
+ info[:author] = article.find("AuthorList/Author").collect do |author|
88
+ begin
89
+ lastname = author.find("LastName").first.content
90
+ if author.find("ForeName").first.nil?
91
+ forename = nil
92
+ else
93
+ forename = author.find("ForeName").first.content.split(/\s/).collect{|word| if word.length == 1; then word + '.'; else word; end} * " "
94
+ end
95
+ bibentry ||= make_bibentry lastname, info[:year], info[:title]
96
+ rescue
97
+ end
98
+ [lastname, forename] * ", "
99
+ end * " and "
100
+
101
+ info[:bibentry] = bibentry.downcase if bibentry
102
+
103
+ info[:pmc_pdf] = pubmed.find("PubmedData/ArticleIdList/ArticleId").select{|id| id[:IdType] == "pmc"}.first
104
+
105
+ if info[:pmc_pdf]
106
+ info[:pmc_pdf] = PMC_PDF_URL.sub(/PMCID/, info[:pmc_pdf].content)
107
+ end
108
+
109
+ info
110
+ end
111
+
112
+ attr_accessor :title, :abstract, :journal, :author, :pmid, :bibentry, :pmc_pdf, :gscholar_pdf, :pdf_url
113
+ attr_accessor *XML_KEYS.collect{|p| p.first }
114
+
115
+ def initialize(xml)
116
+ if xml && ! xml.empty?
117
+ info = PubMed::Article.parse_xml xml
118
+ info.each do |key, value|
119
+ self.send("#{ key }=", value)
120
+ end
121
+ end
122
+ end
123
+
124
+ def pdf_url
125
+ return pmc_pdf if pmc_pdf
126
+ @gscholar_pdf ||= GoogleScholar::full_text_url title
127
+ end
128
+
129
+ def full_text
130
+ return nil if pdf_url.nil?
131
+
132
+ text = nil
133
+ TmpFile.with_file do |pdf|
134
+
135
+ # Change user-agent, oh well...
136
+ `wget --user-agent=firefox #{ pdf_url } -O #{ pdf }`
137
+ TmpFile.with_file do |txt|
138
+ `pdftotext #{ pdf } #{ txt }`
139
+ text = Open.read(txt) if File.exists? txt
140
+ end
141
+ end
142
+
143
+ text
144
+ end
145
+
146
+ def bibtex
147
+ keys = [:author] + XML_KEYS.collect{|p| p.first } - [:bibentry]
148
+ bibtex = "@article{#{bibentry},\n"
149
+
150
+ keys.each do |key|
151
+ next if self.send(key).nil?
152
+
153
+ case key
154
+
155
+ when :title
156
+ bibtex += " title = { #{ PubMed::Article.escape_title title } },\n"
157
+
158
+ when :issue
159
+ bibtex += " number = { #{ issue } },\n"
160
+
161
+ else
162
+ bibtex += " #{ key } = { #{ self.send(key) } },\n"
163
+ end
164
+
165
+ end
166
+
167
+ bibtex += " fulltext = { #{ pdf_url } },\n" if pdf_url
168
+ bibtex += " pmid = { #{ pmid } }\n}"
169
+
170
+
171
+ bibtex
172
+ end
173
+
174
+ # Join the text from title and abstract
175
+ def text
176
+ [title, abstract].join("\n")
177
+ end
178
+ end
179
+
180
+ # Returns the Article object containing the information for the PubMed
181
+ # ID specified as an argument. If +pmid+ is an array instead of a single
182
+ # identifier it returns an hash with the Article object for each id.
183
+ # It uses the Rbbt cache to save the articles xml.
184
+ def self.get_article(pmid)
185
+
186
+ if pmid.is_a? Array
187
+ missing = []
188
+ list = {}
189
+
190
+ pmid.each{|p|
191
+ filename = p.to_s + '.xml'
192
+ if File.exists? FileCache.path(filename)
193
+ list[p] = Article.new(Open.read(FileCache.path(filename)))
194
+ else
195
+ missing << p
196
+ end
197
+ }
198
+
199
+ return list unless missing.any?
200
+ chunk_size = [100, missing.length].min
201
+ chunks = (missing.length.to_f / chunk_size).ceil
202
+
203
+ articles = {}
204
+ chunks.times do |chunk|
205
+ pmids = missing[(chunk * chunk_size)..((chunk + 1) *chunk_size)]
206
+ articles.merge!(get_online(pmids))
207
+ end
208
+
209
+ articles.each{|p, xml|
210
+ filename = p + '.xml'
211
+ FileCache.add(filename,xml)
212
+ list[p] = Article.new(xml)
213
+ }
214
+
215
+ return list
216
+
217
+ else
218
+ filename = pmid.to_s + '.xml'
219
+
220
+ if File.exists? FileCache.path(filename)
221
+ return Article.new(Open.read(FileCache.path(filename)))
222
+ else
223
+ xml = get_online(pmid)
224
+ FileCache.add(filename,xml)
225
+
226
+ return Article.new(xml)
227
+ end
228
+ end
229
+ end
230
+
231
+ # Performs the specified query and returns an array with the PubMed
232
+ # Ids returned. +retmax+ can be used to limit the number of ids
233
+ # returned, if is not specified 30000 is used.
234
+ def self.query(query, retmax=nil)
235
+ retmax ||= 30000
236
+
237
+ Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?retmax=#{retmax}&db=pubmed&term=#{query}",:quiet => true, :nocache => true).scan(/<Id>(\d+)<\/Id>/).flatten
238
+ end
239
+ end
@@ -0,0 +1,30 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt/sources/biomart'
3
+ require 'test/unit'
4
+
5
+ class TestBioMart < Test::Unit::TestCase
6
+
7
+ def test_get
8
+ assert_raise BioMart::QueryError do
9
+ BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],['with_unknownattr'])
10
+ end
11
+
12
+ data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],[], nil, :nocache => true, :wget_options => { :quiet => false})
13
+ assert(data['856452']['protein_id'].include? 'AAB68382')
14
+
15
+ data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['external_gene_id'],[], data, :nocache => true, :wget_options => { :quiet => false} )
16
+ assert(data['856452']['protein_id'].include? 'AAB68382')
17
+ assert(data['856452']['external_gene_id'].include? 'CUP1-2')
18
+
19
+ end
20
+
21
+ def test_query
22
+ data = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :nocache => true, :wget_options => { :quiet => false})
23
+
24
+ assert(data['856452']['protein_id'].include? 'AAB68382')
25
+ assert(data['856452']['external_gene_id'].include? 'CUP1-2')
26
+ end
27
+
28
+ end
29
+
30
+
@@ -0,0 +1,48 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
2
+ require 'rbbt/sources/entrez'
3
+ require 'test/unit'
4
+
5
+ class TestEntrez < Test::Unit::TestCase
6
+ $yeast_tax = 559292
7
+
8
+ def test_entrez2native
9
+ tax = $yeast_tax
10
+ fix = proc{|line| line.sub(/SGD:S0/,'S0') }
11
+ select = proc{|line| line.match(/\tSGD:S0/)}
12
+ lexicon = Entrez.entrez2native(tax, :fix => fix, :select => select)
13
+
14
+ assert(lexicon['855611'].include? 'S000005056')
15
+ end
16
+
17
+ def test_entrez2pubmed
18
+ tax = $yeast_tax
19
+
20
+ data = Entrez.entrez2pubmed(tax)
21
+ assert(data['850320'].include? '15102838')
22
+ end
23
+
24
+ def test_getonline
25
+ geneids = 9129
26
+
27
+ assert_match(/PRP3 pre-mRNA processing factor/s, Entrez.get_online(geneids))
28
+
29
+ geneids = [9129,9]
30
+ assert_match(/PRP3 pre-mRNA processing factor/s, Entrez.get_online(geneids)[9129])
31
+ end
32
+
33
+ def test_getgene
34
+ geneids = 9129
35
+ assert_equal([["PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)"]], Entrez.get_gene(geneids).description)
36
+
37
+ geneids = [9129, 728049]
38
+ assert_equal([["PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)"]], Entrez.get_gene(geneids)[9129].description)
39
+ end
40
+
41
+ def test_similarity
42
+ assert(Entrez.gene_text_similarity(9129, "PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)") > 0)
43
+ assert_equal(0, Entrez.gene_text_similarity("NON EXISTEN GENEID", "PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)"))
44
+ end
45
+
46
+ end
47
+
48
+
@@ -0,0 +1,24 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
2
+
3
+ require 'rbbt/sources/go'
4
+ require 'test/unit'
5
+
6
+ class TestGo < Test::Unit::TestCase
7
+
8
+ def test_go
9
+ assert_match('vacuole inheritance',GO::id2name('GO:0000011'))
10
+ assert_equal(['vacuole inheritance','alpha-glucoside transport'], GO::id2name(['GO:0000011','GO:0000017']))
11
+ end
12
+
13
+ def test_ancestors
14
+ assert GO.id2ancestors('GO:0000001').include? 'GO:0048308'
15
+ end
16
+
17
+ def test_namespace
18
+ assert_equal 'biological_process', GO.id2namespace('GO:0000001')
19
+ end
20
+
21
+
22
+ end
23
+
24
+
@@ -0,0 +1,39 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
2
+
3
+ require 'rbbt/sources/pubmed'
4
+ require 'test/unit'
5
+
6
+ class TestPubMed < Test::Unit::TestCase
7
+
8
+ def test_get_online
9
+ pmid = '16438716'
10
+ assert(PubMed.get_online(pmid) =~ /Discovering semantic features in the literature: a foundation for building functional associations./)
11
+
12
+ pmids = ['16438716', 17204154]
13
+ assert(PubMed.get_online(pmids)[pmid] =~ /Discovering semantic features in the literature: a foundation for building functional associations./)
14
+ end
15
+
16
+ def test_get_article
17
+ pmid = '16438716'
18
+ assert(PubMed.get_article(pmid).title == "Discovering semantic features in the literature: a foundation for building functional associations.")
19
+
20
+ pmids = ['16438716', 17204154]
21
+ assert(PubMed.get_article(pmids)[pmid].title == "Discovering semantic features in the literature: a foundation for building functional associations.")
22
+ end
23
+
24
+ def test_full_text
25
+ pmid = '16438716'
26
+ assert(PubMed.get_article(pmid).full_text =~ /Discovering/)
27
+ end
28
+
29
+ def test_query
30
+ assert(PubMed.query('chagoyen[All Fields] AND ("loattrfull text"[sb] AND hasabstract[text])').include? '16438716')
31
+ end
32
+
33
+ def test_bibentry
34
+ assert("vazquez2008sent", PubMed::Article.make_bibentry('vazquez', 2008, "SENT: Semantic features in text"))
35
+ assert("vazquez2008aes", PubMed::Article.make_bibentry('vazquez', 2008, "An Example System"))
36
+ end
37
+ end
38
+
39
+
@@ -0,0 +1,4 @@
1
+ require 'test/unit'
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
4
+
metadata ADDED
@@ -0,0 +1,123 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rbbt-sources
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Miguel Vazquez
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-12-01 00:00:00 +01:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: rbbt-util
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 3
30
+ segments:
31
+ - 0
32
+ version: "0"
33
+ type: :runtime
34
+ version_requirements: *id001
35
+ - !ruby/object:Gem::Dependency
36
+ name: mechanize
37
+ prerelease: false
38
+ requirement: &id002 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ hash: 3
44
+ segments:
45
+ - 0
46
+ version: "0"
47
+ type: :runtime
48
+ version_requirements: *id002
49
+ - !ruby/object:Gem::Dependency
50
+ name: libxml-ruby
51
+ prerelease: false
52
+ requirement: &id003 !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ hash: 3
58
+ segments:
59
+ - 0
60
+ version: "0"
61
+ type: :runtime
62
+ version_requirements: *id003
63
+ description: Data sources like PubMed, Entrez Gene, or Gene Ontology
64
+ email: miguel.vazquez@fdi.ucm.es
65
+ executables: []
66
+
67
+ extensions: []
68
+
69
+ extra_rdoc_files: []
70
+
71
+ files:
72
+ - lib/rbbt/sources/bibtex.rb
73
+ - lib/rbbt/sources/biomart.rb
74
+ - lib/rbbt/sources/entrez.rb
75
+ - lib/rbbt/sources/go.rb
76
+ - lib/rbbt/sources/gscholar.rb
77
+ - lib/rbbt/sources/organism.rb
78
+ - lib/rbbt/sources/pubmed.rb
79
+ - test/rbbt/sources/test_biomart.rb
80
+ - test/rbbt/sources/test_entrez.rb
81
+ - test/rbbt/sources/test_go.rb
82
+ - test/rbbt/sources/test_pubmed.rb
83
+ - test/test_helper.rb
84
+ has_rdoc: true
85
+ homepage: http://github.com/mikisvaz/rbbt-sources
86
+ licenses: []
87
+
88
+ post_install_message:
89
+ rdoc_options: []
90
+
91
+ require_paths:
92
+ - lib
93
+ required_ruby_version: !ruby/object:Gem::Requirement
94
+ none: false
95
+ requirements:
96
+ - - ">="
97
+ - !ruby/object:Gem::Version
98
+ hash: 3
99
+ segments:
100
+ - 0
101
+ version: "0"
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ none: false
104
+ requirements:
105
+ - - ">="
106
+ - !ruby/object:Gem::Version
107
+ hash: 3
108
+ segments:
109
+ - 0
110
+ version: "0"
111
+ requirements: []
112
+
113
+ rubyforge_project:
114
+ rubygems_version: 1.3.7
115
+ signing_key:
116
+ specification_version: 3
117
+ summary: Data sources for the Ruby Bioinformatics Toolkit (rbbt)
118
+ test_files:
119
+ - test/rbbt/sources/test_biomart.rb
120
+ - test/rbbt/sources/test_entrez.rb
121
+ - test/rbbt/sources/test_go.rb
122
+ - test/rbbt/sources/test_pubmed.rb
123
+ - test/test_helper.rb