rbbt-sources 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rbbt/sources/bibtex.rb +130 -0
- data/lib/rbbt/sources/biomart.rb +104 -0
- data/lib/rbbt/sources/entrez.rb +145 -0
- data/lib/rbbt/sources/go.rb +84 -0
- data/lib/rbbt/sources/gscholar.rb +73 -0
- data/lib/rbbt/sources/organism.rb +9 -0
- data/lib/rbbt/sources/pubmed.rb +239 -0
- data/test/rbbt/sources/test_biomart.rb +30 -0
- data/test/rbbt/sources/test_entrez.rb +48 -0
- data/test/rbbt/sources/test_go.rb +24 -0
- data/test/rbbt/sources/test_pubmed.rb +39 -0
- data/test/test_helper.rb +4 -0
- metadata +123 -0
@@ -0,0 +1,130 @@
|
|
1
|
+
class BibTexFile
|
2
|
+
|
3
|
+
class Entry
|
4
|
+
|
5
|
+
FIELDS = %w(pmid title author journal pages number volume year abstract)
|
6
|
+
|
7
|
+
FIELDS.each do |field|
|
8
|
+
define_method(field, proc{@info[field]})
|
9
|
+
end
|
10
|
+
|
11
|
+
attr_reader :info, :fields, :name, :type
|
12
|
+
def initialize(name, type, info)
|
13
|
+
@name = name
|
14
|
+
@type = type
|
15
|
+
@info = info
|
16
|
+
@fields = info.keys
|
17
|
+
end
|
18
|
+
|
19
|
+
def method_missing(name, *args)
|
20
|
+
if name.to_s =~ /(.*)=$/
|
21
|
+
if (FIELDS + @fields).include?($1.to_s)
|
22
|
+
return @info[$1.to_s] = args[0].chomp
|
23
|
+
else
|
24
|
+
raise "No field named '#{ $1 }'"
|
25
|
+
end
|
26
|
+
else
|
27
|
+
if @fields.include?(name.to_s)
|
28
|
+
return @info[name.to_s]
|
29
|
+
else
|
30
|
+
raise "No field named '#{ name }'"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def to_s
|
36
|
+
str = "@#{type}{#{name},\n"
|
37
|
+
|
38
|
+
FIELDS.each do |field|
|
39
|
+
next if field.nil?
|
40
|
+
str += " #{field} = {#{@info[field]}},\n"
|
41
|
+
end
|
42
|
+
|
43
|
+
(fields - FIELDS).sort.each do |field|
|
44
|
+
str += " #{field} = {#{@info[field]}},\n"
|
45
|
+
end
|
46
|
+
|
47
|
+
str += "}"
|
48
|
+
|
49
|
+
str
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.clean_string(string)
|
54
|
+
string.gsub(/[{}]/,'')
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.parse_bibtex(bibtex)
|
58
|
+
bibtex.scan(/@\w+\{.*?^\}\s*/m)
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.parse_entry(entry)
|
62
|
+
info = {}
|
63
|
+
|
64
|
+
type, name = entry.match(/@([^\s]+)\{([^\s]+)\s*,/).values_at(1,2)
|
65
|
+
|
66
|
+
entry.scan(/\s*(.*?)\s*=\s*\{?\s*(.*?)\s*\}?\s*,?\s*$/).each do |pair|
|
67
|
+
info[pair.first.chomp] = pair.last.chomp
|
68
|
+
end
|
69
|
+
|
70
|
+
[ type.chomp, name.chomp, info]
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.load_file(file)
|
74
|
+
entries = {}
|
75
|
+
|
76
|
+
case
|
77
|
+
when File.exists?(file)
|
78
|
+
self.parse_bibtex File.open(file).read
|
79
|
+
when IO === file
|
80
|
+
self.parse_bibtex file.read
|
81
|
+
when String === file
|
82
|
+
self.parse_bibtex file
|
83
|
+
else
|
84
|
+
raise "Input format not recognized"
|
85
|
+
end.each do |entry|
|
86
|
+
type, name, info = self.parse_entry entry
|
87
|
+
entries[name] = Entry.new name, type, info
|
88
|
+
end
|
89
|
+
|
90
|
+
entries
|
91
|
+
end
|
92
|
+
|
93
|
+
def initialize(file)
|
94
|
+
@entries = BibTexFile.load_file(file)
|
95
|
+
end
|
96
|
+
|
97
|
+
def save(file)
|
98
|
+
text = entries.collect{|e| entry e }.sort{|a,b|
|
99
|
+
if a.year.to_i != b.year.to_i
|
100
|
+
a.year.to_i <=> b.year.to_i
|
101
|
+
else
|
102
|
+
a.name <=> b.name
|
103
|
+
end
|
104
|
+
}.reverse.collect do |entry|
|
105
|
+
entry.to_s
|
106
|
+
end * "\n"
|
107
|
+
|
108
|
+
File.open(file, 'w') do |fout| fout.puts text end
|
109
|
+
end
|
110
|
+
|
111
|
+
def add(bibtex)
|
112
|
+
type, name, info = BibTexFile.parse_entry bibtex
|
113
|
+
@entries[name] = BibTexFile::Entry.new name, type, info
|
114
|
+
end
|
115
|
+
|
116
|
+
def entries
|
117
|
+
@entries.keys
|
118
|
+
end
|
119
|
+
|
120
|
+
def entry(bibentry)
|
121
|
+
@entries[bibentry]
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
if __FILE__ == $0
|
126
|
+
|
127
|
+
b = BibTexFile.new('/home/miki/git/DrugReview/drug.bib')
|
128
|
+
puts b.entry("yao2009novel").to_s
|
129
|
+
b.save('foo.bib')
|
130
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/util/open'
|
3
|
+
|
4
|
+
# This module interacts with BioMart. It performs queries to BioMart and
|
5
|
+
# synthesises a hash with the results. Note that this module connects to the
|
6
|
+
# online BioMart WS using the Open in 'rbbt/util/open' module which offers
|
7
|
+
# caching by default. To obtain up to date results you may need to clear the
|
8
|
+
# cache from previous queries.
|
9
|
+
module BioMart
|
10
|
+
|
11
|
+
class BioMart::QueryError < StandardError; end
|
12
|
+
private
|
13
|
+
|
14
|
+
@@biomart_query_xml = <<-EOT
|
15
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
16
|
+
<!DOCTYPE Query>
|
17
|
+
<Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
|
18
|
+
<Dataset name = "<!--DATABASE-->" interface = "default" >
|
19
|
+
<!--FILTERS-->
|
20
|
+
<!--MAIN-->
|
21
|
+
<!--ATTRIBUTES-->
|
22
|
+
</Dataset>
|
23
|
+
</Query>
|
24
|
+
EOT
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
|
29
|
+
def self.get(database, main, attrs = nil, filters = nil, data = nil, options = {})
|
30
|
+
attrs ||= []
|
31
|
+
filters ||= ["with_#{main}"]
|
32
|
+
data ||= {}
|
33
|
+
|
34
|
+
query = @@biomart_query_xml.dup
|
35
|
+
query.sub!(/<!--DATABASE-->/,database)
|
36
|
+
query.sub!(/<!--FILTERS-->/, filters.collect{|name| "<Filter name = \"#{ name }\" excluded = \"0\"/>"}.join("\n") )
|
37
|
+
query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
|
38
|
+
query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
|
39
|
+
|
40
|
+
response = Open.read('http://www.biomart.org/biomart/martservice?query=' + query.gsub(/\n/,' '), options)
|
41
|
+
if response =~ /Query ERROR:/
|
42
|
+
raise BioMart::QueryError, response
|
43
|
+
end
|
44
|
+
|
45
|
+
response.each_line{|l|
|
46
|
+
parts = l.chomp.split(/\t/)
|
47
|
+
main = parts.shift
|
48
|
+
next if main.nil? || main.empty?
|
49
|
+
|
50
|
+
data[main] ||= {}
|
51
|
+
attrs.each{|name|
|
52
|
+
value = parts.shift
|
53
|
+
data[main][name] ||= []
|
54
|
+
next if value.nil?
|
55
|
+
data[main][name] << value
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
59
|
+
data
|
60
|
+
end
|
61
|
+
|
62
|
+
public
|
63
|
+
|
64
|
+
# This method performs a query in biomart for a datasets and a given set of
|
65
|
+
# attributes, there must be a main attribute that will be used as the key in
|
66
|
+
# the result hash, optionally there may be a list of additional attributes
|
67
|
+
# and filters. The data parameter at the end is used internally to
|
68
|
+
# incrementally building the result, due to a limitation of the BioMart WS
|
69
|
+
# that only allows 3 external arguments, users normally should leave it
|
70
|
+
# unspecified or nil. The result is a hash, where the keys are the different
|
71
|
+
# values for the main attribute, and the value is a hash with every other
|
72
|
+
# attribute as key, and as value and array with all possible values (Note
|
73
|
+
# that for a given value of the main attribute, there may be more than one
|
74
|
+
# value for another attribute). If filters is left a nil it adds a filter to
|
75
|
+
# the BioMart query to remove results with the main attribute empty, this may
|
76
|
+
# cause an error if the BioMart WS does not allow filtering with that
|
77
|
+
# attribute.
|
78
|
+
def self.query(database, main, attrs = nil, filters = nil, data = nil, options = {})
|
79
|
+
attrs ||= []
|
80
|
+
data ||= {}
|
81
|
+
|
82
|
+
chunks = []
|
83
|
+
chunk = []
|
84
|
+
attrs.each{|a|
|
85
|
+
chunk << a
|
86
|
+
if chunk.length == 2
|
87
|
+
chunks << chunk
|
88
|
+
chunk = []
|
89
|
+
end
|
90
|
+
}
|
91
|
+
|
92
|
+
chunks << chunk if chunk.any?
|
93
|
+
|
94
|
+
chunks.each{|chunk|
|
95
|
+
data = get(database, main, chunk, filters, data, options)
|
96
|
+
}
|
97
|
+
|
98
|
+
data
|
99
|
+
end
|
100
|
+
|
101
|
+
|
102
|
+
|
103
|
+
end
|
104
|
+
|
@@ -0,0 +1,145 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
require 'rbbt/util/tsv'
|
3
|
+
require 'rbbt/bow/bow'
|
4
|
+
require 'set'
|
5
|
+
|
6
|
+
module Entrez
|
7
|
+
|
8
|
+
Rbbt.add_datafiles "gene_info" => ['databases/entrez', 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'],
|
9
|
+
"gene2pubmed" => ["databases/entrez", "ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz" ]
|
10
|
+
|
11
|
+
def self.entrez2native(taxs, options = {})
|
12
|
+
options = Misc.add_defaults options, :native => 1, :extra => 5, :flatten => true, :persistence => true
|
13
|
+
|
14
|
+
taxs = [taxs] unless Array === taxs
|
15
|
+
options.merge! :grep => taxs
|
16
|
+
|
17
|
+
TSV.new(Rbbt.find_datafile('gene_info'), options)
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.entrez2pubmed(taxs)
|
21
|
+
options = {:native => 1, :extra => 2, :flatten => true, :persistence => true}
|
22
|
+
|
23
|
+
taxs = [taxs] unless taxs.is_a?(Array)
|
24
|
+
taxs = taxs.collect{|t| t.to_s}
|
25
|
+
options.merge! :grep => taxs
|
26
|
+
|
27
|
+
TSV.new(Rbbt.find_datafile('gene2pubmed'), options)
|
28
|
+
end
|
29
|
+
|
30
|
+
class Gene
|
31
|
+
attr_reader :organism, :symbol, :description, :aka, :protnames, :summary, :comentaries
|
32
|
+
|
33
|
+
def initialize(xml)
|
34
|
+
return if xml.nil?
|
35
|
+
|
36
|
+
@organism = xml.scan(/<Org-ref_taxname>(.*)<\/Org-ref_taxname>/s)
|
37
|
+
@symbol = xml.scan(/<Gene-ref_locus>(.*)<\/Gene-ref_locus>/s)
|
38
|
+
@description = xml.scan(/<Gene-ref_desc>(.*)<\/Gene-ref_desc>/s)
|
39
|
+
@aka = xml.scan(/<Gene-ref_syn_E>(.*)<\Gene-ref_syn_E>/s)
|
40
|
+
@protnames = xml.scan(/<Prot-ref_name_E>(.*)<\/Prot-ref_name_E>/s)
|
41
|
+
@summary = xml.scan(/<Entrezgene_summary>(.*)<\/Entrezgene_summary>/s)
|
42
|
+
@comentaries = xml.scan(/<Gene-commentary_text>(.*)<\/Gene-commentary_text>/s)
|
43
|
+
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
# Joins the text from symbol, description, aka, protnames, and
|
48
|
+
# summary
|
49
|
+
def text
|
50
|
+
#[@organism, @symbol, @description, @aka, @protnames, @summary,@comentaries.join(". ")].join(". ")
|
51
|
+
[@symbol, @description, @aka, @protnames, @summary].flatten.join(". ")
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def self.get_online(geneids)
|
58
|
+
geneids_list = ( geneids.is_a?(Array) ? geneids.join(',') : geneids.to_s )
|
59
|
+
url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{geneids_list}"
|
60
|
+
|
61
|
+
xml = Open.read(url, :wget_options => {:quiet => true}, :nocache => true)
|
62
|
+
|
63
|
+
genes = xml.scan(/(<Entrezgene>.*?<\/Entrezgene>)/sm).flatten
|
64
|
+
|
65
|
+
if geneids.is_a? Array
|
66
|
+
list = {}
|
67
|
+
genes.each_with_index{|gene,i|
|
68
|
+
geneid = geneids[i]
|
69
|
+
list[geneid ] = gene
|
70
|
+
}
|
71
|
+
return list
|
72
|
+
else
|
73
|
+
return genes.first
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
public
|
78
|
+
|
79
|
+
def self.gene_filename(id)
|
80
|
+
'gene-' + id.to_s + '.xml'
|
81
|
+
end
|
82
|
+
|
83
|
+
def self.get_gene(geneid)
|
84
|
+
return nil if geneid.nil?
|
85
|
+
|
86
|
+
if Array === geneid
|
87
|
+
missing = []
|
88
|
+
list = {}
|
89
|
+
|
90
|
+
geneid.each{|p|
|
91
|
+
next if p.nil?
|
92
|
+
if FileCache.found(gene_filename p)
|
93
|
+
list[p] = Gene.new(Open.read(FileCache.path(gene_filename p)))
|
94
|
+
else
|
95
|
+
missing << p
|
96
|
+
end
|
97
|
+
}
|
98
|
+
|
99
|
+
return list unless missing.any?
|
100
|
+
genes = get_online(missing)
|
101
|
+
|
102
|
+
genes.each{|p, xml|
|
103
|
+
filename = gene_filename p
|
104
|
+
FileCache.add(filename,xml) unless FileCache.found(filename)
|
105
|
+
list[p] = Gene.new(xml)
|
106
|
+
}
|
107
|
+
|
108
|
+
return list
|
109
|
+
else
|
110
|
+
filename = gene_filename geneid
|
111
|
+
|
112
|
+
if FileCache.found(filename)
|
113
|
+
return Gene.new(Open.read(FileCache.path(filename)))
|
114
|
+
else
|
115
|
+
xml = get_online(geneid)
|
116
|
+
FileCache.add(filename, xml) unless FileCache.found(filename)
|
117
|
+
|
118
|
+
return Gene.new(xml)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
# Counts the words in common between a chunk of text and the text
|
124
|
+
# found in Entrez Gene for that particular gene. The +gene+ may be a
|
125
|
+
# gene identifier or a Gene class instance.
|
126
|
+
def self.gene_text_similarity(gene, text)
|
127
|
+
|
128
|
+
case
|
129
|
+
when Entrez::Gene === gene
|
130
|
+
gene_text = gene.text
|
131
|
+
when String === gene || Fixnum === gene
|
132
|
+
gene_text = get_gene(gene).text
|
133
|
+
else
|
134
|
+
return 0
|
135
|
+
end
|
136
|
+
|
137
|
+
gene_words = gene_text.words.to_set
|
138
|
+
text_words = text.words.to_set
|
139
|
+
|
140
|
+
return 0 if gene_words.empty? || text_words.empty?
|
141
|
+
|
142
|
+
common = gene_words.intersection(text_words)
|
143
|
+
common.length / (gene_words.length + text_words.length).to_f
|
144
|
+
end
|
145
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
|
3
|
+
# This module holds helper methods to deal with the Gene Ontology files. Right
|
4
|
+
# now all it does is provide a translation form id to the actual names.
|
5
|
+
module GO
|
6
|
+
|
7
|
+
@@info = nil
|
8
|
+
MULTIPLE_VALUE_FIELDS = %w(is_a)
|
9
|
+
|
10
|
+
# This method needs to be called before any translations can be made, it is
|
11
|
+
# called automatically the first time the id2name method is called. It loads
|
12
|
+
# the gene_ontology.obo file and extracts all the fields, although right now,
|
13
|
+
# only the name field is used.
|
14
|
+
def self.init
|
15
|
+
@@info = {}
|
16
|
+
File.open(File.join(Rbbt.datadir, 'dbs/go/gene_ontology.obo')).read.
|
17
|
+
split(/\[Term\]/).
|
18
|
+
each{|term|
|
19
|
+
term_info = {}
|
20
|
+
term.split(/\n/).
|
21
|
+
select{|l| l =~ /:/}.
|
22
|
+
each{|l|
|
23
|
+
key, value = l.chomp.match(/(.*?):(.*)/).values_at(1,2)
|
24
|
+
if MULTIPLE_VALUE_FIELDS.include? key.strip
|
25
|
+
term_info[key.strip] ||= []
|
26
|
+
term_info[key.strip] << value.strip
|
27
|
+
else
|
28
|
+
term_info[key.strip] = value.strip
|
29
|
+
end
|
30
|
+
}
|
31
|
+
@@info[term_info["id"]] = term_info
|
32
|
+
}
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.info
|
36
|
+
self.init unless @@info
|
37
|
+
@@info
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.goterms
|
41
|
+
self.init unless @@info
|
42
|
+
@@info.keys
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.id2name(id)
|
46
|
+
self.init unless @@info
|
47
|
+
if id.kind_of? Array
|
48
|
+
@@info.values_at(*id).collect{|i| i['name'] if i}
|
49
|
+
else
|
50
|
+
return nil if @@info[id].nil?
|
51
|
+
@@info[id]['name']
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.id2ancestors(id)
|
56
|
+
self.init unless @@info
|
57
|
+
if id.kind_of? Array
|
58
|
+
@@info.values_at(*id).
|
59
|
+
select{|i| ! i['is_a'].nil?}.
|
60
|
+
collect{|i| i['is_a'].collect{|id|
|
61
|
+
id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
|
62
|
+
}.compact
|
63
|
+
}
|
64
|
+
else
|
65
|
+
return [] if @@info[id].nil? || @@info[id]['is_a'].nil?
|
66
|
+
@@info[id]['is_a'].
|
67
|
+
collect{|id|
|
68
|
+
id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
|
69
|
+
}.compact
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.id2namespace(id)
|
74
|
+
self.init unless @@info
|
75
|
+
if id.kind_of? Array
|
76
|
+
@@info.values_at(*id).collect{|i| i['namespace'] if i}
|
77
|
+
else
|
78
|
+
return nil if @@info[id].nil?
|
79
|
+
@@info[id]['namespace']
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
|
3
|
+
module GoogleScholar
|
4
|
+
def self.user_agent
|
5
|
+
@@a ||= Mechanize.new
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.citation_link(title)
|
9
|
+
citation_link = nil
|
10
|
+
|
11
|
+
# Get citation page
|
12
|
+
user_agent.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
|
13
|
+
article = page.search('div[@class=gs_r]').first
|
14
|
+
return nil if article.nil?
|
15
|
+
|
16
|
+
return article.search('a').select{|link| link['href'] =~ /scholar\?cites/ && link.inner_html =~ /\d+$/ }.first
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.full_text_url(title)
|
21
|
+
full_text_link = nil
|
22
|
+
|
23
|
+
# Get page
|
24
|
+
user_agent.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
|
25
|
+
article = page.search('div[@class=gs_r]').first
|
26
|
+
return nil if article.nil?
|
27
|
+
|
28
|
+
link = article.search('a').select{ |link|
|
29
|
+
link['href'] =~ /\.pdf$/ || link['href'] =~ /type=pdf/
|
30
|
+
}.first
|
31
|
+
|
32
|
+
return nil if link.nil?
|
33
|
+
|
34
|
+
return link['href']
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
def self.number_cites(title)
|
40
|
+
|
41
|
+
link = citation_link title
|
42
|
+
return 0 if link.nil?
|
43
|
+
|
44
|
+
link.inner_html =~ /(\d+)$/
|
45
|
+
|
46
|
+
return $1.to_i
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
#def get_citers(title)
|
53
|
+
# puts title
|
54
|
+
# citation_link = nil
|
55
|
+
#
|
56
|
+
# # Get citation page
|
57
|
+
# $a.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
|
58
|
+
# citation_link = page.search('div[@class=gs_r]').first.search('a').select{|link| link['href'] =~ /scholar\?cites/ && link.inner_html =~ /\d+$/ }.first
|
59
|
+
# end
|
60
|
+
#
|
61
|
+
# return [] if citation_link.nil?
|
62
|
+
#
|
63
|
+
# # Parse citations
|
64
|
+
#
|
65
|
+
# citers = []
|
66
|
+
# $a.get("http://scholar.google.es" + citation_link['href']) do |page|
|
67
|
+
# citers = page.search('div[@class=gs_r]').collect do |entry|
|
68
|
+
# entry.search('h3').first.search('a').first.inner_html
|
69
|
+
# end
|
70
|
+
# end
|
71
|
+
#
|
72
|
+
# return citers
|
73
|
+
#end
|
@@ -0,0 +1,239 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
require 'libxml'
|
3
|
+
|
4
|
+
# This module offers an interface with PubMed, to perform queries, and
|
5
|
+
# retrieve simple information from articles. It uses the caching
|
6
|
+
# services of Rbbt.
|
7
|
+
module PubMed
|
8
|
+
|
9
|
+
private
|
10
|
+
@@pubmed_lag = 1
|
11
|
+
def self.get_online(pmids)
|
12
|
+
|
13
|
+
pmid_list = ( pmids.is_a?(Array) ? pmids.join(',') : pmids.to_s )
|
14
|
+
url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list}"
|
15
|
+
|
16
|
+
xml = Open.read(url, :quiet => true, :nocache => true, :nice => @@pubmed_lag, :nice_key => "PubMed")
|
17
|
+
|
18
|
+
articles = xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/smu).flatten
|
19
|
+
|
20
|
+
if pmids.is_a? Array
|
21
|
+
list = {}
|
22
|
+
articles.each{|article|
|
23
|
+
pmid = article.scan(/<PMID>(.*?)<\/PMID>/).flatten.first
|
24
|
+
list[pmid] = article
|
25
|
+
}
|
26
|
+
return list
|
27
|
+
else
|
28
|
+
return articles.first
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
public
|
34
|
+
|
35
|
+
# Processes the xml with an articles as served by MedLine and extracts
|
36
|
+
# the abstract, title and journal information
|
37
|
+
class Article
|
38
|
+
|
39
|
+
|
40
|
+
XML_KEYS = [
|
41
|
+
[:title , "ArticleTitle"],
|
42
|
+
[:journal , "Journal/Title"],
|
43
|
+
[:issue , "Journal/JournalIssue/Issue"],
|
44
|
+
[:volume , "Journal/JournalIssue/Volume"],
|
45
|
+
[:issn , "Journal/ISSN"],
|
46
|
+
[:year , "Journal/JournalIssue/PubDate/Year"],
|
47
|
+
[:month , "Journal/JournalIssue/PubDate/Month"],
|
48
|
+
[:pages , "Pagination/MedlinePgn"],
|
49
|
+
[:abstract , "Abstract/AbstractText"],
|
50
|
+
]
|
51
|
+
|
52
|
+
PMC_PDF_URL = "http://www.ncbi.nlm.nih.gov/pmc/articles/PMCID/pdf/"
|
53
|
+
|
54
|
+
def self.escape_title(title)
|
55
|
+
title.gsub(/(\w*[A-Z][A-Z]+\w*)/, '{\1}')
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.make_bibentry(lastname, year, title)
|
59
|
+
words = title.downcase.scan(/\w+/)
|
60
|
+
if words.first.length > 3
|
61
|
+
abrev = words.first
|
62
|
+
else
|
63
|
+
abrev = words[0..2].collect{|w| w.chars.first} * ""
|
64
|
+
end
|
65
|
+
[lastname.gsub(/\s/,'_'), year || "NOYEAR", abrev] * ""
|
66
|
+
end
|
67
|
+
def self.parse_xml(xml)
|
68
|
+
parser = LibXML::XML::Parser.string(xml)
|
69
|
+
pubmed = parser.parse.find("/PubmedArticle").first
|
70
|
+
medline = pubmed.find("MedlineCitation").first
|
71
|
+
article = medline.find("Article").first
|
72
|
+
|
73
|
+
info = {}
|
74
|
+
|
75
|
+
info[:pmid] = medline.find("PMID").first.content
|
76
|
+
|
77
|
+
XML_KEYS.each do |p|
|
78
|
+
name, key = p
|
79
|
+
node = article.find(key).first
|
80
|
+
|
81
|
+
next if node.nil?
|
82
|
+
|
83
|
+
info[name] = node.content
|
84
|
+
end
|
85
|
+
|
86
|
+
bibentry = nil
|
87
|
+
info[:author] = article.find("AuthorList/Author").collect do |author|
|
88
|
+
begin
|
89
|
+
lastname = author.find("LastName").first.content
|
90
|
+
if author.find("ForeName").first.nil?
|
91
|
+
forename = nil
|
92
|
+
else
|
93
|
+
forename = author.find("ForeName").first.content.split(/\s/).collect{|word| if word.length == 1; then word + '.'; else word; end} * " "
|
94
|
+
end
|
95
|
+
bibentry ||= make_bibentry lastname, info[:year], info[:title]
|
96
|
+
rescue
|
97
|
+
end
|
98
|
+
[lastname, forename] * ", "
|
99
|
+
end * " and "
|
100
|
+
|
101
|
+
info[:bibentry] = bibentry.downcase if bibentry
|
102
|
+
|
103
|
+
info[:pmc_pdf] = pubmed.find("PubmedData/ArticleIdList/ArticleId").select{|id| id[:IdType] == "pmc"}.first
|
104
|
+
|
105
|
+
if info[:pmc_pdf]
|
106
|
+
info[:pmc_pdf] = PMC_PDF_URL.sub(/PMCID/, info[:pmc_pdf].content)
|
107
|
+
end
|
108
|
+
|
109
|
+
info
|
110
|
+
end
|
111
|
+
|
112
|
+
attr_accessor :title, :abstract, :journal, :author, :pmid, :bibentry, :pmc_pdf, :gscholar_pdf, :pdf_url
|
113
|
+
attr_accessor *XML_KEYS.collect{|p| p.first }
|
114
|
+
|
115
|
+
def initialize(xml)
|
116
|
+
if xml && ! xml.empty?
|
117
|
+
info = PubMed::Article.parse_xml xml
|
118
|
+
info.each do |key, value|
|
119
|
+
self.send("#{ key }=", value)
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def pdf_url
|
125
|
+
return pmc_pdf if pmc_pdf
|
126
|
+
@gscholar_pdf ||= GoogleScholar::full_text_url title
|
127
|
+
end
|
128
|
+
|
129
|
+
def full_text
|
130
|
+
return nil if pdf_url.nil?
|
131
|
+
|
132
|
+
text = nil
|
133
|
+
TmpFile.with_file do |pdf|
|
134
|
+
|
135
|
+
# Change user-agent, oh well...
|
136
|
+
`wget --user-agent=firefox #{ pdf_url } -O #{ pdf }`
|
137
|
+
TmpFile.with_file do |txt|
|
138
|
+
`pdftotext #{ pdf } #{ txt }`
|
139
|
+
text = Open.read(txt) if File.exists? txt
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
text
|
144
|
+
end
|
145
|
+
|
146
|
+
def bibtex
|
147
|
+
keys = [:author] + XML_KEYS.collect{|p| p.first } - [:bibentry]
|
148
|
+
bibtex = "@article{#{bibentry},\n"
|
149
|
+
|
150
|
+
keys.each do |key|
|
151
|
+
next if self.send(key).nil?
|
152
|
+
|
153
|
+
case key
|
154
|
+
|
155
|
+
when :title
|
156
|
+
bibtex += " title = { #{ PubMed::Article.escape_title title } },\n"
|
157
|
+
|
158
|
+
when :issue
|
159
|
+
bibtex += " number = { #{ issue } },\n"
|
160
|
+
|
161
|
+
else
|
162
|
+
bibtex += " #{ key } = { #{ self.send(key) } },\n"
|
163
|
+
end
|
164
|
+
|
165
|
+
end
|
166
|
+
|
167
|
+
bibtex += " fulltext = { #{ pdf_url } },\n" if pdf_url
|
168
|
+
bibtex += " pmid = { #{ pmid } }\n}"
|
169
|
+
|
170
|
+
|
171
|
+
bibtex
|
172
|
+
end
|
173
|
+
|
174
|
+
# Join the text from title and abstract
|
175
|
+
def text
|
176
|
+
[title, abstract].join("\n")
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
# Returns the Article object containing the information for the PubMed
|
181
|
+
# ID specified as an argument. If +pmid+ is an array instead of a single
|
182
|
+
# identifier it returns an hash with the Article object for each id.
|
183
|
+
# It uses the Rbbt cache to save the articles xml.
|
184
|
+
def self.get_article(pmid)
|
185
|
+
|
186
|
+
if pmid.is_a? Array
|
187
|
+
missing = []
|
188
|
+
list = {}
|
189
|
+
|
190
|
+
pmid.each{|p|
|
191
|
+
filename = p.to_s + '.xml'
|
192
|
+
if File.exists? FileCache.path(filename)
|
193
|
+
list[p] = Article.new(Open.read(FileCache.path(filename)))
|
194
|
+
else
|
195
|
+
missing << p
|
196
|
+
end
|
197
|
+
}
|
198
|
+
|
199
|
+
return list unless missing.any?
|
200
|
+
chunk_size = [100, missing.length].min
|
201
|
+
chunks = (missing.length.to_f / chunk_size).ceil
|
202
|
+
|
203
|
+
articles = {}
|
204
|
+
chunks.times do |chunk|
|
205
|
+
pmids = missing[(chunk * chunk_size)..((chunk + 1) *chunk_size)]
|
206
|
+
articles.merge!(get_online(pmids))
|
207
|
+
end
|
208
|
+
|
209
|
+
articles.each{|p, xml|
|
210
|
+
filename = p + '.xml'
|
211
|
+
FileCache.add(filename,xml)
|
212
|
+
list[p] = Article.new(xml)
|
213
|
+
}
|
214
|
+
|
215
|
+
return list
|
216
|
+
|
217
|
+
else
|
218
|
+
filename = pmid.to_s + '.xml'
|
219
|
+
|
220
|
+
if File.exists? FileCache.path(filename)
|
221
|
+
return Article.new(Open.read(FileCache.path(filename)))
|
222
|
+
else
|
223
|
+
xml = get_online(pmid)
|
224
|
+
FileCache.add(filename,xml)
|
225
|
+
|
226
|
+
return Article.new(xml)
|
227
|
+
end
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
# Performs the specified query and returns an array with the PubMed
|
232
|
+
# Ids returned. +retmax+ can be used to limit the number of ids
|
233
|
+
# returned, if is not specified 30000 is used.
|
234
|
+
def self.query(query, retmax=nil)
|
235
|
+
retmax ||= 30000
|
236
|
+
|
237
|
+
Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?retmax=#{retmax}&db=pubmed&term=#{query}",:quiet => true, :nocache => true).scan(/<Id>(\d+)<\/Id>/).flatten
|
238
|
+
end
|
239
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../test_helper'
|
2
|
+
require 'rbbt/sources/biomart'
|
3
|
+
require 'test/unit'
|
4
|
+
|
5
|
+
class TestBioMart < Test::Unit::TestCase
|
6
|
+
|
7
|
+
def test_get
|
8
|
+
assert_raise BioMart::QueryError do
|
9
|
+
BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],['with_unknownattr'])
|
10
|
+
end
|
11
|
+
|
12
|
+
data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],[], nil, :nocache => true, :wget_options => { :quiet => false})
|
13
|
+
assert(data['856452']['protein_id'].include? 'AAB68382')
|
14
|
+
|
15
|
+
data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['external_gene_id'],[], data, :nocache => true, :wget_options => { :quiet => false} )
|
16
|
+
assert(data['856452']['protein_id'].include? 'AAB68382')
|
17
|
+
assert(data['856452']['external_gene_id'].include? 'CUP1-2')
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_query
|
22
|
+
data = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :nocache => true, :wget_options => { :quiet => false})
|
23
|
+
|
24
|
+
assert(data['856452']['protein_id'].include? 'AAB68382')
|
25
|
+
assert(data['856452']['external_gene_id'].include? 'CUP1-2')
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
|
2
|
+
require 'rbbt/sources/entrez'
|
3
|
+
require 'test/unit'
|
4
|
+
|
5
|
+
class TestEntrez < Test::Unit::TestCase
|
6
|
+
$yeast_tax = 559292
|
7
|
+
|
8
|
+
def test_entrez2native
|
9
|
+
tax = $yeast_tax
|
10
|
+
fix = proc{|line| line.sub(/SGD:S0/,'S0') }
|
11
|
+
select = proc{|line| line.match(/\tSGD:S0/)}
|
12
|
+
lexicon = Entrez.entrez2native(tax, :fix => fix, :select => select)
|
13
|
+
|
14
|
+
assert(lexicon['855611'].include? 'S000005056')
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_entrez2pubmed
|
18
|
+
tax = $yeast_tax
|
19
|
+
|
20
|
+
data = Entrez.entrez2pubmed(tax)
|
21
|
+
assert(data['850320'].include? '15102838')
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_getonline
|
25
|
+
geneids = 9129
|
26
|
+
|
27
|
+
assert_match(/PRP3 pre-mRNA processing factor/s, Entrez.get_online(geneids))
|
28
|
+
|
29
|
+
geneids = [9129,9]
|
30
|
+
assert_match(/PRP3 pre-mRNA processing factor/s, Entrez.get_online(geneids)[9129])
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_getgene
|
34
|
+
geneids = 9129
|
35
|
+
assert_equal([["PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)"]], Entrez.get_gene(geneids).description)
|
36
|
+
|
37
|
+
geneids = [9129, 728049]
|
38
|
+
assert_equal([["PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)"]], Entrez.get_gene(geneids)[9129].description)
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_similarity
|
42
|
+
assert(Entrez.gene_text_similarity(9129, "PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)") > 0)
|
43
|
+
assert_equal(0, Entrez.gene_text_similarity("NON EXISTEN GENEID", "PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)"))
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
|
2
|
+
|
3
|
+
require 'rbbt/sources/go'
|
4
|
+
require 'test/unit'
|
5
|
+
|
6
|
+
class TestGo < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def test_go
|
9
|
+
assert_match('vacuole inheritance',GO::id2name('GO:0000011'))
|
10
|
+
assert_equal(['vacuole inheritance','alpha-glucoside transport'], GO::id2name(['GO:0000011','GO:0000017']))
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_ancestors
|
14
|
+
assert GO.id2ancestors('GO:0000001').include? 'GO:0048308'
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_namespace
|
18
|
+
assert_equal 'biological_process', GO.id2namespace('GO:0000001')
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
|
2
|
+
|
3
|
+
require 'rbbt/sources/pubmed'
|
4
|
+
require 'test/unit'
|
5
|
+
|
6
|
+
class TestPubMed < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def test_get_online
|
9
|
+
pmid = '16438716'
|
10
|
+
assert(PubMed.get_online(pmid) =~ /Discovering semantic features in the literature: a foundation for building functional associations./)
|
11
|
+
|
12
|
+
pmids = ['16438716', 17204154]
|
13
|
+
assert(PubMed.get_online(pmids)[pmid] =~ /Discovering semantic features in the literature: a foundation for building functional associations./)
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_get_article
|
17
|
+
pmid = '16438716'
|
18
|
+
assert(PubMed.get_article(pmid).title == "Discovering semantic features in the literature: a foundation for building functional associations.")
|
19
|
+
|
20
|
+
pmids = ['16438716', 17204154]
|
21
|
+
assert(PubMed.get_article(pmids)[pmid].title == "Discovering semantic features in the literature: a foundation for building functional associations.")
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_full_text
|
25
|
+
pmid = '16438716'
|
26
|
+
assert(PubMed.get_article(pmid).full_text =~ /Discovering/)
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_query
|
30
|
+
assert(PubMed.query('chagoyen[All Fields] AND ("loattrfull text"[sb] AND hasabstract[text])').include? '16438716')
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_bibentry
|
34
|
+
assert("vazquez2008sent", PubMed::Article.make_bibentry('vazquez', 2008, "SENT: Semantic features in text"))
|
35
|
+
assert("vazquez2008aes", PubMed::Article.make_bibentry('vazquez', 2008, "An Example System"))
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
|
data/test/test_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rbbt-sources
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 27
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 0
|
10
|
+
version: 0.1.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Miguel Vazquez
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2010-12-01 00:00:00 +01:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: rbbt-util
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
version: "0"
|
33
|
+
type: :runtime
|
34
|
+
version_requirements: *id001
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: mechanize
|
37
|
+
prerelease: false
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
hash: 3
|
44
|
+
segments:
|
45
|
+
- 0
|
46
|
+
version: "0"
|
47
|
+
type: :runtime
|
48
|
+
version_requirements: *id002
|
49
|
+
- !ruby/object:Gem::Dependency
|
50
|
+
name: libxml-ruby
|
51
|
+
prerelease: false
|
52
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
54
|
+
requirements:
|
55
|
+
- - ">="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
hash: 3
|
58
|
+
segments:
|
59
|
+
- 0
|
60
|
+
version: "0"
|
61
|
+
type: :runtime
|
62
|
+
version_requirements: *id003
|
63
|
+
description: Data sources like PubMed, Entrez Gene, or Gene Ontology
|
64
|
+
email: miguel.vazquez@fdi.ucm.es
|
65
|
+
executables: []
|
66
|
+
|
67
|
+
extensions: []
|
68
|
+
|
69
|
+
extra_rdoc_files: []
|
70
|
+
|
71
|
+
files:
|
72
|
+
- lib/rbbt/sources/bibtex.rb
|
73
|
+
- lib/rbbt/sources/biomart.rb
|
74
|
+
- lib/rbbt/sources/entrez.rb
|
75
|
+
- lib/rbbt/sources/go.rb
|
76
|
+
- lib/rbbt/sources/gscholar.rb
|
77
|
+
- lib/rbbt/sources/organism.rb
|
78
|
+
- lib/rbbt/sources/pubmed.rb
|
79
|
+
- test/rbbt/sources/test_biomart.rb
|
80
|
+
- test/rbbt/sources/test_entrez.rb
|
81
|
+
- test/rbbt/sources/test_go.rb
|
82
|
+
- test/rbbt/sources/test_pubmed.rb
|
83
|
+
- test/test_helper.rb
|
84
|
+
has_rdoc: true
|
85
|
+
homepage: http://github.com/mikisvaz/rbbt-sources
|
86
|
+
licenses: []
|
87
|
+
|
88
|
+
post_install_message:
|
89
|
+
rdoc_options: []
|
90
|
+
|
91
|
+
require_paths:
|
92
|
+
- lib
|
93
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
94
|
+
none: false
|
95
|
+
requirements:
|
96
|
+
- - ">="
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
hash: 3
|
99
|
+
segments:
|
100
|
+
- 0
|
101
|
+
version: "0"
|
102
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
103
|
+
none: false
|
104
|
+
requirements:
|
105
|
+
- - ">="
|
106
|
+
- !ruby/object:Gem::Version
|
107
|
+
hash: 3
|
108
|
+
segments:
|
109
|
+
- 0
|
110
|
+
version: "0"
|
111
|
+
requirements: []
|
112
|
+
|
113
|
+
rubyforge_project:
|
114
|
+
rubygems_version: 1.3.7
|
115
|
+
signing_key:
|
116
|
+
specification_version: 3
|
117
|
+
summary: Data sources for the Ruby Bioinformatics Toolkit (rbbt)
|
118
|
+
test_files:
|
119
|
+
- test/rbbt/sources/test_biomart.rb
|
120
|
+
- test/rbbt/sources/test_entrez.rb
|
121
|
+
- test/rbbt/sources/test_go.rb
|
122
|
+
- test/rbbt/sources/test_pubmed.rb
|
123
|
+
- test/test_helper.rb
|