rbbt 1.2.1 → 1.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rbbt/sources/gscholar.rb +74 -0
- data/lib/rbbt/sources/organism.rb +3 -2
- data/lib/rbbt/sources/pubmed.rb +107 -7
- data/lib/rbbt/util/open.rb +13 -0
- data/test/rbbt/util/test_open.rb +13 -0
- metadata +45 -25
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
|
3
|
+
|
4
|
+
module GoogleScholar
|
5
|
+
def self.user_agent
|
6
|
+
@@a ||= Mechanize.new
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.citation_link(title)
|
10
|
+
citation_link = nil
|
11
|
+
|
12
|
+
# Get citation page
|
13
|
+
user_agent.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
|
14
|
+
article = page.search('div[@class=gs_r]').first
|
15
|
+
return nil if article.nil?
|
16
|
+
|
17
|
+
return article.search('a').select{|link| link['href'] =~ /scholar\?cites/ && link.inner_html =~ /\d+$/ }.first
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.full_text_url(title)
|
22
|
+
full_text_link = nil
|
23
|
+
|
24
|
+
# Get page
|
25
|
+
user_agent.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
|
26
|
+
article = page.search('div[@class=gs_r]').first
|
27
|
+
return nil if article.nil?
|
28
|
+
|
29
|
+
link = article.search('a').select{ |link|
|
30
|
+
link['href'] =~ /\.pdf$/ || link['href'] =~ /type=pdf/
|
31
|
+
}.first
|
32
|
+
|
33
|
+
return nil if link.nil?
|
34
|
+
|
35
|
+
return link['href']
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
def self.number_cites(title)
|
41
|
+
|
42
|
+
link = citation_link title
|
43
|
+
return 0 if link.nil?
|
44
|
+
|
45
|
+
link.inner_html =~ /(\d+)$/
|
46
|
+
|
47
|
+
return $1.to_i
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
#def get_citers(title)
|
54
|
+
# puts title
|
55
|
+
# citation_link = nil
|
56
|
+
#
|
57
|
+
# # Get citation page
|
58
|
+
# $a.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
|
59
|
+
# citation_link = page.search('div[@class=gs_r]').first.search('a').select{|link| link['href'] =~ /scholar\?cites/ && link.inner_html =~ /\d+$/ }.first
|
60
|
+
# end
|
61
|
+
#
|
62
|
+
# return [] if citation_link.nil?
|
63
|
+
#
|
64
|
+
# # Parse citations
|
65
|
+
#
|
66
|
+
# citers = []
|
67
|
+
# $a.get("http://scholar.google.es" + citation_link['href']) do |page|
|
68
|
+
# citers = page.search('div[@class=gs_r]').collect do |entry|
|
69
|
+
# entry.search('h3').first.search('a').first.inner_html
|
70
|
+
# end
|
71
|
+
# end
|
72
|
+
#
|
73
|
+
# return citers
|
74
|
+
#end
|
@@ -127,12 +127,12 @@ module Organism
|
|
127
127
|
if i == 0
|
128
128
|
i += 1
|
129
129
|
next unless l=~/^\s*#/
|
130
|
-
|
130
|
+
formats = Open.fields(l.sub(/^[\s#]+/,'')).collect{|n| n.strip}
|
131
131
|
return formats unless examples
|
132
132
|
next
|
133
133
|
end
|
134
134
|
|
135
|
-
if Open.fields(l).select{|name| name && name =~ /\w/}.length > examples.length
|
135
|
+
if Open.fields(l).select{|name| name && name =~ /\w/}.length > examples.compact.length
|
136
136
|
examples = Open.fields(l).collect{|name| name.split(/\|/).first}
|
137
137
|
end
|
138
138
|
i += 1
|
@@ -216,6 +216,7 @@ module Organism
|
|
216
216
|
first = nil
|
217
217
|
if native
|
218
218
|
first = id_position(supported,native,options)
|
219
|
+
raise "No match for native format '#{ native }'"
|
219
220
|
else
|
220
221
|
first = 0
|
221
222
|
end
|
data/lib/rbbt/sources/pubmed.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require 'rbbt/util/filecache'
|
2
2
|
require 'rbbt/util/open'
|
3
|
+
require 'rbbt/sources/gscholar'
|
3
4
|
require 'rbbt'
|
5
|
+
require 'libxml'
|
4
6
|
|
5
7
|
# This module offers an interface with PubMed, to perform queries, and
|
6
8
|
# retrieve simple information from articles. It uses the caching
|
@@ -42,17 +44,115 @@ module PubMed
|
|
42
44
|
# Processes the xml with an articles as served by MedLine and extracts
|
43
45
|
# the abstract, title and journal information
|
44
46
|
class Article
|
45
|
-
|
47
|
+
|
48
|
+
|
49
|
+
XML_KEYS = [
|
50
|
+
[:title , "ArticleTitle"],
|
51
|
+
[:journal , "Journal/Title"],
|
52
|
+
[:issue , "Journal/JournalIssue/Issue"],
|
53
|
+
[:volume , "Journal/JournalIssue/Volume"],
|
54
|
+
[:issn , "Journal/ISSN"],
|
55
|
+
[:year , "Journal/JournalIssue/PubDate/Year"],
|
56
|
+
[:pages , "Pagination/MedlinePgn"],
|
57
|
+
[:abstract , "Abstract/AbstractText"],
|
58
|
+
]
|
59
|
+
|
60
|
+
PMC_PDF_URL = "http://www.ncbi.nlm.nih.gov/pmc/articles/PMCID/pdf/"
|
61
|
+
|
62
|
+
def self.escape_title(title)
|
63
|
+
title.gsub(/(\w*[A-Z][A-Z]+\w*)/, '{\1}')
|
64
|
+
end
|
65
|
+
|
66
|
+
def self.parse_xml(xml)
|
67
|
+
parser = LibXML::XML::Parser.string(xml)
|
68
|
+
pubmed = parser.parse.find("/PubmedArticle").first
|
69
|
+
medline = pubmed.find("MedlineCitation").first
|
70
|
+
article = medline.find("Article").first
|
71
|
+
|
72
|
+
info = {}
|
73
|
+
|
74
|
+
info[:pmid] = medline.find("PMID").first.content
|
75
|
+
|
76
|
+
XML_KEYS.each do |p|
|
77
|
+
name, key = p
|
78
|
+
node = article.find(key).first
|
79
|
+
|
80
|
+
next if node.nil?
|
81
|
+
|
82
|
+
info[name] = node.content
|
83
|
+
end
|
84
|
+
|
85
|
+
bibentry = nil
|
86
|
+
info[:author] = article.find("AuthorList/Author").collect do |author|
|
87
|
+
lastname = author.find("LastName").first.content
|
88
|
+
if author.find("ForeName").first.nil?
|
89
|
+
forename = nil
|
90
|
+
else
|
91
|
+
forename = author.find("ForeName").first.content.split(/\s/).collect{|word| if word.length == 1; then word + '.'; else word; end} * " "
|
92
|
+
end
|
93
|
+
bibentry ||= [lastname, (info[:year] || "NOYEAR"), info[:title].scan(/\w+/)[0]] * ""
|
94
|
+
[lastname, forename] * ", "
|
95
|
+
end * " and "
|
96
|
+
|
97
|
+
info[:bibentry] = bibentry.downcase
|
98
|
+
|
99
|
+
info[:pmc_pdf] = pubmed.find("PubmedData/ArticleIdList/ArticleId").select{|id| id[:IdType] == "pmc"}.first
|
100
|
+
|
101
|
+
if info[:pmc_pdf]
|
102
|
+
info[:pmc_pdf] = PMC_PDF_URL.sub(/PMCID/, info[:pmc_pdf].content)
|
103
|
+
end
|
104
|
+
|
105
|
+
info
|
106
|
+
end
|
107
|
+
|
108
|
+
attr_accessor :title, :abstract, :journal, :author, :pmid, :bibentry, :pmc_pdf, :gscholar_pdf, :pdf_url
|
109
|
+
attr_accessor *XML_KEYS.collect{|p| p.first }
|
110
|
+
|
46
111
|
def initialize(xml)
|
47
|
-
xml
|
48
|
-
|
49
|
-
|
50
|
-
|
112
|
+
if xml && ! xml.empty?
|
113
|
+
info = PubMed::Article.parse_xml xml
|
114
|
+
info.each do |key, value|
|
115
|
+
self.send("#{ key }=", value)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def pdf_url
|
121
|
+
return pmc_pdf if pmc_pdf
|
122
|
+
@gscholar_pdf ||= GoogleScholar::full_text_url title
|
123
|
+
end
|
124
|
+
|
125
|
+
def bibtex
|
126
|
+
keys = [:author] + XML_KEYS.collect{|p| p.first } - [:bibentry]
|
127
|
+
bibtex = "@article{#{bibentry},\n"
|
128
|
+
|
129
|
+
keys.each do |key|
|
130
|
+
next if self.send(key).nil?
|
131
|
+
|
132
|
+
case key
|
133
|
+
|
134
|
+
when :title
|
135
|
+
bibtex += " title = { #{ PubMed::Article.escape_title title } },\n"
|
136
|
+
|
137
|
+
when :issue
|
138
|
+
bibtex += " number = { #{ issue } },\n"
|
139
|
+
|
140
|
+
else
|
141
|
+
bibtex += " #{ key } = { #{ self.send(key) } },\n"
|
142
|
+
end
|
143
|
+
|
144
|
+
end
|
145
|
+
|
146
|
+
bibtex += " fulltext = { #{ pdf_url } },\n" if pdf_url
|
147
|
+
bibtex += " pmid = { #{ pmid } }\n}"
|
148
|
+
|
149
|
+
|
150
|
+
bibtex
|
51
151
|
end
|
52
152
|
|
53
153
|
# Join the text from title and abstract
|
54
154
|
def text
|
55
|
-
[
|
155
|
+
[title, abstract].join("\n")
|
56
156
|
end
|
57
157
|
end
|
58
158
|
|
@@ -78,7 +178,7 @@ module PubMed
|
|
78
178
|
return list unless missing.any?
|
79
179
|
chunk_size = [100, missing.length].min
|
80
180
|
chunks = (missing.length.to_f / chunk_size).ceil
|
81
|
-
|
181
|
+
|
82
182
|
articles = {}
|
83
183
|
chunks.times do |chunk|
|
84
184
|
pmids = missing[(chunk * chunk_size)..((chunk + 1) *chunk_size)]
|
data/lib/rbbt/util/open.rb
CHANGED
@@ -6,6 +6,16 @@ require 'rbbt/util/tmpfile'
|
|
6
6
|
# for accessing remote files. It supports caching the files.
|
7
7
|
module Open
|
8
8
|
|
9
|
+
# Return a Proc to use in the :select parameter of the Open.to_hash method.
|
10
|
+
# It selects those lines with the content of the first field present on the
|
11
|
+
# entities array. The field can be chosen to be a different one in the
|
12
|
+
# options hash, also the separation string or regexp to determine fields.
|
13
|
+
def self.func_match_field(entities, options = {})
|
14
|
+
field, sep = {:field => 0, :sep => "\t"}.merge(options).values_at(:field, :sep)
|
15
|
+
|
16
|
+
Proc.new {|line| entities.include? line.split(sep)[field] }
|
17
|
+
end
|
18
|
+
|
9
19
|
def self.fields(line, sep = "\t")
|
10
20
|
chunks = line.chomp.split(/(#{sep})/).select{|c| c !~ /^#{sep}$/ }
|
11
21
|
if line =~ /#{sep}$/
|
@@ -176,10 +186,12 @@ module Open
|
|
176
186
|
# * :single => for each key select only the first of the values, instead of the complete array.
|
177
187
|
# * :fix => A Proc that is called to pre-process the line
|
178
188
|
# * :exclude => A Proc that is called to check if the line must be excluded from the process.
|
189
|
+
# * :select => A Proc that is called to check if the line must be selected to process.
|
179
190
|
def self.to_hash(input, options = {})
|
180
191
|
native = options[:native] || 0
|
181
192
|
extra = options[:extra]
|
182
193
|
exclude = options[:exclude]
|
194
|
+
select = options[:select]
|
183
195
|
fix = options[:fix]
|
184
196
|
sep = options[:sep] || "\t"
|
185
197
|
sep2 = options[:sep2] || "|"
|
@@ -200,6 +212,7 @@ module Open
|
|
200
212
|
content.each_line{|l|
|
201
213
|
l = fix.call(l) if fix
|
202
214
|
next if exclude and exclude.call(l)
|
215
|
+
next if select and ! select.call(l)
|
203
216
|
|
204
217
|
row_fields = self.fields(l, sep)
|
205
218
|
id = row_fields[native]
|
data/test/rbbt/util/test_open.rb
CHANGED
@@ -89,6 +89,19 @@ row2 a d e r
|
|
89
89
|
assert_equal(["","",""] , Open.fields("\t\t") )
|
90
90
|
end
|
91
91
|
|
92
|
+
def test_select_field
|
93
|
+
data =<<-EOD
|
94
|
+
row1 a b 3
|
95
|
+
row1 aa bb 33
|
96
|
+
row2 a d e r
|
97
|
+
EOD
|
98
|
+
|
99
|
+
TmpFile.with_file(data) do |file|
|
100
|
+
data = Open.to_hash(file, :select => Open.func_match_field(%w(row1), :sep => " "), :sep => " ")
|
101
|
+
assert ! data.include?('row2')
|
102
|
+
assert data.include?('row1')
|
103
|
+
end
|
104
|
+
end
|
92
105
|
|
93
106
|
|
94
107
|
|
metadata
CHANGED
@@ -1,7 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 1
|
7
|
+
- 2
|
8
|
+
- 2
|
9
|
+
version: 1.2.2
|
5
10
|
platform: ruby
|
6
11
|
authors:
|
7
12
|
- Miguel Vazquez
|
@@ -9,59 +14,71 @@ autorequire:
|
|
9
14
|
bindir: bin
|
10
15
|
cert_chain: []
|
11
16
|
|
12
|
-
date: 2010-
|
17
|
+
date: 2010-05-27 00:00:00 +02:00
|
13
18
|
default_executable: rbbt_config
|
14
19
|
dependencies:
|
15
20
|
- !ruby/object:Gem::Dependency
|
16
21
|
name: rake
|
17
|
-
|
18
|
-
|
19
|
-
version_requirements: !ruby/object:Gem::Requirement
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
24
|
requirements:
|
21
25
|
- - ">="
|
22
26
|
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 0
|
29
|
+
- 8
|
30
|
+
- 4
|
23
31
|
version: 0.8.4
|
24
|
-
|
32
|
+
type: :runtime
|
33
|
+
version_requirements: *id001
|
25
34
|
- !ruby/object:Gem::Dependency
|
26
35
|
name: simpleconsole
|
27
|
-
|
28
|
-
|
29
|
-
version_requirements: !ruby/object:Gem::Requirement
|
36
|
+
prerelease: false
|
37
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
30
38
|
requirements:
|
31
39
|
- - ">="
|
32
40
|
- !ruby/object:Gem::Version
|
41
|
+
segments:
|
42
|
+
- 0
|
33
43
|
version: "0"
|
34
|
-
|
44
|
+
type: :runtime
|
45
|
+
version_requirements: *id002
|
35
46
|
- !ruby/object:Gem::Dependency
|
36
47
|
name: stemmer
|
37
|
-
|
38
|
-
|
39
|
-
version_requirements: !ruby/object:Gem::Requirement
|
48
|
+
prerelease: false
|
49
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
40
50
|
requirements:
|
41
51
|
- - ">="
|
42
52
|
- !ruby/object:Gem::Version
|
53
|
+
segments:
|
54
|
+
- 0
|
43
55
|
version: "0"
|
44
|
-
|
56
|
+
type: :runtime
|
57
|
+
version_requirements: *id003
|
45
58
|
- !ruby/object:Gem::Dependency
|
46
59
|
name: progress-monitor
|
47
|
-
|
48
|
-
|
49
|
-
version_requirements: !ruby/object:Gem::Requirement
|
60
|
+
prerelease: false
|
61
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
50
62
|
requirements:
|
51
63
|
- - ">="
|
52
64
|
- !ruby/object:Gem::Version
|
65
|
+
segments:
|
66
|
+
- 0
|
53
67
|
version: "0"
|
54
|
-
|
68
|
+
type: :runtime
|
69
|
+
version_requirements: *id004
|
55
70
|
- !ruby/object:Gem::Dependency
|
56
71
|
name: simpleconsole
|
57
|
-
|
58
|
-
|
59
|
-
version_requirements: !ruby/object:Gem::Requirement
|
72
|
+
prerelease: false
|
73
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
60
74
|
requirements:
|
61
75
|
- - ">="
|
62
76
|
- !ruby/object:Gem::Version
|
77
|
+
segments:
|
78
|
+
- 0
|
63
79
|
version: "0"
|
64
|
-
|
80
|
+
type: :runtime
|
81
|
+
version_requirements: *id005
|
65
82
|
description: |-
|
66
83
|
This toolbox includes modules for text-mining, like Named Entity Recognition and Normalization and document
|
67
84
|
classification, as well as data integration modules that interface with PubMed, Entrez Gene, BioMart.
|
@@ -118,6 +135,7 @@ files:
|
|
118
135
|
- lib/rbbt/sources/biomart.rb
|
119
136
|
- lib/rbbt/sources/entrez.rb
|
120
137
|
- lib/rbbt/sources/go.rb
|
138
|
+
- lib/rbbt/sources/gscholar.rb
|
121
139
|
- lib/rbbt/sources/organism.rb
|
122
140
|
- lib/rbbt/sources/polysearch.rb
|
123
141
|
- lib/rbbt/sources/pubmed.rb
|
@@ -145,18 +163,20 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
145
163
|
requirements:
|
146
164
|
- - ">="
|
147
165
|
- !ruby/object:Gem::Version
|
166
|
+
segments:
|
167
|
+
- 0
|
148
168
|
version: "0"
|
149
|
-
version:
|
150
169
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
151
170
|
requirements:
|
152
171
|
- - ">="
|
153
172
|
- !ruby/object:Gem::Version
|
173
|
+
segments:
|
174
|
+
- 0
|
154
175
|
version: "0"
|
155
|
-
version:
|
156
176
|
requirements: []
|
157
177
|
|
158
178
|
rubyforge_project:
|
159
|
-
rubygems_version: 1.3.
|
179
|
+
rubygems_version: 1.3.6
|
160
180
|
signing_key:
|
161
181
|
specification_version: 3
|
162
182
|
summary: Bioinformatics and text mining toolbox
|