rbbt 1.2.1 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rbbt/sources/gscholar.rb +74 -0
- data/lib/rbbt/sources/organism.rb +3 -2
- data/lib/rbbt/sources/pubmed.rb +107 -7
- data/lib/rbbt/util/open.rb +13 -0
- data/test/rbbt/util/test_open.rb +13 -0
- metadata +45 -25
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
|
3
|
+
|
4
|
+
module GoogleScholar
|
5
|
+
def self.user_agent
|
6
|
+
@@a ||= Mechanize.new
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.citation_link(title)
|
10
|
+
citation_link = nil
|
11
|
+
|
12
|
+
# Get citation page
|
13
|
+
user_agent.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
|
14
|
+
article = page.search('div[@class=gs_r]').first
|
15
|
+
return nil if article.nil?
|
16
|
+
|
17
|
+
return article.search('a').select{|link| link['href'] =~ /scholar\?cites/ && link.inner_html =~ /\d+$/ }.first
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.full_text_url(title)
|
22
|
+
full_text_link = nil
|
23
|
+
|
24
|
+
# Get page
|
25
|
+
user_agent.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
|
26
|
+
article = page.search('div[@class=gs_r]').first
|
27
|
+
return nil if article.nil?
|
28
|
+
|
29
|
+
link = article.search('a').select{ |link|
|
30
|
+
link['href'] =~ /\.pdf$/ || link['href'] =~ /type=pdf/
|
31
|
+
}.first
|
32
|
+
|
33
|
+
return nil if link.nil?
|
34
|
+
|
35
|
+
return link['href']
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
def self.number_cites(title)
|
41
|
+
|
42
|
+
link = citation_link title
|
43
|
+
return 0 if link.nil?
|
44
|
+
|
45
|
+
link.inner_html =~ /(\d+)$/
|
46
|
+
|
47
|
+
return $1.to_i
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
#def get_citers(title)
|
54
|
+
# puts title
|
55
|
+
# citation_link = nil
|
56
|
+
#
|
57
|
+
# # Get citation page
|
58
|
+
# $a.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
|
59
|
+
# citation_link = page.search('div[@class=gs_r]').first.search('a').select{|link| link['href'] =~ /scholar\?cites/ && link.inner_html =~ /\d+$/ }.first
|
60
|
+
# end
|
61
|
+
#
|
62
|
+
# return [] if citation_link.nil?
|
63
|
+
#
|
64
|
+
# # Parse citations
|
65
|
+
#
|
66
|
+
# citers = []
|
67
|
+
# $a.get("http://scholar.google.es" + citation_link['href']) do |page|
|
68
|
+
# citers = page.search('div[@class=gs_r]').collect do |entry|
|
69
|
+
# entry.search('h3').first.search('a').first.inner_html
|
70
|
+
# end
|
71
|
+
# end
|
72
|
+
#
|
73
|
+
# return citers
|
74
|
+
#end
|
@@ -127,12 +127,12 @@ module Organism
|
|
127
127
|
if i == 0
|
128
128
|
i += 1
|
129
129
|
next unless l=~/^\s*#/
|
130
|
-
|
130
|
+
formats = Open.fields(l.sub(/^[\s#]+/,'')).collect{|n| n.strip}
|
131
131
|
return formats unless examples
|
132
132
|
next
|
133
133
|
end
|
134
134
|
|
135
|
-
if Open.fields(l).select{|name| name && name =~ /\w/}.length > examples.length
|
135
|
+
if Open.fields(l).select{|name| name && name =~ /\w/}.length > examples.compact.length
|
136
136
|
examples = Open.fields(l).collect{|name| name.split(/\|/).first}
|
137
137
|
end
|
138
138
|
i += 1
|
@@ -216,6 +216,7 @@ module Organism
|
|
216
216
|
first = nil
|
217
217
|
if native
|
218
218
|
first = id_position(supported,native,options)
|
219
|
+
raise "No match for native format '#{ native }'"
|
219
220
|
else
|
220
221
|
first = 0
|
221
222
|
end
|
data/lib/rbbt/sources/pubmed.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require 'rbbt/util/filecache'
|
2
2
|
require 'rbbt/util/open'
|
3
|
+
require 'rbbt/sources/gscholar'
|
3
4
|
require 'rbbt'
|
5
|
+
require 'libxml'
|
4
6
|
|
5
7
|
# This module offers an interface with PubMed, to perform queries, and
|
6
8
|
# retrieve simple information from articles. It uses the caching
|
@@ -42,17 +44,115 @@ module PubMed
|
|
42
44
|
# Processes the xml with an articles as served by MedLine and extracts
|
43
45
|
# the abstract, title and journal information
|
44
46
|
class Article
|
45
|
-
|
47
|
+
|
48
|
+
|
49
|
+
XML_KEYS = [
|
50
|
+
[:title , "ArticleTitle"],
|
51
|
+
[:journal , "Journal/Title"],
|
52
|
+
[:issue , "Journal/JournalIssue/Issue"],
|
53
|
+
[:volume , "Journal/JournalIssue/Volume"],
|
54
|
+
[:issn , "Journal/ISSN"],
|
55
|
+
[:year , "Journal/JournalIssue/PubDate/Year"],
|
56
|
+
[:pages , "Pagination/MedlinePgn"],
|
57
|
+
[:abstract , "Abstract/AbstractText"],
|
58
|
+
]
|
59
|
+
|
60
|
+
PMC_PDF_URL = "http://www.ncbi.nlm.nih.gov/pmc/articles/PMCID/pdf/"
|
61
|
+
|
62
|
+
def self.escape_title(title)
|
63
|
+
title.gsub(/(\w*[A-Z][A-Z]+\w*)/, '{\1}')
|
64
|
+
end
|
65
|
+
|
66
|
+
def self.parse_xml(xml)
|
67
|
+
parser = LibXML::XML::Parser.string(xml)
|
68
|
+
pubmed = parser.parse.find("/PubmedArticle").first
|
69
|
+
medline = pubmed.find("MedlineCitation").first
|
70
|
+
article = medline.find("Article").first
|
71
|
+
|
72
|
+
info = {}
|
73
|
+
|
74
|
+
info[:pmid] = medline.find("PMID").first.content
|
75
|
+
|
76
|
+
XML_KEYS.each do |p|
|
77
|
+
name, key = p
|
78
|
+
node = article.find(key).first
|
79
|
+
|
80
|
+
next if node.nil?
|
81
|
+
|
82
|
+
info[name] = node.content
|
83
|
+
end
|
84
|
+
|
85
|
+
bibentry = nil
|
86
|
+
info[:author] = article.find("AuthorList/Author").collect do |author|
|
87
|
+
lastname = author.find("LastName").first.content
|
88
|
+
if author.find("ForeName").first.nil?
|
89
|
+
forename = nil
|
90
|
+
else
|
91
|
+
forename = author.find("ForeName").first.content.split(/\s/).collect{|word| if word.length == 1; then word + '.'; else word; end} * " "
|
92
|
+
end
|
93
|
+
bibentry ||= [lastname, (info[:year] || "NOYEAR"), info[:title].scan(/\w+/)[0]] * ""
|
94
|
+
[lastname, forename] * ", "
|
95
|
+
end * " and "
|
96
|
+
|
97
|
+
info[:bibentry] = bibentry.downcase
|
98
|
+
|
99
|
+
info[:pmc_pdf] = pubmed.find("PubmedData/ArticleIdList/ArticleId").select{|id| id[:IdType] == "pmc"}.first
|
100
|
+
|
101
|
+
if info[:pmc_pdf]
|
102
|
+
info[:pmc_pdf] = PMC_PDF_URL.sub(/PMCID/, info[:pmc_pdf].content)
|
103
|
+
end
|
104
|
+
|
105
|
+
info
|
106
|
+
end
|
107
|
+
|
108
|
+
attr_accessor :title, :abstract, :journal, :author, :pmid, :bibentry, :pmc_pdf, :gscholar_pdf, :pdf_url
|
109
|
+
attr_accessor *XML_KEYS.collect{|p| p.first }
|
110
|
+
|
46
111
|
def initialize(xml)
|
47
|
-
xml
|
48
|
-
|
49
|
-
|
50
|
-
|
112
|
+
if xml && ! xml.empty?
|
113
|
+
info = PubMed::Article.parse_xml xml
|
114
|
+
info.each do |key, value|
|
115
|
+
self.send("#{ key }=", value)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def pdf_url
|
121
|
+
return pmc_pdf if pmc_pdf
|
122
|
+
@gscholar_pdf ||= GoogleScholar::full_text_url title
|
123
|
+
end
|
124
|
+
|
125
|
+
def bibtex
|
126
|
+
keys = [:author] + XML_KEYS.collect{|p| p.first } - [:bibentry]
|
127
|
+
bibtex = "@article{#{bibentry},\n"
|
128
|
+
|
129
|
+
keys.each do |key|
|
130
|
+
next if self.send(key).nil?
|
131
|
+
|
132
|
+
case key
|
133
|
+
|
134
|
+
when :title
|
135
|
+
bibtex += " title = { #{ PubMed::Article.escape_title title } },\n"
|
136
|
+
|
137
|
+
when :issue
|
138
|
+
bibtex += " number = { #{ issue } },\n"
|
139
|
+
|
140
|
+
else
|
141
|
+
bibtex += " #{ key } = { #{ self.send(key) } },\n"
|
142
|
+
end
|
143
|
+
|
144
|
+
end
|
145
|
+
|
146
|
+
bibtex += " fulltext = { #{ pdf_url } },\n" if pdf_url
|
147
|
+
bibtex += " pmid = { #{ pmid } }\n}"
|
148
|
+
|
149
|
+
|
150
|
+
bibtex
|
51
151
|
end
|
52
152
|
|
53
153
|
# Join the text from title and abstract
|
54
154
|
def text
|
55
|
-
[
|
155
|
+
[title, abstract].join("\n")
|
56
156
|
end
|
57
157
|
end
|
58
158
|
|
@@ -78,7 +178,7 @@ module PubMed
|
|
78
178
|
return list unless missing.any?
|
79
179
|
chunk_size = [100, missing.length].min
|
80
180
|
chunks = (missing.length.to_f / chunk_size).ceil
|
81
|
-
|
181
|
+
|
82
182
|
articles = {}
|
83
183
|
chunks.times do |chunk|
|
84
184
|
pmids = missing[(chunk * chunk_size)..((chunk + 1) *chunk_size)]
|
data/lib/rbbt/util/open.rb
CHANGED
@@ -6,6 +6,16 @@ require 'rbbt/util/tmpfile'
|
|
6
6
|
# for accessing remote files. It supports caching the files.
|
7
7
|
module Open
|
8
8
|
|
9
|
+
# Return a Proc to use in the :select parameter of the Open.to_hash method.
|
10
|
+
# It selects those lines with the content of the first field present on the
|
11
|
+
# entities array. The field can be chosen to be a different one in the
|
12
|
+
# options hash, also the separation string or regexp to determine fields.
|
13
|
+
def self.func_match_field(entities, options = {})
|
14
|
+
field, sep = {:field => 0, :sep => "\t"}.merge(options).values_at(:field, :sep)
|
15
|
+
|
16
|
+
Proc.new {|line| entities.include? line.split(sep)[field] }
|
17
|
+
end
|
18
|
+
|
9
19
|
def self.fields(line, sep = "\t")
|
10
20
|
chunks = line.chomp.split(/(#{sep})/).select{|c| c !~ /^#{sep}$/ }
|
11
21
|
if line =~ /#{sep}$/
|
@@ -176,10 +186,12 @@ module Open
|
|
176
186
|
# * :single => for each key select only the first of the values, instead of the complete array.
|
177
187
|
# * :fix => A Proc that is called to pre-process the line
|
178
188
|
# * :exclude => A Proc that is called to check if the line must be excluded from the process.
|
189
|
+
# * :select => A Proc that is called to check if the line must be selected to process.
|
179
190
|
def self.to_hash(input, options = {})
|
180
191
|
native = options[:native] || 0
|
181
192
|
extra = options[:extra]
|
182
193
|
exclude = options[:exclude]
|
194
|
+
select = options[:select]
|
183
195
|
fix = options[:fix]
|
184
196
|
sep = options[:sep] || "\t"
|
185
197
|
sep2 = options[:sep2] || "|"
|
@@ -200,6 +212,7 @@ module Open
|
|
200
212
|
content.each_line{|l|
|
201
213
|
l = fix.call(l) if fix
|
202
214
|
next if exclude and exclude.call(l)
|
215
|
+
next if select and ! select.call(l)
|
203
216
|
|
204
217
|
row_fields = self.fields(l, sep)
|
205
218
|
id = row_fields[native]
|
data/test/rbbt/util/test_open.rb
CHANGED
@@ -89,6 +89,19 @@ row2 a d e r
|
|
89
89
|
assert_equal(["","",""] , Open.fields("\t\t") )
|
90
90
|
end
|
91
91
|
|
92
|
+
def test_select_field
|
93
|
+
data =<<-EOD
|
94
|
+
row1 a b 3
|
95
|
+
row1 aa bb 33
|
96
|
+
row2 a d e r
|
97
|
+
EOD
|
98
|
+
|
99
|
+
TmpFile.with_file(data) do |file|
|
100
|
+
data = Open.to_hash(file, :select => Open.func_match_field(%w(row1), :sep => " "), :sep => " ")
|
101
|
+
assert ! data.include?('row2')
|
102
|
+
assert data.include?('row1')
|
103
|
+
end
|
104
|
+
end
|
92
105
|
|
93
106
|
|
94
107
|
|
metadata
CHANGED
@@ -1,7 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 1
|
7
|
+
- 2
|
8
|
+
- 2
|
9
|
+
version: 1.2.2
|
5
10
|
platform: ruby
|
6
11
|
authors:
|
7
12
|
- Miguel Vazquez
|
@@ -9,59 +14,71 @@ autorequire:
|
|
9
14
|
bindir: bin
|
10
15
|
cert_chain: []
|
11
16
|
|
12
|
-
date: 2010-
|
17
|
+
date: 2010-05-27 00:00:00 +02:00
|
13
18
|
default_executable: rbbt_config
|
14
19
|
dependencies:
|
15
20
|
- !ruby/object:Gem::Dependency
|
16
21
|
name: rake
|
17
|
-
|
18
|
-
|
19
|
-
version_requirements: !ruby/object:Gem::Requirement
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
24
|
requirements:
|
21
25
|
- - ">="
|
22
26
|
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 0
|
29
|
+
- 8
|
30
|
+
- 4
|
23
31
|
version: 0.8.4
|
24
|
-
|
32
|
+
type: :runtime
|
33
|
+
version_requirements: *id001
|
25
34
|
- !ruby/object:Gem::Dependency
|
26
35
|
name: simpleconsole
|
27
|
-
|
28
|
-
|
29
|
-
version_requirements: !ruby/object:Gem::Requirement
|
36
|
+
prerelease: false
|
37
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
30
38
|
requirements:
|
31
39
|
- - ">="
|
32
40
|
- !ruby/object:Gem::Version
|
41
|
+
segments:
|
42
|
+
- 0
|
33
43
|
version: "0"
|
34
|
-
|
44
|
+
type: :runtime
|
45
|
+
version_requirements: *id002
|
35
46
|
- !ruby/object:Gem::Dependency
|
36
47
|
name: stemmer
|
37
|
-
|
38
|
-
|
39
|
-
version_requirements: !ruby/object:Gem::Requirement
|
48
|
+
prerelease: false
|
49
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
40
50
|
requirements:
|
41
51
|
- - ">="
|
42
52
|
- !ruby/object:Gem::Version
|
53
|
+
segments:
|
54
|
+
- 0
|
43
55
|
version: "0"
|
44
|
-
|
56
|
+
type: :runtime
|
57
|
+
version_requirements: *id003
|
45
58
|
- !ruby/object:Gem::Dependency
|
46
59
|
name: progress-monitor
|
47
|
-
|
48
|
-
|
49
|
-
version_requirements: !ruby/object:Gem::Requirement
|
60
|
+
prerelease: false
|
61
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
50
62
|
requirements:
|
51
63
|
- - ">="
|
52
64
|
- !ruby/object:Gem::Version
|
65
|
+
segments:
|
66
|
+
- 0
|
53
67
|
version: "0"
|
54
|
-
|
68
|
+
type: :runtime
|
69
|
+
version_requirements: *id004
|
55
70
|
- !ruby/object:Gem::Dependency
|
56
71
|
name: simpleconsole
|
57
|
-
|
58
|
-
|
59
|
-
version_requirements: !ruby/object:Gem::Requirement
|
72
|
+
prerelease: false
|
73
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
60
74
|
requirements:
|
61
75
|
- - ">="
|
62
76
|
- !ruby/object:Gem::Version
|
77
|
+
segments:
|
78
|
+
- 0
|
63
79
|
version: "0"
|
64
|
-
|
80
|
+
type: :runtime
|
81
|
+
version_requirements: *id005
|
65
82
|
description: |-
|
66
83
|
This toolbox includes modules for text-mining, like Named Entity Recognition and Normalization and document
|
67
84
|
classification, as well as data integration modules that interface with PubMed, Entrez Gene, BioMart.
|
@@ -118,6 +135,7 @@ files:
|
|
118
135
|
- lib/rbbt/sources/biomart.rb
|
119
136
|
- lib/rbbt/sources/entrez.rb
|
120
137
|
- lib/rbbt/sources/go.rb
|
138
|
+
- lib/rbbt/sources/gscholar.rb
|
121
139
|
- lib/rbbt/sources/organism.rb
|
122
140
|
- lib/rbbt/sources/polysearch.rb
|
123
141
|
- lib/rbbt/sources/pubmed.rb
|
@@ -145,18 +163,20 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
145
163
|
requirements:
|
146
164
|
- - ">="
|
147
165
|
- !ruby/object:Gem::Version
|
166
|
+
segments:
|
167
|
+
- 0
|
148
168
|
version: "0"
|
149
|
-
version:
|
150
169
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
151
170
|
requirements:
|
152
171
|
- - ">="
|
153
172
|
- !ruby/object:Gem::Version
|
173
|
+
segments:
|
174
|
+
- 0
|
154
175
|
version: "0"
|
155
|
-
version:
|
156
176
|
requirements: []
|
157
177
|
|
158
178
|
rubyforge_project:
|
159
|
-
rubygems_version: 1.3.
|
179
|
+
rubygems_version: 1.3.6
|
160
180
|
signing_key:
|
161
181
|
specification_version: 3
|
162
182
|
summary: Bioinformatics and text mining toolbox
|