rbbt 1.2.5 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.rdoc +2 -138
- metadata +69 -214
- data/LICENSE +0 -20
- data/bin/rbbt_config +0 -245
- data/install_scripts/classifier/R/classify.R +0 -36
- data/install_scripts/classifier/Rakefile +0 -140
- data/install_scripts/get_abner.sh +0 -2
- data/install_scripts/get_banner.sh +0 -25
- data/install_scripts/get_biocreative.sh +0 -72
- data/install_scripts/get_crf++.sh +0 -26
- data/install_scripts/get_entrez.sh +0 -4
- data/install_scripts/get_go.sh +0 -4
- data/install_scripts/get_polysearch.sh +0 -8
- data/install_scripts/ner/Rakefile +0 -206
- data/install_scripts/ner/config/default.rb +0 -52
- data/install_scripts/norm/Rakefile +0 -219
- data/install_scripts/norm/config/cue_default.rb +0 -10
- data/install_scripts/norm/config/tokens_default.rb +0 -86
- data/install_scripts/norm/functions.sh +0 -23
- data/install_scripts/organisms/Ath.Rakefile +0 -55
- data/install_scripts/organisms/Cal.Rakefile +0 -84
- data/install_scripts/organisms/Cel.Rakefile +0 -109
- data/install_scripts/organisms/Hsa.Rakefile +0 -140
- data/install_scripts/organisms/Mmu.Rakefile +0 -77
- data/install_scripts/organisms/Rakefile +0 -43
- data/install_scripts/organisms/Rno.Rakefile +0 -88
- data/install_scripts/organisms/Sce.Rakefile +0 -66
- data/install_scripts/organisms/Spo.Rakefile +0 -40
- data/install_scripts/organisms/rake-include.rb +0 -252
- data/install_scripts/wordlists/consonants +0 -897
- data/install_scripts/wordlists/stopwords +0 -1
- data/lib/rbbt.rb +0 -83
- data/lib/rbbt/bow/bow.rb +0 -88
- data/lib/rbbt/bow/classifier.rb +0 -116
- data/lib/rbbt/bow/dictionary.rb +0 -187
- data/lib/rbbt/ner/abner.rb +0 -34
- data/lib/rbbt/ner/banner.rb +0 -73
- data/lib/rbbt/ner/dictionaryNER.rb +0 -98
- data/lib/rbbt/ner/regexpNER.rb +0 -70
- data/lib/rbbt/ner/rner.rb +0 -227
- data/lib/rbbt/ner/rnorm.rb +0 -143
- data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
- data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
- data/lib/rbbt/sources/biocreative.rb +0 -75
- data/lib/rbbt/sources/biomart.rb +0 -105
- data/lib/rbbt/sources/entrez.rb +0 -211
- data/lib/rbbt/sources/go.rb +0 -85
- data/lib/rbbt/sources/gscholar.rb +0 -74
- data/lib/rbbt/sources/organism.rb +0 -241
- data/lib/rbbt/sources/polysearch.rb +0 -117
- data/lib/rbbt/sources/pubmed.rb +0 -248
- data/lib/rbbt/util/arrayHash.rb +0 -266
- data/lib/rbbt/util/filecache.rb +0 -72
- data/lib/rbbt/util/index.rb +0 -47
- data/lib/rbbt/util/misc.rb +0 -106
- data/lib/rbbt/util/open.rb +0 -251
- data/lib/rbbt/util/rake.rb +0 -183
- data/lib/rbbt/util/simpleDSL.rb +0 -87
- data/lib/rbbt/util/tmpfile.rb +0 -35
- data/tasks/install.rake +0 -124
- data/test/rbbt/bow/test_bow.rb +0 -33
- data/test/rbbt/bow/test_classifier.rb +0 -72
- data/test/rbbt/bow/test_dictionary.rb +0 -91
- data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
- data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
- data/test/rbbt/ner/test_abner.rb +0 -17
- data/test/rbbt/ner/test_banner.rb +0 -17
- data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
- data/test/rbbt/ner/test_regexpNER.rb +0 -33
- data/test/rbbt/ner/test_rner.rb +0 -126
- data/test/rbbt/ner/test_rnorm.rb +0 -47
- data/test/rbbt/sources/test_biocreative.rb +0 -38
- data/test/rbbt/sources/test_biomart.rb +0 -31
- data/test/rbbt/sources/test_entrez.rb +0 -49
- data/test/rbbt/sources/test_go.rb +0 -24
- data/test/rbbt/sources/test_organism.rb +0 -59
- data/test/rbbt/sources/test_polysearch.rb +0 -27
- data/test/rbbt/sources/test_pubmed.rb +0 -39
- data/test/rbbt/util/test_arrayHash.rb +0 -257
- data/test/rbbt/util/test_filecache.rb +0 -37
- data/test/rbbt/util/test_index.rb +0 -31
- data/test/rbbt/util/test_misc.rb +0 -20
- data/test/rbbt/util/test_open.rb +0 -110
- data/test/rbbt/util/test_simpleDSL.rb +0 -57
- data/test/rbbt/util/test_tmpfile.rb +0 -21
- data/test/test_helper.rb +0 -4
- data/test/test_rbbt.rb +0 -11
@@ -1,241 +0,0 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rbbt/util/open'
|
3
|
-
require 'rbbt/util/index'
|
4
|
-
|
5
|
-
# This module contains some Organism centric functionalities. Each organism is
|
6
|
-
# identified by a keyword.
|
7
|
-
module Organism
|
8
|
-
|
9
|
-
# Raised when trying to access information for an organism that has not been
|
10
|
-
# prepared already.
|
11
|
-
class OrganismNotProcessedError < StandardError; end
|
12
|
-
|
13
|
-
# Return the list of all supported organisms. The prepared flag is used to
|
14
|
-
# show only those that have been prepared.
|
15
|
-
def self.all(prepared = true)
|
16
|
-
if prepared
|
17
|
-
Dir.glob(File.join(Rbbt.datadir,'/organisms/') + '/*/identifiers').collect{|f| File.basename(File.dirname(f))}
|
18
|
-
else
|
19
|
-
Dir.glob(File.join(Rbbt.datadir,'/organisms/') + '/*').select{|f| File.directory? f}.collect{|f| File.basename(f)}
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
|
24
|
-
# Return the complete name of an organism. The org parameter is the organism
|
25
|
-
# keyword
|
26
|
-
def self.name(org)
|
27
|
-
raise OrganismNotProcessedError, "Missing 'name' file" if ! File.exists? File.join(Rbbt.datadir,"organisms/#{ org }/name")
|
28
|
-
Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/name"))
|
29
|
-
end
|
30
|
-
|
31
|
-
# Hash linking all the organism log names with their keywords in Rbbt. Its
|
32
|
-
# the inverse of the name method.
|
33
|
-
NAME2ORG = {}
|
34
|
-
Organism::all.each{|org|
|
35
|
-
name = Organism.name(org).strip.downcase
|
36
|
-
NAME2ORG[name] = org
|
37
|
-
}
|
38
|
-
|
39
|
-
|
40
|
-
# Return the key word associated with an organism.
|
41
|
-
def self.name2org(name)
|
42
|
-
NAME2ORG[name.strip.downcase]
|
43
|
-
end
|
44
|
-
|
45
|
-
# FIXME: The NER related stuff is harder to install, thats why we hide the
|
46
|
-
# requires next to where they are needed, next to options
|
47
|
-
|
48
|
-
# Return a NER object which could be of RNER, Abner or Banner class, this is
|
49
|
-
# selected using the type parameter.
|
50
|
-
def self.ner(org, type=:rner, options = {})
|
51
|
-
|
52
|
-
case type.to_sym
|
53
|
-
when :abner
|
54
|
-
require 'rbbt/ner/abner'
|
55
|
-
return Abner.new
|
56
|
-
when :banner
|
57
|
-
require 'rbbt/ner/banner'
|
58
|
-
return Banner.new
|
59
|
-
when :rner
|
60
|
-
require 'rbbt/ner/rner'
|
61
|
-
model = options[:model]
|
62
|
-
model ||= File.join(Rbbt.datadir,"ner/model/#{ org }") if File.exist? File.join(Rbbt.datadir,"ner/model/#{ org }")
|
63
|
-
model ||= File.join(Rbbt.datadir,'ner/model/BC2')
|
64
|
-
return NER.new(model)
|
65
|
-
else
|
66
|
-
raise "Ner type (#{ type }) unknown"
|
67
|
-
end
|
68
|
-
|
69
|
-
end
|
70
|
-
|
71
|
-
# Return a normalization object.
|
72
|
-
def self.norm(org, to_entrez = nil)
|
73
|
-
require 'rbbt/ner/rnorm'
|
74
|
-
if to_entrez.nil?
|
75
|
-
to_entrez = id_index(org, :native => 'Entrez Gene Id', :other => [supported_ids(org).first])
|
76
|
-
end
|
77
|
-
|
78
|
-
token_file = File.join(Rbbt.datadir, 'norm','config',org.to_s + '.config')
|
79
|
-
if !File.exists? token_file
|
80
|
-
token_file = nil
|
81
|
-
end
|
82
|
-
|
83
|
-
Normalizer.new(File.join(Rbbt.datadir,"organisms/#{ org }/lexicon"), :to_entrez => to_entrez, :file => token_file, :max_candidates => 20)
|
84
|
-
end
|
85
|
-
|
86
|
-
# Returns a hash with the names associated with each gene id. The ids are
|
87
|
-
# in Rbbt native format for that organism.
|
88
|
-
def self.lexicon(org, options = {})
|
89
|
-
options = {:sep => "\t|\\|", :flatten => true}.merge(options)
|
90
|
-
Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/lexicon"),options)
|
91
|
-
end
|
92
|
-
|
93
|
-
# Returns a hash with the list of go terms for each gene id. Gene ids are in
|
94
|
-
# Rbbt native format for that organism.
|
95
|
-
def self.goterms(org)
|
96
|
-
Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene.go"), :flatten => true)
|
97
|
-
end
|
98
|
-
|
99
|
-
# Return list of PubMed ids associated to the organism. Determined using a
|
100
|
-
# PubMed query with the name of the organism
|
101
|
-
def self.literature(org)
|
102
|
-
Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/all.pmid")).scan(/\d+/)
|
103
|
-
end
|
104
|
-
|
105
|
-
# Return hash that associates genes to a list of PubMed ids.
|
106
|
-
def self.gene_literature(org)
|
107
|
-
Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene.pmid"), :flatten => true)
|
108
|
-
end
|
109
|
-
|
110
|
-
# Return hash that associates genes to a list of PubMed ids. Includes only
|
111
|
-
# those found to support GO term associations.
|
112
|
-
def self.gene_literature_go(org)
|
113
|
-
Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene_go.pmid"), :flatten => true)
|
114
|
-
end
|
115
|
-
|
116
|
-
# Returns a list with the names of the id formats supported for an organism.
|
117
|
-
# If examples are produced, the list is of [format, example] pairs.
|
118
|
-
#
|
119
|
-
# *Options:*
|
120
|
-
#
|
121
|
-
# *examples:* Include example ids for each format
|
122
|
-
def self.supported_ids(org, options = {})
|
123
|
-
formats = []
|
124
|
-
examples = [] if options[:examples]
|
125
|
-
i= 0
|
126
|
-
Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers")).each_line{|l|
|
127
|
-
if i == 0
|
128
|
-
i += 1
|
129
|
-
next unless l=~/^\s*#/
|
130
|
-
formats = Open.fields(l.sub(/^[\s#]+/,'')).collect{|n| n.strip}
|
131
|
-
return formats unless examples
|
132
|
-
next
|
133
|
-
end
|
134
|
-
|
135
|
-
if Open.fields(l).select{|name| name && name =~ /\w/}.length > examples.compact.length
|
136
|
-
examples = Open.fields(l).collect{|name| name.split(/\|/).first}
|
137
|
-
end
|
138
|
-
i += 1
|
139
|
-
}
|
140
|
-
|
141
|
-
formats.zip(examples)
|
142
|
-
end
|
143
|
-
|
144
|
-
# Creates a hash where each possible id is associated with the names of the
|
145
|
-
# formats (its potentially possible for different formats to have the same
|
146
|
-
# id). This is used in the guessIdFormat method.
|
147
|
-
def self.id_formats(org)
|
148
|
-
id_types = {}
|
149
|
-
formats = supported_ids(org)
|
150
|
-
|
151
|
-
text = Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"))
|
152
|
-
|
153
|
-
if text.respond_to? :collect
|
154
|
-
lines = text.collect
|
155
|
-
else
|
156
|
-
lines = text.lines
|
157
|
-
end
|
158
|
-
|
159
|
-
lines.each{|l|
|
160
|
-
ids_per_type = Open.fields(l)
|
161
|
-
formats.zip(ids_per_type).each{|p|
|
162
|
-
format = p[0]
|
163
|
-
p[1] ||= ""
|
164
|
-
ids = p[1].split(/\|/)
|
165
|
-
ids.each{|id|
|
166
|
-
next if id.nil? || id == ""
|
167
|
-
id_types[id.downcase] ||= []
|
168
|
-
id_types[id.downcase] << format unless id_types[id.downcase].include? format
|
169
|
-
}
|
170
|
-
}
|
171
|
-
}
|
172
|
-
|
173
|
-
return id_types
|
174
|
-
end
|
175
|
-
|
176
|
-
def self.guessIdFormat(formats, query)
|
177
|
-
query = query.compact.collect{|gene| gene.downcase}.uniq
|
178
|
-
if String === formats
|
179
|
-
formats = id_formats(formats)
|
180
|
-
end
|
181
|
-
|
182
|
-
return nil if formats.values.empty?
|
183
|
-
values = formats.values_at(*query)
|
184
|
-
return nil if values.empty?
|
185
|
-
|
186
|
-
format_count = {}
|
187
|
-
values.compact.collect{|types| types.uniq}.flatten.each{|f|
|
188
|
-
format_count[f] ||= 0
|
189
|
-
format_count[f] += 1
|
190
|
-
}
|
191
|
-
|
192
|
-
return nil if format_count.values.empty?
|
193
|
-
format_count.select{|k,v| v > (query.length / 10)}.sort{|a,b| b[1] <=> a[1]}.first
|
194
|
-
end
|
195
|
-
|
196
|
-
def self.id_position(supported_ids, id_name, options = {})
|
197
|
-
pos = 0
|
198
|
-
supported_ids.each_with_index{|id, i|
|
199
|
-
if id.strip == id_name.strip || !options[:case_sensitive] && id.strip.downcase == id_name.strip.downcase
|
200
|
-
pos = i;
|
201
|
-
end
|
202
|
-
}
|
203
|
-
pos
|
204
|
-
end
|
205
|
-
|
206
|
-
def self.id_index(org, options = {})
|
207
|
-
native = options[:native]
|
208
|
-
other = options[:other]
|
209
|
-
options[:case_sensitive] = false if options[:case_sensitive].nil?
|
210
|
-
|
211
|
-
if native.nil? and other.nil?
|
212
|
-
Index.index(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"), options)
|
213
|
-
else
|
214
|
-
supported = Organism.supported_ids(org)
|
215
|
-
|
216
|
-
first = nil
|
217
|
-
if native
|
218
|
-
first = id_position(supported,native,options)
|
219
|
-
raise "No match for native format '#{ native }'" if first.nil?
|
220
|
-
else
|
221
|
-
first = 0
|
222
|
-
end
|
223
|
-
|
224
|
-
rest = nil
|
225
|
-
if other
|
226
|
-
rest = other.collect{|name| id_position(supported,name, options)}
|
227
|
-
else
|
228
|
-
rest = (0..supported.length - 1).to_a - [first]
|
229
|
-
end
|
230
|
-
|
231
|
-
options[:native] = first
|
232
|
-
options[:extra] = rest
|
233
|
-
options[:sep] = "\t"
|
234
|
-
index = Index.index(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"), options)
|
235
|
-
|
236
|
-
index
|
237
|
-
end
|
238
|
-
end
|
239
|
-
|
240
|
-
end
|
241
|
-
|
@@ -1,117 +0,0 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rbbt/util/open'
|
3
|
-
require 'rbbt/ner/regexpNER'
|
4
|
-
require 'rbbt/ner/dictionaryNER'
|
5
|
-
|
6
|
-
# Find terms in the Polysearch thesauri using simple regular expression
|
7
|
-
# matching. Note that the first time the methods are used the correspondent
|
8
|
-
# thesaurus are loaded into memory. The available thesauri are: disease, drug,
|
9
|
-
# metabolite, organ, subcellular (subcellular localization) and tissue.
|
10
|
-
module Polysearch
|
11
|
-
|
12
|
-
|
13
|
-
@@names = {}
|
14
|
-
def self.type_names(type) #:nodoc:
|
15
|
-
@@names[type] ||= Open.to_hash(File.join(Rbbt.datadir,'dbs','polysearch',type.to_s + '.txt'), :single => true)
|
16
|
-
end
|
17
|
-
|
18
|
-
|
19
|
-
@@indexes = {}
|
20
|
-
def self.type_index(type) #:nodoc:
|
21
|
-
if $stopwords
|
22
|
-
stopwords = $stopwords
|
23
|
-
else
|
24
|
-
stopwords = []
|
25
|
-
end
|
26
|
-
|
27
|
-
case type.to_sym
|
28
|
-
when :disease
|
29
|
-
stopwords << 'use'
|
30
|
-
end
|
31
|
-
|
32
|
-
@@indexes[type] ||= RegExpNER.new(File.join(Rbbt.datadir,'dbs','polysearch',type.to_s + '.txt'), :stopwords => stopwords)
|
33
|
-
end
|
34
|
-
|
35
|
-
# Find matches in a string of text, the types array specifies which thesauri
|
36
|
-
# to use, if if nil it will use all.
|
37
|
-
def self.match(text, types = nil)
|
38
|
-
if types.nil?
|
39
|
-
types = %w(disease drug metabolite organ subcellular tissue)
|
40
|
-
end
|
41
|
-
|
42
|
-
types = [types] unless Array === types
|
43
|
-
types = types.sort
|
44
|
-
|
45
|
-
matches = {}
|
46
|
-
types.collect{|type|
|
47
|
-
matches.merge!(type_index(type).match(text))
|
48
|
-
}
|
49
|
-
|
50
|
-
matches
|
51
|
-
end
|
52
|
-
|
53
|
-
# Transform the code into a name, type is the thesaurus to use
|
54
|
-
def self.name(type, code)
|
55
|
-
type_names(type)[code]
|
56
|
-
end
|
57
|
-
|
58
|
-
end
|
59
|
-
|
60
|
-
if __FILE__ == $0
|
61
|
-
text =<<-EOT
|
62
|
-
|
63
|
-
Background Microorganisms adapt their transcriptome by integrating
|
64
|
-
multiple chemical and physical signals from their environment. Shake-flask
|
65
|
-
cultivation does not allow precise manipulation of individual culture
|
66
|
-
parameters and therefore precludes a quantitative analysis of the
|
67
|
-
(combinatorial) influence of these parameters on transcriptional
|
68
|
-
regulation. Steady-state chemostat cultures, which do enable accurate
|
69
|
-
control, measurement and manipulation of individual cultivation parameters
|
70
|
-
(e.g. specific growth rate, temperature, identity of the growth-limiting
|
71
|
-
nutrient) appear to provide a promising experimental platform for such a
|
72
|
-
combinatorial analysis. Results A microarray compendium of 170
|
73
|
-
steady-state chemostat cultures of the yeast Saccharomyces cerevisiae is
|
74
|
-
presented and analyzed. The 170 microarrays encompass 55 unique
|
75
|
-
conditions, which can be characterized by the combined settings of 10
|
76
|
-
different cultivation parameters. By applying a regression model to assess
|
77
|
-
the impact of (combinations of) cultivation parameters on the
|
78
|
-
transcriptome, most S. cerevisiae genes were shown to be influenced by
|
79
|
-
multiple cultivation parameters, and in many cases by combinatorial
|
80
|
-
effects of cultivation parameters. The inclusion of these combinatorial
|
81
|
-
effects in the regression model led to higher explained variance of the
|
82
|
-
gene expression patterns and resulted in higher function enrichment in
|
83
|
-
subsequent analysis. We further demonstrate the usefulness of the
|
84
|
-
compendium and regression analysis for interpretation of shake-flask-based
|
85
|
-
transcriptome studies and for guiding functional analysis of
|
86
|
-
(uncharacterized) genes and pathways. Conclusions Modeling the
|
87
|
-
combinatorial effects of environmental parameters on the transcriptome is
|
88
|
-
crucial for understanding transcriptional regulation. Chemostat
|
89
|
-
cultivation offers a powerful tool for such an approach. Keywords:
|
90
|
-
chemostat steady state samples Cerebellar stroke syndrome
|
91
|
-
|
92
|
-
|
93
|
-
EOT
|
94
|
-
|
95
|
-
require 'benchmark'
|
96
|
-
require 'ruby-prof'
|
97
|
-
|
98
|
-
puts Benchmark.measure{
|
99
|
-
p Polysearch.match(text,'disease')
|
100
|
-
}
|
101
|
-
|
102
|
-
|
103
|
-
RubyProf.start
|
104
|
-
|
105
|
-
Polysearch.match(text,'disease')
|
106
|
-
|
107
|
-
result = RubyProf.stop
|
108
|
-
|
109
|
-
# Print a flat profile to text
|
110
|
-
printer = RubyProf::FlatPrinter.new(result)
|
111
|
-
printer.print(STDOUT, 0)
|
112
|
-
|
113
|
-
puts Benchmark.measure{
|
114
|
-
10.times{ p Polysearch.match(text,'disease') }
|
115
|
-
}
|
116
|
-
|
117
|
-
end
|
data/lib/rbbt/sources/pubmed.rb
DELETED
@@ -1,248 +0,0 @@
|
|
1
|
-
require 'rbbt/util/filecache'
|
2
|
-
require 'rbbt/util/open'
|
3
|
-
require 'rbbt/sources/gscholar'
|
4
|
-
require 'rbbt'
|
5
|
-
require 'libxml'
|
6
|
-
|
7
|
-
# This module offers an interface with PubMed, to perform queries, and
|
8
|
-
# retrieve simple information from articles. It uses the caching
|
9
|
-
# services of Rbbt.
|
10
|
-
module PubMed
|
11
|
-
|
12
|
-
private
|
13
|
-
@@last = Time.now
|
14
|
-
@@pubmed_lag = 1
|
15
|
-
def self.get_online(pmids)
|
16
|
-
|
17
|
-
pmid_list = ( pmids.is_a?(Array) ? pmids.join(',') : pmids.to_s )
|
18
|
-
url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list}"
|
19
|
-
|
20
|
-
diff = Time.now - @@last
|
21
|
-
sleep @@pubmed_lag - diff unless diff > @@pubmed_lag
|
22
|
-
|
23
|
-
xml = Open.read(url, :quiet => true, :nocache => true)
|
24
|
-
|
25
|
-
@@last = Time.now
|
26
|
-
|
27
|
-
articles = xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/smu).flatten
|
28
|
-
|
29
|
-
if pmids.is_a? Array
|
30
|
-
list = {}
|
31
|
-
articles.each{|article|
|
32
|
-
pmid = article.scan(/<PMID>(.*?)<\/PMID>/).flatten.first
|
33
|
-
list[pmid] = article
|
34
|
-
}
|
35
|
-
return list
|
36
|
-
else
|
37
|
-
return articles.first
|
38
|
-
end
|
39
|
-
|
40
|
-
end
|
41
|
-
|
42
|
-
public
|
43
|
-
|
44
|
-
# Processes the xml with an articles as served by MedLine and extracts
|
45
|
-
# the abstract, title and journal information
|
46
|
-
class Article
|
47
|
-
|
48
|
-
|
49
|
-
XML_KEYS = [
|
50
|
-
[:title , "ArticleTitle"],
|
51
|
-
[:journal , "Journal/Title"],
|
52
|
-
[:issue , "Journal/JournalIssue/Issue"],
|
53
|
-
[:volume , "Journal/JournalIssue/Volume"],
|
54
|
-
[:issn , "Journal/ISSN"],
|
55
|
-
[:year , "Journal/JournalIssue/PubDate/Year"],
|
56
|
-
[:month , "Journal/JournalIssue/PubDate/Month"],
|
57
|
-
[:pages , "Pagination/MedlinePgn"],
|
58
|
-
[:abstract , "Abstract/AbstractText"],
|
59
|
-
]
|
60
|
-
|
61
|
-
PMC_PDF_URL = "http://www.ncbi.nlm.nih.gov/pmc/articles/PMCID/pdf/"
|
62
|
-
|
63
|
-
def self.escape_title(title)
|
64
|
-
title.gsub(/(\w*[A-Z][A-Z]+\w*)/, '{\1}')
|
65
|
-
end
|
66
|
-
|
67
|
-
def self.make_bibentry(lastname, year, title)
|
68
|
-
words = title.downcase.scan(/\w+/)
|
69
|
-
if words.first.length > 3
|
70
|
-
abrev = words.first
|
71
|
-
else
|
72
|
-
abrev = words[0..2].collect{|w| w.chars.first} * ""
|
73
|
-
end
|
74
|
-
[lastname.gsub(/\s/,'_'), year || "NOYEAR", abrev] * ""
|
75
|
-
end
|
76
|
-
def self.parse_xml(xml)
|
77
|
-
parser = LibXML::XML::Parser.string(xml)
|
78
|
-
pubmed = parser.parse.find("/PubmedArticle").first
|
79
|
-
medline = pubmed.find("MedlineCitation").first
|
80
|
-
article = medline.find("Article").first
|
81
|
-
|
82
|
-
info = {}
|
83
|
-
|
84
|
-
info[:pmid] = medline.find("PMID").first.content
|
85
|
-
|
86
|
-
XML_KEYS.each do |p|
|
87
|
-
name, key = p
|
88
|
-
node = article.find(key).first
|
89
|
-
|
90
|
-
next if node.nil?
|
91
|
-
|
92
|
-
info[name] = node.content
|
93
|
-
end
|
94
|
-
|
95
|
-
bibentry = nil
|
96
|
-
info[:author] = article.find("AuthorList/Author").collect do |author|
|
97
|
-
begin
|
98
|
-
lastname = author.find("LastName").first.content
|
99
|
-
if author.find("ForeName").first.nil?
|
100
|
-
forename = nil
|
101
|
-
else
|
102
|
-
forename = author.find("ForeName").first.content.split(/\s/).collect{|word| if word.length == 1; then word + '.'; else word; end} * " "
|
103
|
-
end
|
104
|
-
bibentry ||= make_bibentry lastname, info[:year], info[:title]
|
105
|
-
rescue
|
106
|
-
end
|
107
|
-
[lastname, forename] * ", "
|
108
|
-
end * " and "
|
109
|
-
|
110
|
-
info[:bibentry] = bibentry.downcase if bibentry
|
111
|
-
|
112
|
-
info[:pmc_pdf] = pubmed.find("PubmedData/ArticleIdList/ArticleId").select{|id| id[:IdType] == "pmc"}.first
|
113
|
-
|
114
|
-
if info[:pmc_pdf]
|
115
|
-
info[:pmc_pdf] = PMC_PDF_URL.sub(/PMCID/, info[:pmc_pdf].content)
|
116
|
-
end
|
117
|
-
|
118
|
-
info
|
119
|
-
end
|
120
|
-
|
121
|
-
attr_accessor :title, :abstract, :journal, :author, :pmid, :bibentry, :pmc_pdf, :gscholar_pdf, :pdf_url
|
122
|
-
attr_accessor *XML_KEYS.collect{|p| p.first }
|
123
|
-
|
124
|
-
def initialize(xml)
|
125
|
-
if xml && ! xml.empty?
|
126
|
-
info = PubMed::Article.parse_xml xml
|
127
|
-
info.each do |key, value|
|
128
|
-
self.send("#{ key }=", value)
|
129
|
-
end
|
130
|
-
end
|
131
|
-
end
|
132
|
-
|
133
|
-
def pdf_url
|
134
|
-
return pmc_pdf if pmc_pdf
|
135
|
-
@gscholar_pdf ||= GoogleScholar::full_text_url title
|
136
|
-
end
|
137
|
-
|
138
|
-
def full_text
|
139
|
-
return nil if pdf_url.nil?
|
140
|
-
|
141
|
-
text = nil
|
142
|
-
TmpFile.with_file do |pdf|
|
143
|
-
|
144
|
-
# Change user-agent, oh well...
|
145
|
-
`wget --user-agent=firefox #{ pdf_url } -O #{ pdf }`
|
146
|
-
TmpFile.with_file do |txt|
|
147
|
-
`pdftotext #{ pdf } #{ txt }`
|
148
|
-
text = Open.read(txt) if File.exists? txt
|
149
|
-
end
|
150
|
-
end
|
151
|
-
|
152
|
-
text
|
153
|
-
end
|
154
|
-
|
155
|
-
def bibtex
|
156
|
-
keys = [:author] + XML_KEYS.collect{|p| p.first } - [:bibentry]
|
157
|
-
bibtex = "@article{#{bibentry},\n"
|
158
|
-
|
159
|
-
keys.each do |key|
|
160
|
-
next if self.send(key).nil?
|
161
|
-
|
162
|
-
case key
|
163
|
-
|
164
|
-
when :title
|
165
|
-
bibtex += " title = { #{ PubMed::Article.escape_title title } },\n"
|
166
|
-
|
167
|
-
when :issue
|
168
|
-
bibtex += " number = { #{ issue } },\n"
|
169
|
-
|
170
|
-
else
|
171
|
-
bibtex += " #{ key } = { #{ self.send(key) } },\n"
|
172
|
-
end
|
173
|
-
|
174
|
-
end
|
175
|
-
|
176
|
-
bibtex += " fulltext = { #{ pdf_url } },\n" if pdf_url
|
177
|
-
bibtex += " pmid = { #{ pmid } }\n}"
|
178
|
-
|
179
|
-
|
180
|
-
bibtex
|
181
|
-
end
|
182
|
-
|
183
|
-
# Join the text from title and abstract
|
184
|
-
def text
|
185
|
-
[title, abstract].join("\n")
|
186
|
-
end
|
187
|
-
end
|
188
|
-
|
189
|
-
# Returns the Article object containing the information for the PubMed
|
190
|
-
# ID specified as an argument. If +pmid+ is an array instead of a single
|
191
|
-
# identifier it returns an hash with the Article object for each id.
|
192
|
-
# It uses the Rbbt cache to save the articles xml.
|
193
|
-
def self.get_article(pmid)
|
194
|
-
|
195
|
-
if pmid.is_a? Array
|
196
|
-
missing = []
|
197
|
-
list = {}
|
198
|
-
|
199
|
-
pmid.each{|p|
|
200
|
-
filename = p.to_s + '.xml'
|
201
|
-
if File.exists? FileCache.path(filename)
|
202
|
-
list[p] = Article.new(Open.read(FileCache.path(filename)))
|
203
|
-
else
|
204
|
-
missing << p
|
205
|
-
end
|
206
|
-
}
|
207
|
-
|
208
|
-
return list unless missing.any?
|
209
|
-
chunk_size = [100, missing.length].min
|
210
|
-
chunks = (missing.length.to_f / chunk_size).ceil
|
211
|
-
|
212
|
-
articles = {}
|
213
|
-
chunks.times do |chunk|
|
214
|
-
pmids = missing[(chunk * chunk_size)..((chunk + 1) *chunk_size)]
|
215
|
-
articles.merge!(get_online(pmids))
|
216
|
-
end
|
217
|
-
|
218
|
-
articles.each{|p, xml|
|
219
|
-
filename = p + '.xml'
|
220
|
-
FileCache.add_file(filename,xml, :force => true)
|
221
|
-
list[p] = Article.new(xml)
|
222
|
-
}
|
223
|
-
|
224
|
-
return list
|
225
|
-
|
226
|
-
else
|
227
|
-
filename = pmid.to_s + '.xml'
|
228
|
-
|
229
|
-
if File.exists? FileCache.path(filename)
|
230
|
-
return Article.new(Open.read(FileCache.path(filename)))
|
231
|
-
else
|
232
|
-
xml = get_online(pmid)
|
233
|
-
FileCache.add_file(filename,xml)
|
234
|
-
|
235
|
-
return Article.new(xml)
|
236
|
-
end
|
237
|
-
end
|
238
|
-
end
|
239
|
-
|
240
|
-
# Performs the specified query and returns an array with the PubMed
|
241
|
-
# Ids returned. +retmax+ can be used to limit the number of ids
|
242
|
-
# returned, if is not specified 30000 is used.
|
243
|
-
def self.query(query, retmax=nil)
|
244
|
-
retmax ||= 30000
|
245
|
-
|
246
|
-
Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?retmax=#{retmax}&db=pubmed&term=#{query}",:quiet => true, :nocache => true).scan(/<Id>(\d+)<\/Id>/).flatten
|
247
|
-
end
|
248
|
-
end
|