rbbt 1.2.5 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.rdoc +2 -138
- metadata +69 -214
- data/LICENSE +0 -20
- data/bin/rbbt_config +0 -245
- data/install_scripts/classifier/R/classify.R +0 -36
- data/install_scripts/classifier/Rakefile +0 -140
- data/install_scripts/get_abner.sh +0 -2
- data/install_scripts/get_banner.sh +0 -25
- data/install_scripts/get_biocreative.sh +0 -72
- data/install_scripts/get_crf++.sh +0 -26
- data/install_scripts/get_entrez.sh +0 -4
- data/install_scripts/get_go.sh +0 -4
- data/install_scripts/get_polysearch.sh +0 -8
- data/install_scripts/ner/Rakefile +0 -206
- data/install_scripts/ner/config/default.rb +0 -52
- data/install_scripts/norm/Rakefile +0 -219
- data/install_scripts/norm/config/cue_default.rb +0 -10
- data/install_scripts/norm/config/tokens_default.rb +0 -86
- data/install_scripts/norm/functions.sh +0 -23
- data/install_scripts/organisms/Ath.Rakefile +0 -55
- data/install_scripts/organisms/Cal.Rakefile +0 -84
- data/install_scripts/organisms/Cel.Rakefile +0 -109
- data/install_scripts/organisms/Hsa.Rakefile +0 -140
- data/install_scripts/organisms/Mmu.Rakefile +0 -77
- data/install_scripts/organisms/Rakefile +0 -43
- data/install_scripts/organisms/Rno.Rakefile +0 -88
- data/install_scripts/organisms/Sce.Rakefile +0 -66
- data/install_scripts/organisms/Spo.Rakefile +0 -40
- data/install_scripts/organisms/rake-include.rb +0 -252
- data/install_scripts/wordlists/consonants +0 -897
- data/install_scripts/wordlists/stopwords +0 -1
- data/lib/rbbt.rb +0 -83
- data/lib/rbbt/bow/bow.rb +0 -88
- data/lib/rbbt/bow/classifier.rb +0 -116
- data/lib/rbbt/bow/dictionary.rb +0 -187
- data/lib/rbbt/ner/abner.rb +0 -34
- data/lib/rbbt/ner/banner.rb +0 -73
- data/lib/rbbt/ner/dictionaryNER.rb +0 -98
- data/lib/rbbt/ner/regexpNER.rb +0 -70
- data/lib/rbbt/ner/rner.rb +0 -227
- data/lib/rbbt/ner/rnorm.rb +0 -143
- data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
- data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
- data/lib/rbbt/sources/biocreative.rb +0 -75
- data/lib/rbbt/sources/biomart.rb +0 -105
- data/lib/rbbt/sources/entrez.rb +0 -211
- data/lib/rbbt/sources/go.rb +0 -85
- data/lib/rbbt/sources/gscholar.rb +0 -74
- data/lib/rbbt/sources/organism.rb +0 -241
- data/lib/rbbt/sources/polysearch.rb +0 -117
- data/lib/rbbt/sources/pubmed.rb +0 -248
- data/lib/rbbt/util/arrayHash.rb +0 -266
- data/lib/rbbt/util/filecache.rb +0 -72
- data/lib/rbbt/util/index.rb +0 -47
- data/lib/rbbt/util/misc.rb +0 -106
- data/lib/rbbt/util/open.rb +0 -251
- data/lib/rbbt/util/rake.rb +0 -183
- data/lib/rbbt/util/simpleDSL.rb +0 -87
- data/lib/rbbt/util/tmpfile.rb +0 -35
- data/tasks/install.rake +0 -124
- data/test/rbbt/bow/test_bow.rb +0 -33
- data/test/rbbt/bow/test_classifier.rb +0 -72
- data/test/rbbt/bow/test_dictionary.rb +0 -91
- data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
- data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
- data/test/rbbt/ner/test_abner.rb +0 -17
- data/test/rbbt/ner/test_banner.rb +0 -17
- data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
- data/test/rbbt/ner/test_regexpNER.rb +0 -33
- data/test/rbbt/ner/test_rner.rb +0 -126
- data/test/rbbt/ner/test_rnorm.rb +0 -47
- data/test/rbbt/sources/test_biocreative.rb +0 -38
- data/test/rbbt/sources/test_biomart.rb +0 -31
- data/test/rbbt/sources/test_entrez.rb +0 -49
- data/test/rbbt/sources/test_go.rb +0 -24
- data/test/rbbt/sources/test_organism.rb +0 -59
- data/test/rbbt/sources/test_polysearch.rb +0 -27
- data/test/rbbt/sources/test_pubmed.rb +0 -39
- data/test/rbbt/util/test_arrayHash.rb +0 -257
- data/test/rbbt/util/test_filecache.rb +0 -37
- data/test/rbbt/util/test_index.rb +0 -31
- data/test/rbbt/util/test_misc.rb +0 -20
- data/test/rbbt/util/test_open.rb +0 -110
- data/test/rbbt/util/test_simpleDSL.rb +0 -57
- data/test/rbbt/util/test_tmpfile.rb +0 -21
- data/test/test_helper.rb +0 -4
- data/test/test_rbbt.rb +0 -11
@@ -1,241 +0,0 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rbbt/util/open'
|
3
|
-
require 'rbbt/util/index'
|
4
|
-
|
5
|
-
# This module contains some Organism centric functionalities. Each organism is
|
6
|
-
# identified by a keyword.
|
7
|
-
module Organism
|
8
|
-
|
9
|
-
# Raised when trying to access information for an organism that has not been
|
10
|
-
# prepared already.
|
11
|
-
class OrganismNotProcessedError < StandardError; end
|
12
|
-
|
13
|
-
# Return the list of all supported organisms. The prepared flag is used to
|
14
|
-
# show only those that have been prepared.
|
15
|
-
def self.all(prepared = true)
|
16
|
-
if prepared
|
17
|
-
Dir.glob(File.join(Rbbt.datadir,'/organisms/') + '/*/identifiers').collect{|f| File.basename(File.dirname(f))}
|
18
|
-
else
|
19
|
-
Dir.glob(File.join(Rbbt.datadir,'/organisms/') + '/*').select{|f| File.directory? f}.collect{|f| File.basename(f)}
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
|
24
|
-
# Return the complete name of an organism. The org parameter is the organism
|
25
|
-
# keyword
|
26
|
-
def self.name(org)
|
27
|
-
raise OrganismNotProcessedError, "Missing 'name' file" if ! File.exists? File.join(Rbbt.datadir,"organisms/#{ org }/name")
|
28
|
-
Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/name"))
|
29
|
-
end
|
30
|
-
|
31
|
-
# Hash linking all the organism log names with their keywords in Rbbt. Its
|
32
|
-
# the inverse of the name method.
|
33
|
-
NAME2ORG = {}
|
34
|
-
Organism::all.each{|org|
|
35
|
-
name = Organism.name(org).strip.downcase
|
36
|
-
NAME2ORG[name] = org
|
37
|
-
}
|
38
|
-
|
39
|
-
|
40
|
-
# Return the key word associated with an organism.
|
41
|
-
def self.name2org(name)
|
42
|
-
NAME2ORG[name.strip.downcase]
|
43
|
-
end
|
44
|
-
|
45
|
-
# FIXME: The NER related stuff is harder to install, thats why we hide the
|
46
|
-
# requires next to where they are needed, next to options
|
47
|
-
|
48
|
-
# Return a NER object which could be of RNER, Abner or Banner class, this is
|
49
|
-
# selected using the type parameter.
|
50
|
-
def self.ner(org, type=:rner, options = {})
|
51
|
-
|
52
|
-
case type.to_sym
|
53
|
-
when :abner
|
54
|
-
require 'rbbt/ner/abner'
|
55
|
-
return Abner.new
|
56
|
-
when :banner
|
57
|
-
require 'rbbt/ner/banner'
|
58
|
-
return Banner.new
|
59
|
-
when :rner
|
60
|
-
require 'rbbt/ner/rner'
|
61
|
-
model = options[:model]
|
62
|
-
model ||= File.join(Rbbt.datadir,"ner/model/#{ org }") if File.exist? File.join(Rbbt.datadir,"ner/model/#{ org }")
|
63
|
-
model ||= File.join(Rbbt.datadir,'ner/model/BC2')
|
64
|
-
return NER.new(model)
|
65
|
-
else
|
66
|
-
raise "Ner type (#{ type }) unknown"
|
67
|
-
end
|
68
|
-
|
69
|
-
end
|
70
|
-
|
71
|
-
# Return a normalization object.
|
72
|
-
def self.norm(org, to_entrez = nil)
|
73
|
-
require 'rbbt/ner/rnorm'
|
74
|
-
if to_entrez.nil?
|
75
|
-
to_entrez = id_index(org, :native => 'Entrez Gene Id', :other => [supported_ids(org).first])
|
76
|
-
end
|
77
|
-
|
78
|
-
token_file = File.join(Rbbt.datadir, 'norm','config',org.to_s + '.config')
|
79
|
-
if !File.exists? token_file
|
80
|
-
token_file = nil
|
81
|
-
end
|
82
|
-
|
83
|
-
Normalizer.new(File.join(Rbbt.datadir,"organisms/#{ org }/lexicon"), :to_entrez => to_entrez, :file => token_file, :max_candidates => 20)
|
84
|
-
end
|
85
|
-
|
86
|
-
# Returns a hash with the names associated with each gene id. The ids are
|
87
|
-
# in Rbbt native format for that organism.
|
88
|
-
def self.lexicon(org, options = {})
|
89
|
-
options = {:sep => "\t|\\|", :flatten => true}.merge(options)
|
90
|
-
Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/lexicon"),options)
|
91
|
-
end
|
92
|
-
|
93
|
-
# Returns a hash with the list of go terms for each gene id. Gene ids are in
|
94
|
-
# Rbbt native format for that organism.
|
95
|
-
def self.goterms(org)
|
96
|
-
Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene.go"), :flatten => true)
|
97
|
-
end
|
98
|
-
|
99
|
-
# Return list of PubMed ids associated to the organism. Determined using a
|
100
|
-
# PubMed query with the name of the organism
|
101
|
-
def self.literature(org)
|
102
|
-
Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/all.pmid")).scan(/\d+/)
|
103
|
-
end
|
104
|
-
|
105
|
-
# Return hash that associates genes to a list of PubMed ids.
|
106
|
-
def self.gene_literature(org)
|
107
|
-
Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene.pmid"), :flatten => true)
|
108
|
-
end
|
109
|
-
|
110
|
-
# Return hash that associates genes to a list of PubMed ids. Includes only
|
111
|
-
# those found to support GO term associations.
|
112
|
-
def self.gene_literature_go(org)
|
113
|
-
Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene_go.pmid"), :flatten => true)
|
114
|
-
end
|
115
|
-
|
116
|
-
# Returns a list with the names of the id formats supported for an organism.
|
117
|
-
# If examples are produced, the list is of [format, example] pairs.
|
118
|
-
#
|
119
|
-
# *Options:*
|
120
|
-
#
|
121
|
-
# *examples:* Include example ids for each format
|
122
|
-
def self.supported_ids(org, options = {})
|
123
|
-
formats = []
|
124
|
-
examples = [] if options[:examples]
|
125
|
-
i= 0
|
126
|
-
Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers")).each_line{|l|
|
127
|
-
if i == 0
|
128
|
-
i += 1
|
129
|
-
next unless l=~/^\s*#/
|
130
|
-
formats = Open.fields(l.sub(/^[\s#]+/,'')).collect{|n| n.strip}
|
131
|
-
return formats unless examples
|
132
|
-
next
|
133
|
-
end
|
134
|
-
|
135
|
-
if Open.fields(l).select{|name| name && name =~ /\w/}.length > examples.compact.length
|
136
|
-
examples = Open.fields(l).collect{|name| name.split(/\|/).first}
|
137
|
-
end
|
138
|
-
i += 1
|
139
|
-
}
|
140
|
-
|
141
|
-
formats.zip(examples)
|
142
|
-
end
|
143
|
-
|
144
|
-
# Creates a hash where each possible id is associated with the names of the
|
145
|
-
# formats (its potentially possible for different formats to have the same
|
146
|
-
# id). This is used in the guessIdFormat method.
|
147
|
-
def self.id_formats(org)
|
148
|
-
id_types = {}
|
149
|
-
formats = supported_ids(org)
|
150
|
-
|
151
|
-
text = Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"))
|
152
|
-
|
153
|
-
if text.respond_to? :collect
|
154
|
-
lines = text.collect
|
155
|
-
else
|
156
|
-
lines = text.lines
|
157
|
-
end
|
158
|
-
|
159
|
-
lines.each{|l|
|
160
|
-
ids_per_type = Open.fields(l)
|
161
|
-
formats.zip(ids_per_type).each{|p|
|
162
|
-
format = p[0]
|
163
|
-
p[1] ||= ""
|
164
|
-
ids = p[1].split(/\|/)
|
165
|
-
ids.each{|id|
|
166
|
-
next if id.nil? || id == ""
|
167
|
-
id_types[id.downcase] ||= []
|
168
|
-
id_types[id.downcase] << format unless id_types[id.downcase].include? format
|
169
|
-
}
|
170
|
-
}
|
171
|
-
}
|
172
|
-
|
173
|
-
return id_types
|
174
|
-
end
|
175
|
-
|
176
|
-
def self.guessIdFormat(formats, query)
|
177
|
-
query = query.compact.collect{|gene| gene.downcase}.uniq
|
178
|
-
if String === formats
|
179
|
-
formats = id_formats(formats)
|
180
|
-
end
|
181
|
-
|
182
|
-
return nil if formats.values.empty?
|
183
|
-
values = formats.values_at(*query)
|
184
|
-
return nil if values.empty?
|
185
|
-
|
186
|
-
format_count = {}
|
187
|
-
values.compact.collect{|types| types.uniq}.flatten.each{|f|
|
188
|
-
format_count[f] ||= 0
|
189
|
-
format_count[f] += 1
|
190
|
-
}
|
191
|
-
|
192
|
-
return nil if format_count.values.empty?
|
193
|
-
format_count.select{|k,v| v > (query.length / 10)}.sort{|a,b| b[1] <=> a[1]}.first
|
194
|
-
end
|
195
|
-
|
196
|
-
def self.id_position(supported_ids, id_name, options = {})
|
197
|
-
pos = 0
|
198
|
-
supported_ids.each_with_index{|id, i|
|
199
|
-
if id.strip == id_name.strip || !options[:case_sensitive] && id.strip.downcase == id_name.strip.downcase
|
200
|
-
pos = i;
|
201
|
-
end
|
202
|
-
}
|
203
|
-
pos
|
204
|
-
end
|
205
|
-
|
206
|
-
def self.id_index(org, options = {})
|
207
|
-
native = options[:native]
|
208
|
-
other = options[:other]
|
209
|
-
options[:case_sensitive] = false if options[:case_sensitive].nil?
|
210
|
-
|
211
|
-
if native.nil? and other.nil?
|
212
|
-
Index.index(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"), options)
|
213
|
-
else
|
214
|
-
supported = Organism.supported_ids(org)
|
215
|
-
|
216
|
-
first = nil
|
217
|
-
if native
|
218
|
-
first = id_position(supported,native,options)
|
219
|
-
raise "No match for native format '#{ native }'" if first.nil?
|
220
|
-
else
|
221
|
-
first = 0
|
222
|
-
end
|
223
|
-
|
224
|
-
rest = nil
|
225
|
-
if other
|
226
|
-
rest = other.collect{|name| id_position(supported,name, options)}
|
227
|
-
else
|
228
|
-
rest = (0..supported.length - 1).to_a - [first]
|
229
|
-
end
|
230
|
-
|
231
|
-
options[:native] = first
|
232
|
-
options[:extra] = rest
|
233
|
-
options[:sep] = "\t"
|
234
|
-
index = Index.index(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"), options)
|
235
|
-
|
236
|
-
index
|
237
|
-
end
|
238
|
-
end
|
239
|
-
|
240
|
-
end
|
241
|
-
|
@@ -1,117 +0,0 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rbbt/util/open'
|
3
|
-
require 'rbbt/ner/regexpNER'
|
4
|
-
require 'rbbt/ner/dictionaryNER'
|
5
|
-
|
6
|
-
# Find terms in the Polysearch thesauri using simple regular expression
|
7
|
-
# matching. Note that the first time the methods are used the correspondent
|
8
|
-
# thesaurus are loaded into memory. The available thesauri are: disease, drug,
|
9
|
-
# metabolite, organ, subcellular (subcellular localization) and tissue.
|
10
|
-
module Polysearch
|
11
|
-
|
12
|
-
|
13
|
-
@@names = {}
|
14
|
-
def self.type_names(type) #:nodoc:
|
15
|
-
@@names[type] ||= Open.to_hash(File.join(Rbbt.datadir,'dbs','polysearch',type.to_s + '.txt'), :single => true)
|
16
|
-
end
|
17
|
-
|
18
|
-
|
19
|
-
@@indexes = {}
|
20
|
-
def self.type_index(type) #:nodoc:
|
21
|
-
if $stopwords
|
22
|
-
stopwords = $stopwords
|
23
|
-
else
|
24
|
-
stopwords = []
|
25
|
-
end
|
26
|
-
|
27
|
-
case type.to_sym
|
28
|
-
when :disease
|
29
|
-
stopwords << 'use'
|
30
|
-
end
|
31
|
-
|
32
|
-
@@indexes[type] ||= RegExpNER.new(File.join(Rbbt.datadir,'dbs','polysearch',type.to_s + '.txt'), :stopwords => stopwords)
|
33
|
-
end
|
34
|
-
|
35
|
-
# Find matches in a string of text, the types array specifies which thesauri
|
36
|
-
# to use, if if nil it will use all.
|
37
|
-
def self.match(text, types = nil)
|
38
|
-
if types.nil?
|
39
|
-
types = %w(disease drug metabolite organ subcellular tissue)
|
40
|
-
end
|
41
|
-
|
42
|
-
types = [types] unless Array === types
|
43
|
-
types = types.sort
|
44
|
-
|
45
|
-
matches = {}
|
46
|
-
types.collect{|type|
|
47
|
-
matches.merge!(type_index(type).match(text))
|
48
|
-
}
|
49
|
-
|
50
|
-
matches
|
51
|
-
end
|
52
|
-
|
53
|
-
# Transform the code into a name, type is the thesaurus to use
|
54
|
-
def self.name(type, code)
|
55
|
-
type_names(type)[code]
|
56
|
-
end
|
57
|
-
|
58
|
-
end
|
59
|
-
|
60
|
-
if __FILE__ == $0
|
61
|
-
text =<<-EOT
|
62
|
-
|
63
|
-
Background Microorganisms adapt their transcriptome by integrating
|
64
|
-
multiple chemical and physical signals from their environment. Shake-flask
|
65
|
-
cultivation does not allow precise manipulation of individual culture
|
66
|
-
parameters and therefore precludes a quantitative analysis of the
|
67
|
-
(combinatorial) influence of these parameters on transcriptional
|
68
|
-
regulation. Steady-state chemostat cultures, which do enable accurate
|
69
|
-
control, measurement and manipulation of individual cultivation parameters
|
70
|
-
(e.g. specific growth rate, temperature, identity of the growth-limiting
|
71
|
-
nutrient) appear to provide a promising experimental platform for such a
|
72
|
-
combinatorial analysis. Results A microarray compendium of 170
|
73
|
-
steady-state chemostat cultures of the yeast Saccharomyces cerevisiae is
|
74
|
-
presented and analyzed. The 170 microarrays encompass 55 unique
|
75
|
-
conditions, which can be characterized by the combined settings of 10
|
76
|
-
different cultivation parameters. By applying a regression model to assess
|
77
|
-
the impact of (combinations of) cultivation parameters on the
|
78
|
-
transcriptome, most S. cerevisiae genes were shown to be influenced by
|
79
|
-
multiple cultivation parameters, and in many cases by combinatorial
|
80
|
-
effects of cultivation parameters. The inclusion of these combinatorial
|
81
|
-
effects in the regression model led to higher explained variance of the
|
82
|
-
gene expression patterns and resulted in higher function enrichment in
|
83
|
-
subsequent analysis. We further demonstrate the usefulness of the
|
84
|
-
compendium and regression analysis for interpretation of shake-flask-based
|
85
|
-
transcriptome studies and for guiding functional analysis of
|
86
|
-
(uncharacterized) genes and pathways. Conclusions Modeling the
|
87
|
-
combinatorial effects of environmental parameters on the transcriptome is
|
88
|
-
crucial for understanding transcriptional regulation. Chemostat
|
89
|
-
cultivation offers a powerful tool for such an approach. Keywords:
|
90
|
-
chemostat steady state samples Cerebellar stroke syndrome
|
91
|
-
|
92
|
-
|
93
|
-
EOT
|
94
|
-
|
95
|
-
require 'benchmark'
|
96
|
-
require 'ruby-prof'
|
97
|
-
|
98
|
-
puts Benchmark.measure{
|
99
|
-
p Polysearch.match(text,'disease')
|
100
|
-
}
|
101
|
-
|
102
|
-
|
103
|
-
RubyProf.start
|
104
|
-
|
105
|
-
Polysearch.match(text,'disease')
|
106
|
-
|
107
|
-
result = RubyProf.stop
|
108
|
-
|
109
|
-
# Print a flat profile to text
|
110
|
-
printer = RubyProf::FlatPrinter.new(result)
|
111
|
-
printer.print(STDOUT, 0)
|
112
|
-
|
113
|
-
puts Benchmark.measure{
|
114
|
-
10.times{ p Polysearch.match(text,'disease') }
|
115
|
-
}
|
116
|
-
|
117
|
-
end
|
data/lib/rbbt/sources/pubmed.rb
DELETED
@@ -1,248 +0,0 @@
|
|
1
|
-
require 'rbbt/util/filecache'
|
2
|
-
require 'rbbt/util/open'
|
3
|
-
require 'rbbt/sources/gscholar'
|
4
|
-
require 'rbbt'
|
5
|
-
require 'libxml'
|
6
|
-
|
7
|
-
# This module offers an interface with PubMed, to perform queries, and
|
8
|
-
# retrieve simple information from articles. It uses the caching
|
9
|
-
# services of Rbbt.
|
10
|
-
module PubMed
|
11
|
-
|
12
|
-
private
|
13
|
-
@@last = Time.now
|
14
|
-
@@pubmed_lag = 1
|
15
|
-
def self.get_online(pmids)
|
16
|
-
|
17
|
-
pmid_list = ( pmids.is_a?(Array) ? pmids.join(',') : pmids.to_s )
|
18
|
-
url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list}"
|
19
|
-
|
20
|
-
diff = Time.now - @@last
|
21
|
-
sleep @@pubmed_lag - diff unless diff > @@pubmed_lag
|
22
|
-
|
23
|
-
xml = Open.read(url, :quiet => true, :nocache => true)
|
24
|
-
|
25
|
-
@@last = Time.now
|
26
|
-
|
27
|
-
articles = xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/smu).flatten
|
28
|
-
|
29
|
-
if pmids.is_a? Array
|
30
|
-
list = {}
|
31
|
-
articles.each{|article|
|
32
|
-
pmid = article.scan(/<PMID>(.*?)<\/PMID>/).flatten.first
|
33
|
-
list[pmid] = article
|
34
|
-
}
|
35
|
-
return list
|
36
|
-
else
|
37
|
-
return articles.first
|
38
|
-
end
|
39
|
-
|
40
|
-
end
|
41
|
-
|
42
|
-
public
|
43
|
-
|
44
|
-
# Processes the xml with an articles as served by MedLine and extracts
|
45
|
-
# the abstract, title and journal information
|
46
|
-
class Article
|
47
|
-
|
48
|
-
|
49
|
-
XML_KEYS = [
|
50
|
-
[:title , "ArticleTitle"],
|
51
|
-
[:journal , "Journal/Title"],
|
52
|
-
[:issue , "Journal/JournalIssue/Issue"],
|
53
|
-
[:volume , "Journal/JournalIssue/Volume"],
|
54
|
-
[:issn , "Journal/ISSN"],
|
55
|
-
[:year , "Journal/JournalIssue/PubDate/Year"],
|
56
|
-
[:month , "Journal/JournalIssue/PubDate/Month"],
|
57
|
-
[:pages , "Pagination/MedlinePgn"],
|
58
|
-
[:abstract , "Abstract/AbstractText"],
|
59
|
-
]
|
60
|
-
|
61
|
-
PMC_PDF_URL = "http://www.ncbi.nlm.nih.gov/pmc/articles/PMCID/pdf/"
|
62
|
-
|
63
|
-
def self.escape_title(title)
|
64
|
-
title.gsub(/(\w*[A-Z][A-Z]+\w*)/, '{\1}')
|
65
|
-
end
|
66
|
-
|
67
|
-
def self.make_bibentry(lastname, year, title)
|
68
|
-
words = title.downcase.scan(/\w+/)
|
69
|
-
if words.first.length > 3
|
70
|
-
abrev = words.first
|
71
|
-
else
|
72
|
-
abrev = words[0..2].collect{|w| w.chars.first} * ""
|
73
|
-
end
|
74
|
-
[lastname.gsub(/\s/,'_'), year || "NOYEAR", abrev] * ""
|
75
|
-
end
|
76
|
-
def self.parse_xml(xml)
|
77
|
-
parser = LibXML::XML::Parser.string(xml)
|
78
|
-
pubmed = parser.parse.find("/PubmedArticle").first
|
79
|
-
medline = pubmed.find("MedlineCitation").first
|
80
|
-
article = medline.find("Article").first
|
81
|
-
|
82
|
-
info = {}
|
83
|
-
|
84
|
-
info[:pmid] = medline.find("PMID").first.content
|
85
|
-
|
86
|
-
XML_KEYS.each do |p|
|
87
|
-
name, key = p
|
88
|
-
node = article.find(key).first
|
89
|
-
|
90
|
-
next if node.nil?
|
91
|
-
|
92
|
-
info[name] = node.content
|
93
|
-
end
|
94
|
-
|
95
|
-
bibentry = nil
|
96
|
-
info[:author] = article.find("AuthorList/Author").collect do |author|
|
97
|
-
begin
|
98
|
-
lastname = author.find("LastName").first.content
|
99
|
-
if author.find("ForeName").first.nil?
|
100
|
-
forename = nil
|
101
|
-
else
|
102
|
-
forename = author.find("ForeName").first.content.split(/\s/).collect{|word| if word.length == 1; then word + '.'; else word; end} * " "
|
103
|
-
end
|
104
|
-
bibentry ||= make_bibentry lastname, info[:year], info[:title]
|
105
|
-
rescue
|
106
|
-
end
|
107
|
-
[lastname, forename] * ", "
|
108
|
-
end * " and "
|
109
|
-
|
110
|
-
info[:bibentry] = bibentry.downcase if bibentry
|
111
|
-
|
112
|
-
info[:pmc_pdf] = pubmed.find("PubmedData/ArticleIdList/ArticleId").select{|id| id[:IdType] == "pmc"}.first
|
113
|
-
|
114
|
-
if info[:pmc_pdf]
|
115
|
-
info[:pmc_pdf] = PMC_PDF_URL.sub(/PMCID/, info[:pmc_pdf].content)
|
116
|
-
end
|
117
|
-
|
118
|
-
info
|
119
|
-
end
|
120
|
-
|
121
|
-
attr_accessor :title, :abstract, :journal, :author, :pmid, :bibentry, :pmc_pdf, :gscholar_pdf, :pdf_url
|
122
|
-
attr_accessor *XML_KEYS.collect{|p| p.first }
|
123
|
-
|
124
|
-
def initialize(xml)
|
125
|
-
if xml && ! xml.empty?
|
126
|
-
info = PubMed::Article.parse_xml xml
|
127
|
-
info.each do |key, value|
|
128
|
-
self.send("#{ key }=", value)
|
129
|
-
end
|
130
|
-
end
|
131
|
-
end
|
132
|
-
|
133
|
-
def pdf_url
|
134
|
-
return pmc_pdf if pmc_pdf
|
135
|
-
@gscholar_pdf ||= GoogleScholar::full_text_url title
|
136
|
-
end
|
137
|
-
|
138
|
-
def full_text
|
139
|
-
return nil if pdf_url.nil?
|
140
|
-
|
141
|
-
text = nil
|
142
|
-
TmpFile.with_file do |pdf|
|
143
|
-
|
144
|
-
# Change user-agent, oh well...
|
145
|
-
`wget --user-agent=firefox #{ pdf_url } -O #{ pdf }`
|
146
|
-
TmpFile.with_file do |txt|
|
147
|
-
`pdftotext #{ pdf } #{ txt }`
|
148
|
-
text = Open.read(txt) if File.exists? txt
|
149
|
-
end
|
150
|
-
end
|
151
|
-
|
152
|
-
text
|
153
|
-
end
|
154
|
-
|
155
|
-
def bibtex
|
156
|
-
keys = [:author] + XML_KEYS.collect{|p| p.first } - [:bibentry]
|
157
|
-
bibtex = "@article{#{bibentry},\n"
|
158
|
-
|
159
|
-
keys.each do |key|
|
160
|
-
next if self.send(key).nil?
|
161
|
-
|
162
|
-
case key
|
163
|
-
|
164
|
-
when :title
|
165
|
-
bibtex += " title = { #{ PubMed::Article.escape_title title } },\n"
|
166
|
-
|
167
|
-
when :issue
|
168
|
-
bibtex += " number = { #{ issue } },\n"
|
169
|
-
|
170
|
-
else
|
171
|
-
bibtex += " #{ key } = { #{ self.send(key) } },\n"
|
172
|
-
end
|
173
|
-
|
174
|
-
end
|
175
|
-
|
176
|
-
bibtex += " fulltext = { #{ pdf_url } },\n" if pdf_url
|
177
|
-
bibtex += " pmid = { #{ pmid } }\n}"
|
178
|
-
|
179
|
-
|
180
|
-
bibtex
|
181
|
-
end
|
182
|
-
|
183
|
-
# Join the text from title and abstract
|
184
|
-
def text
|
185
|
-
[title, abstract].join("\n")
|
186
|
-
end
|
187
|
-
end
|
188
|
-
|
189
|
-
# Returns the Article object containing the information for the PubMed
|
190
|
-
# ID specified as an argument. If +pmid+ is an array instead of a single
|
191
|
-
# identifier it returns an hash with the Article object for each id.
|
192
|
-
# It uses the Rbbt cache to save the articles xml.
|
193
|
-
def self.get_article(pmid)
|
194
|
-
|
195
|
-
if pmid.is_a? Array
|
196
|
-
missing = []
|
197
|
-
list = {}
|
198
|
-
|
199
|
-
pmid.each{|p|
|
200
|
-
filename = p.to_s + '.xml'
|
201
|
-
if File.exists? FileCache.path(filename)
|
202
|
-
list[p] = Article.new(Open.read(FileCache.path(filename)))
|
203
|
-
else
|
204
|
-
missing << p
|
205
|
-
end
|
206
|
-
}
|
207
|
-
|
208
|
-
return list unless missing.any?
|
209
|
-
chunk_size = [100, missing.length].min
|
210
|
-
chunks = (missing.length.to_f / chunk_size).ceil
|
211
|
-
|
212
|
-
articles = {}
|
213
|
-
chunks.times do |chunk|
|
214
|
-
pmids = missing[(chunk * chunk_size)..((chunk + 1) *chunk_size)]
|
215
|
-
articles.merge!(get_online(pmids))
|
216
|
-
end
|
217
|
-
|
218
|
-
articles.each{|p, xml|
|
219
|
-
filename = p + '.xml'
|
220
|
-
FileCache.add_file(filename,xml, :force => true)
|
221
|
-
list[p] = Article.new(xml)
|
222
|
-
}
|
223
|
-
|
224
|
-
return list
|
225
|
-
|
226
|
-
else
|
227
|
-
filename = pmid.to_s + '.xml'
|
228
|
-
|
229
|
-
if File.exists? FileCache.path(filename)
|
230
|
-
return Article.new(Open.read(FileCache.path(filename)))
|
231
|
-
else
|
232
|
-
xml = get_online(pmid)
|
233
|
-
FileCache.add_file(filename,xml)
|
234
|
-
|
235
|
-
return Article.new(xml)
|
236
|
-
end
|
237
|
-
end
|
238
|
-
end
|
239
|
-
|
240
|
-
# Performs the specified query and returns an array with the PubMed
|
241
|
-
# Ids returned. +retmax+ can be used to limit the number of ids
|
242
|
-
# returned, if is not specified 30000 is used.
|
243
|
-
def self.query(query, retmax=nil)
|
244
|
-
retmax ||= 30000
|
245
|
-
|
246
|
-
Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?retmax=#{retmax}&db=pubmed&term=#{query}",:quiet => true, :nocache => true).scan(/<Id>(\d+)<\/Id>/).flatten
|
247
|
-
end
|
248
|
-
end
|