bio 1.1.0 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -5,7 +5,7 @@
5
5
  #
6
6
  # License:: The Ruby License
7
7
  #
8
- # $Id: flatfile.rb,v 1.60 2007/07/09 14:08:34 ngoto Exp $
8
+ # $Id: flatfile.rb,v 1.61 2007/11/15 07:07:16 k Exp $
9
9
  #
10
10
  #
11
11
  # Bio::FlatFile is a helper and wrapper class to read a biological data file.
@@ -1130,7 +1130,7 @@ module Bio
1130
1130
  genpept = RuleRegexp[ 'Bio::GenPept',
1131
1131
  /^LOCUS .+ aa .+/ ],
1132
1132
  medline = RuleRegexp[ 'Bio::MEDLINE',
1133
- /^UI \- [0-9]+$/ ],
1133
+ /^PMID\- [0-9]+$/ ],
1134
1134
  embl = RuleRegexp[ 'Bio::EMBL',
1135
1135
  /^ID .+\; .*(DNA|RNA|XXX)\;/ ],
1136
1136
  sptr = RuleRegexp2[ 'Bio::SPTR',
@@ -4,7 +4,7 @@
4
4
  # Copyright:: Copyright (C) 2002 GOTO Naohisa <ng@bioruby.org>
5
5
  # License:: The Ruby License
6
6
  #
7
- # $Id: indexer.rb,v 1.25 2007/04/05 23:35:41 trevor Exp $
7
+ # $Id: indexer.rb,v 1.26 2007/12/11 15:13:32 ngoto Exp $
8
8
  #
9
9
 
10
10
  require 'bio/io/flatfile/index'
@@ -714,7 +714,7 @@ module Bio
714
714
 
715
715
  ##############################################################
716
716
  def self.formatstring2class(format_string)
717
- case format
717
+ case format_string
718
718
  when /genbank/i
719
719
  dbclass = Bio::GenBank
720
720
  when /genpept/i
@@ -4,7 +4,7 @@
4
4
  # Copyright:: Copyright (C) 2003, 2004 Toshiaki Katayama <k@bioruby.org>
5
5
  # License:: The Ruby License
6
6
  #
7
- # $Id: keggapi.rb,v 1.14 2007/04/05 23:35:41 trevor Exp $
7
+ # $Id: keggapi.rb,v 1.15 2007/07/20 21:56:45 k Exp $
8
8
  #
9
9
 
10
10
  require 'bio/io/soapwsdl'
@@ -331,6 +331,7 @@ class API < Bio::SOAPWSDL
331
331
  def add_filter(results)
332
332
  if results.is_a?(Array)
333
333
  results.each do |result|
334
+ next if result.is_a?(Fixnum)
334
335
  def result.filter(fields)
335
336
  fields.collect { |field| self.send(field) }
336
337
  end
@@ -1,16 +1,15 @@
1
1
  #
2
2
  # = bio/io/pubmed.rb - NCBI Entrez/PubMed client module
3
3
  #
4
- # Copyright:: Copyright (C) 2001 Toshiaki Katayama <k@bioruby.org>
4
+ # Copyright:: Copyright (C) 2001, 2007 Toshiaki Katayama <k@bioruby.org>
5
5
  # Copyright:: Copyright (C) 2006 Jan Aerts <jan.aerts@bbsrc.ac.uk>
6
6
  # License:: The Ruby License
7
7
  #
8
- # $Id: pubmed.rb,v 1.16 2007/04/05 23:35:41 trevor Exp $
8
+ # $Id: pubmed.rb,v 1.23 2007/12/12 13:53:26 k Exp $
9
9
  #
10
10
 
11
- require 'net/http'
12
- require 'cgi' unless defined?(CGI)
13
11
  require 'bio/command'
12
+ require 'cgi' unless defined?(CGI)
14
13
 
15
14
  module Bio
16
15
 
@@ -18,18 +17,19 @@ module Bio
18
17
  #
19
18
  # The Bio::PubMed class provides several ways to retrieve bibliographic
20
19
  # information from the PubMed database at
21
- # http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=PubMed. Basically, two
22
- # types of queries are possible:
20
+ # http://www.ncbi.nlm.nih.gov/sites/entrez?db=PubMed
21
+ #
22
+ # Basically, two types of queries are possible:
23
23
  #
24
24
  # * searching for PubMed IDs given a query string:
25
- # * Bio::PubMed#search
26
- # * Bio::PubMed#esearch
25
+ # * Bio::PubMed#esearch (recommended)
26
+ # * Bio::PubMed#search (only retrieves top 20 hits)
27
27
  #
28
28
  # * retrieving the MEDLINE text (i.e. authors, journal, abstract, ...)
29
29
  # given a PubMed ID
30
- # * Bio::PubMed#query
31
- # * Bio::PubMed#pmfetch
32
- # * Bio::PubMed#efetch
30
+ # * Bio::PubMed#efetch (recommended)
31
+ # * Bio::PubMed#query (unstable for the change of the HTML design)
32
+ # * Bio::PubMed#pmfetch (still working but could be obsoleted by NCBI)
33
33
  #
34
34
  # The different methods within the same group are interchangeable and should
35
35
  # return the same result.
@@ -37,54 +37,61 @@ module Bio
37
37
  # Additional information about the MEDLINE format and PubMed programmable
38
38
  # APIs can be found on the following websites:
39
39
  #
40
- # * Overview: http://www.ncbi.nlm.nih.gov/entrez/query/static/overview.html
41
- # * How to link: http://www.ncbi.nlm.nih.gov/entrez/query/static/linking.html
42
- # * MEDLINE format: http://www.ncbi.nlm.nih.gov/entrez/query/static/help/pmhelp.html#MEDLINEDisplayFormat
43
- # * Search field descriptions and tags: http://www.ncbi.nlm.nih.gov/entrez/query/static/help/pmhelp.html#SearchFieldDescriptionsandTags
44
- # * Entrez utilities index: http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html
45
- # * PmFetch CGI help: http://www.ncbi.nlm.nih.gov/entrez/utils/pmfetch_help.html
46
- # * E-Utilities CGI help: http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html
40
+ # * PubMed Overview:
41
+ # http://www.ncbi.nlm.nih.gov/entrez/query/static/overview.html
42
+ # * PubMed help:
43
+ # http://www.ncbi.nlm.nih.gov/entrez/query/static/help/pmhelp.html
44
+ # * Entrez utilities index:
45
+ # http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html
46
+ # * How to link:
47
+ # http://www.ncbi.nlm.nih.gov/books/bv.fcgi?rid=helplinks.chapter.linkshelp
47
48
  #
48
49
  # == Usage
49
50
  #
50
51
  # require 'bio'
51
52
  #
52
53
  # # If you don't know the pubmed ID:
53
- # Bio::PubMed.search("(genome AND analysis) OR bioinformatics)").each do |x|
54
+ # Bio::PubMed.esearch("(genome AND analysis) OR bioinformatics").each do |x|
54
55
  # p x
55
56
  # end
56
- # Bio::PubMed.esearch("(genome AND analysis) OR bioinformatics)").each do |x|
57
+ #
58
+ # Bio::PubMed.search("(genome AND analysis) OR bioinformatics").each do |x|
57
59
  # p x
58
60
  # end
59
61
  #
60
62
  # # To retrieve the MEDLINE entry for a given PubMed ID:
63
+ # puts Bio::PubMed.efetch("10592173", "14693808")
61
64
  # puts Bio::PubMed.query("10592173")
62
65
  # puts Bio::PubMed.pmfetch("10592173")
63
- # puts Bio::PubMed.efetch("10592173", "14693808")
66
+ #
64
67
  # # This can be converted into a Bio::MEDLINE object:
65
68
  # manuscript = Bio::PubMed.query("10592173")
66
- # medline = Bio::MEDLINE(manuscript)
69
+ # medline = Bio::MEDLINE.new(manuscript)
67
70
  #
68
71
  class PubMed
69
72
 
70
- # Search the PubMed database by given keywords using entrez query and returns
71
- # an array of PubMed IDs.
72
- # ---
73
- # *Arguments*:
74
- # * _id_: query string (required)
75
- # *Returns*:: array of PubMed IDs
76
- def self.search(str)
77
- host = "www.ncbi.nlm.nih.gov"
78
- path = "/entrez/query.fcgi?tool=bioruby&cmd=Search&doptcmdl=MEDLINE&db=PubMed&term="
73
+ # Run retrieval scripts on weekends or between 9 pm and 5 am Eastern Time
74
+ # weekdays for any series of more than 100 requests.
75
+ # -> Not implemented yet in BioRuby
79
76
 
80
- http = Bio::Command.new_http(host)
81
- response, = http.get(path + CGI.escape(str))
82
- result = response.body
83
- result = result.gsub("\r", "\n").squeeze("\n")
84
- result = result.scan(/<pre>(.*?)<\/pre>/m).flatten
85
- return result
77
+ # Make no more than one request every 3 seconds.
78
+ NCBI_INTERVAL = 3
79
+ @@last_access = nil
80
+
81
+ private
82
+
83
+ def ncbi_access_wait(wait = NCBI_INTERVAL)
84
+ if @@last_access
85
+ duration = Time.now - @@last_access
86
+ if wait > duration
87
+ sleep wait - duration
88
+ end
89
+ end
90
+ @@last_access = Time.now
86
91
  end
87
92
 
93
+ public
94
+
88
95
  # Search the PubMed database by given keywords using E-Utils and returns
89
96
  # an array of PubMed IDs.
90
97
  #
@@ -102,22 +109,80 @@ class PubMed
102
109
  # * _retmax_ (default 100)
103
110
  # * _retmode_
104
111
  # * _rettype_
105
- # *Returns*:: array of PubMed IDs
106
- def self.esearch(str, hash = {})
107
- hash['retmax'] = 100 unless hash['retmax']
112
+ # *Returns*:: array of PubMed IDs or a number of results
113
+ def esearch(str, hash = {})
114
+ return nil if str.empty?
108
115
 
109
- opts = []
110
- hash.each do |k, v|
111
- opts << "#{k}=#{v}"
116
+ serv = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
117
+ opts = {
118
+ "retmax" => 100,
119
+ "tool" => "bioruby",
120
+ "db" => "pubmed",
121
+ "term" => str
122
+ }
123
+ opts.update(hash)
124
+
125
+ ncbi_access_wait
126
+
127
+ response, = Bio::Command.post_form(serv, opts)
128
+ result = response.body
129
+ if opts['rettype'] == 'count'
130
+ result = result.scan(/<Count>(.*?)<\/Count>/m).flatten.first.to_i
131
+ else
132
+ result = result.scan(/<Id>(.*?)<\/Id>/m).flatten
112
133
  end
134
+ return result
135
+ end
136
+
137
+ # Retrieve PubMed entry by PMID and returns MEDLINE formatted string using
138
+ # entrez efetch. Multiple PubMed IDs can be provided:
139
+ # Bio::PubMed.efetch(123)
140
+ # Bio::PubMed.efetch([123,456,789])
141
+ # ---
142
+ # *Arguments*:
143
+ # * _ids_: list of PubMed IDs (required)
144
+ # *Returns*:: Array of MEDLINE formatted String
145
+ def efetch(ids, hash = {})
146
+ return nil if ids.to_s.empty?
147
+ ids = ids.join(",") if ids === Array
148
+
149
+ serv = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
150
+ opts = {
151
+ "tool" => "bioruby",
152
+ "db" => "pubmed",
153
+ "retmode" => "text",
154
+ "rettype" => "medline",
155
+ "id" => ids,
156
+ }
157
+ opts.update(hash)
158
+
159
+ ncbi_access_wait
160
+
161
+ response, = Bio::Command.post_form(serv, opts)
162
+ result = response.body
163
+ if opts["retmode"] == "text"
164
+ result = result.split(/\n\n+/)
165
+ end
166
+ return result
167
+ end
168
+
169
+ # Search the PubMed database by given keywords using entrez query and returns
170
+ # an array of PubMed IDs. Caution: this method returns the first 20 hits only.
171
+ # Instead, use of the 'esearch' method is strongly recomended.
172
+ # ---
173
+ # *Arguments*:
174
+ # * _id_: query string (required)
175
+ # *Returns*:: array of PubMed IDs
176
+ def search(str)
177
+ host = "www.ncbi.nlm.nih.gov"
178
+ path = "/sites/entrez?tool=bioruby&cmd=Search&doptcmdl=Brief&db=PubMed&term="
113
179
 
114
- host = "eutils.ncbi.nlm.nih.gov"
115
- path = "/entrez/eutils/esearch.fcgi?tool=bioruby&db=pubmed&#{opts.join('&')}&term="
180
+ ncbi_access_wait
116
181
 
117
182
  http = Bio::Command.new_http(host)
118
183
  response, = http.get(path + CGI.escape(str))
119
184
  result = response.body
120
- result = result.scan(/<Id>(.*?)<\/Id>/m).flatten
185
+ result = result.scan(/value="(\d+)" id="UidCheckBox"/m).flatten
121
186
  return result
122
187
  end
123
188
 
@@ -127,18 +192,27 @@ class PubMed
127
192
  # *Arguments*:
128
193
  # * _id_: PubMed ID (required)
129
194
  # *Returns*:: MEDLINE formatted String
130
- def self.query(id)
195
+ def query(*ids)
131
196
  host = "www.ncbi.nlm.nih.gov"
132
- path = "/entrez/query.fcgi?tool=bioruby&cmd=Text&dopt=MEDLINE&db=PubMed&uid="
197
+ path = "/sites/entrez?tool=bioruby&cmd=Text&dopt=MEDLINE&db=PubMed&uid="
198
+ list = ids.join(",")
199
+
200
+ ncbi_access_wait
133
201
 
134
202
  http = Bio::Command.new_http(host)
135
- response, = http.get(path + id.to_s)
203
+ response, = http.get(path + list)
136
204
  result = response.body
137
- if result =~ /#{id}\s+Error/
205
+ result = result.scan(/<pre>\s*(.*?)<\/pre>/m).flatten
206
+
207
+ if result =~ /id:.*Error occurred/
208
+ # id: xxxxx Error occurred: Article does not exist
138
209
  raise( result )
139
210
  else
140
- result = result.gsub("\r", "\n").squeeze("\n").gsub(/<\/?pre>/, '')
141
- return result
211
+ if ids.size > 1
212
+ return result
213
+ else
214
+ return result.first
215
+ end
142
216
  end
143
217
  end
144
218
 
@@ -148,10 +222,12 @@ class PubMed
148
222
  # *Arguments*:
149
223
  # * _id_: PubMed ID (required)
150
224
  # *Returns*:: MEDLINE formatted String
151
- def self.pmfetch(id)
225
+ def pmfetch(id)
152
226
  host = "www.ncbi.nlm.nih.gov"
153
227
  path = "/entrez/utils/pmfetch.fcgi?tool=bioruby&mode=text&report=medline&db=PubMed&id="
154
228
 
229
+ ncbi_access_wait
230
+
155
231
  http = Bio::Command.new_http(host)
156
232
  response, = http.get(path + id.to_s)
157
233
  result = response.body
@@ -163,28 +239,24 @@ class PubMed
163
239
  end
164
240
  end
165
241
 
166
- # Retrieve PubMed entry by PMID and returns MEDLINE formatted string using
167
- # entrez efetch. Multiple PubMed IDs can be provided:
168
- # Bio::PubMed.efetch(123)
169
- # Bio::PubMed.efetch(123,456,789)
170
- # Bio::PubMed.efetch([123,456,789])
171
- # ---
172
- # *Arguments*:
173
- # * _ids_: list of PubMed IDs (required)
174
- # *Returns*:: MEDLINE formatted String
175
- def self.efetch(*ids)
176
- return [] if ids.empty?
242
+ def self.esearch(*args)
243
+ self.new.esearch(*args)
244
+ end
245
+
246
+ def self.efetch(*args)
247
+ self.new.efetch(*args)
248
+ end
177
249
 
178
- host = "eutils.ncbi.nlm.nih.gov"
179
- path = "/entrez/eutils/efetch.fcgi?tool=bioruby&db=pubmed&retmode=text&rettype=medline&id="
250
+ def self.search(*args)
251
+ self.new.search(*args)
252
+ end
180
253
 
181
- ids = ids.join(",")
254
+ def self.query(*args)
255
+ self.new.query(*args)
256
+ end
182
257
 
183
- http = Bio::Command.new_http(host)
184
- response, = http.get(path + ids)
185
- result = response.body
186
- result = result.split(/\n\n+/)
187
- return result
258
+ def self.pmfetch(*args)
259
+ self.new.pmfetch(*args)
188
260
  end
189
261
 
190
262
  end # PubMed
@@ -194,18 +266,88 @@ end # Bio
194
266
 
195
267
  if __FILE__ == $0
196
268
 
197
- puts Bio::PubMed.query("10592173")
198
- puts "--- ---"
199
- puts Bio::PubMed.pmfetch("10592173")
200
- puts "--- ---"
201
- Bio::PubMed.search("(genome AND analysis) OR bioinformatics)").each do |x|
269
+ puts "=== instance methods ==="
270
+
271
+ pubmed = Bio::PubMed.new
272
+
273
+ puts "--- Search PubMed by E-Utils ---"
274
+ opts = {"rettype" => "count"}
275
+ puts Time.now
276
+ puts pubmed.esearch("(genome AND analysis) OR bioinformatics", opts)
277
+ puts Time.now
278
+ puts pubmed.esearch("(genome AND analysis) OR bioinformatics", opts)
279
+ puts Time.now
280
+ puts pubmed.esearch("(genome AND analysis) OR bioinformatics", opts)
281
+ puts Time.now
282
+ pubmed.esearch("(genome AND analysis) OR bioinformatics").each do |x|
283
+ puts x
284
+ end
285
+
286
+ puts "--- Retrieve PubMed entry by E-Utils ---"
287
+ puts Time.now
288
+ puts pubmed.efetch(16381885)
289
+ puts Time.now
290
+ puts pubmed.efetch("16381885")
291
+ puts Time.now
292
+ puts pubmed.efetch("16381885")
293
+ puts Time.now
294
+ opts = {"retmode" => "xml"}
295
+ puts pubmed.efetch([10592173, 14693808], opts)
296
+ puts Time.now
297
+ puts pubmed.efetch(["10592173", "14693808"], opts)
298
+
299
+ puts "--- Search PubMed by Entrez CGI ---"
300
+ pubmed.search("(genome AND analysis) OR bioinformatics").each do |x|
202
301
  p x
203
302
  end
204
- puts "--- ---"
205
- Bio::PubMed.esearch("(genome AND analysis) OR bioinformatics)").each do |x|
303
+
304
+ puts "--- Retrieve PubMed entry by Entrez CGI ---"
305
+ puts pubmed.query("16381885")
306
+
307
+
308
+ puts "--- Retrieve PubMed entry by PMfetch ---"
309
+ puts pubmed.pmfetch("16381885")
310
+
311
+
312
+ puts "=== class methods ==="
313
+
314
+
315
+ puts "--- Search PubMed by E-Utils ---"
316
+ opts = {"rettype" => "count"}
317
+ puts Time.now
318
+ puts Bio::PubMed.esearch("(genome AND analysis) OR bioinformatics", opts)
319
+ puts Time.now
320
+ puts Bio::PubMed.esearch("(genome AND analysis) OR bioinformatics", opts)
321
+ puts Time.now
322
+ puts Bio::PubMed.esearch("(genome AND analysis) OR bioinformatics", opts)
323
+ puts Time.now
324
+ Bio::PubMed.esearch("(genome AND analysis) OR bioinformatics").each do |x|
325
+ puts x
326
+ end
327
+
328
+ puts "--- Retrieve PubMed entry by E-Utils ---"
329
+ puts Time.now
330
+ puts Bio::PubMed.efetch(16381885)
331
+ puts Time.now
332
+ puts Bio::PubMed.efetch("16381885")
333
+ puts Time.now
334
+ puts Bio::PubMed.efetch("16381885")
335
+ puts Time.now
336
+ opts = {"retmode" => "xml"}
337
+ puts Bio::PubMed.efetch([10592173, 14693808], opts)
338
+ puts Time.now
339
+ puts Bio::PubMed.efetch(["10592173", "14693808"], opts)
340
+
341
+ puts "--- Search PubMed by Entrez CGI ---"
342
+ Bio::PubMed.search("(genome AND analysis) OR bioinformatics").each do |x|
206
343
  p x
207
344
  end
208
- puts "--- ---"
209
- puts Bio::PubMed.efetch("10592173", "14693808")
345
+
346
+ puts "--- Retrieve PubMed entry by Entrez CGI ---"
347
+ puts Bio::PubMed.query("16381885")
348
+
349
+
350
+ puts "--- Retrieve PubMed entry by PMfetch ---"
351
+ puts Bio::PubMed.pmfetch("16381885")
210
352
 
211
353
  end