bio 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,7 @@
5
5
  #
6
6
  # License:: The Ruby License
7
7
  #
8
- # $Id: flatfile.rb,v 1.60 2007/07/09 14:08:34 ngoto Exp $
8
+ # $Id: flatfile.rb,v 1.61 2007/11/15 07:07:16 k Exp $
9
9
  #
10
10
  #
11
11
  # Bio::FlatFile is a helper and wrapper class to read a biological data file.
@@ -1130,7 +1130,7 @@ module Bio
1130
1130
  genpept = RuleRegexp[ 'Bio::GenPept',
1131
1131
  /^LOCUS .+ aa .+/ ],
1132
1132
  medline = RuleRegexp[ 'Bio::MEDLINE',
1133
- /^UI \- [0-9]+$/ ],
1133
+ /^PMID\- [0-9]+$/ ],
1134
1134
  embl = RuleRegexp[ 'Bio::EMBL',
1135
1135
  /^ID .+\; .*(DNA|RNA|XXX)\;/ ],
1136
1136
  sptr = RuleRegexp2[ 'Bio::SPTR',
@@ -4,7 +4,7 @@
4
4
  # Copyright:: Copyright (C) 2002 GOTO Naohisa <ng@bioruby.org>
5
5
  # License:: The Ruby License
6
6
  #
7
- # $Id: indexer.rb,v 1.25 2007/04/05 23:35:41 trevor Exp $
7
+ # $Id: indexer.rb,v 1.26 2007/12/11 15:13:32 ngoto Exp $
8
8
  #
9
9
 
10
10
  require 'bio/io/flatfile/index'
@@ -714,7 +714,7 @@ module Bio
714
714
 
715
715
  ##############################################################
716
716
  def self.formatstring2class(format_string)
717
- case format
717
+ case format_string
718
718
  when /genbank/i
719
719
  dbclass = Bio::GenBank
720
720
  when /genpept/i
@@ -4,7 +4,7 @@
4
4
  # Copyright:: Copyright (C) 2003, 2004 Toshiaki Katayama <k@bioruby.org>
5
5
  # License:: The Ruby License
6
6
  #
7
- # $Id: keggapi.rb,v 1.14 2007/04/05 23:35:41 trevor Exp $
7
+ # $Id: keggapi.rb,v 1.15 2007/07/20 21:56:45 k Exp $
8
8
  #
9
9
 
10
10
  require 'bio/io/soapwsdl'
@@ -331,6 +331,7 @@ class API < Bio::SOAPWSDL
331
331
  def add_filter(results)
332
332
  if results.is_a?(Array)
333
333
  results.each do |result|
334
+ next if result.is_a?(Fixnum)
334
335
  def result.filter(fields)
335
336
  fields.collect { |field| self.send(field) }
336
337
  end
@@ -1,16 +1,15 @@
1
1
  #
2
2
  # = bio/io/pubmed.rb - NCBI Entrez/PubMed client module
3
3
  #
4
- # Copyright:: Copyright (C) 2001 Toshiaki Katayama <k@bioruby.org>
4
+ # Copyright:: Copyright (C) 2001, 2007 Toshiaki Katayama <k@bioruby.org>
5
5
  # Copyright:: Copyright (C) 2006 Jan Aerts <jan.aerts@bbsrc.ac.uk>
6
6
  # License:: The Ruby License
7
7
  #
8
- # $Id: pubmed.rb,v 1.16 2007/04/05 23:35:41 trevor Exp $
8
+ # $Id: pubmed.rb,v 1.23 2007/12/12 13:53:26 k Exp $
9
9
  #
10
10
 
11
- require 'net/http'
12
- require 'cgi' unless defined?(CGI)
13
11
  require 'bio/command'
12
+ require 'cgi' unless defined?(CGI)
14
13
 
15
14
  module Bio
16
15
 
@@ -18,18 +17,19 @@ module Bio
18
17
  #
19
18
  # The Bio::PubMed class provides several ways to retrieve bibliographic
20
19
  # information from the PubMed database at
21
- # http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=PubMed. Basically, two
22
- # types of queries are possible:
20
+ # http://www.ncbi.nlm.nih.gov/sites/entrez?db=PubMed
21
+ #
22
+ # Basically, two types of queries are possible:
23
23
  #
24
24
  # * searching for PubMed IDs given a query string:
25
- # * Bio::PubMed#search
26
- # * Bio::PubMed#esearch
25
+ # * Bio::PubMed#esearch (recommended)
26
+ # * Bio::PubMed#search (only retrieves top 20 hits)
27
27
  #
28
28
  # * retrieving the MEDLINE text (i.e. authors, journal, abstract, ...)
29
29
  # given a PubMed ID
30
- # * Bio::PubMed#query
31
- # * Bio::PubMed#pmfetch
32
- # * Bio::PubMed#efetch
30
+ # * Bio::PubMed#efetch (recommended)
31
+ # * Bio::PubMed#query (unstable for the change of the HTML design)
32
+ # * Bio::PubMed#pmfetch (still working but could be obsoleted by NCBI)
33
33
  #
34
34
  # The different methods within the same group are interchangeable and should
35
35
  # return the same result.
@@ -37,54 +37,61 @@ module Bio
37
37
  # Additional information about the MEDLINE format and PubMed programmable
38
38
  # APIs can be found on the following websites:
39
39
  #
40
- # * Overview: http://www.ncbi.nlm.nih.gov/entrez/query/static/overview.html
41
- # * How to link: http://www.ncbi.nlm.nih.gov/entrez/query/static/linking.html
42
- # * MEDLINE format: http://www.ncbi.nlm.nih.gov/entrez/query/static/help/pmhelp.html#MEDLINEDisplayFormat
43
- # * Search field descriptions and tags: http://www.ncbi.nlm.nih.gov/entrez/query/static/help/pmhelp.html#SearchFieldDescriptionsandTags
44
- # * Entrez utilities index: http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html
45
- # * PmFetch CGI help: http://www.ncbi.nlm.nih.gov/entrez/utils/pmfetch_help.html
46
- # * E-Utilities CGI help: http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html
40
+ # * PubMed Overview:
41
+ # http://www.ncbi.nlm.nih.gov/entrez/query/static/overview.html
42
+ # * PubMed help:
43
+ # http://www.ncbi.nlm.nih.gov/entrez/query/static/help/pmhelp.html
44
+ # * Entrez utilities index:
45
+ # http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html
46
+ # * How to link:
47
+ # http://www.ncbi.nlm.nih.gov/books/bv.fcgi?rid=helplinks.chapter.linkshelp
47
48
  #
48
49
  # == Usage
49
50
  #
50
51
  # require 'bio'
51
52
  #
52
53
  # # If you don't know the pubmed ID:
53
- # Bio::PubMed.search("(genome AND analysis) OR bioinformatics)").each do |x|
54
+ # Bio::PubMed.esearch("(genome AND analysis) OR bioinformatics").each do |x|
54
55
  # p x
55
56
  # end
56
- # Bio::PubMed.esearch("(genome AND analysis) OR bioinformatics)").each do |x|
57
+ #
58
+ # Bio::PubMed.search("(genome AND analysis) OR bioinformatics").each do |x|
57
59
  # p x
58
60
  # end
59
61
  #
60
62
  # # To retrieve the MEDLINE entry for a given PubMed ID:
63
+ # puts Bio::PubMed.efetch("10592173", "14693808")
61
64
  # puts Bio::PubMed.query("10592173")
62
65
  # puts Bio::PubMed.pmfetch("10592173")
63
- # puts Bio::PubMed.efetch("10592173", "14693808")
66
+ #
64
67
  # # This can be converted into a Bio::MEDLINE object:
65
68
  # manuscript = Bio::PubMed.query("10592173")
66
- # medline = Bio::MEDLINE(manuscript)
69
+ # medline = Bio::MEDLINE.new(manuscript)
67
70
  #
68
71
  class PubMed
69
72
 
70
- # Search the PubMed database by given keywords using entrez query and returns
71
- # an array of PubMed IDs.
72
- # ---
73
- # *Arguments*:
74
- # * _id_: query string (required)
75
- # *Returns*:: array of PubMed IDs
76
- def self.search(str)
77
- host = "www.ncbi.nlm.nih.gov"
78
- path = "/entrez/query.fcgi?tool=bioruby&cmd=Search&doptcmdl=MEDLINE&db=PubMed&term="
73
+ # Run retrieval scripts on weekends or between 9 pm and 5 am Eastern Time
74
+ # weekdays for any series of more than 100 requests.
75
+ # -> Not implemented yet in BioRuby
79
76
 
80
- http = Bio::Command.new_http(host)
81
- response, = http.get(path + CGI.escape(str))
82
- result = response.body
83
- result = result.gsub("\r", "\n").squeeze("\n")
84
- result = result.scan(/<pre>(.*?)<\/pre>/m).flatten
85
- return result
77
+ # Make no more than one request every 3 seconds.
78
+ NCBI_INTERVAL = 3
79
+ @@last_access = nil
80
+
81
+ private
82
+
83
+ def ncbi_access_wait(wait = NCBI_INTERVAL)
84
+ if @@last_access
85
+ duration = Time.now - @@last_access
86
+ if wait > duration
87
+ sleep wait - duration
88
+ end
89
+ end
90
+ @@last_access = Time.now
86
91
  end
87
92
 
93
+ public
94
+
88
95
  # Search the PubMed database by given keywords using E-Utils and returns
89
96
  # an array of PubMed IDs.
90
97
  #
@@ -102,22 +109,80 @@ class PubMed
102
109
  # * _retmax_ (default 100)
103
110
  # * _retmode_
104
111
  # * _rettype_
105
- # *Returns*:: array of PubMed IDs
106
- def self.esearch(str, hash = {})
107
- hash['retmax'] = 100 unless hash['retmax']
112
+ # *Returns*:: array of PubMed IDs or a number of results
113
+ def esearch(str, hash = {})
114
+ return nil if str.empty?
108
115
 
109
- opts = []
110
- hash.each do |k, v|
111
- opts << "#{k}=#{v}"
116
+ serv = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
117
+ opts = {
118
+ "retmax" => 100,
119
+ "tool" => "bioruby",
120
+ "db" => "pubmed",
121
+ "term" => str
122
+ }
123
+ opts.update(hash)
124
+
125
+ ncbi_access_wait
126
+
127
+ response, = Bio::Command.post_form(serv, opts)
128
+ result = response.body
129
+ if opts['rettype'] == 'count'
130
+ result = result.scan(/<Count>(.*?)<\/Count>/m).flatten.first.to_i
131
+ else
132
+ result = result.scan(/<Id>(.*?)<\/Id>/m).flatten
112
133
  end
134
+ return result
135
+ end
136
+
137
+ # Retrieve PubMed entry by PMID and returns MEDLINE formatted string using
138
+ # entrez efetch. Multiple PubMed IDs can be provided:
139
+ # Bio::PubMed.efetch(123)
140
+ # Bio::PubMed.efetch([123,456,789])
141
+ # ---
142
+ # *Arguments*:
143
+ # * _ids_: list of PubMed IDs (required)
144
+ # *Returns*:: Array of MEDLINE formatted String
145
+ def efetch(ids, hash = {})
146
+ return nil if ids.to_s.empty?
147
+ ids = ids.join(",") if ids === Array
148
+
149
+ serv = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
150
+ opts = {
151
+ "tool" => "bioruby",
152
+ "db" => "pubmed",
153
+ "retmode" => "text",
154
+ "rettype" => "medline",
155
+ "id" => ids,
156
+ }
157
+ opts.update(hash)
158
+
159
+ ncbi_access_wait
160
+
161
+ response, = Bio::Command.post_form(serv, opts)
162
+ result = response.body
163
+ if opts["retmode"] == "text"
164
+ result = result.split(/\n\n+/)
165
+ end
166
+ return result
167
+ end
168
+
169
+ # Search the PubMed database by given keywords using entrez query and returns
170
+ # an array of PubMed IDs. Caution: this method returns the first 20 hits only.
171
+ # Instead, use of the 'esearch' method is strongly recomended.
172
+ # ---
173
+ # *Arguments*:
174
+ # * _id_: query string (required)
175
+ # *Returns*:: array of PubMed IDs
176
+ def search(str)
177
+ host = "www.ncbi.nlm.nih.gov"
178
+ path = "/sites/entrez?tool=bioruby&cmd=Search&doptcmdl=Brief&db=PubMed&term="
113
179
 
114
- host = "eutils.ncbi.nlm.nih.gov"
115
- path = "/entrez/eutils/esearch.fcgi?tool=bioruby&db=pubmed&#{opts.join('&')}&term="
180
+ ncbi_access_wait
116
181
 
117
182
  http = Bio::Command.new_http(host)
118
183
  response, = http.get(path + CGI.escape(str))
119
184
  result = response.body
120
- result = result.scan(/<Id>(.*?)<\/Id>/m).flatten
185
+ result = result.scan(/value="(\d+)" id="UidCheckBox"/m).flatten
121
186
  return result
122
187
  end
123
188
 
@@ -127,18 +192,27 @@ class PubMed
127
192
  # *Arguments*:
128
193
  # * _id_: PubMed ID (required)
129
194
  # *Returns*:: MEDLINE formatted String
130
- def self.query(id)
195
+ def query(*ids)
131
196
  host = "www.ncbi.nlm.nih.gov"
132
- path = "/entrez/query.fcgi?tool=bioruby&cmd=Text&dopt=MEDLINE&db=PubMed&uid="
197
+ path = "/sites/entrez?tool=bioruby&cmd=Text&dopt=MEDLINE&db=PubMed&uid="
198
+ list = ids.join(",")
199
+
200
+ ncbi_access_wait
133
201
 
134
202
  http = Bio::Command.new_http(host)
135
- response, = http.get(path + id.to_s)
203
+ response, = http.get(path + list)
136
204
  result = response.body
137
- if result =~ /#{id}\s+Error/
205
+ result = result.scan(/<pre>\s*(.*?)<\/pre>/m).flatten
206
+
207
+ if result =~ /id:.*Error occurred/
208
+ # id: xxxxx Error occurred: Article does not exist
138
209
  raise( result )
139
210
  else
140
- result = result.gsub("\r", "\n").squeeze("\n").gsub(/<\/?pre>/, '')
141
- return result
211
+ if ids.size > 1
212
+ return result
213
+ else
214
+ return result.first
215
+ end
142
216
  end
143
217
  end
144
218
 
@@ -148,10 +222,12 @@ class PubMed
148
222
  # *Arguments*:
149
223
  # * _id_: PubMed ID (required)
150
224
  # *Returns*:: MEDLINE formatted String
151
- def self.pmfetch(id)
225
+ def pmfetch(id)
152
226
  host = "www.ncbi.nlm.nih.gov"
153
227
  path = "/entrez/utils/pmfetch.fcgi?tool=bioruby&mode=text&report=medline&db=PubMed&id="
154
228
 
229
+ ncbi_access_wait
230
+
155
231
  http = Bio::Command.new_http(host)
156
232
  response, = http.get(path + id.to_s)
157
233
  result = response.body
@@ -163,28 +239,24 @@ class PubMed
163
239
  end
164
240
  end
165
241
 
166
- # Retrieve PubMed entry by PMID and returns MEDLINE formatted string using
167
- # entrez efetch. Multiple PubMed IDs can be provided:
168
- # Bio::PubMed.efetch(123)
169
- # Bio::PubMed.efetch(123,456,789)
170
- # Bio::PubMed.efetch([123,456,789])
171
- # ---
172
- # *Arguments*:
173
- # * _ids_: list of PubMed IDs (required)
174
- # *Returns*:: MEDLINE formatted String
175
- def self.efetch(*ids)
176
- return [] if ids.empty?
242
+ def self.esearch(*args)
243
+ self.new.esearch(*args)
244
+ end
245
+
246
+ def self.efetch(*args)
247
+ self.new.efetch(*args)
248
+ end
177
249
 
178
- host = "eutils.ncbi.nlm.nih.gov"
179
- path = "/entrez/eutils/efetch.fcgi?tool=bioruby&db=pubmed&retmode=text&rettype=medline&id="
250
+ def self.search(*args)
251
+ self.new.search(*args)
252
+ end
180
253
 
181
- ids = ids.join(",")
254
+ def self.query(*args)
255
+ self.new.query(*args)
256
+ end
182
257
 
183
- http = Bio::Command.new_http(host)
184
- response, = http.get(path + ids)
185
- result = response.body
186
- result = result.split(/\n\n+/)
187
- return result
258
+ def self.pmfetch(*args)
259
+ self.new.pmfetch(*args)
188
260
  end
189
261
 
190
262
  end # PubMed
@@ -194,18 +266,88 @@ end # Bio
194
266
 
195
267
  if __FILE__ == $0
196
268
 
197
- puts Bio::PubMed.query("10592173")
198
- puts "--- ---"
199
- puts Bio::PubMed.pmfetch("10592173")
200
- puts "--- ---"
201
- Bio::PubMed.search("(genome AND analysis) OR bioinformatics)").each do |x|
269
+ puts "=== instance methods ==="
270
+
271
+ pubmed = Bio::PubMed.new
272
+
273
+ puts "--- Search PubMed by E-Utils ---"
274
+ opts = {"rettype" => "count"}
275
+ puts Time.now
276
+ puts pubmed.esearch("(genome AND analysis) OR bioinformatics", opts)
277
+ puts Time.now
278
+ puts pubmed.esearch("(genome AND analysis) OR bioinformatics", opts)
279
+ puts Time.now
280
+ puts pubmed.esearch("(genome AND analysis) OR bioinformatics", opts)
281
+ puts Time.now
282
+ pubmed.esearch("(genome AND analysis) OR bioinformatics").each do |x|
283
+ puts x
284
+ end
285
+
286
+ puts "--- Retrieve PubMed entry by E-Utils ---"
287
+ puts Time.now
288
+ puts pubmed.efetch(16381885)
289
+ puts Time.now
290
+ puts pubmed.efetch("16381885")
291
+ puts Time.now
292
+ puts pubmed.efetch("16381885")
293
+ puts Time.now
294
+ opts = {"retmode" => "xml"}
295
+ puts pubmed.efetch([10592173, 14693808], opts)
296
+ puts Time.now
297
+ puts pubmed.efetch(["10592173", "14693808"], opts)
298
+
299
+ puts "--- Search PubMed by Entrez CGI ---"
300
+ pubmed.search("(genome AND analysis) OR bioinformatics").each do |x|
202
301
  p x
203
302
  end
204
- puts "--- ---"
205
- Bio::PubMed.esearch("(genome AND analysis) OR bioinformatics)").each do |x|
303
+
304
+ puts "--- Retrieve PubMed entry by Entrez CGI ---"
305
+ puts pubmed.query("16381885")
306
+
307
+
308
+ puts "--- Retrieve PubMed entry by PMfetch ---"
309
+ puts pubmed.pmfetch("16381885")
310
+
311
+
312
+ puts "=== class methods ==="
313
+
314
+
315
+ puts "--- Search PubMed by E-Utils ---"
316
+ opts = {"rettype" => "count"}
317
+ puts Time.now
318
+ puts Bio::PubMed.esearch("(genome AND analysis) OR bioinformatics", opts)
319
+ puts Time.now
320
+ puts Bio::PubMed.esearch("(genome AND analysis) OR bioinformatics", opts)
321
+ puts Time.now
322
+ puts Bio::PubMed.esearch("(genome AND analysis) OR bioinformatics", opts)
323
+ puts Time.now
324
+ Bio::PubMed.esearch("(genome AND analysis) OR bioinformatics").each do |x|
325
+ puts x
326
+ end
327
+
328
+ puts "--- Retrieve PubMed entry by E-Utils ---"
329
+ puts Time.now
330
+ puts Bio::PubMed.efetch(16381885)
331
+ puts Time.now
332
+ puts Bio::PubMed.efetch("16381885")
333
+ puts Time.now
334
+ puts Bio::PubMed.efetch("16381885")
335
+ puts Time.now
336
+ opts = {"retmode" => "xml"}
337
+ puts Bio::PubMed.efetch([10592173, 14693808], opts)
338
+ puts Time.now
339
+ puts Bio::PubMed.efetch(["10592173", "14693808"], opts)
340
+
341
+ puts "--- Search PubMed by Entrez CGI ---"
342
+ Bio::PubMed.search("(genome AND analysis) OR bioinformatics").each do |x|
206
343
  p x
207
344
  end
208
- puts "--- ---"
209
- puts Bio::PubMed.efetch("10592173", "14693808")
345
+
346
+ puts "--- Retrieve PubMed entry by Entrez CGI ---"
347
+ puts Bio::PubMed.query("16381885")
348
+
349
+
350
+ puts "--- Retrieve PubMed entry by PMfetch ---"
351
+ puts Bio::PubMed.pmfetch("16381885")
210
352
 
211
353
  end