rschenk-pubmed_search 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +5 -0
- data/LICENSE +20 -0
- data/README.rdoc +49 -0
- data/Rakefile +48 -0
- data/VERSION +1 -0
- data/lib/pubmed_search.rb +106 -0
- data/spec/pubmed_search_spec.rb +68 -0
- data/spec/responses/bangana_tonkinensis_cranksection.xml +49 -0
- data/spec/responses/biodiversity_informatics.xml +22 -0
- data/spec/responses/e_coli_0.xml +100004 -0
- data/spec/responses/e_coli_1.xml +100004 -0
- data/spec/responses/e_coli_2.xml +75241 -0
- data/spec/responses/mus_musculus.xml +100004 -0
- data/spec/spec_helper.rb +53 -0
- metadata +69 -0
data/.document
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Ryan Schenk
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
= pubmed_search
|
2
|
+
|
3
|
+
PubmedSearch is PubMed search wrapper. It provides more (obscure) features than BioRuby's.
|
4
|
+
|
5
|
+
If all you need is a list of PubMed IDs, then use BioRuby. If you need more information from the eSearchResponse, then check this out.
|
6
|
+
|
7
|
+
== Features
|
8
|
+
|
9
|
+
In addition to returning the list of PubMed IDs, it will give you more information about your eSearch Response:
|
10
|
+
|
11
|
+
* list of Exploded MeSH terms ({More Info about MeSH Term Explosion}[http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=2651214#id443777])
|
12
|
+
* list of PhraseNotFound terms
|
13
|
+
* the Count field
|
14
|
+
|
15
|
+
Additionally, PubmedSearch can automatically fetch more results if the list of PubMed IDs is less than the Count. For instance, PubMed automatically caps your retmax to 100,000. However, a search for "Mus musculus" will return over 950,000 results. In this situation, PubmedSearch will automatically perform the 10 eSearches required to load the entire set of PubMed IDs, and return it to you as a single result set. This functionality is _disabled_ by default, but can be turned on via an option.
|
16
|
+
|
17
|
+
== Synopsis
|
18
|
+
|
19
|
+
=== Without the +load_all_pmids+ functionality (default)
|
20
|
+
|
21
|
+
results = PubmedSearch.search "Mus musculus"
|
22
|
+
|
23
|
+
results.pmids.length
|
24
|
+
#=> 100000
|
25
|
+
|
26
|
+
results.count
|
27
|
+
#=> 951134
|
28
|
+
|
29
|
+
results.exploded_mesh_terms
|
30
|
+
#=> #<Set: {"mice"}>
|
31
|
+
|
32
|
+
=== With the +load_all_pmids+ functionality
|
33
|
+
|
34
|
+
results = PubmedSearch.search "Mus musculus", :load_all_pmids => true
|
35
|
+
|
36
|
+
results.pmids.length
|
37
|
+
#=> 951134
|
38
|
+
|
39
|
+
results.count
|
40
|
+
#=> 951134
|
41
|
+
|
42
|
+
== Requirements
|
43
|
+
|
44
|
+
* libxml-ruby - sudo gem install libxml-ruby
|
45
|
+
* simple_uri_template - sudo gem install rschenk-simple_uri_template
|
46
|
+
|
47
|
+
== Copyright
|
48
|
+
|
49
|
+
Copyright (c) 2009 Ryan Schenk. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "pubmed_search"
|
8
|
+
gem.summary = %Q{A PubMed searching library with more features than BioRuby}
|
9
|
+
gem.email = "rschenk@gmail.com"
|
10
|
+
gem.homepage = "http://github.com/rschenk/pubmed_search"
|
11
|
+
gem.authors = ["Ryan Schenk"]
|
12
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
13
|
+
end
|
14
|
+
|
15
|
+
rescue LoadError
|
16
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
17
|
+
end
|
18
|
+
|
19
|
+
require 'spec/rake/spectask'
|
20
|
+
Spec::Rake::SpecTask.new(:spec) do |spec|
|
21
|
+
spec.libs << 'lib' << 'spec'
|
22
|
+
spec.spec_files = FileList['spec/**/*_spec.rb']
|
23
|
+
end
|
24
|
+
|
25
|
+
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
26
|
+
spec.libs << 'lib' << 'spec'
|
27
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
28
|
+
spec.rcov = true
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
task :default => :spec
|
33
|
+
|
34
|
+
require 'rake/rdoctask'
|
35
|
+
Rake::RDocTask.new do |rdoc|
|
36
|
+
if File.exist?('VERSION.yml')
|
37
|
+
config = YAML.load(File.read('VERSION.yml'))
|
38
|
+
version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
|
39
|
+
else
|
40
|
+
version = ""
|
41
|
+
end
|
42
|
+
|
43
|
+
rdoc.rdoc_dir = 'rdoc'
|
44
|
+
rdoc.title = "pubmed_search #{version}"
|
45
|
+
rdoc.rdoc_files.include('README*')
|
46
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
47
|
+
end
|
48
|
+
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
@@ -0,0 +1,106 @@
|
|
1
|
+
require 'set'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'xml' # sudo gem install libxml-ruby
|
6
|
+
require 'simple_uri_template' # sudo gem install rschenk-simple_uri_template
|
7
|
+
|
8
|
+
|
9
|
+
class PubmedSearch
|
10
|
+
# List of Pubmed IDs returned by your search
|
11
|
+
attr_accessor :pmids
|
12
|
+
|
13
|
+
# The Count field returned by your search. If pmids < count, then you need to look at your retmax or try load_all_pmids
|
14
|
+
attr_accessor :count
|
15
|
+
|
16
|
+
# See exploded_mesh_terms for a description.
|
17
|
+
attr_accessor :exploded_mesh_terms
|
18
|
+
|
19
|
+
# The PhraseNotFound elements returned by your search
|
20
|
+
attr_accessor :phrases_not_found
|
21
|
+
|
22
|
+
WAIT_TIME = 1 # seconds
|
23
|
+
DEFAULT_OPTIONS = {:retmax => 100000,
|
24
|
+
:retstart => 0,
|
25
|
+
:load_all_pmids => false }
|
26
|
+
|
27
|
+
@uri_template = SimpleURITemplate.new('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmax={retmax}&retstart={retstart}&term={term}')
|
28
|
+
|
29
|
+
class << self
|
30
|
+
# Performs a search to PubMed via eUtils with the given term +String+, and returns a +PubmedSearch+ object modeling the response.
|
31
|
+
#
|
32
|
+
# Accepts a +Hash+ of options. Valid options are
|
33
|
+
# * :retmax - Defaults to 100,000 which is the largest retmax that PubMed will honor.
|
34
|
+
# * :retstart - Defaults to 0. Set higher if you need to page through results. You shouldn't need to do that manually, because of the +load_all_pmids+ option
|
35
|
+
# * :load_all_pmids - Defaults to +false+. If this is set +true+, then search will continue sending eSearches with an increasing retstart until the list of pmids == count. For instance, an eSearch for "Mus musculus" will return ~951134 results, but the highest retmax allowable is 100000. With +load_all_pmids+ set +true+, search will automatically perform 10 eSearches and return the entire list of pmids in one go.
|
36
|
+
def search(term, options={})
|
37
|
+
options = DEFAULT_OPTIONS.merge(options)
|
38
|
+
|
39
|
+
results = do_search(new, term, options)
|
40
|
+
|
41
|
+
if options[:load_all_pmids]
|
42
|
+
# Send off subsequent requests to load all the PMIDs, add them to the results
|
43
|
+
(options[:retmax]..results.count).step(options[:retmax]) do |step|
|
44
|
+
do_search(results, term, options.merge({:retstart => step}))
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
results
|
49
|
+
end
|
50
|
+
|
51
|
+
# As of May 2009, PubMed requires a 300ms pause between eUtils calls. It used to be 3 seconds.
|
52
|
+
# PubmedSearch pauses for 1 second just to be on the safe side.
|
53
|
+
def wait
|
54
|
+
sleep WAIT_TIME unless @skip_wait
|
55
|
+
end
|
56
|
+
|
57
|
+
# Setting this to true will prevent PubmedSearch from pausing before sending requests to PubMed. This is a fantastic way to get yourself banned from eUtils.
|
58
|
+
#
|
59
|
+
# I only use this for testing.
|
60
|
+
def skip_wait=(setting)
|
61
|
+
@skip_wait = setting
|
62
|
+
end
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
# Performs the HTTP request and parses the response
|
67
|
+
def do_search(results, term, options)
|
68
|
+
wait
|
69
|
+
|
70
|
+
esearch_url = @uri_template.expand(options.merge({:term => term}))
|
71
|
+
doc = XML::Document.io( open esearch_url )
|
72
|
+
|
73
|
+
results.count = doc.find('/eSearchResult/Count').first.content.to_i
|
74
|
+
|
75
|
+
doc.find('/eSearchResult/IdList/Id').each {|n| results.pmids << n.content.to_i}
|
76
|
+
|
77
|
+
doc.find('/eSearchResult/TranslationStack/TermSet/Term').each do |n|
|
78
|
+
if n.content =~ /"(.*)"\[MeSH Terms\]/
|
79
|
+
results.exploded_mesh_terms << $1
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
doc.find('/eSearchResult/ErrorList/PhraseNotFound').each {|n| results.phrases_not_found << n.content }
|
84
|
+
|
85
|
+
results
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
# Get the list of Pubmed IDs returned by this esearch as an +Array+ of +Numbers+
|
91
|
+
def pmids
|
92
|
+
@pmids ||= []
|
93
|
+
end
|
94
|
+
|
95
|
+
# Get the list of MeSH terms that PubMed exploded.
|
96
|
+
# For more information on MeSH term explosion, see http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=2651214#id443777
|
97
|
+
def exploded_mesh_terms
|
98
|
+
@exploded_mesh_terms ||= Set.new
|
99
|
+
end
|
100
|
+
|
101
|
+
# Get the list of PhraseNotFound terms returned by your search
|
102
|
+
def phrases_not_found
|
103
|
+
@phrases_not_found ||= Set.new
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
require 'benchmark'
|
3
|
+
|
4
|
+
fake_esearch_response "Mus musculus"
|
5
|
+
fake_esearch_response "biodiversity informatics"
|
6
|
+
fake_esearch_response "Bangana tonkinensis cranksection"
|
7
|
+
|
8
|
+
describe PubmedSearch do
|
9
|
+
before(:each) do
|
10
|
+
PubmedSearch.skip_wait = true
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should have attributes count, pmids, exploded_mesh_terms, and phrases_not_found" do
|
14
|
+
result = PubmedSearch.new
|
15
|
+
result.should respond_to :count
|
16
|
+
result.should respond_to :pmids
|
17
|
+
result.should respond_to :exploded_mesh_terms
|
18
|
+
result.should respond_to :phrases_not_found
|
19
|
+
end
|
20
|
+
|
21
|
+
describe "::search" do
|
22
|
+
it "should pause before sending a subsequent request to eUtils" do
|
23
|
+
PubmedSearch.skip_wait = false
|
24
|
+
Benchmark.realtime{ PubmedSearch.search "biodiversity informatics" }.should > PubmedSearch::WAIT_TIME
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should build a PubmedSearch object for the results" do
|
28
|
+
result = PubmedSearch.search("biodiversity informatics")
|
29
|
+
result.should be_an_instance_of PubmedSearch
|
30
|
+
|
31
|
+
result.count.should == result.pmids.length
|
32
|
+
|
33
|
+
result.pmids.should include 19129210, 18784790, 18483570
|
34
|
+
|
35
|
+
result.exploded_mesh_terms.should only_include 'biodiversity', 'informatics'
|
36
|
+
|
37
|
+
result.phrases_not_found.should be_empty
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should allow the user to specify retmax" do
|
41
|
+
FakeWeb.allow_net_connect = true
|
42
|
+
result = PubmedSearch.search "Mr T", :retmax => 5
|
43
|
+
result.pmids.length.should == 5
|
44
|
+
FakeWeb.allow_net_connect = false
|
45
|
+
end
|
46
|
+
|
47
|
+
it "should allow multiple requests to NLM if Count > Retmax if desired" do
|
48
|
+
FakeWeb.register_uri("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmax=100000&retstart=0&term=e%20coli", :file => File.dirname(__FILE__) + '/responses/e_coli_0.xml')
|
49
|
+
FakeWeb.register_uri("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmax=100000&retstart=100000&term=e%20coli", :file => File.dirname(__FILE__) + '/responses/e_coli_1.xml')
|
50
|
+
FakeWeb.register_uri("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmax=100000&retstart=200000&term=e%20coli", :file => File.dirname(__FILE__) + '/responses/e_coli_2.xml')
|
51
|
+
|
52
|
+
result = PubmedSearch.search("e coli", :load_all_pmids => true)
|
53
|
+
|
54
|
+
result.pmids.length.should == result.count
|
55
|
+
|
56
|
+
result.pmids.should include 19464251, 9737856, 6319486 # One PMID from each of the three e_coli_n.xml files
|
57
|
+
|
58
|
+
result.exploded_mesh_terms.should only_include 'escherichia coli'
|
59
|
+
end
|
60
|
+
|
61
|
+
it "should record any PhraseNotFound elements" do
|
62
|
+
result = PubmedSearch.search "Bangana tonkinensis cranksection"
|
63
|
+
|
64
|
+
result.phrases_not_found.should only_include 'Bangana', 'cranksection'
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
<?xml version="1.0" ?>
|
2
|
+
<!DOCTYPE eSearchResult PUBLIC "-//NLM//DTD eSearchResult, 11 May 2002//EN" "http://www.ncbi.nlm.nih.gov/entrez/query/DTD/eSearch_020511.dtd">
|
3
|
+
<eSearchResult><Count>45</Count><RetMax>45</RetMax><RetStart>0</RetStart><IdList>
|
4
|
+
<Id>19445149</Id>
|
5
|
+
<Id>19260293</Id>
|
6
|
+
<Id>19209761</Id>
|
7
|
+
<Id>19196409</Id>
|
8
|
+
<Id>19170963</Id>
|
9
|
+
<Id>18615626</Id>
|
10
|
+
<Id>18161942</Id>
|
11
|
+
<Id>17450758</Id>
|
12
|
+
<Id>17427256</Id>
|
13
|
+
<Id>17365189</Id>
|
14
|
+
<Id>16904915</Id>
|
15
|
+
<Id>16810764</Id>
|
16
|
+
<Id>16783704</Id>
|
17
|
+
<Id>16753795</Id>
|
18
|
+
<Id>16724846</Id>
|
19
|
+
<Id>16355802</Id>
|
20
|
+
<Id>16308206</Id>
|
21
|
+
<Id>16289147</Id>
|
22
|
+
<Id>16273300</Id>
|
23
|
+
<Id>16042151</Id>
|
24
|
+
<Id>16012772</Id>
|
25
|
+
<Id>15934237</Id>
|
26
|
+
<Id>15787740</Id>
|
27
|
+
<Id>15744101</Id>
|
28
|
+
<Id>15658812</Id>
|
29
|
+
<Id>15461276</Id>
|
30
|
+
<Id>15351108</Id>
|
31
|
+
<Id>15256715</Id>
|
32
|
+
<Id>15224417</Id>
|
33
|
+
<Id>15125577</Id>
|
34
|
+
<Id>15015265</Id>
|
35
|
+
<Id>14510600</Id>
|
36
|
+
<Id>12872400</Id>
|
37
|
+
<Id>12785735</Id>
|
38
|
+
<Id>12774385</Id>
|
39
|
+
<Id>12736462</Id>
|
40
|
+
<Id>12151076</Id>
|
41
|
+
<Id>10536849</Id>
|
42
|
+
<Id>9000883</Id>
|
43
|
+
<Id>7765437</Id>
|
44
|
+
<Id>8427879</Id>
|
45
|
+
<Id>1479025</Id>
|
46
|
+
<Id>1442048</Id>
|
47
|
+
<Id>1758366</Id>
|
48
|
+
<Id>2512948</Id>
|
49
|
+
</IdList><TranslationSet/><TranslationStack> <TermSet> <Term>tonkinensis[All Fields]</Term> <Field>All Fields</Field> <Count>46</Count> <Explode>Y</Explode> </TermSet> <OP>GROUP</OP> </TranslationStack><QueryTranslation>tonkinensis[All Fields]</QueryTranslation><ErrorList><PhraseNotFound>Bangana</PhraseNotFound><PhraseNotFound>cranksection</PhraseNotFound></ErrorList></eSearchResult>
|
@@ -0,0 +1,22 @@
|
|
1
|
+
<?xml version="1.0" ?>
|
2
|
+
<!DOCTYPE eSearchResult PUBLIC "-//NLM//DTD eSearchResult, 11 May 2002//EN" "http://www.ncbi.nlm.nih.gov/entrez/query/DTD/eSearch_020511.dtd">
|
3
|
+
<eSearchResult><Count>18</Count><RetMax>18</RetMax><RetStart>0</RetStart><IdList>
|
4
|
+
<Id>19129210</Id>
|
5
|
+
<Id>18784790</Id>
|
6
|
+
<Id>18483570</Id>
|
7
|
+
<Id>18445641</Id>
|
8
|
+
<Id>18335319</Id>
|
9
|
+
<Id>17704120</Id>
|
10
|
+
<Id>17597923</Id>
|
11
|
+
<Id>17594421</Id>
|
12
|
+
<Id>16956323</Id>
|
13
|
+
<Id>16701313</Id>
|
14
|
+
<Id>16680511</Id>
|
15
|
+
<Id>19455221</Id>
|
16
|
+
<Id>19455206</Id>
|
17
|
+
<Id>15253354</Id>
|
18
|
+
<Id>15192219</Id>
|
19
|
+
<Id>15063059</Id>
|
20
|
+
<Id>12376687</Id>
|
21
|
+
<Id>11009408</Id>
|
22
|
+
</IdList><TranslationSet><Translation> <From>biodiversity</From> <To>"biodiversity"[MeSH Terms] OR "biodiversity"[All Fields]</To> </Translation><Translation> <From>informatics</From> <To>"informatics"[MeSH Terms] OR "informatics"[All Fields]</To> </Translation></TranslationSet><TranslationStack> <TermSet> <Term>"biodiversity"[MeSH Terms]</Term> <Field>MeSH Terms</Field> <Count>5512</Count> <Explode>Y</Explode> </TermSet> <TermSet> <Term>"biodiversity"[All Fields]</Term> <Field>All Fields</Field> <Count>9741</Count> <Explode>Y</Explode> </TermSet> <OP>OR</OP> <OP>GROUP</OP> <TermSet> <Term>"informatics"[MeSH Terms]</Term> <Field>MeSH Terms</Field> <Count>6441</Count> <Explode>Y</Explode> </TermSet> <TermSet> <Term>"informatics"[All Fields]</Term> <Field>All Fields</Field> <Count>21168</Count> <Explode>Y</Explode> </TermSet> <OP>OR</OP> <OP>GROUP</OP> <OP>AND</OP> <OP>GROUP</OP> </TranslationStack><QueryTranslation>("biodiversity"[MeSH Terms] OR "biodiversity"[All Fields]) AND ("informatics"[MeSH Terms] OR "informatics"[All Fields])</QueryTranslation></eSearchResult>
|