pubmed_search 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Ryan Schenk
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,49 @@
1
+ = pubmed_search
2
+
3
+ PubmedSearch is PubMed search wrapper. It provides more (obscure) features than BioRuby's.
4
+
5
+ If all you need is a list of PubMed IDs, then use BioRuby. If you need more information from the eSearchResponse, then check this out.
6
+
7
+ == Features
8
+
9
+ In addition to returning the list of PubMed IDs, it will give you more information about your eSearch Response:
10
+
11
+ * list of Exploded MeSH terms ({More Info about MeSH Term Explosion}[http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=2651214#id443777])
12
+ * list of PhraseNotFound terms
13
+ * the Count field
14
+
15
+ Additionally, PubmedSearch can automatically fetch more results if the list of PubMed IDs is less than the Count. For instance, PubMed automatically caps your retmax to 100,000. However, a search for "Mus musculus" will return over 950,000 results. In this situation, PubmedSearch will automatically perform the 10 eSearches required to load the entire set of PubMed IDs, and return it to you as a single result set. This functionality is _disabled_ by default, but can be turned on via an option.
16
+
17
+ == Synopsis
18
+
19
+ === Without the +load_all_pmids+ functionality (default)
20
+
21
+ results = PubmedSearch.search "Mus musculus"
22
+
23
+ results.pmids.length
24
+ #=> 100000
25
+
26
+ results.count
27
+ #=> 951134
28
+
29
+ results.exploded_mesh_terms
30
+ #=> #<Set: {"mice"}>
31
+
32
+ === With the +load_all_pmids+ functionality
33
+
34
+ results = PubmedSearch.search "Mus musculus", :load_all_pmids => true
35
+
36
+ results.pmids.length
37
+ #=> 951134
38
+
39
+ results.count
40
+ #=> 951134
41
+
42
+ == Requirements
43
+
44
+ * libxml-ruby - sudo gem install libxml-ruby
45
+ * simple_uri_template - sudo gem install rschenk-simple_uri_template
46
+
47
+ == Copyright
48
+
49
+ Copyright (c) 2009 Ryan Schenk. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,50 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "pubmed_search"
8
+ gem.summary = %Q{A PubMed searching library with more features than BioRuby}
9
+ gem.email = "rschenk@gmail.com"
10
+ gem.homepage = "http://github.com/rschenk/pubmed_search"
11
+ gem.authors = ["Ryan Schenk"]
12
+ gem.add_dependency 'rschenk-simple_uri_template'
13
+ gem.add_dependency 'nokogiri'
14
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
+ end
16
+
17
+ rescue LoadError
18
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
19
+ end
20
+
21
+ require 'spec/rake/spectask'
22
+ Spec::Rake::SpecTask.new(:spec) do |spec|
23
+ spec.libs << 'lib' << 'spec'
24
+ spec.spec_files = FileList['spec/**/*_spec.rb']
25
+ end
26
+
27
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
28
+ spec.libs << 'lib' << 'spec'
29
+ spec.pattern = 'spec/**/*_spec.rb'
30
+ spec.rcov = true
31
+ end
32
+
33
+
34
+ task :default => :spec
35
+
36
+ require 'rake/rdoctask'
37
+ Rake::RDocTask.new do |rdoc|
38
+ if File.exist?('VERSION.yml')
39
+ config = YAML.load(File.read('VERSION.yml'))
40
+ version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
41
+ else
42
+ version = ""
43
+ end
44
+
45
+ rdoc.rdoc_dir = 'rdoc'
46
+ rdoc.title = "pubmed_search #{version}"
47
+ rdoc.rdoc_files.include('README*')
48
+ rdoc.rdoc_files.include('lib/**/*.rb')
49
+ end
50
+
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.0
@@ -0,0 +1,106 @@
1
+ require 'set'
2
+ require 'open-uri'
3
+
4
+ require 'rubygems'
5
+ require 'nokogiri'
6
+ require 'simple_uri_template' # sudo gem install rschenk-simple_uri_template
7
+
8
+
9
+ class PubmedSearch
10
+ # List of Pubmed IDs returned by your search
11
+ attr_accessor :pmids
12
+
13
+ # The Count field returned by your search. If pmids < count, then you need to look at your retmax or try load_all_pmids
14
+ attr_accessor :count
15
+
16
+ # See exploded_mesh_terms for a description.
17
+ attr_accessor :exploded_mesh_terms
18
+
19
+ # The PhraseNotFound elements returned by your search
20
+ attr_accessor :phrases_not_found
21
+
22
+ WAIT_TIME = 1 # seconds
23
+ DEFAULT_OPTIONS = {:retmax => 100000,
24
+ :retstart => 0,
25
+ :load_all_pmids => false }
26
+
27
+ @uri_template = SimpleURITemplate.new('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmax={retmax}&retstart={retstart}&term={term}')
28
+
29
+ class << self
30
+ # Performs a search to PubMed via eUtils with the given term +String+, and returns a +PubmedSearch+ object modeling the response.
31
+ #
32
+ # Accepts a +Hash+ of options. Valid options are
33
+ # * :retmax - Defaults to 100,000 which is the largest retmax that PubMed will honor.
34
+ # * :retstart - Defaults to 0. Set higher if you need to page through results. You shouldn't need to do that manually, because of the +load_all_pmids+ option
35
+ # * :load_all_pmids - Defaults to +false+. If this is set +true+, then search will continue sending eSearches with an increasing retstart until the list of pmids == count. For instance, an eSearch for "Mus musculus" will return ~951134 results, but the highest retmax allowable is 100000. With +load_all_pmids+ set +true+, search will automatically perform 10 eSearches and return the entire list of pmids in one go.
36
+ def search(term, options={})
37
+ options = DEFAULT_OPTIONS.merge(options)
38
+
39
+ results = do_search(new, term, options)
40
+
41
+ if options[:load_all_pmids]
42
+ # Send off subsequent requests to load all the PMIDs, add them to the results
43
+ (options[:retmax]..results.count).step(options[:retmax]) do |step|
44
+ do_search(results, term, options.merge({:retstart => step}))
45
+ end
46
+ end
47
+
48
+ results
49
+ end
50
+
51
+ # As of May 2009, PubMed requires a 300ms pause between eUtils calls. It used to be 3 seconds.
52
+ # PubmedSearch pauses for 1 second just to be on the safe side.
53
+ def wait
54
+ sleep WAIT_TIME unless @skip_wait
55
+ end
56
+
57
+ # Setting this to true will prevent PubmedSearch from pausing before sending requests to PubMed. This is a fantastic way to get yourself banned from eUtils.
58
+ #
59
+ # I only use this for testing.
60
+ def skip_wait=(setting)
61
+ @skip_wait = setting
62
+ end
63
+
64
+ private
65
+
66
+ # Performs the HTTP request and parses the response
67
+ def do_search(results, term, options)
68
+ wait
69
+
70
+ esearch_url = @uri_template.expand(options.merge({:term => term}))
71
+ doc = Nokogiri::XML( open esearch_url )
72
+
73
+ results.count = doc.xpath('/eSearchResult/Count').first.content.to_i
74
+
75
+ doc.xpath('/eSearchResult/IdList/Id').each {|n| results.pmids << n.content.to_i}
76
+
77
+ doc.xpath('/eSearchResult/TranslationStack/TermSet/Term').each do |n|
78
+ if n.content =~ /"(.*)"\[MeSH Terms\]/
79
+ results.exploded_mesh_terms << $1
80
+ end
81
+ end
82
+
83
+ doc.xpath('/eSearchResult/ErrorList/PhraseNotFound').each {|n| results.phrases_not_found << n.content }
84
+
85
+ results
86
+ end
87
+ end
88
+
89
+
90
+ # Get the list of Pubmed IDs returned by this esearch as an +Array+ of +Numbers+
91
+ def pmids
92
+ @pmids ||= []
93
+ end
94
+
95
+ # Get the list of MeSH terms that PubMed exploded.
96
+ # For more information on MeSH term explosion, see http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=2651214#id443777
97
+ def exploded_mesh_terms
98
+ @exploded_mesh_terms ||= Set.new
99
+ end
100
+
101
+ # Get the list of PhraseNotFound terms returned by your search
102
+ def phrases_not_found
103
+ @phrases_not_found ||= Set.new
104
+ end
105
+
106
+ end
@@ -0,0 +1,62 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{pubmed_search}
8
+ s.version = "0.2.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Ryan Schenk"]
12
+ s.date = %q{2010-01-30}
13
+ s.email = %q{rschenk@gmail.com}
14
+ s.extra_rdoc_files = [
15
+ "LICENSE",
16
+ "README.rdoc"
17
+ ]
18
+ s.files = [
19
+ ".document",
20
+ ".gitignore",
21
+ "LICENSE",
22
+ "README.rdoc",
23
+ "Rakefile",
24
+ "VERSION",
25
+ "lib/pubmed_search.rb",
26
+ "pubmed_search.gemspec",
27
+ "spec/pubmed_search_spec.rb",
28
+ "spec/responses/bangana_tonkinensis_cranksection.xml",
29
+ "spec/responses/biodiversity_informatics.xml",
30
+ "spec/responses/e_coli_0.xml",
31
+ "spec/responses/e_coli_1.xml",
32
+ "spec/responses/e_coli_2.xml",
33
+ "spec/responses/mus_musculus.xml",
34
+ "spec/spec_helper.rb"
35
+ ]
36
+ s.has_rdoc = true
37
+ s.homepage = %q{http://github.com/rschenk/pubmed_search}
38
+ s.rdoc_options = ["--charset=UTF-8"]
39
+ s.require_paths = ["lib"]
40
+ s.rubygems_version = %q{1.3.1}
41
+ s.summary = %q{A PubMed searching library with more features than BioRuby}
42
+ s.test_files = [
43
+ "spec/pubmed_search_spec.rb",
44
+ "spec/spec_helper.rb"
45
+ ]
46
+
47
+ if s.respond_to? :specification_version then
48
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
49
+ s.specification_version = 2
50
+
51
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
52
+ s.add_runtime_dependency(%q<rschenk-simple_uri_template>, [">= 0"])
53
+ s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
54
+ else
55
+ s.add_dependency(%q<rschenk-simple_uri_template>, [">= 0"])
56
+ s.add_dependency(%q<nokogiri>, [">= 0"])
57
+ end
58
+ else
59
+ s.add_dependency(%q<rschenk-simple_uri_template>, [">= 0"])
60
+ s.add_dependency(%q<nokogiri>, [">= 0"])
61
+ end
62
+ end
@@ -0,0 +1,68 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+ require 'benchmark'
3
+
4
+ fake_esearch_response "Mus musculus"
5
+ fake_esearch_response "biodiversity informatics"
6
+ fake_esearch_response "Bangana tonkinensis cranksection"
7
+
8
+ describe PubmedSearch do
9
+ before(:each) do
10
+ PubmedSearch.skip_wait = true
11
+ end
12
+
13
+ it "should have attributes count, pmids, exploded_mesh_terms, and phrases_not_found" do
14
+ result = PubmedSearch.new
15
+ result.should respond_to :count
16
+ result.should respond_to :pmids
17
+ result.should respond_to :exploded_mesh_terms
18
+ result.should respond_to :phrases_not_found
19
+ end
20
+
21
+ describe "::search" do
22
+ it "should pause before sending a subsequent request to eUtils" do
23
+ PubmedSearch.skip_wait = false
24
+ Benchmark.realtime{ PubmedSearch.search "biodiversity informatics" }.should > PubmedSearch::WAIT_TIME
25
+ end
26
+
27
+ it "should build a PubmedSearch object for the results" do
28
+ result = PubmedSearch.search("biodiversity informatics")
29
+ result.should be_an_instance_of PubmedSearch
30
+
31
+ result.count.should == result.pmids.length
32
+
33
+ result.pmids.should include 19129210, 18784790, 18483570
34
+
35
+ result.exploded_mesh_terms.should only_include 'biodiversity', 'informatics'
36
+
37
+ result.phrases_not_found.should be_empty
38
+ end
39
+
40
+ it "should allow the user to specify retmax" do
41
+ FakeWeb.allow_net_connect = true
42
+ result = PubmedSearch.search "Mr T", :retmax => 5
43
+ result.pmids.length.should == 5
44
+ FakeWeb.allow_net_connect = false
45
+ end
46
+
47
+ it "should allow multiple requests to NLM if Count > Retmax if desired" do
48
+ FakeWeb.register_uri(:any, "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmax=100000&retstart=0&term=e%20coli", :body => File.dirname(__FILE__) + '/responses/e_coli_0.xml')
49
+ FakeWeb.register_uri(:any, "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmax=100000&retstart=100000&term=e%20coli", :body => File.dirname(__FILE__) + '/responses/e_coli_1.xml')
50
+ FakeWeb.register_uri(:any, "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmax=100000&retstart=200000&term=e%20coli", :body => File.dirname(__FILE__) + '/responses/e_coli_2.xml')
51
+
52
+ result = PubmedSearch.search("e coli", :load_all_pmids => true)
53
+
54
+ result.pmids.length.should == result.count
55
+
56
+ result.pmids.should include 19464251, 9737856, 6319486 # One PMID from each of the three e_coli_n.xml files
57
+
58
+ result.exploded_mesh_terms.should only_include 'escherichia coli'
59
+ end
60
+
61
+ it "should record any PhraseNotFound elements" do
62
+ result = PubmedSearch.search "Bangana tonkinensis cranksection"
63
+
64
+ result.phrases_not_found.should only_include 'Bangana', 'cranksection'
65
+ end
66
+ end
67
+
68
+ end
@@ -0,0 +1,49 @@
1
+ <?xml version="1.0" ?>
2
+ <!DOCTYPE eSearchResult PUBLIC "-//NLM//DTD eSearchResult, 11 May 2002//EN" "http://www.ncbi.nlm.nih.gov/entrez/query/DTD/eSearch_020511.dtd">
3
+ <eSearchResult><Count>45</Count><RetMax>45</RetMax><RetStart>0</RetStart><IdList>
4
+ <Id>19445149</Id>
5
+ <Id>19260293</Id>
6
+ <Id>19209761</Id>
7
+ <Id>19196409</Id>
8
+ <Id>19170963</Id>
9
+ <Id>18615626</Id>
10
+ <Id>18161942</Id>
11
+ <Id>17450758</Id>
12
+ <Id>17427256</Id>
13
+ <Id>17365189</Id>
14
+ <Id>16904915</Id>
15
+ <Id>16810764</Id>
16
+ <Id>16783704</Id>
17
+ <Id>16753795</Id>
18
+ <Id>16724846</Id>
19
+ <Id>16355802</Id>
20
+ <Id>16308206</Id>
21
+ <Id>16289147</Id>
22
+ <Id>16273300</Id>
23
+ <Id>16042151</Id>
24
+ <Id>16012772</Id>
25
+ <Id>15934237</Id>
26
+ <Id>15787740</Id>
27
+ <Id>15744101</Id>
28
+ <Id>15658812</Id>
29
+ <Id>15461276</Id>
30
+ <Id>15351108</Id>
31
+ <Id>15256715</Id>
32
+ <Id>15224417</Id>
33
+ <Id>15125577</Id>
34
+ <Id>15015265</Id>
35
+ <Id>14510600</Id>
36
+ <Id>12872400</Id>
37
+ <Id>12785735</Id>
38
+ <Id>12774385</Id>
39
+ <Id>12736462</Id>
40
+ <Id>12151076</Id>
41
+ <Id>10536849</Id>
42
+ <Id>9000883</Id>
43
+ <Id>7765437</Id>
44
+ <Id>8427879</Id>
45
+ <Id>1479025</Id>
46
+ <Id>1442048</Id>
47
+ <Id>1758366</Id>
48
+ <Id>2512948</Id>
49
+ </IdList><TranslationSet/><TranslationStack> <TermSet> <Term>tonkinensis[All Fields]</Term> <Field>All Fields</Field> <Count>46</Count> <Explode>Y</Explode> </TermSet> <OP>GROUP</OP> </TranslationStack><QueryTranslation>tonkinensis[All Fields]</QueryTranslation><ErrorList><PhraseNotFound>Bangana</PhraseNotFound><PhraseNotFound>cranksection</PhraseNotFound></ErrorList></eSearchResult>
@@ -0,0 +1,22 @@
1
+ <?xml version="1.0" ?>
2
+ <!DOCTYPE eSearchResult PUBLIC "-//NLM//DTD eSearchResult, 11 May 2002//EN" "http://www.ncbi.nlm.nih.gov/entrez/query/DTD/eSearch_020511.dtd">
3
+ <eSearchResult><Count>18</Count><RetMax>18</RetMax><RetStart>0</RetStart><IdList>
4
+ <Id>19129210</Id>
5
+ <Id>18784790</Id>
6
+ <Id>18483570</Id>
7
+ <Id>18445641</Id>
8
+ <Id>18335319</Id>
9
+ <Id>17704120</Id>
10
+ <Id>17597923</Id>
11
+ <Id>17594421</Id>
12
+ <Id>16956323</Id>
13
+ <Id>16701313</Id>
14
+ <Id>16680511</Id>
15
+ <Id>19455221</Id>
16
+ <Id>19455206</Id>
17
+ <Id>15253354</Id>
18
+ <Id>15192219</Id>
19
+ <Id>15063059</Id>
20
+ <Id>12376687</Id>
21
+ <Id>11009408</Id>
22
+ </IdList><TranslationSet><Translation> <From>biodiversity</From> <To>"biodiversity"[MeSH Terms] OR "biodiversity"[All Fields]</To> </Translation><Translation> <From>informatics</From> <To>"informatics"[MeSH Terms] OR "informatics"[All Fields]</To> </Translation></TranslationSet><TranslationStack> <TermSet> <Term>"biodiversity"[MeSH Terms]</Term> <Field>MeSH Terms</Field> <Count>5512</Count> <Explode>Y</Explode> </TermSet> <TermSet> <Term>"biodiversity"[All Fields]</Term> <Field>All Fields</Field> <Count>9741</Count> <Explode>Y</Explode> </TermSet> <OP>OR</OP> <OP>GROUP</OP> <TermSet> <Term>"informatics"[MeSH Terms]</Term> <Field>MeSH Terms</Field> <Count>6441</Count> <Explode>Y</Explode> </TermSet> <TermSet> <Term>"informatics"[All Fields]</Term> <Field>All Fields</Field> <Count>21168</Count> <Explode>Y</Explode> </TermSet> <OP>OR</OP> <OP>GROUP</OP> <OP>AND</OP> <OP>GROUP</OP> </TranslationStack><QueryTranslation>("biodiversity"[MeSH Terms] OR "biodiversity"[All Fields]) AND ("informatics"[MeSH Terms] OR "informatics"[All Fields])</QueryTranslation></eSearchResult>