RubyGems - doctor_scrape - Versions diffs - 0.0.2 - Mend

doctor_scrape 0.0.2

Files changed (54) hide show

data/.gitignore +18 -0
data/.rbenv-version +1 -0
data/.rspec +2 -0
data/Gemfile +4 -0
data/Guardfile +10 -0
data/LICENSE +22 -0
data/README.md +78 -0
data/Rakefile +14 -0
data/doctor_scrape.gemspec +35 -0
data/lib/doctor_scrape/data.rb +12 -0
data/lib/doctor_scrape/redirect_follower.rb +29 -0
data/lib/doctor_scrape/scraper/base.rb +50 -0
data/lib/doctor_scrape/scraper/bora.rb +25 -0
data/lib/doctor_scrape/scraper/diva.rb +16 -0
data/lib/doctor_scrape/scraper/duo.rb +32 -0
data/lib/doctor_scrape/scraper/meta.rb +39 -0
data/lib/doctor_scrape/scraper/unknown.rb +20 -0
data/lib/doctor_scrape/search.rb +42 -0
data/lib/doctor_scrape/version.rb +3 -0
data/lib/doctor_scrape.rb +37 -0
data/spec/cassettes/brage_bibsys_no_hibo_handle_URN_NBN_no-bibsys_brage_17854.yml +624 -0
data/spec/cassettes/brage_bibsys_no_nhh_handle_URN_NBN_no-bibsys_brage_22813.yml +535 -0
data/spec/cassettes/brage_bibsys_no_nhh_handle_URN_NBN_no-bibsys_brage_24121.yml +388 -0
data/spec/cassettes/https___bora_hib_no_handle_10049_234.yml +429 -0
data/spec/cassettes/https___bora_hib_no_handle_10049_330.yml +347 -0
data/spec/cassettes/https___bora_uib_no_handle_1956_3282.yml +682 -0
data/spec/cassettes/nora_search.yml +795 -0
data/spec/cassettes/ntnu.diva-portal.org_smash_record.jsf?searchId=1&pid=diva2:122798.yml +398 -0
data/spec/cassettes/ntnu_diva-portal_org_smash_record_jsf_searchId_1_pid_diva2_122794.yml +398 -0
data/spec/cassettes/ntnu_diva-portal_org_smash_record_jsf_searchId_1_pid_diva2_122798.yml +487 -0
data/spec/cassettes/www_duo_uio_no_sok_work_html_WORKID_112975.yml +248 -0
data/spec/cassettes/www_duo_uio_no_sok_work_html_WORKID_149776.yml +240 -0
data/spec/cassettes/www_ub_uit_no_munin_handle_10037_3822.yml +602 -0
data/spec/cassettes/www_ub_uit_no_munin_handle_10037_3826.yml +534 -0
data/spec/data_spec.rb +22 -0
data/spec/parse/bibsys_spec.rb +40 -0
data/spec/parse/bora_hib_spec.rb +29 -0
data/spec/parse/bora_uib_spec.rb +18 -0
data/spec/parse/diva_spec.rb +29 -0
data/spec/parse/duo_spec.rb +29 -0
data/spec/parse/munin_spec.rb +30 -0
data/spec/redirect_follower_spec.rb +37 -0
data/spec/scraper_spec.rb +43 -0
data/spec/scrapers/base_spec.rb +6 -0
data/spec/scrapers/bora_spec.rb +6 -0
data/spec/scrapers/diva_spec.rb +6 -0
data/spec/scrapers/duo_spec.rb +6 -0
data/spec/scrapers/meta_spec.rb +6 -0
data/spec/scrapers/unknown_spec.rb +18 -0
data/spec/search_spec.rb +111 -0
data/spec/shared/scraper.rb +62 -0
data/spec/spec_helper.rb +29 -0
data/spec/support/setup_scraper.rb +8 -0
metadata +292 -0

data/spec/parse/diva_spec.rb ADDED Viewed

@@ -0,0 +1,29 @@
+# encoding: utf-8
+require 'spec_helper'
+describe "ntnu.diva-portal.org" do
+  let(:scraper) { DoctorScrape::Scraper::Diva.new url }
+  context "http://ntnu.diva-portal.org/smash/record.jsf?searchId=1&pid=diva2:122798" do
+    setup_scraper_for "http://ntnu.diva-portal.org/smash/record.jsf?searchId=1&pid=diva2:122798"
+    its(:title)     { should eq "Sporene der hjemme : Om 15-16-åringer og deres hverdagskultur – basert på en undersøkelse om kulturelle og estetiske praksiser i noen utvalgte nordiske ungdomsrom" }
+    its(:author)    { should eq "Aagre, Willy" }
+    its(:issued)    { should eq "2006" }
+    its(:abstract)  { should be_nil }
+    its(:permalink) { should eq "http://urn.kb.se/resolve?urn=urn:nbn:no:ntnu:diva-1697" }
+    its(:pdf)       { should eq "http://ntnu.diva-portal.org/smash/get/diva2:122798/FULLTEXT01" }
+  end
+  context "http://ntnu.diva-portal.org/smash/record.jsf?searchId=1&pid=diva2:122794" do
+    setup_scraper_for "http://ntnu.diva-portal.org/smash/record.jsf?searchId=1&pid=diva2:122794"
+    its(:title)     { should eq "Knowledge Management in Software Engineering: A Systematic Review of Studied Concepts and Research Methods Used" }
+    its(:author)    { should eq "Bjørnson, Finn Olav; Dingsøyr, Torgeir" }
+    its(:issued)    { should eq "" }
+    its(:abstract)  { should be_nil }
+    its(:permalink) { should eq "http://urn.kb.se/resolve?urn=urn:nbn:no:ntnu:diva-2833" }
+    its(:pdf)       { should eq "" }
+  end
+end

data/spec/parse/duo_spec.rb ADDED Viewed

@@ -0,0 +1,29 @@
+# encoding: utf-8
+require "spec_helper"
+describe "duo.uio.no" do
+  let(:scraper) { DoctorScrape::Scraper::Duo.new url }
+  context "http://www.duo.uio.no/sok/work.html?WORKID=112975" do
+    setup_scraper_for "http://www.duo.uio.no/sok/work.html?WORKID=112975"
+    its(:title)     { should eq "Meaningful Method Names" }
+    its(:author)    { should eq "Høst, Einar" }
+    its(:issued)    { should eq "2011" }
+    its(:pdf)       { should eq "http://www.duo.uio.no/sok/work.html?WORKID=112975&fid=65890" }
+    its(:permalink) { should eq "http://urn.nb.no/URN:NBN:no-27629" }
+    its(:abstract)  { should match /We build computer programs by creating named abtractions/ }
+  end
+  context "http://www.duo.uio.no/sok/work.html?WORKID=149776" do
+    setup_scraper_for "http://www.duo.uio.no/sok/work.html?WORKID=149776"
+    its(:title)     { should eq "CacheCast: a system for efficient single source multiple destination data transfer" }
+    its(:author)    { should eq "Srebrny, Piotr" }
+    its(:issued)    { should eq "2011" }
+    its(:pdf)       { should eq "http://www.duo.uio.no/sok/work.html?WORKID=149776&fid=91631" }
+    its(:permalink) { should eq "http://urn.nb.no/URN:NBN:no-30226" }
+    its(:abstract)  { should match /^The basic function of the Internet is to.*stream to thousands of clients\.$/m }
+  end
+end

data/spec/parse/munin_spec.rb ADDED Viewed

@@ -0,0 +1,30 @@
+# encoding: utf-8
+require 'spec_helper'
+describe "ub.uit.no/munin" do
+  let(:scraper) { DoctorScrape::Scraper::Meta.new url }
+  context "http://www.ub.uit.no/munin/handle/10037/3822" do
+    setup_scraper_for "http://www.ub.uit.no/munin/handle/10037/3822"
+    its(:title)     { should eq "The hidden children of Eve : Sámi poetics guovtti ilmmi gaskkas" }
+    its(:author)    { should eq "Jernsletten, Kristin (Kikki)" }
+    its(:issued)    { should eq "2012-02-29" }
+    its(:permalink) { should eq "http://hdl.handle.net/10037/3822" }
+    its(:pdf)       { should eq "http://www.ub.uit.no/munin/bitstream/10037/3822/4/thesis.pdf" }
+    its(:isbn)      { should be_nil }
+    its(:abstract)  { should match /^Tesen går ut på at samisk litteraturforståelse og verdensoppfatning.*videre forskning\.$/m }
+  end
+  context "http://www.ub.uit.no/munin/handle/10037/3826" do
+    setup_scraper_for "http://www.ub.uit.no/munin/handle/10037/3826"
+    its(:title)     { should eq "IKT-forbindelser i helsesektoren, Sammenvevinger av IKT, steder, yrker, kjønn og politikk" }
+    its(:author)    { should eq "Dyb, Kari" }
+    its(:issued)    { should eq "2011-04-29" }
+    its(:permalink) { should eq "http://hdl.handle.net/10037/3826" }
+    its(:pdf)       { should eq "http://www.ub.uit.no/munin/bitstream/10037/3826/3/paper_4.pdf" }
+    its(:abstract)  { should match /^Til tross for betydelig politisk satsing.*teknologien blir brukt i sektoren\./m }
+  end
+end

data/spec/redirect_follower_spec.rb ADDED Viewed

@@ -0,0 +1,37 @@
+# encoding: utf-8
+require 'spec_helper'
+describe DoctorScrape::RedirectFollower do
+  let(:url)      { "http://bit.ly/foobar" }
+  let(:endpoint) { "http://example.com"   }
+  let(:resolver) { DoctorScrape::RedirectFollower.new url }
+  before { stub_request(:any, endpoint).to_return(:body => "You found me!") }
+  context "when url doesn't redirect" do
+    before  { stub_request(:any, url).to_return(:body => "Ok") }
+    specify { resolver.resolve.should eq url }
+  end
+  context "when url redirects" do
+    before  { stub_request(:any, url).to_return(:status => [ 302, "Moved Temporarily" ], headers: { location: endpoint }) }
+    specify { resolver.resolve.should eq endpoint }
+  end
+  context "too many redirects" do
+    before  { stub_request(:any, url).to_return(:status => [ 302, "Moved Temporarily" ], headers: { location: url }) }
+    it "raises error after 5 redirects" do
+      expect { resolver.resolve }.to raise_error
+      a_request(:get, url).should have_been_made.times(5)
+    end
+  end
+  context "when exception occurs" do
+    it "returns the last url" do
+      stub_request(:get, url).to_return(:status => [ 302, "Moved Temporarily" ], headers: { location: endpoint })
+      stub_request(:get, endpoint).to_raise Net::HTTPBadResponse
+      resolver.resolve.should eq(endpoint)
+    end
+  end
+end

data/spec/scraper_spec.rb ADDED Viewed

@@ -0,0 +1,43 @@
+require 'spec_helper'
+shared_examples :it_matches do |urls|
+  let(:expected_class) { described_class }
+  [urls].flatten.each do |url|
+    context url do
+      subject { DoctorScrape::Scraper.for url }
+      it { should be_a expected_class }
+    end
+  end
+end
+describe DoctorScrape::Scraper::Unknown do
+  it_behaves_like :it_matches, "http://foo.example.com"
+end
+describe DoctorScrape::Scraper::Duo do
+  it_behaves_like :it_matches, [
+    "http://duo.uio.no/sok/work.html?WORKID=1234",
+    "http://www.duo.uio.no/sok/work.html?WORKID=1234",
+  ]
+end
+describe DoctorScrape::Scraper::Diva do
+  it_behaves_like :it_matches, [
+    "http://ntnu.diva-portal.org/smash/record.jsf?searchId=1&pid=diva2:122798",
+    "http://urn.kb.se/resolve?urn=urn:nbn:no:ntnu:diva-15280"
+  ]
+end
+describe DoctorScrape::Scraper::Bora do
+  it_behaves_like :it_matches, "https://bora.hib.no/handle/10049/330"
+end
+describe DoctorScrape::Scraper::Meta do
+  it_behaves_like :it_matches, [
+     "http://brage.bibsys.no/nhh/handle/URN:NBN:no-bibsys_brage_24121",
+     "http://brage.bibsys.no/hibo/handle/URN:NBN:no-bibsys_brage_17854",
+     "http://idtjeneste.nb.no/URN:NBN:no-bibsys_brage_24791"
+  ]
+end

data/spec/scrapers/base_spec.rb ADDED Viewed

@@ -0,0 +1,6 @@
+# encoding: utf-8
+require "spec_helper"
+describe DoctorScrape::Scraper::Base do
+  it_behaves_like :scraper
+end

data/spec/scrapers/bora_spec.rb ADDED Viewed

@@ -0,0 +1,6 @@
+# encoding: utf-8
+require 'spec_helper'
+describe DoctorScrape::Scraper::Bora do
+  it_behaves_like :scraper
+end

data/spec/scrapers/diva_spec.rb ADDED Viewed

@@ -0,0 +1,6 @@
+# encoding: utf-8
+require "spec_helper"
+describe DoctorScrape::Scraper::Diva do
+  it_behaves_like :scraper
+end

data/spec/scrapers/duo_spec.rb ADDED Viewed

@@ -0,0 +1,6 @@
+# encoding: utf-8
+require "spec_helper"
+describe DoctorScrape::Scraper::Duo do
+  it_behaves_like :scraper
+end

data/spec/scrapers/meta_spec.rb ADDED Viewed

@@ -0,0 +1,6 @@
+# encoding: utf-8
+require "spec_helper"
+describe DoctorScrape::Scraper::Meta do
+  it_behaves_like :scraper
+end

data/spec/scrapers/unknown_spec.rb ADDED Viewed

@@ -0,0 +1,18 @@
+# encoding: utf-8
+require "spec_helper"
+describe DoctorScrape::Scraper::Unknown do
+  subject { DoctorScrape::Scraper::Unknown.new "http://foobar.com" }
+  its(:errors?) { should be_true }
+  its(:errors)  { should == ["Don't know how to scrape this page"] }
+  specify { subject.fetch.should be_false }
+  specify { subject.scrape.should be_false }
+  context "after scrape" do
+    before { subject.scrape }
+    its(:errors) { should == [ "Don't know how to scrape this page" ] }
+  end
+end

data/spec/search_spec.rb ADDED Viewed

@@ -0,0 +1,111 @@
+require 'spec_helper'
+describe DoctorScrape::Search do
+  describe "nora" do
+    let(:default_params) do
+      {
+        "PAGESIZE" => "50",
+        "FROM" => "2007",
+        "TO" => "2012",
+        "SEARCHMODE" => "TOPIC",
+        "DOCUMENTTYPES" => "Doctoral thesis",
+        "RESULTMODE" => "rss"
+      }
+    end
+    use_vcr_cassette "nora_search"
+    let(:url) { "http://www.ub.uio.no/nora/result.html" }
+    subject { DoctorScrape::Search.nora }
+    specify     { should be_a Array }
+    its(:first) { should match /^http:\/\// }
+    context "without options" do
+      it "uses default options" do
+        stub = stub_request(:get, url).with(:query => default_params)
+        DoctorScrape::Search.nora
+        stub.should have_been_requested
+      end
+      it "doesn't resolve redirects" do
+        DoctorScrape::Search.should_not_receive(:resolve_urls)
+        DoctorScrape::Search.should_not_receive(:resolve_scrapers)
+        DoctorScrape::Search.nora
+      end
+    end
+    context "with options" do
+      # TODO: DRY this up
+      it ":limit changes PAGESIZE" do
+        params = default_params.update("PAGESIZE" => "10")
+        stub_request(:any, url).with(:query => params)
+        DoctorScrape::Search.nora :limit => 10
+        a_request(:get, url).with(:query => params).should have_been_requested
+      end
+      it ":from changes start year" do
+        params = default_params.update("FROM" => "2010")
+        stub_request(:any, url).with(:query => params)
+        DoctorScrape::Search.nora :from => 2010
+        a_request(:get, url).with(:query => params).should have_been_requested
+      end
+      it ":to changes end year" do
+        params = default_params.update("TO" => "2008")
+        stub_request(:any, url).with(:query => params)
+        DoctorScrape::Search.nora :to => 2008
+        a_request(:get, url).with(:query => params).should have_been_requested
+      end
+      it "can be combined" do
+        params = default_params.update("TO" => "2008", "FROM" => "1980", "PAGESIZE" => "1")
+        stub_request(:any, url).with(:query => params)
+        DoctorScrape::Search.nora :to => 2008, :from => "1980", limit: 1
+        a_request(:get, url).with(:query => params).should have_been_requested
+      end
+      context "resolve is true" do
+        it "resolves scrapers" do
+          DoctorScrape::Search.should_receive(:resolve_scrapers)
+          DoctorScrape::Search.nora resolve: true
+        end
+      end
+    end
+    context "when search fails" do
+      it "returns empty array" do
+        stub_request(:get, url).with(:query => default_params).to_return(:body => "")
+        DoctorScrape::Search.nora.should be_empty
+      end
+    end
+  end
+  describe "resolve_urls" do
+    let(:urls)     { [ "http://bit.ly",   "http://example.com", "http://goo.gl"   ] }
+    let(:resolved) { ["http://bitly.com", "http://example.com", "http://googl.com"] }
+    it "returns array with resolved urls" do
+      resolvers = resolved.map { |r| double "resolver", :resolve => r }
+      urls.each_with_index do |url, i|
+        DoctorScrape::RedirectFollower.should_receive(:new).with(url) { resolvers[i] }
+      end
+      DoctorScrape::Search.resolve_urls(urls).should == resolved
+    end
+  end
+  describe "resolve_scrapers" do
+    let(:urls)     { [ "http://duo.uio.no/", "http://http://foo.bibsys.no", "http://bora.hib.no" ] }
+    it "returns array with appropriate scraper for each url" do
+      urls.each { |url| stub_request(:get, url) }
+      DoctorScrape::Scraper.should_receive(:for).exactly(3).times  { DoctorScrape::Scraper::Meta.new "foo" }
+      DoctorScrape::Search.resolve_scrapers(urls).first.should be_a DoctorScrape::Scraper::Meta
+    end
+  end
+end

data/spec/shared/scraper.rb ADDED Viewed

@@ -0,0 +1,62 @@
+# encoding: utf-8
+shared_examples_for :scraper do
+  let(:url) { "http://test.host" }
+  let(:scraper) { described_class.new url }
+  context "#fetch" do
+    it "uses #url for the url" do
+      stub_request :any, "http://www.example.com"
+      scraper.should_receive(:url) { "http://www.example.com" }
+      scraper.fetch
+    end
+  end
+  context "#scrape" do
+    context "when successful" do
+      before do
+        scraper.stub :fetch
+        scraper.stub :parse
+      end
+      it "clears errors before scrape" do
+        scraper.instance_variable_set("@errors", ['foo', 'bar'])
+        scraper.scrape
+        scraper.errors.should be_empty
+      end
+      it "returns true" do
+        scraper.stub(:errors?) { false }
+        scraper.scrape.should be_true
+      end
+    end
+    context "when there are errors" do
+      context "parser error" do
+        before  { scraper.stub(:parse).and_raise "parse error" }
+        it "returns false " do
+          scraper.scrape.should be_false
+        end
+      end
+      [ [ 404, "Not Found" ], [ 500, "Internal Server Error" ] ].each do |error|
+        context error.join(" ") do
+          before do
+            stub_request(:any, scraper.url).to_return(:status => [error.first, error.last])
+            scraper.scrape
+          end
+          specify { scraper.errors.first.should be_a OpenURI::HTTPError }
+          specify { scraper.errors.first.message.should eq error.join(" ") }
+          specify { scraper.data.fields.should == [ :url ] }
+        end
+      end
+    end
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,29 @@
+if ENV["SIMPLECOV"]
+  require 'simplecov'
+  SimpleCov.start do
+    add_filter "/vendor"
+  end
+end
+require 'rspec/autorun'
+require 'webmock/rspec'
+require 'vcr'
+require 'doctor_scrape'
+require 'awesome_print'
+Dir["./spec/shared/**/*.rb"].each  {|f| require f}
+Dir["./spec/support/**/*.rb"].each {|f| require f}
+# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
+RSpec.configure do |config|
+  config.treat_symbols_as_metadata_keys_with_true_values = true
+  config.run_all_when_everything_filtered = true
+  config.filter_run :focus
+  config.extend VCR::RSpec::Macros
+end
+VCR.configure do |config|
+  config.cassette_library_dir = "spec/cassettes"
+  config.hook_into :webmock
+  config.default_cassette_options = { :serialize_with => :yaml }
+end

data/spec/support/setup_scraper.rb ADDED Viewed

@@ -0,0 +1,8 @@
+def setup_scraper_for(url)
+  use_vcr_cassette url.gsub(/^http:\/\//, '').gsub(/\//, '_')
+  let(:url) { url }
+  before    { scraper.scrape }
+  subject   { scraper.data }
+  specify   { scraper.errors?.should be_false }
+end