doctor_scrape 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. data/.gitignore +18 -0
  2. data/.rbenv-version +1 -0
  3. data/.rspec +2 -0
  4. data/Gemfile +4 -0
  5. data/Guardfile +10 -0
  6. data/LICENSE +22 -0
  7. data/README.md +78 -0
  8. data/Rakefile +14 -0
  9. data/doctor_scrape.gemspec +35 -0
  10. data/lib/doctor_scrape/data.rb +12 -0
  11. data/lib/doctor_scrape/redirect_follower.rb +29 -0
  12. data/lib/doctor_scrape/scraper/base.rb +50 -0
  13. data/lib/doctor_scrape/scraper/bora.rb +25 -0
  14. data/lib/doctor_scrape/scraper/diva.rb +16 -0
  15. data/lib/doctor_scrape/scraper/duo.rb +32 -0
  16. data/lib/doctor_scrape/scraper/meta.rb +39 -0
  17. data/lib/doctor_scrape/scraper/unknown.rb +20 -0
  18. data/lib/doctor_scrape/search.rb +42 -0
  19. data/lib/doctor_scrape/version.rb +3 -0
  20. data/lib/doctor_scrape.rb +37 -0
  21. data/spec/cassettes/brage_bibsys_no_hibo_handle_URN_NBN_no-bibsys_brage_17854.yml +624 -0
  22. data/spec/cassettes/brage_bibsys_no_nhh_handle_URN_NBN_no-bibsys_brage_22813.yml +535 -0
  23. data/spec/cassettes/brage_bibsys_no_nhh_handle_URN_NBN_no-bibsys_brage_24121.yml +388 -0
  24. data/spec/cassettes/https___bora_hib_no_handle_10049_234.yml +429 -0
  25. data/spec/cassettes/https___bora_hib_no_handle_10049_330.yml +347 -0
  26. data/spec/cassettes/https___bora_uib_no_handle_1956_3282.yml +682 -0
  27. data/spec/cassettes/nora_search.yml +795 -0
  28. data/spec/cassettes/ntnu.diva-portal.org_smash_record.jsf?searchId=1&pid=diva2:122798.yml +398 -0
  29. data/spec/cassettes/ntnu_diva-portal_org_smash_record_jsf_searchId_1_pid_diva2_122794.yml +398 -0
  30. data/spec/cassettes/ntnu_diva-portal_org_smash_record_jsf_searchId_1_pid_diva2_122798.yml +487 -0
  31. data/spec/cassettes/www_duo_uio_no_sok_work_html_WORKID_112975.yml +248 -0
  32. data/spec/cassettes/www_duo_uio_no_sok_work_html_WORKID_149776.yml +240 -0
  33. data/spec/cassettes/www_ub_uit_no_munin_handle_10037_3822.yml +602 -0
  34. data/spec/cassettes/www_ub_uit_no_munin_handle_10037_3826.yml +534 -0
  35. data/spec/data_spec.rb +22 -0
  36. data/spec/parse/bibsys_spec.rb +40 -0
  37. data/spec/parse/bora_hib_spec.rb +29 -0
  38. data/spec/parse/bora_uib_spec.rb +18 -0
  39. data/spec/parse/diva_spec.rb +29 -0
  40. data/spec/parse/duo_spec.rb +29 -0
  41. data/spec/parse/munin_spec.rb +30 -0
  42. data/spec/redirect_follower_spec.rb +37 -0
  43. data/spec/scraper_spec.rb +43 -0
  44. data/spec/scrapers/base_spec.rb +6 -0
  45. data/spec/scrapers/bora_spec.rb +6 -0
  46. data/spec/scrapers/diva_spec.rb +6 -0
  47. data/spec/scrapers/duo_spec.rb +6 -0
  48. data/spec/scrapers/meta_spec.rb +6 -0
  49. data/spec/scrapers/unknown_spec.rb +18 -0
  50. data/spec/search_spec.rb +111 -0
  51. data/spec/shared/scraper.rb +62 -0
  52. data/spec/spec_helper.rb +29 -0
  53. data/spec/support/setup_scraper.rb +8 -0
  54. metadata +292 -0
@@ -0,0 +1,29 @@
1
+ # encoding: utf-8
2
+ require 'spec_helper'
3
+
4
+ describe "ntnu.diva-portal.org" do
5
+ let(:scraper) { DoctorScrape::Scraper::Diva.new url }
6
+
7
+ context "http://ntnu.diva-portal.org/smash/record.jsf?searchId=1&pid=diva2:122798" do
8
+ setup_scraper_for "http://ntnu.diva-portal.org/smash/record.jsf?searchId=1&pid=diva2:122798"
9
+
10
+ its(:title) { should eq "Sporene der hjemme : Om 15-16-åringer og deres hverdagskultur – basert på en undersøkelse om kulturelle og estetiske praksiser i noen utvalgte nordiske ungdomsrom" }
11
+ its(:author) { should eq "Aagre, Willy" }
12
+ its(:issued) { should eq "2006" }
13
+ its(:abstract) { should be_nil }
14
+ its(:permalink) { should eq "http://urn.kb.se/resolve?urn=urn:nbn:no:ntnu:diva-1697" }
15
+ its(:pdf) { should eq "http://ntnu.diva-portal.org/smash/get/diva2:122798/FULLTEXT01" }
16
+ end
17
+
18
+ context "http://ntnu.diva-portal.org/smash/record.jsf?searchId=1&pid=diva2:122794" do
19
+ setup_scraper_for "http://ntnu.diva-portal.org/smash/record.jsf?searchId=1&pid=diva2:122794"
20
+
21
+ its(:title) { should eq "Knowledge Management in Software Engineering: A Systematic Review of Studied Concepts and Research Methods Used" }
22
+ its(:author) { should eq "Bjørnson, Finn Olav; Dingsøyr, Torgeir" }
23
+ its(:issued) { should eq "" }
24
+ its(:abstract) { should be_nil }
25
+ its(:permalink) { should eq "http://urn.kb.se/resolve?urn=urn:nbn:no:ntnu:diva-2833" }
26
+ its(:pdf) { should eq "" }
27
+ end
28
+
29
+ end
@@ -0,0 +1,29 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe "duo.uio.no" do
5
+ let(:scraper) { DoctorScrape::Scraper::Duo.new url }
6
+
7
+ context "http://www.duo.uio.no/sok/work.html?WORKID=112975" do
8
+ setup_scraper_for "http://www.duo.uio.no/sok/work.html?WORKID=112975"
9
+
10
+ its(:title) { should eq "Meaningful Method Names" }
11
+ its(:author) { should eq "Høst, Einar" }
12
+ its(:issued) { should eq "2011" }
13
+ its(:pdf) { should eq "http://www.duo.uio.no/sok/work.html?WORKID=112975&fid=65890" }
14
+ its(:permalink) { should eq "http://urn.nb.no/URN:NBN:no-27629" }
15
+ its(:abstract) { should match /We build computer programs by creating named abtractions/ }
16
+ end
17
+
18
+ context "http://www.duo.uio.no/sok/work.html?WORKID=149776" do
19
+ setup_scraper_for "http://www.duo.uio.no/sok/work.html?WORKID=149776"
20
+
21
+ its(:title) { should eq "CacheCast: a system for efficient single source multiple destination data transfer" }
22
+ its(:author) { should eq "Srebrny, Piotr" }
23
+ its(:issued) { should eq "2011" }
24
+ its(:pdf) { should eq "http://www.duo.uio.no/sok/work.html?WORKID=149776&fid=91631" }
25
+ its(:permalink) { should eq "http://urn.nb.no/URN:NBN:no-30226" }
26
+ its(:abstract) { should match /^The basic function of the Internet is to.*stream to thousands of clients\.$/m }
27
+ end
28
+
29
+ end
@@ -0,0 +1,30 @@
1
+ # encoding: utf-8
2
+ require 'spec_helper'
3
+
4
+ describe "ub.uit.no/munin" do
5
+ let(:scraper) { DoctorScrape::Scraper::Meta.new url }
6
+
7
+ context "http://www.ub.uit.no/munin/handle/10037/3822" do
8
+ setup_scraper_for "http://www.ub.uit.no/munin/handle/10037/3822"
9
+
10
+ its(:title) { should eq "The hidden children of Eve : Sámi poetics guovtti ilmmi gaskkas" }
11
+ its(:author) { should eq "Jernsletten, Kristin (Kikki)" }
12
+ its(:issued) { should eq "2012-02-29" }
13
+ its(:permalink) { should eq "http://hdl.handle.net/10037/3822" }
14
+ its(:pdf) { should eq "http://www.ub.uit.no/munin/bitstream/10037/3822/4/thesis.pdf" }
15
+ its(:isbn) { should be_nil }
16
+ its(:abstract) { should match /^Tesen går ut på at samisk litteraturforståelse og verdensoppfatning.*videre forskning\.$/m }
17
+ end
18
+
19
+ context "http://www.ub.uit.no/munin/handle/10037/3826" do
20
+ setup_scraper_for "http://www.ub.uit.no/munin/handle/10037/3826"
21
+
22
+ its(:title) { should eq "IKT-forbindelser i helsesektoren, Sammenvevinger av IKT, steder, yrker, kjønn og politikk" }
23
+ its(:author) { should eq "Dyb, Kari" }
24
+ its(:issued) { should eq "2011-04-29" }
25
+ its(:permalink) { should eq "http://hdl.handle.net/10037/3826" }
26
+ its(:pdf) { should eq "http://www.ub.uit.no/munin/bitstream/10037/3826/3/paper_4.pdf" }
27
+ its(:abstract) { should match /^Til tross for betydelig politisk satsing.*teknologien blir brukt i sektoren\./m }
28
+ end
29
+
30
+ end
@@ -0,0 +1,37 @@
1
+ # encoding: utf-8
2
+ require 'spec_helper'
3
+
4
+ describe DoctorScrape::RedirectFollower do
5
+ let(:url) { "http://bit.ly/foobar" }
6
+ let(:endpoint) { "http://example.com" }
7
+ let(:resolver) { DoctorScrape::RedirectFollower.new url }
8
+ before { stub_request(:any, endpoint).to_return(:body => "You found me!") }
9
+
10
+ context "when url doesn't redirect" do
11
+ before { stub_request(:any, url).to_return(:body => "Ok") }
12
+ specify { resolver.resolve.should eq url }
13
+ end
14
+
15
+ context "when url redirects" do
16
+ before { stub_request(:any, url).to_return(:status => [ 302, "Moved Temporarily" ], headers: { location: endpoint }) }
17
+ specify { resolver.resolve.should eq endpoint }
18
+ end
19
+
20
+ context "too many redirects" do
21
+ before { stub_request(:any, url).to_return(:status => [ 302, "Moved Temporarily" ], headers: { location: url }) }
22
+
23
+ it "raises error after 5 redirects" do
24
+ expect { resolver.resolve }.to raise_error
25
+ a_request(:get, url).should have_been_made.times(5)
26
+ end
27
+ end
28
+
29
+ context "when exception occurs" do
30
+ it "returns the last url" do
31
+ stub_request(:get, url).to_return(:status => [ 302, "Moved Temporarily" ], headers: { location: endpoint })
32
+ stub_request(:get, endpoint).to_raise Net::HTTPBadResponse
33
+ resolver.resolve.should eq(endpoint)
34
+ end
35
+ end
36
+
37
+ end
@@ -0,0 +1,43 @@
1
+ require 'spec_helper'
2
+
3
+ shared_examples :it_matches do |urls|
4
+ let(:expected_class) { described_class }
5
+
6
+ [urls].flatten.each do |url|
7
+ context url do
8
+ subject { DoctorScrape::Scraper.for url }
9
+ it { should be_a expected_class }
10
+ end
11
+ end
12
+
13
+ end
14
+
15
+ describe DoctorScrape::Scraper::Unknown do
16
+ it_behaves_like :it_matches, "http://foo.example.com"
17
+ end
18
+
19
+ describe DoctorScrape::Scraper::Duo do
20
+ it_behaves_like :it_matches, [
21
+ "http://duo.uio.no/sok/work.html?WORKID=1234",
22
+ "http://www.duo.uio.no/sok/work.html?WORKID=1234",
23
+ ]
24
+ end
25
+
26
+ describe DoctorScrape::Scraper::Diva do
27
+ it_behaves_like :it_matches, [
28
+ "http://ntnu.diva-portal.org/smash/record.jsf?searchId=1&pid=diva2:122798",
29
+ "http://urn.kb.se/resolve?urn=urn:nbn:no:ntnu:diva-15280"
30
+ ]
31
+ end
32
+
33
+ describe DoctorScrape::Scraper::Bora do
34
+ it_behaves_like :it_matches, "https://bora.hib.no/handle/10049/330"
35
+ end
36
+
37
+ describe DoctorScrape::Scraper::Meta do
38
+ it_behaves_like :it_matches, [
39
+ "http://brage.bibsys.no/nhh/handle/URN:NBN:no-bibsys_brage_24121",
40
+ "http://brage.bibsys.no/hibo/handle/URN:NBN:no-bibsys_brage_17854",
41
+ "http://idtjeneste.nb.no/URN:NBN:no-bibsys_brage_24791"
42
+ ]
43
+ end
@@ -0,0 +1,6 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe DoctorScrape::Scraper::Base do
5
+ it_behaves_like :scraper
6
+ end
@@ -0,0 +1,6 @@
1
+ # encoding: utf-8
2
+ require 'spec_helper'
3
+
4
+ describe DoctorScrape::Scraper::Bora do
5
+ it_behaves_like :scraper
6
+ end
@@ -0,0 +1,6 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe DoctorScrape::Scraper::Diva do
5
+ it_behaves_like :scraper
6
+ end
@@ -0,0 +1,6 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe DoctorScrape::Scraper::Duo do
5
+ it_behaves_like :scraper
6
+ end
@@ -0,0 +1,6 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe DoctorScrape::Scraper::Meta do
5
+ it_behaves_like :scraper
6
+ end
@@ -0,0 +1,18 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe DoctorScrape::Scraper::Unknown do
5
+ subject { DoctorScrape::Scraper::Unknown.new "http://foobar.com" }
6
+
7
+ its(:errors?) { should be_true }
8
+ its(:errors) { should == ["Don't know how to scrape this page"] }
9
+
10
+ specify { subject.fetch.should be_false }
11
+ specify { subject.scrape.should be_false }
12
+
13
+ context "after scrape" do
14
+ before { subject.scrape }
15
+ its(:errors) { should == [ "Don't know how to scrape this page" ] }
16
+ end
17
+
18
+ end
@@ -0,0 +1,111 @@
1
+ require 'spec_helper'
2
+
3
+ describe DoctorScrape::Search do
4
+
5
+ describe "nora" do
6
+ let(:default_params) do
7
+ {
8
+ "PAGESIZE" => "50",
9
+ "FROM" => "2007",
10
+ "TO" => "2012",
11
+ "SEARCHMODE" => "TOPIC",
12
+ "DOCUMENTTYPES" => "Doctoral thesis",
13
+ "RESULTMODE" => "rss"
14
+ }
15
+ end
16
+ use_vcr_cassette "nora_search"
17
+ let(:url) { "http://www.ub.uio.no/nora/result.html" }
18
+ subject { DoctorScrape::Search.nora }
19
+
20
+ specify { should be_a Array }
21
+ its(:first) { should match /^http:\/\// }
22
+
23
+ context "without options" do
24
+ it "uses default options" do
25
+ stub = stub_request(:get, url).with(:query => default_params)
26
+ DoctorScrape::Search.nora
27
+ stub.should have_been_requested
28
+ end
29
+
30
+ it "doesn't resolve redirects" do
31
+ DoctorScrape::Search.should_not_receive(:resolve_urls)
32
+ DoctorScrape::Search.should_not_receive(:resolve_scrapers)
33
+ DoctorScrape::Search.nora
34
+ end
35
+ end
36
+
37
+ context "with options" do
38
+
39
+ # TODO: DRY this up
40
+
41
+ it ":limit changes PAGESIZE" do
42
+ params = default_params.update("PAGESIZE" => "10")
43
+ stub_request(:any, url).with(:query => params)
44
+ DoctorScrape::Search.nora :limit => 10
45
+ a_request(:get, url).with(:query => params).should have_been_requested
46
+ end
47
+
48
+ it ":from changes start year" do
49
+ params = default_params.update("FROM" => "2010")
50
+ stub_request(:any, url).with(:query => params)
51
+ DoctorScrape::Search.nora :from => 2010
52
+ a_request(:get, url).with(:query => params).should have_been_requested
53
+ end
54
+
55
+ it ":to changes end year" do
56
+ params = default_params.update("TO" => "2008")
57
+ stub_request(:any, url).with(:query => params)
58
+ DoctorScrape::Search.nora :to => 2008
59
+ a_request(:get, url).with(:query => params).should have_been_requested
60
+ end
61
+
62
+ it "can be combined" do
63
+ params = default_params.update("TO" => "2008", "FROM" => "1980", "PAGESIZE" => "1")
64
+ stub_request(:any, url).with(:query => params)
65
+ DoctorScrape::Search.nora :to => 2008, :from => "1980", limit: 1
66
+ a_request(:get, url).with(:query => params).should have_been_requested
67
+ end
68
+
69
+ context "resolve is true" do
70
+ it "resolves scrapers" do
71
+ DoctorScrape::Search.should_receive(:resolve_scrapers)
72
+ DoctorScrape::Search.nora resolve: true
73
+ end
74
+ end
75
+
76
+ end
77
+
78
+ context "when search fails" do
79
+ it "returns empty array" do
80
+ stub_request(:get, url).with(:query => default_params).to_return(:body => "")
81
+ DoctorScrape::Search.nora.should be_empty
82
+ end
83
+ end
84
+
85
+ end
86
+
87
+ describe "resolve_urls" do
88
+ let(:urls) { [ "http://bit.ly", "http://example.com", "http://goo.gl" ] }
89
+ let(:resolved) { ["http://bitly.com", "http://example.com", "http://googl.com"] }
90
+
91
+ it "returns array with resolved urls" do
92
+ resolvers = resolved.map { |r| double "resolver", :resolve => r }
93
+ urls.each_with_index do |url, i|
94
+ DoctorScrape::RedirectFollower.should_receive(:new).with(url) { resolvers[i] }
95
+ end
96
+
97
+ DoctorScrape::Search.resolve_urls(urls).should == resolved
98
+ end
99
+ end
100
+
101
+ describe "resolve_scrapers" do
102
+ let(:urls) { [ "http://duo.uio.no/", "http://http://foo.bibsys.no", "http://bora.hib.no" ] }
103
+
104
+ it "returns array with appropriate scraper for each url" do
105
+ urls.each { |url| stub_request(:get, url) }
106
+ DoctorScrape::Scraper.should_receive(:for).exactly(3).times { DoctorScrape::Scraper::Meta.new "foo" }
107
+ DoctorScrape::Search.resolve_scrapers(urls).first.should be_a DoctorScrape::Scraper::Meta
108
+ end
109
+ end
110
+
111
+ end
@@ -0,0 +1,62 @@
1
+ # encoding: utf-8
2
+
3
+ shared_examples_for :scraper do
4
+ let(:url) { "http://test.host" }
5
+ let(:scraper) { described_class.new url }
6
+
7
+ context "#fetch" do
8
+ it "uses #url for the url" do
9
+ stub_request :any, "http://www.example.com"
10
+ scraper.should_receive(:url) { "http://www.example.com" }
11
+ scraper.fetch
12
+ end
13
+ end
14
+
15
+ context "#scrape" do
16
+
17
+ context "when successful" do
18
+
19
+ before do
20
+ scraper.stub :fetch
21
+ scraper.stub :parse
22
+ end
23
+
24
+ it "clears errors before scrape" do
25
+ scraper.instance_variable_set("@errors", ['foo', 'bar'])
26
+ scraper.scrape
27
+ scraper.errors.should be_empty
28
+ end
29
+
30
+ it "returns true" do
31
+ scraper.stub(:errors?) { false }
32
+ scraper.scrape.should be_true
33
+ end
34
+
35
+ end
36
+
37
+ context "when there are errors" do
38
+
39
+ context "parser error" do
40
+ before { scraper.stub(:parse).and_raise "parse error" }
41
+ it "returns false " do
42
+ scraper.scrape.should be_false
43
+ end
44
+ end
45
+
46
+ [ [ 404, "Not Found" ], [ 500, "Internal Server Error" ] ].each do |error|
47
+ context error.join(" ") do
48
+ before do
49
+ stub_request(:any, scraper.url).to_return(:status => [error.first, error.last])
50
+ scraper.scrape
51
+ end
52
+
53
+ specify { scraper.errors.first.should be_a OpenURI::HTTPError }
54
+ specify { scraper.errors.first.message.should eq error.join(" ") }
55
+ specify { scraper.data.fields.should == [ :url ] }
56
+
57
+ end
58
+ end
59
+ end
60
+
61
+ end
62
+ end
@@ -0,0 +1,29 @@
1
+ if ENV["SIMPLECOV"]
2
+ require 'simplecov'
3
+ SimpleCov.start do
4
+ add_filter "/vendor"
5
+ end
6
+ end
7
+
8
+ require 'rspec/autorun'
9
+ require 'webmock/rspec'
10
+ require 'vcr'
11
+ require 'doctor_scrape'
12
+ require 'awesome_print'
13
+
14
+ Dir["./spec/shared/**/*.rb"].each {|f| require f}
15
+ Dir["./spec/support/**/*.rb"].each {|f| require f}
16
+
17
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
18
+ RSpec.configure do |config|
19
+ config.treat_symbols_as_metadata_keys_with_true_values = true
20
+ config.run_all_when_everything_filtered = true
21
+ config.filter_run :focus
22
+ config.extend VCR::RSpec::Macros
23
+ end
24
+
25
+ VCR.configure do |config|
26
+ config.cassette_library_dir = "spec/cassettes"
27
+ config.hook_into :webmock
28
+ config.default_cassette_options = { :serialize_with => :yaml }
29
+ end
@@ -0,0 +1,8 @@
1
+ def setup_scraper_for(url)
2
+ use_vcr_cassette url.gsub(/^http:\/\//, '').gsub(/\//, '_')
3
+ let(:url) { url }
4
+ before { scraper.scrape }
5
+ subject { scraper.data }
6
+ specify { scraper.errors?.should be_false }
7
+ end
8
+