doctor_scrape 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. data/.gitignore +18 -0
  2. data/.rbenv-version +1 -0
  3. data/.rspec +2 -0
  4. data/Gemfile +4 -0
  5. data/Guardfile +10 -0
  6. data/LICENSE +22 -0
  7. data/README.md +78 -0
  8. data/Rakefile +14 -0
  9. data/doctor_scrape.gemspec +35 -0
  10. data/lib/doctor_scrape/data.rb +12 -0
  11. data/lib/doctor_scrape/redirect_follower.rb +29 -0
  12. data/lib/doctor_scrape/scraper/base.rb +50 -0
  13. data/lib/doctor_scrape/scraper/bora.rb +25 -0
  14. data/lib/doctor_scrape/scraper/diva.rb +16 -0
  15. data/lib/doctor_scrape/scraper/duo.rb +32 -0
  16. data/lib/doctor_scrape/scraper/meta.rb +39 -0
  17. data/lib/doctor_scrape/scraper/unknown.rb +20 -0
  18. data/lib/doctor_scrape/search.rb +42 -0
  19. data/lib/doctor_scrape/version.rb +3 -0
  20. data/lib/doctor_scrape.rb +37 -0
  21. data/spec/cassettes/brage_bibsys_no_hibo_handle_URN_NBN_no-bibsys_brage_17854.yml +624 -0
  22. data/spec/cassettes/brage_bibsys_no_nhh_handle_URN_NBN_no-bibsys_brage_22813.yml +535 -0
  23. data/spec/cassettes/brage_bibsys_no_nhh_handle_URN_NBN_no-bibsys_brage_24121.yml +388 -0
  24. data/spec/cassettes/https___bora_hib_no_handle_10049_234.yml +429 -0
  25. data/spec/cassettes/https___bora_hib_no_handle_10049_330.yml +347 -0
  26. data/spec/cassettes/https___bora_uib_no_handle_1956_3282.yml +682 -0
  27. data/spec/cassettes/nora_search.yml +795 -0
  28. data/spec/cassettes/ntnu.diva-portal.org_smash_record.jsf?searchId=1&pid=diva2:122798.yml +398 -0
  29. data/spec/cassettes/ntnu_diva-portal_org_smash_record_jsf_searchId_1_pid_diva2_122794.yml +398 -0
  30. data/spec/cassettes/ntnu_diva-portal_org_smash_record_jsf_searchId_1_pid_diva2_122798.yml +487 -0
  31. data/spec/cassettes/www_duo_uio_no_sok_work_html_WORKID_112975.yml +248 -0
  32. data/spec/cassettes/www_duo_uio_no_sok_work_html_WORKID_149776.yml +240 -0
  33. data/spec/cassettes/www_ub_uit_no_munin_handle_10037_3822.yml +602 -0
  34. data/spec/cassettes/www_ub_uit_no_munin_handle_10037_3826.yml +534 -0
  35. data/spec/data_spec.rb +22 -0
  36. data/spec/parse/bibsys_spec.rb +40 -0
  37. data/spec/parse/bora_hib_spec.rb +29 -0
  38. data/spec/parse/bora_uib_spec.rb +18 -0
  39. data/spec/parse/diva_spec.rb +29 -0
  40. data/spec/parse/duo_spec.rb +29 -0
  41. data/spec/parse/munin_spec.rb +30 -0
  42. data/spec/redirect_follower_spec.rb +37 -0
  43. data/spec/scraper_spec.rb +43 -0
  44. data/spec/scrapers/base_spec.rb +6 -0
  45. data/spec/scrapers/bora_spec.rb +6 -0
  46. data/spec/scrapers/diva_spec.rb +6 -0
  47. data/spec/scrapers/duo_spec.rb +6 -0
  48. data/spec/scrapers/meta_spec.rb +6 -0
  49. data/spec/scrapers/unknown_spec.rb +18 -0
  50. data/spec/search_spec.rb +111 -0
  51. data/spec/shared/scraper.rb +62 -0
  52. data/spec/spec_helper.rb +29 -0
  53. data/spec/support/setup_scraper.rb +8 -0
  54. metadata +292 -0
@@ -0,0 +1,29 @@
1
+ # encoding: utf-8
2
+ require 'spec_helper'
3
+
4
+ describe "ntnu.diva-portal.org" do
5
+ let(:scraper) { DoctorScrape::Scraper::Diva.new url }
6
+
7
+ context "http://ntnu.diva-portal.org/smash/record.jsf?searchId=1&pid=diva2:122798" do
8
+ setup_scraper_for "http://ntnu.diva-portal.org/smash/record.jsf?searchId=1&pid=diva2:122798"
9
+
10
+ its(:title) { should eq "Sporene der hjemme : Om 15-16-åringer og deres hverdagskultur – basert på en undersøkelse om kulturelle og estetiske praksiser i noen utvalgte nordiske ungdomsrom" }
11
+ its(:author) { should eq "Aagre, Willy" }
12
+ its(:issued) { should eq "2006" }
13
+ its(:abstract) { should be_nil }
14
+ its(:permalink) { should eq "http://urn.kb.se/resolve?urn=urn:nbn:no:ntnu:diva-1697" }
15
+ its(:pdf) { should eq "http://ntnu.diva-portal.org/smash/get/diva2:122798/FULLTEXT01" }
16
+ end
17
+
18
+ context "http://ntnu.diva-portal.org/smash/record.jsf?searchId=1&pid=diva2:122794" do
19
+ setup_scraper_for "http://ntnu.diva-portal.org/smash/record.jsf?searchId=1&pid=diva2:122794"
20
+
21
+ its(:title) { should eq "Knowledge Management in Software Engineering: A Systematic Review of Studied Concepts and Research Methods Used" }
22
+ its(:author) { should eq "Bjørnson, Finn Olav; Dingsøyr, Torgeir" }
23
+ its(:issued) { should eq "" }
24
+ its(:abstract) { should be_nil }
25
+ its(:permalink) { should eq "http://urn.kb.se/resolve?urn=urn:nbn:no:ntnu:diva-2833" }
26
+ its(:pdf) { should eq "" }
27
+ end
28
+
29
+ end
@@ -0,0 +1,29 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe "duo.uio.no" do
5
+ let(:scraper) { DoctorScrape::Scraper::Duo.new url }
6
+
7
+ context "http://www.duo.uio.no/sok/work.html?WORKID=112975" do
8
+ setup_scraper_for "http://www.duo.uio.no/sok/work.html?WORKID=112975"
9
+
10
+ its(:title) { should eq "Meaningful Method Names" }
11
+ its(:author) { should eq "Høst, Einar" }
12
+ its(:issued) { should eq "2011" }
13
+ its(:pdf) { should eq "http://www.duo.uio.no/sok/work.html?WORKID=112975&fid=65890" }
14
+ its(:permalink) { should eq "http://urn.nb.no/URN:NBN:no-27629" }
15
+ its(:abstract) { should match /We build computer programs by creating named abtractions/ }
16
+ end
17
+
18
+ context "http://www.duo.uio.no/sok/work.html?WORKID=149776" do
19
+ setup_scraper_for "http://www.duo.uio.no/sok/work.html?WORKID=149776"
20
+
21
+ its(:title) { should eq "CacheCast: a system for efficient single source multiple destination data transfer" }
22
+ its(:author) { should eq "Srebrny, Piotr" }
23
+ its(:issued) { should eq "2011" }
24
+ its(:pdf) { should eq "http://www.duo.uio.no/sok/work.html?WORKID=149776&fid=91631" }
25
+ its(:permalink) { should eq "http://urn.nb.no/URN:NBN:no-30226" }
26
+ its(:abstract) { should match /^The basic function of the Internet is to.*stream to thousands of clients\.$/m }
27
+ end
28
+
29
+ end
@@ -0,0 +1,30 @@
1
+ # encoding: utf-8
2
+ require 'spec_helper'
3
+
4
+ describe "ub.uit.no/munin" do
5
+ let(:scraper) { DoctorScrape::Scraper::Meta.new url }
6
+
7
+ context "http://www.ub.uit.no/munin/handle/10037/3822" do
8
+ setup_scraper_for "http://www.ub.uit.no/munin/handle/10037/3822"
9
+
10
+ its(:title) { should eq "The hidden children of Eve : Sámi poetics guovtti ilmmi gaskkas" }
11
+ its(:author) { should eq "Jernsletten, Kristin (Kikki)" }
12
+ its(:issued) { should eq "2012-02-29" }
13
+ its(:permalink) { should eq "http://hdl.handle.net/10037/3822" }
14
+ its(:pdf) { should eq "http://www.ub.uit.no/munin/bitstream/10037/3822/4/thesis.pdf" }
15
+ its(:isbn) { should be_nil }
16
+ its(:abstract) { should match /^Tesen går ut på at samisk litteraturforståelse og verdensoppfatning.*videre forskning\.$/m }
17
+ end
18
+
19
+ context "http://www.ub.uit.no/munin/handle/10037/3826" do
20
+ setup_scraper_for "http://www.ub.uit.no/munin/handle/10037/3826"
21
+
22
+ its(:title) { should eq "IKT-forbindelser i helsesektoren, Sammenvevinger av IKT, steder, yrker, kjønn og politikk" }
23
+ its(:author) { should eq "Dyb, Kari" }
24
+ its(:issued) { should eq "2011-04-29" }
25
+ its(:permalink) { should eq "http://hdl.handle.net/10037/3826" }
26
+ its(:pdf) { should eq "http://www.ub.uit.no/munin/bitstream/10037/3826/3/paper_4.pdf" }
27
+ its(:abstract) { should match /^Til tross for betydelig politisk satsing.*teknologien blir brukt i sektoren\./m }
28
+ end
29
+
30
+ end
@@ -0,0 +1,37 @@
1
+ # encoding: utf-8
2
+ require 'spec_helper'
3
+
4
+ describe DoctorScrape::RedirectFollower do
5
+ let(:url) { "http://bit.ly/foobar" }
6
+ let(:endpoint) { "http://example.com" }
7
+ let(:resolver) { DoctorScrape::RedirectFollower.new url }
8
+ before { stub_request(:any, endpoint).to_return(:body => "You found me!") }
9
+
10
+ context "when url doesn't redirect" do
11
+ before { stub_request(:any, url).to_return(:body => "Ok") }
12
+ specify { resolver.resolve.should eq url }
13
+ end
14
+
15
+ context "when url redirects" do
16
+ before { stub_request(:any, url).to_return(:status => [ 302, "Moved Temporarily" ], headers: { location: endpoint }) }
17
+ specify { resolver.resolve.should eq endpoint }
18
+ end
19
+
20
+ context "too many redirects" do
21
+ before { stub_request(:any, url).to_return(:status => [ 302, "Moved Temporarily" ], headers: { location: url }) }
22
+
23
+ it "raises error after 5 redirects" do
24
+ expect { resolver.resolve }.to raise_error
25
+ a_request(:get, url).should have_been_made.times(5)
26
+ end
27
+ end
28
+
29
+ context "when exception occurs" do
30
+ it "returns the last url" do
31
+ stub_request(:get, url).to_return(:status => [ 302, "Moved Temporarily" ], headers: { location: endpoint })
32
+ stub_request(:get, endpoint).to_raise Net::HTTPBadResponse
33
+ resolver.resolve.should eq(endpoint)
34
+ end
35
+ end
36
+
37
+ end
@@ -0,0 +1,43 @@
1
+ require 'spec_helper'
2
+
3
+ shared_examples :it_matches do |urls|
4
+ let(:expected_class) { described_class }
5
+
6
+ [urls].flatten.each do |url|
7
+ context url do
8
+ subject { DoctorScrape::Scraper.for url }
9
+ it { should be_a expected_class }
10
+ end
11
+ end
12
+
13
+ end
14
+
15
+ describe DoctorScrape::Scraper::Unknown do
16
+ it_behaves_like :it_matches, "http://foo.example.com"
17
+ end
18
+
19
+ describe DoctorScrape::Scraper::Duo do
20
+ it_behaves_like :it_matches, [
21
+ "http://duo.uio.no/sok/work.html?WORKID=1234",
22
+ "http://www.duo.uio.no/sok/work.html?WORKID=1234",
23
+ ]
24
+ end
25
+
26
+ describe DoctorScrape::Scraper::Diva do
27
+ it_behaves_like :it_matches, [
28
+ "http://ntnu.diva-portal.org/smash/record.jsf?searchId=1&pid=diva2:122798",
29
+ "http://urn.kb.se/resolve?urn=urn:nbn:no:ntnu:diva-15280"
30
+ ]
31
+ end
32
+
33
+ describe DoctorScrape::Scraper::Bora do
34
+ it_behaves_like :it_matches, "https://bora.hib.no/handle/10049/330"
35
+ end
36
+
37
+ describe DoctorScrape::Scraper::Meta do
38
+ it_behaves_like :it_matches, [
39
+ "http://brage.bibsys.no/nhh/handle/URN:NBN:no-bibsys_brage_24121",
40
+ "http://brage.bibsys.no/hibo/handle/URN:NBN:no-bibsys_brage_17854",
41
+ "http://idtjeneste.nb.no/URN:NBN:no-bibsys_brage_24791"
42
+ ]
43
+ end
@@ -0,0 +1,6 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe DoctorScrape::Scraper::Base do
5
+ it_behaves_like :scraper
6
+ end
@@ -0,0 +1,6 @@
1
+ # encoding: utf-8
2
+ require 'spec_helper'
3
+
4
+ describe DoctorScrape::Scraper::Bora do
5
+ it_behaves_like :scraper
6
+ end
@@ -0,0 +1,6 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe DoctorScrape::Scraper::Diva do
5
+ it_behaves_like :scraper
6
+ end
@@ -0,0 +1,6 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe DoctorScrape::Scraper::Duo do
5
+ it_behaves_like :scraper
6
+ end
@@ -0,0 +1,6 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe DoctorScrape::Scraper::Meta do
5
+ it_behaves_like :scraper
6
+ end
@@ -0,0 +1,18 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe DoctorScrape::Scraper::Unknown do
5
+ subject { DoctorScrape::Scraper::Unknown.new "http://foobar.com" }
6
+
7
+ its(:errors?) { should be_true }
8
+ its(:errors) { should == ["Don't know how to scrape this page"] }
9
+
10
+ specify { subject.fetch.should be_false }
11
+ specify { subject.scrape.should be_false }
12
+
13
+ context "after scrape" do
14
+ before { subject.scrape }
15
+ its(:errors) { should == [ "Don't know how to scrape this page" ] }
16
+ end
17
+
18
+ end
@@ -0,0 +1,111 @@
1
+ require 'spec_helper'
2
+
3
+ describe DoctorScrape::Search do
4
+
5
+ describe "nora" do
6
+ let(:default_params) do
7
+ {
8
+ "PAGESIZE" => "50",
9
+ "FROM" => "2007",
10
+ "TO" => "2012",
11
+ "SEARCHMODE" => "TOPIC",
12
+ "DOCUMENTTYPES" => "Doctoral thesis",
13
+ "RESULTMODE" => "rss"
14
+ }
15
+ end
16
+ use_vcr_cassette "nora_search"
17
+ let(:url) { "http://www.ub.uio.no/nora/result.html" }
18
+ subject { DoctorScrape::Search.nora }
19
+
20
+ specify { should be_a Array }
21
+ its(:first) { should match /^http:\/\// }
22
+
23
+ context "without options" do
24
+ it "uses default options" do
25
+ stub = stub_request(:get, url).with(:query => default_params)
26
+ DoctorScrape::Search.nora
27
+ stub.should have_been_requested
28
+ end
29
+
30
+ it "doesn't resolve redirects" do
31
+ DoctorScrape::Search.should_not_receive(:resolve_urls)
32
+ DoctorScrape::Search.should_not_receive(:resolve_scrapers)
33
+ DoctorScrape::Search.nora
34
+ end
35
+ end
36
+
37
+ context "with options" do
38
+
39
+ # TODO: DRY this up
40
+
41
+ it ":limit changes PAGESIZE" do
42
+ params = default_params.update("PAGESIZE" => "10")
43
+ stub_request(:any, url).with(:query => params)
44
+ DoctorScrape::Search.nora :limit => 10
45
+ a_request(:get, url).with(:query => params).should have_been_requested
46
+ end
47
+
48
+ it ":from changes start year" do
49
+ params = default_params.update("FROM" => "2010")
50
+ stub_request(:any, url).with(:query => params)
51
+ DoctorScrape::Search.nora :from => 2010
52
+ a_request(:get, url).with(:query => params).should have_been_requested
53
+ end
54
+
55
+ it ":to changes end year" do
56
+ params = default_params.update("TO" => "2008")
57
+ stub_request(:any, url).with(:query => params)
58
+ DoctorScrape::Search.nora :to => 2008
59
+ a_request(:get, url).with(:query => params).should have_been_requested
60
+ end
61
+
62
+ it "can be combined" do
63
+ params = default_params.update("TO" => "2008", "FROM" => "1980", "PAGESIZE" => "1")
64
+ stub_request(:any, url).with(:query => params)
65
+ DoctorScrape::Search.nora :to => 2008, :from => "1980", limit: 1
66
+ a_request(:get, url).with(:query => params).should have_been_requested
67
+ end
68
+
69
+ context "resolve is true" do
70
+ it "resolves scrapers" do
71
+ DoctorScrape::Search.should_receive(:resolve_scrapers)
72
+ DoctorScrape::Search.nora resolve: true
73
+ end
74
+ end
75
+
76
+ end
77
+
78
+ context "when search fails" do
79
+ it "returns empty array" do
80
+ stub_request(:get, url).with(:query => default_params).to_return(:body => "")
81
+ DoctorScrape::Search.nora.should be_empty
82
+ end
83
+ end
84
+
85
+ end
86
+
87
+ describe "resolve_urls" do
88
+ let(:urls) { [ "http://bit.ly", "http://example.com", "http://goo.gl" ] }
89
+ let(:resolved) { ["http://bitly.com", "http://example.com", "http://googl.com"] }
90
+
91
+ it "returns array with resolved urls" do
92
+ resolvers = resolved.map { |r| double "resolver", :resolve => r }
93
+ urls.each_with_index do |url, i|
94
+ DoctorScrape::RedirectFollower.should_receive(:new).with(url) { resolvers[i] }
95
+ end
96
+
97
+ DoctorScrape::Search.resolve_urls(urls).should == resolved
98
+ end
99
+ end
100
+
101
+ describe "resolve_scrapers" do
102
+ let(:urls) { [ "http://duo.uio.no/", "http://http://foo.bibsys.no", "http://bora.hib.no" ] }
103
+
104
+ it "returns array with appropriate scraper for each url" do
105
+ urls.each { |url| stub_request(:get, url) }
106
+ DoctorScrape::Scraper.should_receive(:for).exactly(3).times { DoctorScrape::Scraper::Meta.new "foo" }
107
+ DoctorScrape::Search.resolve_scrapers(urls).first.should be_a DoctorScrape::Scraper::Meta
108
+ end
109
+ end
110
+
111
+ end
@@ -0,0 +1,62 @@
1
+ # encoding: utf-8
2
+
3
+ shared_examples_for :scraper do
4
+ let(:url) { "http://test.host" }
5
+ let(:scraper) { described_class.new url }
6
+
7
+ context "#fetch" do
8
+ it "uses #url for the url" do
9
+ stub_request :any, "http://www.example.com"
10
+ scraper.should_receive(:url) { "http://www.example.com" }
11
+ scraper.fetch
12
+ end
13
+ end
14
+
15
+ context "#scrape" do
16
+
17
+ context "when successful" do
18
+
19
+ before do
20
+ scraper.stub :fetch
21
+ scraper.stub :parse
22
+ end
23
+
24
+ it "clears errors before scrape" do
25
+ scraper.instance_variable_set("@errors", ['foo', 'bar'])
26
+ scraper.scrape
27
+ scraper.errors.should be_empty
28
+ end
29
+
30
+ it "returns true" do
31
+ scraper.stub(:errors?) { false }
32
+ scraper.scrape.should be_true
33
+ end
34
+
35
+ end
36
+
37
+ context "when there are errors" do
38
+
39
+ context "parser error" do
40
+ before { scraper.stub(:parse).and_raise "parse error" }
41
+ it "returns false " do
42
+ scraper.scrape.should be_false
43
+ end
44
+ end
45
+
46
+ [ [ 404, "Not Found" ], [ 500, "Internal Server Error" ] ].each do |error|
47
+ context error.join(" ") do
48
+ before do
49
+ stub_request(:any, scraper.url).to_return(:status => [error.first, error.last])
50
+ scraper.scrape
51
+ end
52
+
53
+ specify { scraper.errors.first.should be_a OpenURI::HTTPError }
54
+ specify { scraper.errors.first.message.should eq error.join(" ") }
55
+ specify { scraper.data.fields.should == [ :url ] }
56
+
57
+ end
58
+ end
59
+ end
60
+
61
+ end
62
+ end
@@ -0,0 +1,29 @@
1
+ if ENV["SIMPLECOV"]
2
+ require 'simplecov'
3
+ SimpleCov.start do
4
+ add_filter "/vendor"
5
+ end
6
+ end
7
+
8
+ require 'rspec/autorun'
9
+ require 'webmock/rspec'
10
+ require 'vcr'
11
+ require 'doctor_scrape'
12
+ require 'awesome_print'
13
+
14
+ Dir["./spec/shared/**/*.rb"].each {|f| require f}
15
+ Dir["./spec/support/**/*.rb"].each {|f| require f}
16
+
17
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
18
+ RSpec.configure do |config|
19
+ config.treat_symbols_as_metadata_keys_with_true_values = true
20
+ config.run_all_when_everything_filtered = true
21
+ config.filter_run :focus
22
+ config.extend VCR::RSpec::Macros
23
+ end
24
+
25
+ VCR.configure do |config|
26
+ config.cassette_library_dir = "spec/cassettes"
27
+ config.hook_into :webmock
28
+ config.default_cassette_options = { :serialize_with => :yaml }
29
+ end
@@ -0,0 +1,8 @@
1
+ def setup_scraper_for(url)
2
+ use_vcr_cassette url.gsub(/^http:\/\//, '').gsub(/\//, '_')
3
+ let(:url) { url }
4
+ before { scraper.scrape }
5
+ subject { scraper.data }
6
+ specify { scraper.errors?.should be_false }
7
+ end
8
+