doctor_scrape 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +18 -0
- data/.rbenv-version +1 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/Guardfile +10 -0
- data/LICENSE +22 -0
- data/README.md +78 -0
- data/Rakefile +14 -0
- data/doctor_scrape.gemspec +35 -0
- data/lib/doctor_scrape/data.rb +12 -0
- data/lib/doctor_scrape/redirect_follower.rb +29 -0
- data/lib/doctor_scrape/scraper/base.rb +50 -0
- data/lib/doctor_scrape/scraper/bora.rb +25 -0
- data/lib/doctor_scrape/scraper/diva.rb +16 -0
- data/lib/doctor_scrape/scraper/duo.rb +32 -0
- data/lib/doctor_scrape/scraper/meta.rb +39 -0
- data/lib/doctor_scrape/scraper/unknown.rb +20 -0
- data/lib/doctor_scrape/search.rb +42 -0
- data/lib/doctor_scrape/version.rb +3 -0
- data/lib/doctor_scrape.rb +37 -0
- data/spec/cassettes/brage_bibsys_no_hibo_handle_URN_NBN_no-bibsys_brage_17854.yml +624 -0
- data/spec/cassettes/brage_bibsys_no_nhh_handle_URN_NBN_no-bibsys_brage_22813.yml +535 -0
- data/spec/cassettes/brage_bibsys_no_nhh_handle_URN_NBN_no-bibsys_brage_24121.yml +388 -0
- data/spec/cassettes/https___bora_hib_no_handle_10049_234.yml +429 -0
- data/spec/cassettes/https___bora_hib_no_handle_10049_330.yml +347 -0
- data/spec/cassettes/https___bora_uib_no_handle_1956_3282.yml +682 -0
- data/spec/cassettes/nora_search.yml +795 -0
- data/spec/cassettes/ntnu.diva-portal.org_smash_record.jsf?searchId=1&pid=diva2:122798.yml +398 -0
- data/spec/cassettes/ntnu_diva-portal_org_smash_record_jsf_searchId_1_pid_diva2_122794.yml +398 -0
- data/spec/cassettes/ntnu_diva-portal_org_smash_record_jsf_searchId_1_pid_diva2_122798.yml +487 -0
- data/spec/cassettes/www_duo_uio_no_sok_work_html_WORKID_112975.yml +248 -0
- data/spec/cassettes/www_duo_uio_no_sok_work_html_WORKID_149776.yml +240 -0
- data/spec/cassettes/www_ub_uit_no_munin_handle_10037_3822.yml +602 -0
- data/spec/cassettes/www_ub_uit_no_munin_handle_10037_3826.yml +534 -0
- data/spec/data_spec.rb +22 -0
- data/spec/parse/bibsys_spec.rb +40 -0
- data/spec/parse/bora_hib_spec.rb +29 -0
- data/spec/parse/bora_uib_spec.rb +18 -0
- data/spec/parse/diva_spec.rb +29 -0
- data/spec/parse/duo_spec.rb +29 -0
- data/spec/parse/munin_spec.rb +30 -0
- data/spec/redirect_follower_spec.rb +37 -0
- data/spec/scraper_spec.rb +43 -0
- data/spec/scrapers/base_spec.rb +6 -0
- data/spec/scrapers/bora_spec.rb +6 -0
- data/spec/scrapers/diva_spec.rb +6 -0
- data/spec/scrapers/duo_spec.rb +6 -0
- data/spec/scrapers/meta_spec.rb +6 -0
- data/spec/scrapers/unknown_spec.rb +18 -0
- data/spec/search_spec.rb +111 -0
- data/spec/shared/scraper.rb +62 -0
- data/spec/spec_helper.rb +29 -0
- data/spec/support/setup_scraper.rb +8 -0
- metadata +292 -0
data/.gitignore
ADDED
data/.rbenv-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
1.9.3-p125-perf
|
data/.rspec
ADDED
data/Gemfile
ADDED
data/Guardfile
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
# More info at https://github.com/guard/guard#readme
|
2
|
+
|
3
|
+
guard 'rspec', :version => 2 do
|
4
|
+
watch(%r{^spec/.+_spec\.rb$})
|
5
|
+
watch(%r{^lib/doctor_scrape/scraper/(.+)\.rb$}) { |m| "spec/scrapers/#{m[1]}_spec.rb" }
|
6
|
+
watch(%r{^spec/shared/}) { "spec" }
|
7
|
+
watch("spec/spec_helper.rb") { "spec" }
|
8
|
+
watch(%r{^lib/.+\.rb$}) { "spec" }
|
9
|
+
end
|
10
|
+
|
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Gudleik Rasch
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
# DoctorScrape
|
2
|
+
|
3
|
+
Collection of libraries to scrape contents from norwegian doctoral dissertations.
|
4
|
+
Used by http://avhandlinger.no
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
Add this line to your application's Gemfile:
|
9
|
+
|
10
|
+
gem 'doctor_scrape'
|
11
|
+
|
12
|
+
And then execute:
|
13
|
+
|
14
|
+
$ bundle
|
15
|
+
|
16
|
+
Or install it yourself as:
|
17
|
+
|
18
|
+
$ gem install doctor_scrape
|
19
|
+
|
20
|
+
## Usage
|
21
|
+
|
22
|
+
Ta-da
|
23
|
+
|
24
|
+
## Contributing
|
25
|
+
|
26
|
+
1. Fork it
|
27
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
28
|
+
3. Commit your changes (`git commit -am 'Added some feature'`)
|
29
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
30
|
+
5. Create new Pull Request
|
31
|
+
|
32
|
+
## Resources
|
33
|
+
|
34
|
+
### Supported
|
35
|
+
|
36
|
+
* Duo: http://www.duo.uio.no/sok/search.html?documentTypes=Doktoravhandling&yearFrom=1917&yearTo=2011
|
37
|
+
* Brage/NHH: http://brage.bibsys.no/nhh/browse?type=type&order=ASC&rpp=250&value=Doctoral+thesis
|
38
|
+
* DIVA/NTNU: http://ntnu.diva-portal.org/smash/searchlist.jsf?searchId=1
|
39
|
+
* MUNIN/UiT: http://www.ub.uit.no/munin/
|
40
|
+
* Bora/HiB: https://bora.hib.no/
|
41
|
+
|
42
|
+
### Unsupported
|
43
|
+
|
44
|
+
DE STORE
|
45
|
+
|
46
|
+
* Bora/UiB: https://bora.uib.no/browse?type=documenttype&order=ASC&rpp=650&value=Doctoral+thesis
|
47
|
+
* TEORA: http://teora.hit.no/dspace/ (Merk at avhandlinger ligger under ulike enheter, ikke bare «Doktorgradsavhandlinger - dr. ingeniør».)
|
48
|
+
|
49
|
+
DE MELLOMSTORE
|
50
|
+
|
51
|
+
* Brage/BI: http://brage.bibsys.no/bi/browse?type=type&order=ASC&rpp=50&value=Doctoral+thesis
|
52
|
+
* Brage/NIH: http://brage.bibsys.no/nih/browse?type=type&order=ASC&rpp=500&value=Doctoral+thesis
|
53
|
+
* Brage/UiA: http://brage.bibsys.no/hia/browse?type=type&order=ASC&rpp=20&value=Doctoral+thesis
|
54
|
+
* Brage/UiN: http://brage.bibsys.no/hibo/browse?type=type&order=ASC&rpp=20&value=Doctoral+thesis
|
55
|
+
* Brage/UiS: http://brage.bibsys.no/uis/browse?type=type&order=ASC&rpp=25&value=Doctoral+thesis
|
56
|
+
|
57
|
+
DE SMÅ
|
58
|
+
|
59
|
+
* ODA/HiO: https://oda.hio.no/jspui/browse?type=type&order=ASC&rpp=20&value=Doctoral+thesis
|
60
|
+
* Brage/AHO: http://brage.bibsys.no/aho/browse?type=type&order=ASC&rpp=20&value=Doctoral+thesis
|
61
|
+
* Brage/DHS: http://brage.bibsys.no/diakon/browse?type=type&order=ASC&rpp=20&value=Doctoral+thesis
|
62
|
+
* Brage/HSF: http://brage.bibsys.no/hsf/browse?type=type&order=ASC&rpp=20&value=Doctoral+thesis
|
63
|
+
* Brage/HiNT: http://brage.bibsys.no/hint/browse?type=type&order=ASC&rpp=20&value=Doctoral+thesis
|
64
|
+
* Brage/HiNe: http://brage.bibsys.no/hinesna/browse?type=type&order=ASC&rpp=20&value=Doctoral+thesis
|
65
|
+
* Brage/HiL: http://brage.bibsys.no/hil/browse?type=type&order=ASC&rpp=20&value=Doctoral+thesis
|
66
|
+
* Brage/HiST: http://brage.bibsys.no/hist/browse?type=type&order=ASC&rpp=20&value=Doctoral+thesis
|
67
|
+
* Brage/HiL: http://brage.bibsys.no/hil/browse?type=type&order=ASC&rpp=20&value=Doctoral+thesis
|
68
|
+
* Brage/MHS: http://brage.bibsys.no/misjon/browse?type=type&order=ASC&rpp=20&value=Doctoral+thesis
|
69
|
+
* Brage/PHS: http://brage.bibsys.no/politihs/browse?type=type&order=ASC&rpp=20&value=Doctoral+thesis
|
70
|
+
* Brage/NP: http://brage.bibsys.no/npolar/browse?type=type&order=ASC&rpp=20&value=Doctoral+thesis
|
71
|
+
* Brage/HiH: http://brage.bibsys.no/hhe/browse?type=type&order=ASC&rpp=20&value=Doctoral+thesis
|
72
|
+
* Brage/IMR: http://brage.bibsys.no/imr/browse?type=type&order=ASC&rpp=20&value=Doctoral+thesis
|
73
|
+
* Brage/HiØ: http://brage.bibsys.no/hiof/browse?type=type&order=ASC&rpp=20&value=Doctoral+thesis
|
74
|
+
* Brage/NMH: http://brage.bibsys.no/nmh/browse?type=type&order=ASC&rpp=20&value=Doctoral+thesis
|
75
|
+
* Brage/SA: http://brage.bibsys.no/samall/browse?type=type&order=ASC&rpp=20&value=Doctoral+thesis
|
76
|
+
* Brage/UMB: http://brage.bibsys.no/umb/browse?type=type&order=ASC&rpp=20&value=Doctoral+thesis
|
77
|
+
* Brage/SSB: http://brage.bibsys.no/ssb/browse?type=type&order=ASC&rpp=20&value=Doctoral+thesis
|
78
|
+
* Brage/KRUS: http://brage.bibsys.no/krus/browse?type=type&order=ASC&rpp=20&value=Doctoral+thesis
|
data/Rakefile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env rake
|
2
|
+
require "bundler/gem_tasks"
|
3
|
+
require 'rspec/core/rake_task'
|
4
|
+
|
5
|
+
desc "run specs"
|
6
|
+
RSpec::Core::RakeTask.new
|
7
|
+
|
8
|
+
task :default => :spec
|
9
|
+
|
10
|
+
desc "Run test suite and generate coverage report"
|
11
|
+
task :coverage do
|
12
|
+
ENV["SIMPLECOV"] = "1"
|
13
|
+
Rake::Task['spec'].invoke
|
14
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/doctor_scrape/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Gudleik Rasch"]
|
6
|
+
gem.email = ["gudleik@gmail.com"]
|
7
|
+
gem.description = "Library for scraping norwegian doctoral dissertations"
|
8
|
+
gem.summary = "Library for scraping norwegian doctoral dissertations"
|
9
|
+
gem.homepage = "https://github.com/Skalar/doctor_scrape"
|
10
|
+
|
11
|
+
gem.files = `git ls-files`.split("\n")
|
12
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
|
+
gem.name = "doctor_scrape"
|
15
|
+
gem.require_paths = ["lib"]
|
16
|
+
gem.version = DoctorScrape::VERSION
|
17
|
+
|
18
|
+
gem.add_dependency "mechanize", ["~> 2.3"]
|
19
|
+
# gem.add_dependency "text", ["~> 1.0.3"]
|
20
|
+
|
21
|
+
gem.add_development_dependency "rspec", ["~> 2.8.0"]
|
22
|
+
gem.add_development_dependency "vcr", ["~> 2.0.0.rc1"]
|
23
|
+
gem.add_development_dependency "webmock", ["< 1.8"]
|
24
|
+
gem.add_development_dependency "guard", ["~> 1.0.0" ]
|
25
|
+
gem.add_development_dependency "guard-rspec", ["~> 0.6.0"]
|
26
|
+
gem.add_development_dependency "ruby_gntp", ["~> 0.3.4"]
|
27
|
+
gem.add_development_dependency "rb-fsevent", ["~> 0.9.0"]
|
28
|
+
gem.add_development_dependency "pry", ["~> 0.9.8.2"]
|
29
|
+
gem.add_development_dependency "pry-doc", ["~> 0.4.0"]
|
30
|
+
gem.add_development_dependency "pry-editline", ["~> 1.1.1"]
|
31
|
+
gem.add_development_dependency "hirb", ["~> 0.6.0"]
|
32
|
+
gem.add_development_dependency "awesome_print", ["~> 1.0.2"]
|
33
|
+
gem.add_development_dependency "simplecov", ["~> 0.6.0"]
|
34
|
+
# gem.add_development_dependency "psych", ["~> 1.2.2"]
|
35
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'net/https'
|
3
|
+
module DoctorScrape
|
4
|
+
class TooManyRedirects < StandardError; end
|
5
|
+
|
6
|
+
class RedirectFollower
|
7
|
+
attr_accessor :url
|
8
|
+
|
9
|
+
def initialize(url)
|
10
|
+
@url = url
|
11
|
+
end
|
12
|
+
|
13
|
+
def resolve(limit=5)
|
14
|
+
raise TooManyRedirects if limit == 0
|
15
|
+
|
16
|
+
response = Net::HTTP.get_response URI.parse(@url)
|
17
|
+
|
18
|
+
if response.is_a? Net::HTTPRedirection
|
19
|
+
@url = response['location']
|
20
|
+
resolve limit - 1
|
21
|
+
end
|
22
|
+
|
23
|
+
@url
|
24
|
+
rescue Net::HTTPBadResponse => error
|
25
|
+
# This can safely be ignored
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module DoctorScrape::Scraper
|
3
|
+
|
4
|
+
class Base
|
5
|
+
attr_reader :data, :errors
|
6
|
+
|
7
|
+
def initialize(url)
|
8
|
+
@url = url
|
9
|
+
@data = DoctorScrape::Data.new url: @url
|
10
|
+
end
|
11
|
+
|
12
|
+
def scrape
|
13
|
+
@errors = []
|
14
|
+
fetch && parse
|
15
|
+
rescue => error
|
16
|
+
@errors << error
|
17
|
+
ensure
|
18
|
+
return !errors?
|
19
|
+
end
|
20
|
+
|
21
|
+
def fetch
|
22
|
+
@body ||= open(url)
|
23
|
+
end
|
24
|
+
|
25
|
+
def doc
|
26
|
+
@doc ||= Nokogiri::HTML @body
|
27
|
+
end
|
28
|
+
|
29
|
+
def url
|
30
|
+
@url
|
31
|
+
end
|
32
|
+
|
33
|
+
def errors?
|
34
|
+
errors.any?
|
35
|
+
end
|
36
|
+
|
37
|
+
protected
|
38
|
+
|
39
|
+
def link(selector)
|
40
|
+
href = doc.at(selector)['href']
|
41
|
+
href ? URI.parse(@url).merge(href).to_s : nil
|
42
|
+
end
|
43
|
+
|
44
|
+
def pdf_from_link
|
45
|
+
link ".standard:nth-child(1) a"
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module DoctorScrape::Scraper
|
3
|
+
|
4
|
+
class Bora < Base
|
5
|
+
|
6
|
+
def parse
|
7
|
+
|
8
|
+
doc.search("table.itemDisplayTable tr").each do |el|
|
9
|
+
key, value = el.children[0..1].map &:text
|
10
|
+
next unless key =~ /^dc\./
|
11
|
+
|
12
|
+
attribute = key.match(/\.(?<attr>[a-z]+)$/)[:attr]
|
13
|
+
@data.send("#{attribute}=", value)
|
14
|
+
end
|
15
|
+
|
16
|
+
@data.permalink = @data.uri
|
17
|
+
@data.pdf = pdf_from_link
|
18
|
+
end
|
19
|
+
|
20
|
+
def url
|
21
|
+
"#{@url}?mode=full"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module DoctorScrape::Scraper
|
3
|
+
|
4
|
+
class Diva < Meta
|
5
|
+
|
6
|
+
def parse
|
7
|
+
@data.title = text "DC.Title"
|
8
|
+
@data.author = text "DC.Creator"
|
9
|
+
@data.issued = text "DC.Date"
|
10
|
+
@data.permalink = text "DC.Identifier.url"
|
11
|
+
@data.pdf = text "citation_pdf_url"
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module DoctorScrape::Scraper
|
3
|
+
|
4
|
+
class Duo < Base
|
5
|
+
|
6
|
+
def parse
|
7
|
+
doc.search("#main_container table:nth-child(2) tr").each do |el|
|
8
|
+
label, value = el.children.search("td")[0..1].map { |a| a.text.strip.gsub(/\r\n/, '') }
|
9
|
+
|
10
|
+
case label
|
11
|
+
when /Tittel/ then @data.title = value
|
12
|
+
when /Forfatter/ then @data.author = value
|
13
|
+
when /Publisert/ then @data.issued = value
|
14
|
+
when /Permanent/ then @data.permalink = value
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
|
19
|
+
@data.pdf = link "#main_container table:nth-child(2) + p a"
|
20
|
+
@data.abstract = text "#main_container p:last"
|
21
|
+
end
|
22
|
+
|
23
|
+
protected
|
24
|
+
|
25
|
+
def text(selector)
|
26
|
+
el = doc.at(selector)
|
27
|
+
el ? el.text.strip : nil
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module DoctorScrape::Scraper
|
3
|
+
|
4
|
+
# This class uses the meta tags from the HTML document to get the content
|
5
|
+
class Meta < Base
|
6
|
+
|
7
|
+
def parse
|
8
|
+
@data.title = text "DC.title"
|
9
|
+
@data.author = text "DC.creator"
|
10
|
+
@data.issued = text "DCTERMS.issued"
|
11
|
+
@data.permalink = doc.xpath("//meta[@scheme='DCTERMS.URI']/@content").text
|
12
|
+
@data.pdf = pdf_from_meta || pdf_from_link
|
13
|
+
@data.abstract = abstract
|
14
|
+
end
|
15
|
+
|
16
|
+
protected
|
17
|
+
|
18
|
+
def abstract
|
19
|
+
meta("DCTERMS.abstract").map(&:text).sort { |a,b| b.size <=> a.size }.first
|
20
|
+
end
|
21
|
+
|
22
|
+
def text(name)
|
23
|
+
if value = meta(name).text
|
24
|
+
CGI.unescapeHTML(value).gsub(/\r\n/, ' ')
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def meta(name)
|
29
|
+
doc.xpath("//meta[@name='#{name}']/@content")
|
30
|
+
end
|
31
|
+
|
32
|
+
def pdf_from_meta
|
33
|
+
value = doc.xpath("//meta[contains(@content, 'pdf')]/@content").map(&:text).select { |x| x =~ /http/ }.first
|
34
|
+
# sometimes the value contains Fulltext, doh
|
35
|
+
value.gsub(/Fulltext /, '') if value
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'rss'
|
3
|
+
|
4
|
+
module DoctorScrape
|
5
|
+
|
6
|
+
class Search
|
7
|
+
|
8
|
+
class << self
|
9
|
+
def nora(options={})
|
10
|
+
params = {
|
11
|
+
"PAGESIZE" => options[:limit] || 50,
|
12
|
+
"FROM" => options[:from] || 2007,
|
13
|
+
"TO" => options[:to] || Time.now.year,
|
14
|
+
"SEARCHMODE" => "TOPIC",
|
15
|
+
"DOCUMENTTYPES" => "Doctoral+thesis",
|
16
|
+
"RESULTMODE" => "rss",
|
17
|
+
}
|
18
|
+
|
19
|
+
url = "http://www.ub.uio.no/nora/result.html?" << params.map { |key,val| "#{key}=#{val}" }.join("&")
|
20
|
+
|
21
|
+
open url do |rss|
|
22
|
+
feed = RSS::Parser.parse rss
|
23
|
+
|
24
|
+
return [] if feed.nil?
|
25
|
+
items = feed.items.map &:link
|
26
|
+
|
27
|
+
options[:resolve] ? resolve_scrapers(items) : items
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def resolve_urls(urls)
|
32
|
+
urls.map { |url| RedirectFollower.new(url).resolve }
|
33
|
+
end
|
34
|
+
|
35
|
+
def resolve_scrapers(urls)
|
36
|
+
resolve_urls(urls).map { |url| Scraper.for url }
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'doctor_scrape/version'
|
3
|
+
require 'mechanize'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'ostruct'
|
6
|
+
require 'open-uri'
|
7
|
+
|
8
|
+
module DoctorScrape
|
9
|
+
autoload :Data, 'doctor_scrape/data'
|
10
|
+
autoload :Search, 'doctor_scrape/search'
|
11
|
+
autoload :RedirectFollower, 'doctor_scrape/redirect_follower'
|
12
|
+
|
13
|
+
module Scraper
|
14
|
+
autoload :Base, 'doctor_scrape/scraper/base'
|
15
|
+
autoload :Bora, 'doctor_scrape/scraper/bora'
|
16
|
+
autoload :Duo, 'doctor_scrape/scraper/duo'
|
17
|
+
autoload :Diva, 'doctor_scrape/scraper/diva'
|
18
|
+
autoload :Meta, 'doctor_scrape/scraper/meta'
|
19
|
+
autoload :Unknown, 'doctor_scrape/scraper/unknown'
|
20
|
+
|
21
|
+
# TODO: move this logic into the scraper classes
|
22
|
+
def self.for(url)
|
23
|
+
case url
|
24
|
+
when %r{^http://(www.)?duo.uio.no} then Duo.new url
|
25
|
+
when %r{^https?://bora.hib.no} then Bora.new url
|
26
|
+
when %r{^http://.+bibsys.no},
|
27
|
+
%r{idtjeneste.nb.no/URN:NBN:no-bibsys_brage} then Meta.new url
|
28
|
+
when %r{^http://ntnu.diva-portal.org/smash/record.jsf},
|
29
|
+
%r{urn=urn:nbn:no:ntnu:diva} then Diva.new url
|
30
|
+
|
31
|
+
else
|
32
|
+
Unknown.new url
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|