harvestdor 0.0.13
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +25 -0
- data/.travis.yml +14 -0
- data/.yardopts +3 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +5 -0
- data/README.rdoc +162 -0
- data/Rakefile +50 -0
- data/harvestdor.gemspec +41 -0
- data/lib/harvestdor/errors.rb +12 -0
- data/lib/harvestdor/oai_harvest.rb +115 -0
- data/lib/harvestdor/purl_xml.rb +200 -0
- data/lib/harvestdor/version.rb +3 -0
- data/lib/harvestdor.rb +121 -0
- data/spec/config/oai.yml +37 -0
- data/spec/harvestdor_client_spec.rb +135 -0
- data/spec/harvestdor_spec.rb +23 -0
- data/spec/oai_harvest_spec.rb +220 -0
- data/spec/oai_integration_spec.rb +125 -0
- data/spec/purl_xml_spec.rb +194 -0
- data/spec/spec_helper.rb +21 -0
- metadata +211 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 3036c2f23661946012d909179b67effa433c2dd9
|
4
|
+
data.tar.gz: 149e27d97f4c9d48ca46772d550537aca8a55362
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 4406cda66fcb2a726564ba912c60364433b9209ac03218fc2fbf70200fe679f8c964fa416abc0f163eecdabc4db51ad7b57de148f38cf4773b84a4de3fde854e
|
7
|
+
data.tar.gz: b18c647163a3970ae02cb3ee79ba0df04f4d5920ccbc5740a83b19b2958297ce2b1e0c761128293ae7bbb51633e419a6689b839f33f34f602a5292b2a078a4db
|
data/.gitignore
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
.yardoc
|
6
|
+
.travis
|
7
|
+
.rvmrc
|
8
|
+
Gemfile.lock
|
9
|
+
InstalledFiles
|
10
|
+
_yardoc
|
11
|
+
coverage
|
12
|
+
doc/
|
13
|
+
lib/bundler/man
|
14
|
+
pkg
|
15
|
+
rdoc
|
16
|
+
spec/reports
|
17
|
+
spec/test_logs
|
18
|
+
test/tmp
|
19
|
+
test/version_tmp
|
20
|
+
tmp
|
21
|
+
logs
|
22
|
+
.DS_Store
|
23
|
+
*.tmproj
|
24
|
+
tmtags
|
25
|
+
.idea/*
|
data/.travis.yml
ADDED
data/.yardopts
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,5 @@
|
|
1
|
+
Copyright (c) 20XX-2012. The Board of Trustees of the Leland Stanford Junior University. All rights reserved.
|
2
|
+
|
3
|
+
Redistribution and use of this distribution in source and binary forms, with or without modification, are permitted provided that: The above copyright notice and this permission notice appear in all copies and supporting documentation; The name, identifiers, and trademarks of The Board of Trustees of the Leland Stanford Junior University are not used in advertising or publicity without the express prior written permission of The Board of Trustees of the Leland Stanford Junior University; Recipients acknowledge that this distribution is made available as a research courtesy, "as is", potentially with defects, without any obligation on the part of The Board of Trustees of the Leland Stanford Junior University to provide support, services, or repair;
|
4
|
+
|
5
|
+
THE BOARD OF TRUSTEES OF THE LELAND STANFORD JUNIOR UNIVERSITY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, WITH REGARD TO THIS SOFTWARE, INCLUDING WITHOUT LIMITATION ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, AND IN NO EVENT SHALL THE BOARD OF TRUSTEES OF THE LELAND STANFORD JUNIOR UNIVERSITY BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, TORT (INCLUDING NEGLIGENCE) OR STRICT LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,162 @@
|
|
1
|
+
= Harvestdor
|
2
|
+
{<img src="https://travis-ci.org/sul-dlss/harvestdor.svg?branch=master" alt="Build Status" />}[https://travis-ci.org/sul-dlss/harvestdor] {<img src="https://gemnasium.com/sul-dlss/harvestdor.svg" alt="Dependency Status" />}[https://gemnasium.com/sul-dlss/harvestdor]
|
3
|
+
|
4
|
+
A Gem to harvest metadata from DOR.
|
5
|
+
|
6
|
+
== Installation
|
7
|
+
|
8
|
+
Add this line to your application's Gemfile:
|
9
|
+
|
10
|
+
gem 'harvestdor'
|
11
|
+
|
12
|
+
And then execute:
|
13
|
+
|
14
|
+
$ bundle
|
15
|
+
|
16
|
+
Or install it yourself as:
|
17
|
+
|
18
|
+
$ gem install harvestdor
|
19
|
+
|
20
|
+
== Usage
|
21
|
+
|
22
|
+
=== Configuration
|
23
|
+
|
24
|
+
==== Possible configuration options (with default values unless otherwise indicated)
|
25
|
+
|
26
|
+
client = Harvestdor::Client.new({ # Example with all possible options
|
27
|
+
:log_dir => File.join(File.dirname(__FILE__), "..", "logs"),
|
28
|
+
:log_name => 'harvestdor.log',
|
29
|
+
:purl => 'http://purl.stanford.edu',
|
30
|
+
:http_options => { 'ssl' => {
|
31
|
+
'verify' => false
|
32
|
+
},
|
33
|
+
'request' => {
|
34
|
+
'timeout' => 60, # open/read timeout (seconds)
|
35
|
+
'open_timeout' => 60 # connection open timeout (seconds)
|
36
|
+
}
|
37
|
+
},
|
38
|
+
:oai_repository_url => 'https://dor-oaiprovider-prod.stanford.edu/oai', # The OAI repository to connect to
|
39
|
+
:oai_client_debug => false,
|
40
|
+
:default_metadata_prefix => 'mods',
|
41
|
+
:default_from_date => '2012-12-01', # default value is nil
|
42
|
+
:default_until_date => '2014-12-01', # default value is nil
|
43
|
+
:default_set => 'is_governed_by_hy787xj5878', # default value is nil
|
44
|
+
})
|
45
|
+
|
46
|
+
==== Option 1: use a yaml file
|
47
|
+
|
48
|
+
for contents of yml -- see spec/config/oai.yml
|
49
|
+
|
50
|
+
client = Harvestdor::Client.new({:config_yml_path => path_to_my_yml})
|
51
|
+
client.druids_via_oai do |druid|
|
52
|
+
# do stuff with the druid, e.g.
|
53
|
+
# cm = client.content_metadata(druid)
|
54
|
+
# mods = client.mods(druid)
|
55
|
+
# create solr doc from mods and cm
|
56
|
+
# write solr doc to your app's index
|
57
|
+
end
|
58
|
+
|
59
|
+
==== Option 2: pass in non-default configurations as a hash
|
60
|
+
|
61
|
+
client = Harvestdor::Client.new({:oai_repository_url => 'http://my_oai.org', :default_from_date => '2012-12-01'})
|
62
|
+
client.druids_via_oai do |druid|
|
63
|
+
# do stuff with the druid, e.g.
|
64
|
+
# cm = client.content_metadata(druid)
|
65
|
+
# mods = client.mods(druid)
|
66
|
+
# create solr doc from mods and cm
|
67
|
+
# write solr doc to your app's index
|
68
|
+
end
|
69
|
+
|
70
|
+
==== Option 3: set the attributes explicitly in your code
|
71
|
+
|
72
|
+
client = Harvestdor::Client.new
|
73
|
+
client.config.oai_repository_url = 'http://my_oai.org'
|
74
|
+
client.oai_records do |rec|
|
75
|
+
# do stuff with the OAI rec, e.g.
|
76
|
+
# manipulate metadata into solr doc
|
77
|
+
# manipulate stuff from record.about into solr doc
|
78
|
+
# write solr doc to your app's index
|
79
|
+
end
|
80
|
+
|
81
|
+
==== Option 4: set the OAI parameters as params in call to Harvestdor::Client.harvest_xx
|
82
|
+
|
83
|
+
client = Harvestdor::Client.new({oai_repository_url = 'http://my_oai.org'})
|
84
|
+
client.oai_headers(:metadataPrefix => 'foo', :from => '2012-11-27', :set => 'is_governed_by_hy787xj5878') do | id |
|
85
|
+
# do stuff with the druid
|
86
|
+
end
|
87
|
+
|
88
|
+
=== OAI Harvesting
|
89
|
+
|
90
|
+
Harvestdor::Client.druids_via_oai gets enumerated druids for the records in your specified set / date range (druids are not preceded by 'druid:')
|
91
|
+
|
92
|
+
Harvestdor::Client.oai_records gets enumerated OAI record objects in your specified set / data range, with the metadata format you indicated
|
93
|
+
|
94
|
+
You can also get these as arrays:
|
95
|
+
|
96
|
+
druid_array = client.druids_via_oai(:metadataPrefix => 'foo', :from => '(last_harvested_date)', :set => 'asdfasdf')
|
97
|
+
|
98
|
+
=== XML from PURL pages
|
99
|
+
|
100
|
+
You can get, for example, the contentMetadata for a druid:
|
101
|
+
|
102
|
+
it "#content_metadata retrieves contentMetadata as a Nokogiri::XML::Document" do
|
103
|
+
cm = Harvestdor.content_metadata('bb375wb8869', 'http://purl-test.stanford.edu')
|
104
|
+
cm.should be_kind_of(Nokogiri::XML::Document)
|
105
|
+
cm.root.name.should == 'contentMetadata'
|
106
|
+
cm.root.attributes['objectId'].text.should == @druid
|
107
|
+
end
|
108
|
+
|
109
|
+
Or the MODS metadata:
|
110
|
+
|
111
|
+
it "#mods returns a Nokogiri::XML::Document from the purl mods" do
|
112
|
+
x = Harvestdor.mods('bb375wb8869', 'http://purl-test.stanford.edu')
|
113
|
+
x.should be_kind_of(Nokogiri::XML::Document)
|
114
|
+
x.root.name.should == 'mods'
|
115
|
+
x.root.namespace.href.should == Harvestdor::MODS_NAMESPACE
|
116
|
+
end
|
117
|
+
|
118
|
+
Similarly for
|
119
|
+
# mods
|
120
|
+
# public_xml (all of it)
|
121
|
+
# content_metadata
|
122
|
+
# identity_metadata
|
123
|
+
# rights_metadata
|
124
|
+
# rdf
|
125
|
+
# dc
|
126
|
+
|
127
|
+
You can also do this from a Harvestdor::Client object, and it will use the purl from the Client.config:
|
128
|
+
|
129
|
+
client = Harvestdor::Client.new({:purl_url => 'http://thisone.org'})
|
130
|
+
client.identity_metadata('bb375wb8869')
|
131
|
+
|
132
|
+
|
133
|
+
=== TODO: Last Harvested Datestamp (Incremental Harvests)
|
134
|
+
|
135
|
+
Harvestdor::Client.last_datestamp
|
136
|
+
|
137
|
+
persist this information with your app for incremental harvests
|
138
|
+
|
139
|
+
|
140
|
+
== Contributing
|
141
|
+
|
142
|
+
# Fork it
|
143
|
+
# Create your feature branch (`git checkout -b my-new-feature`)
|
144
|
+
# Write code and tests.
|
145
|
+
# Commit your changes (`git commit -am 'Added some feature'`)
|
146
|
+
# Push to the branch (`git push origin my-new-feature`)
|
147
|
+
# Create new Pull Request
|
148
|
+
|
149
|
+
== Releases
|
150
|
+
|
151
|
+
* <b>0.0.13</b> Updated to work with Faraday 0.9, releases via rubygems instead of sul-gems
|
152
|
+
* <b>0.0.11</b> better error handling, and better testing for errors
|
153
|
+
* <b>0.0.10</b> tweak specs to test that unnec fetching isn't done.
|
154
|
+
* <b>0.0.9</b> allows public xml to be passed as Nokogiri::XML::Document to content_metadata, etc. to avoid unnec fetching
|
155
|
+
* <b>0.0.8</b> avoid undefined method 'size' from scrub_oai_args when using a non-nil default date param
|
156
|
+
* <b>0.0.7</b> add oai client timeout overrides, update README
|
157
|
+
* <b>0.0.6</b> refactoring oai_harvest for greater simplicity and passing errors through, add oai_record (get_record OAI request)
|
158
|
+
* <b>0.0.5</b> don't send empty string arguments to OAI server so you can get actual results
|
159
|
+
* <b>0.0.4</b> add integration spec and get it working with actual OAI server
|
160
|
+
* <b>0.0.3</b> add method to get mods from purl
|
161
|
+
* <b>0.0.2</b> tidy up README
|
162
|
+
* <b>0.0.1</b> initial commit
|
data/Rakefile
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
|
3
|
+
require 'rake'
|
4
|
+
require 'bundler'
|
5
|
+
|
6
|
+
require 'rspec/core/rake_task'
|
7
|
+
require 'yard'
|
8
|
+
require 'yard/rake/yardoc_task'
|
9
|
+
|
10
|
+
begin
|
11
|
+
Bundler.setup(:default, :development)
|
12
|
+
rescue Bundler::BundlerError => e
|
13
|
+
$stderr.puts e.message
|
14
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
15
|
+
exit e.status_code
|
16
|
+
end
|
17
|
+
|
18
|
+
task :default => :ci
|
19
|
+
|
20
|
+
desc "run continuous integration suite (tests, coverage, docs)"
|
21
|
+
task :ci => [:rspec, :doc]
|
22
|
+
|
23
|
+
task :spec => :rspec
|
24
|
+
|
25
|
+
desc "run specs EXCEPT integration specs"
|
26
|
+
RSpec::Core::RakeTask.new(:spec_fast) do |spec|
|
27
|
+
spec.rspec_opts = ["-c", "-f progress", "--tty", "-t ~integration", "-r ./spec/spec_helper.rb"]
|
28
|
+
end
|
29
|
+
|
30
|
+
RSpec::Core::RakeTask.new(:rspec) do |spec|
|
31
|
+
spec.rspec_opts = ["-c", "-f progress", "--tty", "-r ./spec/spec_helper.rb"]
|
32
|
+
end
|
33
|
+
|
34
|
+
# Use yard to build docs
|
35
|
+
begin
|
36
|
+
project_root = File.expand_path(File.dirname(__FILE__))
|
37
|
+
doc_dest_dir = File.join(project_root, 'doc')
|
38
|
+
|
39
|
+
YARD::Rake::YardocTask.new(:doc) do |yt|
|
40
|
+
yt.files = Dir.glob(File.join(project_root, 'lib', '**', '*.rb')) +
|
41
|
+
[ File.join(project_root, 'README.rdoc') ]
|
42
|
+
yt.options = ['--output-dir', doc_dest_dir, '--readme', 'README.rdoc', '--title', 'Harvestdor Gem Documentation']
|
43
|
+
end
|
44
|
+
rescue LoadError
|
45
|
+
desc "Generate YARD Documentation"
|
46
|
+
task :doc do
|
47
|
+
abort "Please install the YARD gem to generate rdoc."
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
data/harvestdor.gemspec
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'harvestdor/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "harvestdor"
|
8
|
+
gem.version = Harvestdor::VERSION
|
9
|
+
gem.authors = ["Naomi Dushay"]
|
10
|
+
gem.email = ["ndushay@stanford.edu"]
|
11
|
+
gem.description = %q{Harvest DOR object metadata via a relationship (e.g. hydra:isGovernedBy rdf:resource="info:fedora/druid:hy787xj5878") and dates}
|
12
|
+
gem.summary = %q{Harvest DOR object metadata}
|
13
|
+
gem.homepage = "https://consul.stanford.edu/display/chimera/Chimera+project"
|
14
|
+
|
15
|
+
gem.files = `git ls-files`.split($/)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^spec/})
|
18
|
+
gem.require_paths = ["lib"]
|
19
|
+
|
20
|
+
gem.add_dependency 'oai', '~> 0.3.0'
|
21
|
+
gem.add_dependency 'faraday', '>= 0.9.0'
|
22
|
+
gem.add_dependency 'confstruct'
|
23
|
+
gem.add_dependency 'nokogiri'
|
24
|
+
|
25
|
+
# Runtime dependencies
|
26
|
+
# gem.add_runtime_dependency 'nokogiri'
|
27
|
+
|
28
|
+
# Development dependencies
|
29
|
+
# Bundler will install these gems too if you've checked out solrmarc-wrapper source from git and run 'bundle install'
|
30
|
+
# It will not add these as dependencies if you require solrmarc-wrapper for other projects
|
31
|
+
gem.add_development_dependency "rake"
|
32
|
+
# docs
|
33
|
+
gem.add_development_dependency "rdoc"
|
34
|
+
gem.add_development_dependency "yard"
|
35
|
+
# tests
|
36
|
+
gem.add_development_dependency 'rspec'
|
37
|
+
gem.add_development_dependency 'simplecov'
|
38
|
+
gem.add_development_dependency 'simplecov-rcov'
|
39
|
+
# gem.add_development_dependency 'ruby-debug19'
|
40
|
+
|
41
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
module Harvestdor
|
2
|
+
module Errors
|
3
|
+
MissingPurlPage = Class.new(StandardError)
|
4
|
+
MissingMods = Class.new(StandardError)
|
5
|
+
MissingPublicXml = Class.new(StandardError)
|
6
|
+
MissingContentMetadata = Class.new(StandardError)
|
7
|
+
MissingIdentityMetadata = Class.new(StandardError)
|
8
|
+
MissingRightsMetadata = Class.new(StandardError)
|
9
|
+
MissingRDF = Class.new(StandardError)
|
10
|
+
MissingDC = Class.new(StandardError)
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,115 @@
|
|
1
|
+
require 'oai'
|
2
|
+
|
3
|
+
module Harvestdor
|
4
|
+
|
5
|
+
# Mixin: methods to perform an OAI harvest and iterate over results
|
6
|
+
class Client
|
7
|
+
|
8
|
+
# return Array of OAI::Records from the OAI harvest indicated by OAI params (metadata_prefix, from, until, set)
|
9
|
+
# @param [Hash] oai_args optional OAI params (:metadata_prefix, :from, :until, :set) to be used in lieu of config default values
|
10
|
+
# @return [Array<OAI::Record>] or enumeration over it, if block is given
|
11
|
+
def oai_records oai_args = {}
|
12
|
+
return to_enum(:oai_records, oai_args).to_a unless block_given?
|
13
|
+
|
14
|
+
harvest(:list_records, scrub_oai_args(oai_args)) do |oai_rec|
|
15
|
+
yield oai_rec
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
# return Array of OAI::Headers from the OAI harvest indicated by OAI params (metadata_prefix, from, until, set)
|
20
|
+
# @param [Hash] oai_args optional OAI params (:metadata_prefix, :from, :until, :set) to be used in lieu of config default values
|
21
|
+
# @return [Array<OAI::Header>] or enumeration over it, if block is given
|
22
|
+
def oai_headers oai_args = {}
|
23
|
+
return to_enum(:oai_headers, oai_args).to_a unless block_given?
|
24
|
+
|
25
|
+
harvest(:list_identifiers, scrub_oai_args(oai_args)) do |oai_hdr|
|
26
|
+
yield oai_hdr
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# return Array of druids contained in the OAI harvest indicated by OAI params (metadata_prefix, from, until, set)
|
31
|
+
# @param [Hash] oai_args optional OAI params (:metadata_prefix, :from, :until, :set) to be used in lieu of config default values
|
32
|
+
# @return [Array<String>] or enumeration over it, if block is given
|
33
|
+
def druids_via_oai oai_args = {}
|
34
|
+
return to_enum(:druids_via_oai, oai_args).to_a unless block_given?
|
35
|
+
|
36
|
+
harvest(:list_identifiers, scrub_oai_args(oai_args)) do |oai_hdr|
|
37
|
+
yield Harvestdor.druid(oai_hdr)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
# get a single OAI record using a get_record OAI request
|
42
|
+
# @param [String] druid (which will be turned into OAI identifier)
|
43
|
+
# @param [String] md_prefix the OAI metadata prefix determining which metadata will be in the retrieved OAI::Record object
|
44
|
+
# @return [OAI::Record] record object retrieved from OAI server
|
45
|
+
def oai_record druid, md_prefix = 'mods'
|
46
|
+
prefix = md_prefix ? md_prefix : config.default_metadata_prefix
|
47
|
+
oai_client.get_record({:identifier => "oai:searchworks.stanford.edu/druid:#{druid}", :metadata_prefix => prefix}).record
|
48
|
+
end
|
49
|
+
|
50
|
+
protected #---------------------------------------------------------------------
|
51
|
+
|
52
|
+
# @param [Hash] oai_args Hash of OAI params (metadata_prefix, from, until, set) to be used in lieu of config default values
|
53
|
+
# @return [Hash] OAI params (metadata_prefix, from, until, set) cleaned up for making harvest request
|
54
|
+
def scrub_oai_args oai_args = {}
|
55
|
+
scrubbed_args={}
|
56
|
+
scrubbed_args[:metadata_prefix] = oai_args.keys.include?(:metadata_prefix) ? oai_args[:metadata_prefix] : config.default_metadata_prefix
|
57
|
+
scrubbed_args[:from] = oai_args.keys.include?(:from) ? oai_args[:from] : config.default_from_date
|
58
|
+
scrubbed_args[:until] = oai_args.keys.include?(:until) ? oai_args[:until] : config.default_until_date
|
59
|
+
scrubbed_args[:set] = oai_args.keys.include?(:set) ? oai_args[:set] : config.default_set
|
60
|
+
scrubbed_args.each { |k, v|
|
61
|
+
scrubbed_args.delete(k) if v.nil? || v == ''
|
62
|
+
}
|
63
|
+
scrubbed_args
|
64
|
+
end
|
65
|
+
|
66
|
+
# harvest OAI headers or OAI records and return a response object with one entry for each record/header retrieved
|
67
|
+
# follows resumption tokens (i.e. chunks are all present in result)
|
68
|
+
# @param [Symbol] verb :list_identifiers or :list_records
|
69
|
+
# @param [Hash] oai_args OAI params (metadata_prefix, from, until, set) used for request
|
70
|
+
# @return response to OAI request, as one enumerable object
|
71
|
+
# TODO: This could be moved into ruby-oai?
|
72
|
+
def harvest (verb, oai_args, &block)
|
73
|
+
response = oai_client.send verb, oai_args
|
74
|
+
while response && response.entries.size > 0
|
75
|
+
response.entries.each &block
|
76
|
+
|
77
|
+
token = response.resumption_token
|
78
|
+
if token.nil? or token.empty?
|
79
|
+
break
|
80
|
+
else
|
81
|
+
response = oai_client.send(verb, :resumption_token => token)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
rescue Faraday::Error::TimeoutError => e
|
85
|
+
logger.error "No response from OAI Provider"
|
86
|
+
logger.error e
|
87
|
+
raise e
|
88
|
+
rescue OAI::Exception => e
|
89
|
+
# possibly unnecessary after ruby-oai 0.0.14
|
90
|
+
logger.error "Received unexpected OAI::Exception"
|
91
|
+
logger.error e
|
92
|
+
raise e
|
93
|
+
end
|
94
|
+
|
95
|
+
end # class OaiHarvester
|
96
|
+
|
97
|
+
end # module Harvestdor
|
98
|
+
|
99
|
+
module OAI
|
100
|
+
class Client
|
101
|
+
# monkey patch to adjust timeouts
|
102
|
+
# Do the actual HTTP get, following any temporary redirects
|
103
|
+
def get(uri)
|
104
|
+
# OLD: response = @http_client.get uri
|
105
|
+
response = @http_client.get do |req|
|
106
|
+
req.url uri
|
107
|
+
# FIXME: hard-coded default settings in harvestdor are used here
|
108
|
+
# values are in seconds
|
109
|
+
req.options[:timeout] = Harvestdor::Client.default_config.http_options.timeout # open/read timeout
|
110
|
+
req.options[:open_timeout] = Harvestdor::Client.default_config.http_options.open_timeout # connection open timeout
|
111
|
+
end
|
112
|
+
response.body
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
@@ -0,0 +1,200 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module Harvestdor
|
4
|
+
# Mixin: code to retrieve Purl public xml pieces
|
5
|
+
|
6
|
+
RDF_NAMESPACE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
|
7
|
+
OAI_DC_NAMESPACE = 'http://www.openarchives.org/OAI/2.0/oai_dc/'
|
8
|
+
MODS_NAMESPACE = 'http://www.loc.gov/mods/v3'
|
9
|
+
|
10
|
+
# the MODS metadata for this fedora object, from the purl server
|
11
|
+
# @param [String] druid e.g. ab123cd4567
|
12
|
+
# @param [String] purl_url url for the purl server. default is Harvestdor::PURL_DEFAULT
|
13
|
+
# @return [Nokogiri::XML::Document] the MODS for the fedora object
|
14
|
+
def self.mods druid, purl_url = Harvestdor::PURL_DEFAULT
|
15
|
+
begin
|
16
|
+
Nokogiri::XML(open("#{purl_url}/#{druid}.mods"),nil,'UTF-8')
|
17
|
+
rescue OpenURI::HTTPError
|
18
|
+
raise Harvestdor::Errors::MissingMods.new(druid)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# the public xml for this fedora object, from the purl page
|
23
|
+
# @param [String] druid e.g. ab123cd4567
|
24
|
+
# @param [String] purl_url url for the purl server. default is Harvestdor::PURL_DEFAULT
|
25
|
+
# @return [Nokogiri::XML::Document] the public xml for the fedora object
|
26
|
+
def self.public_xml druid, purl_url = Harvestdor::PURL_DEFAULT
|
27
|
+
return druid if druid.instance_of?(Nokogiri::XML::Document)
|
28
|
+
begin
|
29
|
+
ng_doc = Nokogiri::XML(open("#{purl_url}/#{druid}.xml"))
|
30
|
+
raise Harvestdor::Errors::MissingPublicXml.new(druid) if !ng_doc || ng_doc.children.empty?
|
31
|
+
ng_doc
|
32
|
+
rescue OpenURI::HTTPError
|
33
|
+
raise Harvestdor::Errors::MissingPurlPage.new(druid)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# the contentMetadata for this fedora object, from the purl xml
|
38
|
+
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
39
|
+
# a Nokogiri::XML::Document containing the public_xml for an object
|
40
|
+
# @param [String] purl_url url for the purl server. default is Harvestdor::PURL_DEFAULT
|
41
|
+
# @return [Nokogiri::XML::Document] the contentMetadata for the fedora object
|
42
|
+
def self.content_metadata object, purl_url = Harvestdor::PURL_DEFAULT
|
43
|
+
pub_xml_ng_doc = pub_xml(object, purl_url)
|
44
|
+
begin
|
45
|
+
# preserve namespaces, etc for the node
|
46
|
+
ng_doc = Nokogiri::XML(pub_xml_ng_doc.root.xpath('/publicObject/contentMetadata').to_xml)
|
47
|
+
raise Harvestdor::Errors::MissingContentMetadata.new(object.inspect) if !ng_doc || ng_doc.children.empty?
|
48
|
+
ng_doc
|
49
|
+
rescue
|
50
|
+
raise Harvestdor::Errors::MissingContentMetadata.new(object.inspect)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# the identityMetadata for this fedora object, from the purl xml
|
55
|
+
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
56
|
+
# a Nokogiri::XML::Document containing the public_xml for an object
|
57
|
+
# @param [String] purl_url url for the purl server. default is Harvestdor::PURL_DEFAULT
|
58
|
+
# @return [Nokogiri::XML::Document] the identityMetadata for the fedora object
|
59
|
+
def self.identity_metadata object, purl_url = Harvestdor::PURL_DEFAULT
|
60
|
+
pub_xml_ng_doc = pub_xml(object, purl_url)
|
61
|
+
begin
|
62
|
+
# preserve namespaces, etc for the node
|
63
|
+
ng_doc = Nokogiri::XML(pub_xml_ng_doc.root.xpath('/publicObject/identityMetadata').to_xml)
|
64
|
+
raise Harvestdor::Errors::MissingIdentityMetadata.new(object.inspect) if !ng_doc || ng_doc.children.empty?
|
65
|
+
ng_doc
|
66
|
+
rescue
|
67
|
+
raise Harvestdor::Errors::MissingIdentityMetadata.new(object.inspect)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# the rightsMetadata for this fedora object, from the purl xml
|
72
|
+
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
73
|
+
# a Nokogiri::XML::Document containing the public_xml for an object
|
74
|
+
# @param [String] purl_url url for the purl server. default is Harvestdor::PURL_DEFAULT
|
75
|
+
# @return [Nokogiri::XML::Document] the rightsMetadata for the fedora object
|
76
|
+
def self.rights_metadata object, purl_url = Harvestdor::PURL_DEFAULT
|
77
|
+
pub_xml_ng_doc = pub_xml(object, purl_url)
|
78
|
+
begin
|
79
|
+
# preserve namespaces, etc for the node
|
80
|
+
ng_doc = Nokogiri::XML(pub_xml_ng_doc.root.xpath('/publicObject/rightsMetadata').to_xml)
|
81
|
+
raise Harvestdor::Errors::MissingRightsMetadata.new(object.inspect) if !ng_doc || ng_doc.children.empty?
|
82
|
+
ng_doc
|
83
|
+
rescue
|
84
|
+
raise Harvestdor::Errors::MissingRightsMetadata.new(object.inspect)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# the RDF for this fedora object, from the purl xml
|
89
|
+
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
90
|
+
# a Nokogiri::XML::Document containing the public_xml for an object
|
91
|
+
# @param [String] purl_url url for the purl server. default is Harvestdor::PURL_DEFAULT
|
92
|
+
# @return [Nokogiri::XML::Document] the RDF for the fedora object
|
93
|
+
def self.rdf object, purl_url = Harvestdor::PURL_DEFAULT
|
94
|
+
pub_xml_ng_doc = pub_xml(object, purl_url)
|
95
|
+
begin
|
96
|
+
# preserve namespaces, etc for the node
|
97
|
+
ng_doc = Nokogiri::XML(pub_xml_ng_doc.root.xpath('/publicObject/rdf:RDF', {'rdf' => Harvestdor::RDF_NAMESPACE}).to_xml)
|
98
|
+
raise Harvestdor::Errors::MissingRDF.new(object.inspect) if !ng_doc || ng_doc.children.empty?
|
99
|
+
ng_doc
|
100
|
+
rescue
|
101
|
+
raise Harvestdor::Errors::MissingRDF.new(object.inspect)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# the Dublin Core for this fedora object, from the purl xml
|
106
|
+
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
107
|
+
# a Nokogiri::XML::Document containing the public_xml for an object
|
108
|
+
# @param [String] purl_url url for the purl server. default is Harvestdor::PURL_DEFAULT
|
109
|
+
# @return [Nokogiri::XML::Document] the dc for the fedora object
|
110
|
+
def self.dc object, purl_url = Harvestdor::PURL_DEFAULT
|
111
|
+
pub_xml_ng_doc = pub_xml(object, purl_url)
|
112
|
+
begin
|
113
|
+
# preserve namespaces, etc for the node
|
114
|
+
ng_doc = Nokogiri::XML(pub_xml_ng_doc.root.xpath('/publicObject/dc:dc', {'dc' => Harvestdor::OAI_DC_NAMESPACE}).to_xml)
|
115
|
+
raise Harvestdor::Errors::MissingDC.new(object.inspect) if !ng_doc || ng_doc.children.empty?
|
116
|
+
ng_doc
|
117
|
+
rescue
|
118
|
+
raise Harvestdor::Errors::MissingDC.new(object.inspect)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
|
123
|
+
class Client
|
124
|
+
|
125
|
+
# the public xml for this fedora object, from the purl server
|
126
|
+
# @param [String] druid e.g. ab123cd4567, in the purl url
|
127
|
+
# @return [Nokogiri::XML::Document] the MODS metadata for the fedora object
|
128
|
+
def mods druid
|
129
|
+
Harvestdor.mods(druid, config.purl)
|
130
|
+
end
|
131
|
+
|
132
|
+
# the public xml for this fedora object, from the purl xml
|
133
|
+
# @param [String] druid e.g. ab123cd4567, in the purl url
|
134
|
+
# @return [Nokogiri::XML::Document] the public xml for the fedora object
|
135
|
+
def public_xml druid
|
136
|
+
Harvestdor.public_xml(druid, config.purl)
|
137
|
+
end
|
138
|
+
|
139
|
+
# the contentMetadata for this fedora object, from the purl xml
|
140
|
+
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
141
|
+
# a Nokogiri::XML::Document containing the public_xml for an object
|
142
|
+
# @return [Nokogiri::XML::Document] the contentMetadata for the fedora object
|
143
|
+
def content_metadata object
|
144
|
+
Harvestdor.content_metadata(object, config.purl)
|
145
|
+
end
|
146
|
+
|
147
|
+
# the identityMetadata for this fedora object, from the purl xml
|
148
|
+
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
149
|
+
# a Nokogiri::XML::Document containing the public_xml for an object
|
150
|
+
# @return [Nokogiri::XML::Document] the identityMetadata for the fedora object
|
151
|
+
def identity_metadata object
|
152
|
+
Harvestdor.identity_metadata(object, config.purl)
|
153
|
+
end
|
154
|
+
|
155
|
+
# the rightsMetadata for this fedora object, from the purl xml
|
156
|
+
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
157
|
+
# a Nokogiri::XML::Document containing the public_xml for an object
|
158
|
+
# @return [Nokogiri::XML::Document] the rightsMetadata for the fedora object
|
159
|
+
def rights_metadata object
|
160
|
+
Harvestdor.rights_metadata(object, config.purl)
|
161
|
+
end
|
162
|
+
|
163
|
+
# the RDF for this fedora object, from the purl xml
|
164
|
+
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
165
|
+
# a Nokogiri::XML::Document containing the public_xml for an object
|
166
|
+
# @return [Nokogiri::XML::Document] the RDF for the fedora object
|
167
|
+
def rdf object
|
168
|
+
Harvestdor.rdf(object, config.purl)
|
169
|
+
end
|
170
|
+
|
171
|
+
# the Dublin Core for this fedora object, from the purl xml
|
172
|
+
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
173
|
+
# a Nokogiri::XML::Document containing the public_xml for an object
|
174
|
+
# @return [Nokogiri::XML::Document] the dc for the fedora object
|
175
|
+
def dc object
|
176
|
+
Harvestdor.dc(object, config.purl)
|
177
|
+
end
|
178
|
+
|
179
|
+
end # class Client
|
180
|
+
|
181
|
+
protected #--------------------------------------------
|
182
|
+
|
183
|
+
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
184
|
+
# a Nokogiri::XML::Document containing the public_xml for an object
|
185
|
+
# @param [String] purl_url url for the purl server. default is Harvestdor::PURL_DEFAULT
|
186
|
+
# @return [Nokogiri::XML::Document] the public xml for a DOR object
|
187
|
+
def self.pub_xml(object, purl_url = Harvestdor::PURL_DEFAULT)
|
188
|
+
case
|
189
|
+
when object.instance_of?(String)
|
190
|
+
# it's a druid
|
191
|
+
pub_xml_ng_doc = Harvestdor.public_xml(object, purl_url)
|
192
|
+
when object.instance_of?(Nokogiri::XML::Document)
|
193
|
+
pub_xml_ng_doc = object
|
194
|
+
else
|
195
|
+
raise "expected String or Nokogiri::XML::Document for first argument, got #{object.class}"
|
196
|
+
end
|
197
|
+
pub_xml_ng_doc
|
198
|
+
end
|
199
|
+
|
200
|
+
end # module Harvestdor
|