harvestdor 0.0.13
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +25 -0
- data/.travis.yml +14 -0
- data/.yardopts +3 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +5 -0
- data/README.rdoc +162 -0
- data/Rakefile +50 -0
- data/harvestdor.gemspec +41 -0
- data/lib/harvestdor/errors.rb +12 -0
- data/lib/harvestdor/oai_harvest.rb +115 -0
- data/lib/harvestdor/purl_xml.rb +200 -0
- data/lib/harvestdor/version.rb +3 -0
- data/lib/harvestdor.rb +121 -0
- data/spec/config/oai.yml +37 -0
- data/spec/harvestdor_client_spec.rb +135 -0
- data/spec/harvestdor_spec.rb +23 -0
- data/spec/oai_harvest_spec.rb +220 -0
- data/spec/oai_integration_spec.rb +125 -0
- data/spec/purl_xml_spec.rb +194 -0
- data/spec/spec_helper.rb +21 -0
- metadata +211 -0
data/lib/harvestdor.rb
ADDED
@@ -0,0 +1,121 @@
|
|
1
|
+
require 'harvestdor/errors'
|
2
|
+
require 'harvestdor/oai_harvest'
|
3
|
+
require 'harvestdor/purl_xml'
|
4
|
+
require 'harvestdor/version'
|
5
|
+
# external gems
|
6
|
+
require 'confstruct'
|
7
|
+
require 'oai'
|
8
|
+
# stdlib
|
9
|
+
require 'logger'
|
10
|
+
require 'open-uri'
|
11
|
+
require 'yaml'
|
12
|
+
|
13
|
+
module Harvestdor
|
14
|
+
|
15
|
+
LOG_NAME_DEFAULT = "harvestdor.log"
|
16
|
+
LOG_DIR_DEFAULT = File.join(File.dirname(__FILE__), "..", "logs")
|
17
|
+
PURL_DEFAULT = 'http://purl.stanford.edu'
|
18
|
+
HTTP_OPTIONS_DEFAULT = { 'ssl' => {
|
19
|
+
'verify' => false
|
20
|
+
},
|
21
|
+
'request' => {
|
22
|
+
'timeout' => 60, # open/read timeout (seconds)
|
23
|
+
'open_timeout' => 60 # connection open timeout (seconds)
|
24
|
+
}
|
25
|
+
}
|
26
|
+
OAI_CLIENT_DEBUG_DEFAULT = false
|
27
|
+
OAI_REPOSITORY_URL_DEFAULT = 'https://dor-oaiprovider-prod.stanford.edu/oai'
|
28
|
+
DEFAULT_METADATA_PREFIX = 'mods'
|
29
|
+
DEFAULT_FROM_DATE = nil
|
30
|
+
DEFAULT_UNTIL_DATE = nil
|
31
|
+
DEFAULT_SET = nil
|
32
|
+
|
33
|
+
class Client
|
34
|
+
|
35
|
+
# Set default values for the construction of Harvestdor::Client objects
|
36
|
+
def self.default_config
|
37
|
+
@class_config ||= Confstruct::Configuration.new({
|
38
|
+
:log_dir => LOG_DIR_DEFAULT,
|
39
|
+
:log_name => LOG_NAME_DEFAULT,
|
40
|
+
:purl => PURL_DEFAULT,
|
41
|
+
:http_options => HTTP_OPTIONS_DEFAULT,
|
42
|
+
:oai_repository_url => OAI_REPOSITORY_URL_DEFAULT,
|
43
|
+
:oai_client_debug => OAI_CLIENT_DEBUG_DEFAULT,
|
44
|
+
:default_metadata_prefix => DEFAULT_METADATA_PREFIX,
|
45
|
+
:default_from_date => DEFAULT_FROM_DATE,
|
46
|
+
:default_until_date => DEFAULT_UNTIL_DATE,
|
47
|
+
:default_set => DEFAULT_SET
|
48
|
+
})
|
49
|
+
end
|
50
|
+
|
51
|
+
# Initialize a new instance of Harvestdor::Client
|
52
|
+
# @param Hash options
|
53
|
+
# @example
|
54
|
+
# client = Harvestdor::Client.new({ # Example with all possible options
|
55
|
+
# :log_dir => File.join(File.dirname(__FILE__), "..", "logs"),
|
56
|
+
# :log_name => 'harvestdor.log',
|
57
|
+
# :purl => 'http://purl.stanford.edu',
|
58
|
+
# :http_options => { 'ssl' => {
|
59
|
+
# 'verify' => false
|
60
|
+
# },
|
61
|
+
# 'request' => {
|
62
|
+
# 'timeout' => 30, # open/read timeout (seconds)
|
63
|
+
# 'open_timeout' => 30 # connection open timeout (seconds)
|
64
|
+
# }
|
65
|
+
# },
|
66
|
+
# :oai_repository_url => 'https://dor-oaiprovider-prod.stanford.edu/oai', # The OAI repository to connect to
|
67
|
+
# :oai_client_debug => false,
|
68
|
+
# :default_metadata_prefix => 'mods',
|
69
|
+
# :default_from_date => '2012-12-01',
|
70
|
+
# :default_until_date => '2014-12-01',
|
71
|
+
# :default_set => nil,
|
72
|
+
# })
|
73
|
+
def initialize options = {}
|
74
|
+
config.configure(YAML.load_file(options[:config_yml_path])) if options[:config_yml_path]
|
75
|
+
config.configure options
|
76
|
+
yield(config) if block_given?
|
77
|
+
end
|
78
|
+
|
79
|
+
def config
|
80
|
+
@config ||= Confstruct::Configuration.new(self.class.default_config)
|
81
|
+
end
|
82
|
+
|
83
|
+
# @return OAI::Client an instantiated OAI::Client object, based on config options
|
84
|
+
def oai_client
|
85
|
+
@oai_client ||= OAI::Client.new config.oai_repository_url, :debug => config.oai_client_debug, :http => oai_http_client
|
86
|
+
end
|
87
|
+
|
88
|
+
def logger
|
89
|
+
@logger ||= self.class.logger(config.log_dir, config.log_name)
|
90
|
+
end
|
91
|
+
|
92
|
+
protected #---------------------------------------------------------------------
|
93
|
+
|
94
|
+
def oai_http_client
|
95
|
+
logger.info "Constructing OAI http client with faraday options #{config.http_options.to_hash.inspect}"
|
96
|
+
@oai_http_client ||= Faraday.new config.oai_repository_url, config.http_options.to_hash
|
97
|
+
end
|
98
|
+
|
99
|
+
# Global, memoized, lazy initialized instance of a logger
|
100
|
+
# @param [String] log_dir directory for to get log file
|
101
|
+
# @param [String] log_name name of log file
|
102
|
+
def self.logger(log_dir, log_name)
|
103
|
+
Dir.mkdir(log_dir) unless File.directory?(log_dir)
|
104
|
+
@logger ||= Logger.new(File.join(log_dir, log_name), 'daily')
|
105
|
+
end
|
106
|
+
|
107
|
+
end # class Client
|
108
|
+
|
109
|
+
# @param [Object] arg OAI::Header object or OAI::Record object or String (oai identifier)
|
110
|
+
# @return [String] the druid part of an OAI identifier in an OAI header, e.g. bb134cc1324
|
111
|
+
def self.druid(arg)
|
112
|
+
oai_id = arg
|
113
|
+
if arg.is_a?(OAI::Header)
|
114
|
+
oai_id = arg.identifier
|
115
|
+
elsif arg.is_a?(OAI::Record)
|
116
|
+
oai_id = arg.header.identifier
|
117
|
+
end
|
118
|
+
oai_id.split('druid:').last
|
119
|
+
end
|
120
|
+
|
121
|
+
end # module Harvestdor
|
data/spec/config/oai.yml
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# log_dir: directory for log file (default logs, relative to harvestdor gem path)
|
2
|
+
log_dir: spec/test_logs
|
3
|
+
|
4
|
+
# log_name: name of log file (default: harvestdor.log)
|
5
|
+
|
6
|
+
# purl: url for the DOR purl server (used to get ContentMetadata, etc.)
|
7
|
+
# purl: http://purl-test.stanford.edu
|
8
|
+
|
9
|
+
# ---------- OAI harvesting parameters -----------
|
10
|
+
|
11
|
+
# oai_client_debug: true for OAI::Client debug mode (default: false)
|
12
|
+
|
13
|
+
# oai_repository_url: URL of the OAI data provider
|
14
|
+
oai_repository_url: https://dor-oaiprovider-test.stanford.edu/oai
|
15
|
+
|
16
|
+
# default_metadata_prefix: default metadata prefix to be used for harvesting (default: mods)
|
17
|
+
# can be overridden on calls to harvest_ids and harvest_records
|
18
|
+
default_metadata_prefix: mods
|
19
|
+
|
20
|
+
# default_from_date: default from date for harvest (default: nil)
|
21
|
+
# can be overridden on calls to harvest_ids and harvest_records
|
22
|
+
default_from_date: '2012-11-01'
|
23
|
+
|
24
|
+
# default_until_date: default until date for harvest (default: nil)
|
25
|
+
# can be overridden on calls to harvest_ids and harvest_records
|
26
|
+
|
27
|
+
# default_set: default set for harvest (default: nil)
|
28
|
+
# can be overridden on calls to harvest_ids and harvest_records
|
29
|
+
|
30
|
+
# Additional options to pass to Faraday http client (https://github.com/technoweenie/faraday)
|
31
|
+
# timeouts are in seconds; timeout -> open/read, open_timeout -> connection open
|
32
|
+
http_options:
|
33
|
+
ssl:
|
34
|
+
verify: false
|
35
|
+
request:
|
36
|
+
timeout: 121
|
37
|
+
open_timeout: 122
|
@@ -0,0 +1,135 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe Harvestdor::Client do
|
4
|
+
|
5
|
+
before(:all) do
|
6
|
+
@config_yml_path = File.join(File.dirname(__FILE__), "config", "oai.yml")
|
7
|
+
@client_via_yml_only = Harvestdor::Client.new({:config_yml_path => @config_yml_path})
|
8
|
+
require 'yaml'
|
9
|
+
@yaml = YAML.load_file(@config_yml_path)
|
10
|
+
end
|
11
|
+
|
12
|
+
describe "initialization" do
|
13
|
+
before(:all) do
|
14
|
+
@from_date = '2012-11-29'
|
15
|
+
@repo_url = 'http://my_oai_repo.org/oai'
|
16
|
+
end
|
17
|
+
context "attributes passed in hash argument" do
|
18
|
+
before(:all) do
|
19
|
+
@some_args = Harvestdor::Client.new({:default_from_date => @from_date, :oai_repository_url => @repo_url}).config
|
20
|
+
end
|
21
|
+
it "should set the attributes to the passed values" do
|
22
|
+
expect(@some_args.oai_repository_url).to eql(@repo_url)
|
23
|
+
expect(@some_args.default_from_date).to eql(@from_date)
|
24
|
+
end
|
25
|
+
it "should keep the defaults for attributes not in the hash argument" do
|
26
|
+
expect(@some_args.log_name).to eql(Harvestdor::LOG_NAME_DEFAULT)
|
27
|
+
expect(@some_args.log_dir).to eql(Harvestdor::LOG_DIR_DEFAULT)
|
28
|
+
expect(@some_args.purl).to eql(Harvestdor::PURL_DEFAULT)
|
29
|
+
expect(@some_args.http_options).to eql(Confstruct::Configuration.new(Harvestdor::HTTP_OPTIONS_DEFAULT))
|
30
|
+
expect(@some_args.oai_client_debug).to eql(Harvestdor::OAI_CLIENT_DEBUG_DEFAULT)
|
31
|
+
expect(@some_args.default_metadata_prefix).to eql(Harvestdor::DEFAULT_METADATA_PREFIX)
|
32
|
+
expect(@some_args.default_until_date).to eql(Harvestdor::DEFAULT_UNTIL_DATE)
|
33
|
+
expect(@some_args.default_set).to eql(Harvestdor::DEFAULT_SET)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
context "config_yml_path in hash argument" do
|
38
|
+
before(:all) do
|
39
|
+
@config_via_yml_only = @client_via_yml_only.config
|
40
|
+
end
|
41
|
+
it "should set attributes in yml file over defaults" do
|
42
|
+
expect(@config_via_yml_only.log_dir).to eql(@yaml['log_dir'])
|
43
|
+
expect(@config_via_yml_only.oai_repository_url).to eql(@yaml['oai_repository_url'])
|
44
|
+
expect(@config_via_yml_only.default_from_date).to eql(@yaml['default_from_date'])
|
45
|
+
expect(@config_via_yml_only.default_metadata_prefix).to eql(@yaml['default_metadata_prefix'])
|
46
|
+
expect(@config_via_yml_only.http_options.request.timeout).to eql(@yaml['http_options']['request']['timeout'])
|
47
|
+
end
|
48
|
+
it "should keep the defaults for attributes not present in yml file nor a config yml file" do
|
49
|
+
expect(@config_via_yml_only.log_name).to eql(Harvestdor::LOG_NAME_DEFAULT)
|
50
|
+
expect(@config_via_yml_only.purl).to eql(Harvestdor::PURL_DEFAULT)
|
51
|
+
expect(@config_via_yml_only.default_until_date).to eql(Harvestdor::DEFAULT_UNTIL_DATE)
|
52
|
+
expect(@config_via_yml_only.default_set).to eql(Harvestdor::DEFAULT_SET)
|
53
|
+
end
|
54
|
+
context "and some hash arguments" do
|
55
|
+
before(:all) do
|
56
|
+
@config_via_yml_plus = Harvestdor::Client.new({:config_yml_path => @config_yml_path,
|
57
|
+
:default_from_date => @from_date, :oai_repository_url => @repo_url}).config
|
58
|
+
end
|
59
|
+
it "should favor hash arg attribute values over yml file values" do
|
60
|
+
expect(@config_via_yml_plus.oai_repository_url).to eql(@repo_url)
|
61
|
+
expect(@config_via_yml_plus.default_from_date).to eql(@from_date)
|
62
|
+
end
|
63
|
+
it "should favor yml file values over defaults" do
|
64
|
+
expect(@config_via_yml_plus.log_dir).to eql(@yaml['log_dir'])
|
65
|
+
expect(@config_via_yml_plus.default_metadata_prefix).to eql(@yaml['default_metadata_prefix'])
|
66
|
+
expect(@config_via_yml_plus.http_options.timeout).to eql(@yaml['http_options']['timeout'])
|
67
|
+
end
|
68
|
+
it "should keep the defaults for attributes not present in yml file" do
|
69
|
+
expect(@config_via_yml_plus.log_name).to eql(Harvestdor::LOG_NAME_DEFAULT)
|
70
|
+
expect(@config_via_yml_plus.default_until_date).to eql(Harvestdor::DEFAULT_UNTIL_DATE)
|
71
|
+
expect(@config_via_yml_plus.default_set).to eql(Harvestdor::DEFAULT_SET)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
context "without hash arguments" do
|
77
|
+
it "should keep the defaults for all attributes" do
|
78
|
+
no_args = Harvestdor::Client.new.config
|
79
|
+
expect(no_args.log_name).to eql(Harvestdor::LOG_NAME_DEFAULT)
|
80
|
+
expect(no_args.log_dir).to eql(Harvestdor::LOG_DIR_DEFAULT)
|
81
|
+
expect(no_args.purl).to eql(Harvestdor::PURL_DEFAULT)
|
82
|
+
expect(no_args.http_options).to eql(Confstruct::Configuration.new(Harvestdor::HTTP_OPTIONS_DEFAULT))
|
83
|
+
expect(no_args.oai_client_debug).to eql(Harvestdor::OAI_CLIENT_DEBUG_DEFAULT)
|
84
|
+
expect(no_args.oai_repository_url).to eql(Harvestdor::OAI_REPOSITORY_URL_DEFAULT)
|
85
|
+
expect(no_args.default_metadata_prefix).to eql(Harvestdor::DEFAULT_METADATA_PREFIX)
|
86
|
+
expect(no_args.default_from_date).to eql(Harvestdor::DEFAULT_FROM_DATE)
|
87
|
+
expect(no_args.default_until_date).to eql(Harvestdor::DEFAULT_UNTIL_DATE)
|
88
|
+
expect(no_args.default_set).to eql(Harvestdor::DEFAULT_SET)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end # initialize client
|
92
|
+
|
93
|
+
it "should allow direct setting of configuration attributes" do
|
94
|
+
conf = Harvestdor::Client.new.config
|
95
|
+
expect(conf.log_dir).to eql(Harvestdor::LOG_DIR_DEFAULT)
|
96
|
+
conf['log_dir'] = 'my_log_dir'
|
97
|
+
expect(conf.log_dir).to eql('my_log_dir')
|
98
|
+
end
|
99
|
+
|
100
|
+
describe "logging" do
|
101
|
+
it "should write the log file to the directory indicated by log_dir" do
|
102
|
+
@client_via_yml_only.logger.info("harvestdor_client_spec logging test message")
|
103
|
+
expect(File.exists?(File.join(@yaml['log_dir'], Harvestdor::LOG_NAME_DEFAULT))).to eql(true)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
context "oai_client" do
|
108
|
+
before(:all) do
|
109
|
+
@client = Harvestdor::Client.new
|
110
|
+
@default_oai_client = Harvestdor::Client.new.oai_client
|
111
|
+
end
|
112
|
+
|
113
|
+
it "oai_client should return an OAI::Client object based on config data" do
|
114
|
+
expect(@default_oai_client).to be_an_instance_of(OAI::Client)
|
115
|
+
end
|
116
|
+
|
117
|
+
it "oai_client should have an http_client" do
|
118
|
+
expect(@default_oai_client.instance_variable_get(:@http_client)).to be_an_instance_of(Faraday::Connection)
|
119
|
+
end
|
120
|
+
|
121
|
+
context "oai_http_client (protected method)" do
|
122
|
+
before(:all) do
|
123
|
+
@http_client = @client.send(:oai_http_client)
|
124
|
+
end
|
125
|
+
it "should be a Faraday object" do
|
126
|
+
expect(@http_client).to be_an_instance_of(Faraday::Connection)
|
127
|
+
end
|
128
|
+
it "should have the oai_provider url from config" do
|
129
|
+
uri_obj = @http_client.url_prefix
|
130
|
+
expect(@client.config.oai_repository_url).to match(Regexp.new(uri_obj.host + uri_obj.path))
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end # context oai_client
|
134
|
+
|
135
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe Harvestdor do
|
4
|
+
|
5
|
+
context "#druid" do
|
6
|
+
it "should return the druid part of an oai identifier" do
|
7
|
+
Harvestdor.druid('oai:searchworks.stanford.edu/druid:foo').should == 'foo'
|
8
|
+
end
|
9
|
+
it "should work with OAI::Header as argument" do
|
10
|
+
header = OAI::Header.new(nil)
|
11
|
+
header.identifier = 'oai:searchworks.stanford.edu/druid:foo'
|
12
|
+
Harvestdor.druid(header).should == 'foo'
|
13
|
+
end
|
14
|
+
it "should work with OAI::Record as argument" do
|
15
|
+
oai_rec = OAI::Record.new(nil)
|
16
|
+
header = OAI::Header.new(nil)
|
17
|
+
header.identifier = 'oai:searchworks.stanford.edu/druid:foo'
|
18
|
+
oai_rec.header = header
|
19
|
+
Harvestdor.druid(oai_rec).should == 'foo'
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
@@ -0,0 +1,220 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Harvestdor::Client oai harvesting' do
|
4
|
+
before(:all) do
|
5
|
+
@harvestdor_client = Harvestdor::Client.new
|
6
|
+
@oai_arg_defaults = {:metadata_prefix => @harvestdor_client.config.default_metadata_prefix,
|
7
|
+
:from => @harvestdor_client.config.default_from_date,
|
8
|
+
:until => @harvestdor_client.config.default_until_date,
|
9
|
+
:set => @harvestdor_client.config.default_set }
|
10
|
+
end
|
11
|
+
|
12
|
+
describe "druids_via_oai" do
|
13
|
+
before(:each) do
|
14
|
+
oai_response = double('oai_response')
|
15
|
+
oai_response.stub(:entries).and_return(['foo', 'bar'])
|
16
|
+
oai_response.stub(:resumption_token).and_return('')
|
17
|
+
@harvestdor_client.oai_client.stub(:list_identifiers).with(an_instance_of(Hash)) {
|
18
|
+
oai_response
|
19
|
+
}
|
20
|
+
end
|
21
|
+
it "should return druids" do
|
22
|
+
header1 = OAI::Header.new(nil)
|
23
|
+
header1.identifier = 'oai:searchworks.stanford.edu/druid:foo'
|
24
|
+
header2 = OAI::Header.new(nil)
|
25
|
+
header2.identifier = 'oai:searchworks.stanford.edu/druid:bar'
|
26
|
+
oai_response = double('oai_response')
|
27
|
+
oai_response.stub(:entries).and_return([header1, header2])
|
28
|
+
@harvestdor_client.druids_via_oai.should == ['foo', 'bar']
|
29
|
+
end
|
30
|
+
it "should have results viewable as an array" do
|
31
|
+
@harvestdor_client.druids_via_oai.should be_an_instance_of(Array)
|
32
|
+
end
|
33
|
+
it "should have enumerable results" do
|
34
|
+
@harvestdor_client.druids_via_oai.should respond_to(:each, :count)
|
35
|
+
end
|
36
|
+
it "should yield to a passed block" do
|
37
|
+
expect { |b| @harvestdor_client.druids_via_oai(&b) }.to yield_successive_args('foo', 'bar')
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
describe "oai_records" do
|
42
|
+
before(:each) do
|
43
|
+
@oai_response = double('oai_response')
|
44
|
+
@oai_response.stub(:entries).and_return([1, 2])
|
45
|
+
@oai_response.stub(:resumption_token).and_return('')
|
46
|
+
@harvestdor_client.oai_client.stub(:list_records).with(an_instance_of(Hash)) {
|
47
|
+
@oai_response
|
48
|
+
}
|
49
|
+
end
|
50
|
+
it "should return OAI::Record objects" do
|
51
|
+
header1 = OAI::Header.new(nil)
|
52
|
+
header1.identifier = 'oai:searchworks.stanford.edu/druid:foo'
|
53
|
+
oai_rec1 = OAI::Record.new(nil)
|
54
|
+
oai_rec1.header = header1
|
55
|
+
header2 = OAI::Header.new(nil)
|
56
|
+
header2.identifier = 'oai:searchworks.stanford.edu/druid:bar'
|
57
|
+
oai_rec2 = OAI::Record.new(nil)
|
58
|
+
oai_rec2.header = header2
|
59
|
+
@oai_response.stub(:entries).and_return([oai_rec1, oai_rec2])
|
60
|
+
@harvestdor_client.oai_records.should == [oai_rec1, oai_rec2]
|
61
|
+
end
|
62
|
+
it "should have results viewable as an array" do
|
63
|
+
@harvestdor_client.oai_records.should be_an_instance_of(Array)
|
64
|
+
end
|
65
|
+
it "should have enumerable results" do
|
66
|
+
@harvestdor_client.oai_records.should respond_to(:each, :count)
|
67
|
+
end
|
68
|
+
it "should yield to a passed block" do
|
69
|
+
expect { |b| @harvestdor_client.oai_records(&b) }.to yield_successive_args(1, 2)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
describe "oai_headers" do
|
74
|
+
before(:each) do
|
75
|
+
@oai_response = double('oai_response')
|
76
|
+
@oai_response.stub(:entries).and_return([1, 2])
|
77
|
+
@oai_response.stub(:resumption_token).and_return('')
|
78
|
+
@harvestdor_client.oai_client.stub(:list_identifiers).with(an_instance_of(Hash)) {
|
79
|
+
@oai_response
|
80
|
+
}
|
81
|
+
end
|
82
|
+
it "should return OAI::Header objects" do
|
83
|
+
header1 = OAI::Header.new(nil)
|
84
|
+
header1.identifier = 'oai:searchworks.stanford.edu/druid:foo'
|
85
|
+
header2 = OAI::Header.new(nil)
|
86
|
+
header2.identifier = 'oai:searchworks.stanford.edu/druid:bar'
|
87
|
+
@oai_response.stub(:entries).and_return([header1, header2])
|
88
|
+
@harvestdor_client.oai_headers.should == [header1, header2]
|
89
|
+
end
|
90
|
+
it "should have results viewable as an array" do
|
91
|
+
@harvestdor_client.oai_headers.should be_an_instance_of(Array)
|
92
|
+
end
|
93
|
+
it "should have enumerable results" do
|
94
|
+
@harvestdor_client.oai_headers.should respond_to(:each, :count)
|
95
|
+
end
|
96
|
+
it "should yield to a passed block" do
|
97
|
+
expect { |b| @harvestdor_client.oai_headers(&b) }.to yield_successive_args(1, 2)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
describe "oai_record (single record request)" do
|
102
|
+
it "should return OAI::Record object" do
|
103
|
+
oai_rec = OAI::Record.new(nil)
|
104
|
+
oai_resp = double('oai_response')
|
105
|
+
oai_resp.stub(:record).and_return(oai_rec)
|
106
|
+
@harvestdor_client.oai_client.stub(:get_record) {
|
107
|
+
oai_resp
|
108
|
+
}
|
109
|
+
@harvestdor_client.oai_record('druid').should == oai_rec
|
110
|
+
@harvestdor_client.oai_record('druid', 'mods').should == oai_rec
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
describe "scrub_oai_args" do
|
115
|
+
before(:all) do
|
116
|
+
@expected_oai_args = @oai_arg_defaults.dup
|
117
|
+
@expected_oai_args.each { |k, v|
|
118
|
+
@expected_oai_args.delete(k) if v.nil? || v.size == 0
|
119
|
+
}
|
120
|
+
|
121
|
+
end
|
122
|
+
it "should use client's default values for OAI arguments if they are not present in the method param hash" do
|
123
|
+
@harvestdor_client.send(:scrub_oai_args).should == @expected_oai_args
|
124
|
+
end
|
125
|
+
it "should use OAI arguments from the method param hash if they are present" do
|
126
|
+
passed_options = {:metadata_prefix => 'mods', :from => '2012-11-30'}
|
127
|
+
@harvestdor_client.send(:scrub_oai_args, passed_options).should == @expected_oai_args.merge(passed_options)
|
128
|
+
end
|
129
|
+
it "should use nil value for option when it is passed in options hash" do
|
130
|
+
client = Harvestdor::Client.new({:default_from_date => '2012-01-01'})
|
131
|
+
client.config.default_from_date.should == '2012-01-01'
|
132
|
+
passed_options = {:from => nil}
|
133
|
+
client.send(:scrub_oai_args, passed_options)[:from].should == nil
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
describe "harvest" do
|
138
|
+
it "should perform a list_records OAI request when first arg is true" do
|
139
|
+
oai_response = double('oai_response')
|
140
|
+
oai_response.stub(:entries).and_return([])
|
141
|
+
@harvestdor_client.oai_client.stub(:list_records).with(an_instance_of(Hash)) {
|
142
|
+
oai_response
|
143
|
+
}
|
144
|
+
@harvestdor_client.oai_client.should_receive(:list_records)
|
145
|
+
@harvestdor_client.send(:harvest, :list_records, {})
|
146
|
+
end
|
147
|
+
|
148
|
+
it "should perform a list_identifiers OAI request when first arg is false" do
|
149
|
+
oai_response = double('oai_response')
|
150
|
+
oai_response.stub(:entries).and_return([])
|
151
|
+
@harvestdor_client.oai_client.stub(:list_identifiers).with(an_instance_of(Hash)) {
|
152
|
+
oai_response
|
153
|
+
}
|
154
|
+
@harvestdor_client.oai_client.should_receive(:list_identifiers)
|
155
|
+
@harvestdor_client.send(:harvest, :list_identifiers, {})
|
156
|
+
end
|
157
|
+
|
158
|
+
it "should use passed OAI arguments" do
|
159
|
+
oai_response = double('oai_response')
|
160
|
+
oai_response.stub(:entries).and_return([])
|
161
|
+
@harvestdor_client.oai_client.stub(:list_identifiers).with(an_instance_of(Hash)) {
|
162
|
+
oai_response
|
163
|
+
}
|
164
|
+
oai_options_hash = {:metadata_prefix => 'mods', :from => '2012-11-30'}
|
165
|
+
@harvestdor_client.oai_client.should_receive(:list_identifiers).with(oai_options_hash)
|
166
|
+
@harvestdor_client.send(:harvest, :list_identifiers, oai_options_hash)
|
167
|
+
end
|
168
|
+
|
169
|
+
it "should yield to a passed block" do
|
170
|
+
oai_response = double('oai_response')
|
171
|
+
oai_response.stub(:entries).and_return([1, 2])
|
172
|
+
oai_response.stub(:resumption_token).and_return('')
|
173
|
+
@harvestdor_client.oai_client.stub(:list_records).with(an_instance_of(Hash)) {
|
174
|
+
oai_response
|
175
|
+
}
|
176
|
+
expect { |b| @harvestdor_client.send(:harvest, :list_records, {}, &b) }.to yield_successive_args(1, 2)
|
177
|
+
end
|
178
|
+
|
179
|
+
context "resumption tokens" do
|
180
|
+
it "should stop processing when no records/headers are received" do
|
181
|
+
oai_response = double('oai_response')
|
182
|
+
oai_response.stub(:entries).and_return([])
|
183
|
+
@harvestdor_client.oai_client.stub(:list_records).with(an_instance_of(Hash)) {
|
184
|
+
oai_response
|
185
|
+
}
|
186
|
+
|
187
|
+
i = 0
|
188
|
+
@harvestdor_client.send(:harvest, :list_records, {}) { |record| i += 1 }
|
189
|
+
i.should == 0
|
190
|
+
end
|
191
|
+
|
192
|
+
it "should stop processing when the resumption token is empty" do
|
193
|
+
oai_response_with_token = double('oai_response')
|
194
|
+
oai_response_with_token.stub(:entries).and_return([1,2,3,4,5])
|
195
|
+
oai_response_with_token.stub(:resumption_token).and_return('')
|
196
|
+
@harvestdor_client.oai_client.stub(:list_records).with(an_instance_of(Hash)) {
|
197
|
+
oai_response_with_token
|
198
|
+
}
|
199
|
+
|
200
|
+
i = 0
|
201
|
+
@harvestdor_client.send(:harvest, :list_records, {}) { |record| i += 1 }
|
202
|
+
i.should == 5
|
203
|
+
end
|
204
|
+
|
205
|
+
it "should stop processing when there was no resumption token" do
|
206
|
+
oai_response_with_token = double('oai_response')
|
207
|
+
oai_response_with_token.stub(:entries).and_return([1,2,3,4,5])
|
208
|
+
oai_response_with_token.stub(:resumption_token).and_return(nil)
|
209
|
+
@harvestdor_client.oai_client.stub(:list_records).with(an_instance_of(Hash)) {
|
210
|
+
oai_response_with_token
|
211
|
+
}
|
212
|
+
|
213
|
+
i = 0
|
214
|
+
@harvestdor_client.send(:harvest, :list_records, {}) { |record| i += 1 }
|
215
|
+
expect(i).to eql(5)
|
216
|
+
end
|
217
|
+
end # resumption tokens
|
218
|
+
end
|
219
|
+
|
220
|
+
end
|
@@ -0,0 +1,125 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe 'Harvestdor::Client OAI Harvesting Integration Tests', :integration => true do
|
5
|
+
|
6
|
+
before(:all) do
|
7
|
+
@config_yml_path = File.join(File.dirname(__FILE__), "config", "oai.yml")
|
8
|
+
end
|
9
|
+
|
10
|
+
context "test OAI server" do
|
11
|
+
before(:all) do
|
12
|
+
@test_hclient ||= Harvestdor::Client.new({:config_yml_path => @config_yml_path, :oai_client_debug => 'true', :oai_repository_url => 'https://dor-oaiprovider-test.stanford.edu/oai'})
|
13
|
+
end
|
14
|
+
context "withOUT resumption tokens" do
|
15
|
+
before(:all) do
|
16
|
+
@oai_args = {:metadata_prefix => 'mods', :from => nil, :until => nil, :set => 'is_governed_by_hy787xj5878'}
|
17
|
+
end
|
18
|
+
it "should be able to harvest headers" do
|
19
|
+
headers = @test_hclient.oai_headers(@oai_args)
|
20
|
+
headers.should be_an_instance_of(Array)
|
21
|
+
headers.size.should > 0
|
22
|
+
headers.size.should < 50 # no resumption token
|
23
|
+
headers.first.should be_an_instance_of(OAI::Header)
|
24
|
+
end
|
25
|
+
it "should be able to harvest records" do
|
26
|
+
records = @test_hclient.oai_records(@oai_args)
|
27
|
+
records.should be_an_instance_of(Array)
|
28
|
+
records.size.should > 0
|
29
|
+
records.size.should < 50 # no resumption token
|
30
|
+
records.first.should be_an_instance_of(OAI::Record)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
context "with resumption tokens" do
|
34
|
+
before(:all) do
|
35
|
+
@oai_args = {:metadata_prefix => 'mods', :from => nil, :until => nil, :set => 'is_member_of_kh678dr8608'}
|
36
|
+
end
|
37
|
+
it "should be able to harvest headers" do
|
38
|
+
skip "need to find small set > 50 on test"
|
39
|
+
headers = @test_hclient.oai_headers(@oai_args)
|
40
|
+
headers.should be_an_instance_of(Array)
|
41
|
+
headers.size.should > 50
|
42
|
+
headers.first.should be_an_instance_of(OAI::Header)
|
43
|
+
end
|
44
|
+
it "should be able to harvest records" do
|
45
|
+
pending "need to find small set > 50 on test"
|
46
|
+
records = @test_hclient.harvest_records(@oai_args)
|
47
|
+
records.should be_an_instance_of(Array)
|
48
|
+
records.size.should > 50
|
49
|
+
records.first.should be_an_instance_of(OAI::Record)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
context "oai_record (single record request)" do
|
53
|
+
before(:all) do
|
54
|
+
@rec = @test_hclient.oai_record('jt959wc5586')
|
55
|
+
end
|
56
|
+
it "should get a single OAI::Record object" do
|
57
|
+
@rec.should be_an_instance_of(OAI::Record)
|
58
|
+
end
|
59
|
+
it "should keep utf-8 encoded characters intact" do
|
60
|
+
xml = Nokogiri::XML(@rec.metadata.to_s)
|
61
|
+
xml.remove_namespaces!
|
62
|
+
xml.root.xpath('/metadata/mods/titleInfo/subTitle').text.should =~ /^recueil complet des débats législatifs & politiques des chambres françaises/
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
context "production OAI server" do
|
68
|
+
before(:all) do
|
69
|
+
@prod_hclient ||= Harvestdor::Client.new({:config_yml_path => @config_yml_path, :oai_repository_url => 'https://dor-oaiprovider-prod.stanford.edu/oai'})
|
70
|
+
end
|
71
|
+
context "withOUT resumption tokens" do
|
72
|
+
before(:all) do
|
73
|
+
# Reid-Dennis: 47 objects
|
74
|
+
@oai_args = {:metadata_prefix => 'mods', :from => nil, :until => nil, :set => 'is_governed_by_sd064kn5856'}
|
75
|
+
end
|
76
|
+
it "should be able to harvest headers" do
|
77
|
+
headers = @prod_hclient.oai_headers(@oai_args)
|
78
|
+
headers.should be_an_instance_of(Array)
|
79
|
+
headers.size.should > 0
|
80
|
+
headers.size.should < 50 # no resumption token
|
81
|
+
headers.first.should be_an_instance_of(OAI::Header)
|
82
|
+
end
|
83
|
+
it "should be able to harvest records" do
|
84
|
+
records = @prod_hclient.oai_records(@oai_args)
|
85
|
+
records.should be_an_instance_of(Array)
|
86
|
+
records.size.should > 0
|
87
|
+
records.size.should < 50 # no resumption token
|
88
|
+
records.first.should be_an_instance_of(OAI::Record)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
context "with resumption tokens" do
|
92
|
+
before(:all) do
|
93
|
+
# Archives Parlementaires - 8x objects
|
94
|
+
@oai_args = {:metadata_prefix => 'mods', :from => nil, :until => nil, :set => 'is_member_of_collection_jh957jy1101'}
|
95
|
+
end
|
96
|
+
it "should be able to harvest headers" do
|
97
|
+
headers = @prod_hclient.oai_headers(@oai_args)
|
98
|
+
headers.should be_an_instance_of(Array)
|
99
|
+
headers.size.should > 50
|
100
|
+
headers.first.should be_an_instance_of(OAI::Header)
|
101
|
+
end
|
102
|
+
it "should be able to harvest records" do
|
103
|
+
pending "the request always seems to time out"
|
104
|
+
records = @prod_hclient.oai_records(@oai_args)
|
105
|
+
records.should be_an_instance_of(Array)
|
106
|
+
records.size.should > 50
|
107
|
+
records.first.should be_an_instance_of(OAI::Record)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
context "oai_record (single record request)" do
|
111
|
+
before(:all) do
|
112
|
+
@rec = @prod_hclient.oai_record('jt959wc5586')
|
113
|
+
end
|
114
|
+
it "should get a single OAI::Record object" do
|
115
|
+
@rec.should be_an_instance_of(OAI::Record)
|
116
|
+
end
|
117
|
+
it "should keep utf-8 encoded characters intact" do
|
118
|
+
xml = Nokogiri::XML(@rec.metadata.to_s)
|
119
|
+
xml.remove_namespaces!
|
120
|
+
xml.root.xpath('/metadata/mods/titleInfo/subTitle').text.should =~ /^recueil complet des débats législatifs & politiques des chambres françaises/
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|