harvestdor 0.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +25 -0
- data/.travis.yml +14 -0
- data/.yardopts +3 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +5 -0
- data/README.rdoc +162 -0
- data/Rakefile +50 -0
- data/harvestdor.gemspec +41 -0
- data/lib/harvestdor/errors.rb +12 -0
- data/lib/harvestdor/oai_harvest.rb +115 -0
- data/lib/harvestdor/purl_xml.rb +200 -0
- data/lib/harvestdor/version.rb +3 -0
- data/lib/harvestdor.rb +121 -0
- data/spec/config/oai.yml +37 -0
- data/spec/harvestdor_client_spec.rb +135 -0
- data/spec/harvestdor_spec.rb +23 -0
- data/spec/oai_harvest_spec.rb +220 -0
- data/spec/oai_integration_spec.rb +125 -0
- data/spec/purl_xml_spec.rb +194 -0
- data/spec/spec_helper.rb +21 -0
- metadata +211 -0
data/lib/harvestdor.rb
ADDED
@@ -0,0 +1,121 @@
|
|
1
|
+
require 'harvestdor/errors'
|
2
|
+
require 'harvestdor/oai_harvest'
|
3
|
+
require 'harvestdor/purl_xml'
|
4
|
+
require 'harvestdor/version'
|
5
|
+
# external gems
|
6
|
+
require 'confstruct'
|
7
|
+
require 'oai'
|
8
|
+
# stdlib
|
9
|
+
require 'logger'
|
10
|
+
require 'open-uri'
|
11
|
+
require 'yaml'
|
12
|
+
|
13
|
+
module Harvestdor
|
14
|
+
|
15
|
+
LOG_NAME_DEFAULT = "harvestdor.log"
|
16
|
+
LOG_DIR_DEFAULT = File.join(File.dirname(__FILE__), "..", "logs")
|
17
|
+
PURL_DEFAULT = 'http://purl.stanford.edu'
|
18
|
+
HTTP_OPTIONS_DEFAULT = { 'ssl' => {
|
19
|
+
'verify' => false
|
20
|
+
},
|
21
|
+
'request' => {
|
22
|
+
'timeout' => 60, # open/read timeout (seconds)
|
23
|
+
'open_timeout' => 60 # connection open timeout (seconds)
|
24
|
+
}
|
25
|
+
}
|
26
|
+
OAI_CLIENT_DEBUG_DEFAULT = false
|
27
|
+
OAI_REPOSITORY_URL_DEFAULT = 'https://dor-oaiprovider-prod.stanford.edu/oai'
|
28
|
+
DEFAULT_METADATA_PREFIX = 'mods'
|
29
|
+
DEFAULT_FROM_DATE = nil
|
30
|
+
DEFAULT_UNTIL_DATE = nil
|
31
|
+
DEFAULT_SET = nil
|
32
|
+
|
33
|
+
class Client
|
34
|
+
|
35
|
+
# Set default values for the construction of Harvestdor::Client objects
|
36
|
+
def self.default_config
|
37
|
+
@class_config ||= Confstruct::Configuration.new({
|
38
|
+
:log_dir => LOG_DIR_DEFAULT,
|
39
|
+
:log_name => LOG_NAME_DEFAULT,
|
40
|
+
:purl => PURL_DEFAULT,
|
41
|
+
:http_options => HTTP_OPTIONS_DEFAULT,
|
42
|
+
:oai_repository_url => OAI_REPOSITORY_URL_DEFAULT,
|
43
|
+
:oai_client_debug => OAI_CLIENT_DEBUG_DEFAULT,
|
44
|
+
:default_metadata_prefix => DEFAULT_METADATA_PREFIX,
|
45
|
+
:default_from_date => DEFAULT_FROM_DATE,
|
46
|
+
:default_until_date => DEFAULT_UNTIL_DATE,
|
47
|
+
:default_set => DEFAULT_SET
|
48
|
+
})
|
49
|
+
end
|
50
|
+
|
51
|
+
# Initialize a new instance of Harvestdor::Client
|
52
|
+
# @param Hash options
|
53
|
+
# @example
|
54
|
+
# client = Harvestdor::Client.new({ # Example with all possible options
|
55
|
+
# :log_dir => File.join(File.dirname(__FILE__), "..", "logs"),
|
56
|
+
# :log_name => 'harvestdor.log',
|
57
|
+
# :purl => 'http://purl.stanford.edu',
|
58
|
+
# :http_options => { 'ssl' => {
|
59
|
+
# 'verify' => false
|
60
|
+
# },
|
61
|
+
# 'request' => {
|
62
|
+
# 'timeout' => 30, # open/read timeout (seconds)
|
63
|
+
# 'open_timeout' => 30 # connection open timeout (seconds)
|
64
|
+
# }
|
65
|
+
# },
|
66
|
+
# :oai_repository_url => 'https://dor-oaiprovider-prod.stanford.edu/oai', # The OAI repository to connect to
|
67
|
+
# :oai_client_debug => false,
|
68
|
+
# :default_metadata_prefix => 'mods',
|
69
|
+
# :default_from_date => '2012-12-01',
|
70
|
+
# :default_until_date => '2014-12-01',
|
71
|
+
# :default_set => nil,
|
72
|
+
# })
|
73
|
+
def initialize options = {}
|
74
|
+
config.configure(YAML.load_file(options[:config_yml_path])) if options[:config_yml_path]
|
75
|
+
config.configure options
|
76
|
+
yield(config) if block_given?
|
77
|
+
end
|
78
|
+
|
79
|
+
def config
|
80
|
+
@config ||= Confstruct::Configuration.new(self.class.default_config)
|
81
|
+
end
|
82
|
+
|
83
|
+
# @return OAI::Client an instantiated OAI::Client object, based on config options
|
84
|
+
def oai_client
|
85
|
+
@oai_client ||= OAI::Client.new config.oai_repository_url, :debug => config.oai_client_debug, :http => oai_http_client
|
86
|
+
end
|
87
|
+
|
88
|
+
def logger
|
89
|
+
@logger ||= self.class.logger(config.log_dir, config.log_name)
|
90
|
+
end
|
91
|
+
|
92
|
+
protected #---------------------------------------------------------------------
|
93
|
+
|
94
|
+
def oai_http_client
|
95
|
+
logger.info "Constructing OAI http client with faraday options #{config.http_options.to_hash.inspect}"
|
96
|
+
@oai_http_client ||= Faraday.new config.oai_repository_url, config.http_options.to_hash
|
97
|
+
end
|
98
|
+
|
99
|
+
# Global, memoized, lazy initialized instance of a logger
|
100
|
+
# @param [String] log_dir directory for to get log file
|
101
|
+
# @param [String] log_name name of log file
|
102
|
+
def self.logger(log_dir, log_name)
|
103
|
+
Dir.mkdir(log_dir) unless File.directory?(log_dir)
|
104
|
+
@logger ||= Logger.new(File.join(log_dir, log_name), 'daily')
|
105
|
+
end
|
106
|
+
|
107
|
+
end # class Client
|
108
|
+
|
109
|
+
# @param [Object] arg OAI::Header object or OAI::Record object or String (oai identifier)
|
110
|
+
# @return [String] the druid part of an OAI identifier in an OAI header, e.g. bb134cc1324
|
111
|
+
def self.druid(arg)
|
112
|
+
oai_id = arg
|
113
|
+
if arg.is_a?(OAI::Header)
|
114
|
+
oai_id = arg.identifier
|
115
|
+
elsif arg.is_a?(OAI::Record)
|
116
|
+
oai_id = arg.header.identifier
|
117
|
+
end
|
118
|
+
oai_id.split('druid:').last
|
119
|
+
end
|
120
|
+
|
121
|
+
end # module Harvestdor
|
data/spec/config/oai.yml
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# log_dir: directory for log file (default logs, relative to harvestdor gem path)
|
2
|
+
log_dir: spec/test_logs
|
3
|
+
|
4
|
+
# log_name: name of log file (default: harvestdor.log)
|
5
|
+
|
6
|
+
# purl: url for the DOR purl server (used to get ContentMetadata, etc.)
|
7
|
+
# purl: http://purl-test.stanford.edu
|
8
|
+
|
9
|
+
# ---------- OAI harvesting parameters -----------
|
10
|
+
|
11
|
+
# oai_client_debug: true for OAI::Client debug mode (default: false)
|
12
|
+
|
13
|
+
# oai_repository_url: URL of the OAI data provider
|
14
|
+
oai_repository_url: https://dor-oaiprovider-test.stanford.edu/oai
|
15
|
+
|
16
|
+
# default_metadata_prefix: default metadata prefix to be used for harvesting (default: mods)
|
17
|
+
# can be overridden on calls to harvest_ids and harvest_records
|
18
|
+
default_metadata_prefix: mods
|
19
|
+
|
20
|
+
# default_from_date: default from date for harvest (default: nil)
|
21
|
+
# can be overridden on calls to harvest_ids and harvest_records
|
22
|
+
default_from_date: '2012-11-01'
|
23
|
+
|
24
|
+
# default_until_date: default until date for harvest (default: nil)
|
25
|
+
# can be overridden on calls to harvest_ids and harvest_records
|
26
|
+
|
27
|
+
# default_set: default set for harvest (default: nil)
|
28
|
+
# can be overridden on calls to harvest_ids and harvest_records
|
29
|
+
|
30
|
+
# Additional options to pass to Faraday http client (https://github.com/technoweenie/faraday)
|
31
|
+
# timeouts are in seconds; timeout -> open/read, open_timeout -> connection open
|
32
|
+
http_options:
|
33
|
+
ssl:
|
34
|
+
verify: false
|
35
|
+
request:
|
36
|
+
timeout: 121
|
37
|
+
open_timeout: 122
|
@@ -0,0 +1,135 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe Harvestdor::Client do
|
4
|
+
|
5
|
+
before(:all) do
|
6
|
+
@config_yml_path = File.join(File.dirname(__FILE__), "config", "oai.yml")
|
7
|
+
@client_via_yml_only = Harvestdor::Client.new({:config_yml_path => @config_yml_path})
|
8
|
+
require 'yaml'
|
9
|
+
@yaml = YAML.load_file(@config_yml_path)
|
10
|
+
end
|
11
|
+
|
12
|
+
describe "initialization" do
|
13
|
+
before(:all) do
|
14
|
+
@from_date = '2012-11-29'
|
15
|
+
@repo_url = 'http://my_oai_repo.org/oai'
|
16
|
+
end
|
17
|
+
context "attributes passed in hash argument" do
|
18
|
+
before(:all) do
|
19
|
+
@some_args = Harvestdor::Client.new({:default_from_date => @from_date, :oai_repository_url => @repo_url}).config
|
20
|
+
end
|
21
|
+
it "should set the attributes to the passed values" do
|
22
|
+
expect(@some_args.oai_repository_url).to eql(@repo_url)
|
23
|
+
expect(@some_args.default_from_date).to eql(@from_date)
|
24
|
+
end
|
25
|
+
it "should keep the defaults for attributes not in the hash argument" do
|
26
|
+
expect(@some_args.log_name).to eql(Harvestdor::LOG_NAME_DEFAULT)
|
27
|
+
expect(@some_args.log_dir).to eql(Harvestdor::LOG_DIR_DEFAULT)
|
28
|
+
expect(@some_args.purl).to eql(Harvestdor::PURL_DEFAULT)
|
29
|
+
expect(@some_args.http_options).to eql(Confstruct::Configuration.new(Harvestdor::HTTP_OPTIONS_DEFAULT))
|
30
|
+
expect(@some_args.oai_client_debug).to eql(Harvestdor::OAI_CLIENT_DEBUG_DEFAULT)
|
31
|
+
expect(@some_args.default_metadata_prefix).to eql(Harvestdor::DEFAULT_METADATA_PREFIX)
|
32
|
+
expect(@some_args.default_until_date).to eql(Harvestdor::DEFAULT_UNTIL_DATE)
|
33
|
+
expect(@some_args.default_set).to eql(Harvestdor::DEFAULT_SET)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
context "config_yml_path in hash argument" do
|
38
|
+
before(:all) do
|
39
|
+
@config_via_yml_only = @client_via_yml_only.config
|
40
|
+
end
|
41
|
+
it "should set attributes in yml file over defaults" do
|
42
|
+
expect(@config_via_yml_only.log_dir).to eql(@yaml['log_dir'])
|
43
|
+
expect(@config_via_yml_only.oai_repository_url).to eql(@yaml['oai_repository_url'])
|
44
|
+
expect(@config_via_yml_only.default_from_date).to eql(@yaml['default_from_date'])
|
45
|
+
expect(@config_via_yml_only.default_metadata_prefix).to eql(@yaml['default_metadata_prefix'])
|
46
|
+
expect(@config_via_yml_only.http_options.request.timeout).to eql(@yaml['http_options']['request']['timeout'])
|
47
|
+
end
|
48
|
+
it "should keep the defaults for attributes not present in yml file nor a config yml file" do
|
49
|
+
expect(@config_via_yml_only.log_name).to eql(Harvestdor::LOG_NAME_DEFAULT)
|
50
|
+
expect(@config_via_yml_only.purl).to eql(Harvestdor::PURL_DEFAULT)
|
51
|
+
expect(@config_via_yml_only.default_until_date).to eql(Harvestdor::DEFAULT_UNTIL_DATE)
|
52
|
+
expect(@config_via_yml_only.default_set).to eql(Harvestdor::DEFAULT_SET)
|
53
|
+
end
|
54
|
+
context "and some hash arguments" do
|
55
|
+
before(:all) do
|
56
|
+
@config_via_yml_plus = Harvestdor::Client.new({:config_yml_path => @config_yml_path,
|
57
|
+
:default_from_date => @from_date, :oai_repository_url => @repo_url}).config
|
58
|
+
end
|
59
|
+
it "should favor hash arg attribute values over yml file values" do
|
60
|
+
expect(@config_via_yml_plus.oai_repository_url).to eql(@repo_url)
|
61
|
+
expect(@config_via_yml_plus.default_from_date).to eql(@from_date)
|
62
|
+
end
|
63
|
+
it "should favor yml file values over defaults" do
|
64
|
+
expect(@config_via_yml_plus.log_dir).to eql(@yaml['log_dir'])
|
65
|
+
expect(@config_via_yml_plus.default_metadata_prefix).to eql(@yaml['default_metadata_prefix'])
|
66
|
+
expect(@config_via_yml_plus.http_options.timeout).to eql(@yaml['http_options']['timeout'])
|
67
|
+
end
|
68
|
+
it "should keep the defaults for attributes not present in yml file" do
|
69
|
+
expect(@config_via_yml_plus.log_name).to eql(Harvestdor::LOG_NAME_DEFAULT)
|
70
|
+
expect(@config_via_yml_plus.default_until_date).to eql(Harvestdor::DEFAULT_UNTIL_DATE)
|
71
|
+
expect(@config_via_yml_plus.default_set).to eql(Harvestdor::DEFAULT_SET)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
context "without hash arguments" do
|
77
|
+
it "should keep the defaults for all attributes" do
|
78
|
+
no_args = Harvestdor::Client.new.config
|
79
|
+
expect(no_args.log_name).to eql(Harvestdor::LOG_NAME_DEFAULT)
|
80
|
+
expect(no_args.log_dir).to eql(Harvestdor::LOG_DIR_DEFAULT)
|
81
|
+
expect(no_args.purl).to eql(Harvestdor::PURL_DEFAULT)
|
82
|
+
expect(no_args.http_options).to eql(Confstruct::Configuration.new(Harvestdor::HTTP_OPTIONS_DEFAULT))
|
83
|
+
expect(no_args.oai_client_debug).to eql(Harvestdor::OAI_CLIENT_DEBUG_DEFAULT)
|
84
|
+
expect(no_args.oai_repository_url).to eql(Harvestdor::OAI_REPOSITORY_URL_DEFAULT)
|
85
|
+
expect(no_args.default_metadata_prefix).to eql(Harvestdor::DEFAULT_METADATA_PREFIX)
|
86
|
+
expect(no_args.default_from_date).to eql(Harvestdor::DEFAULT_FROM_DATE)
|
87
|
+
expect(no_args.default_until_date).to eql(Harvestdor::DEFAULT_UNTIL_DATE)
|
88
|
+
expect(no_args.default_set).to eql(Harvestdor::DEFAULT_SET)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end # initialize client
|
92
|
+
|
93
|
+
it "should allow direct setting of configuration attributes" do
|
94
|
+
conf = Harvestdor::Client.new.config
|
95
|
+
expect(conf.log_dir).to eql(Harvestdor::LOG_DIR_DEFAULT)
|
96
|
+
conf['log_dir'] = 'my_log_dir'
|
97
|
+
expect(conf.log_dir).to eql('my_log_dir')
|
98
|
+
end
|
99
|
+
|
100
|
+
describe "logging" do
|
101
|
+
it "should write the log file to the directory indicated by log_dir" do
|
102
|
+
@client_via_yml_only.logger.info("harvestdor_client_spec logging test message")
|
103
|
+
expect(File.exists?(File.join(@yaml['log_dir'], Harvestdor::LOG_NAME_DEFAULT))).to eql(true)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
context "oai_client" do
|
108
|
+
before(:all) do
|
109
|
+
@client = Harvestdor::Client.new
|
110
|
+
@default_oai_client = Harvestdor::Client.new.oai_client
|
111
|
+
end
|
112
|
+
|
113
|
+
it "oai_client should return an OAI::Client object based on config data" do
|
114
|
+
expect(@default_oai_client).to be_an_instance_of(OAI::Client)
|
115
|
+
end
|
116
|
+
|
117
|
+
it "oai_client should have an http_client" do
|
118
|
+
expect(@default_oai_client.instance_variable_get(:@http_client)).to be_an_instance_of(Faraday::Connection)
|
119
|
+
end
|
120
|
+
|
121
|
+
context "oai_http_client (protected method)" do
|
122
|
+
before(:all) do
|
123
|
+
@http_client = @client.send(:oai_http_client)
|
124
|
+
end
|
125
|
+
it "should be a Faraday object" do
|
126
|
+
expect(@http_client).to be_an_instance_of(Faraday::Connection)
|
127
|
+
end
|
128
|
+
it "should have the oai_provider url from config" do
|
129
|
+
uri_obj = @http_client.url_prefix
|
130
|
+
expect(@client.config.oai_repository_url).to match(Regexp.new(uri_obj.host + uri_obj.path))
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end # context oai_client
|
134
|
+
|
135
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe Harvestdor do
|
4
|
+
|
5
|
+
context "#druid" do
|
6
|
+
it "should return the druid part of an oai identifier" do
|
7
|
+
Harvestdor.druid('oai:searchworks.stanford.edu/druid:foo').should == 'foo'
|
8
|
+
end
|
9
|
+
it "should work with OAI::Header as argument" do
|
10
|
+
header = OAI::Header.new(nil)
|
11
|
+
header.identifier = 'oai:searchworks.stanford.edu/druid:foo'
|
12
|
+
Harvestdor.druid(header).should == 'foo'
|
13
|
+
end
|
14
|
+
it "should work with OAI::Record as argument" do
|
15
|
+
oai_rec = OAI::Record.new(nil)
|
16
|
+
header = OAI::Header.new(nil)
|
17
|
+
header.identifier = 'oai:searchworks.stanford.edu/druid:foo'
|
18
|
+
oai_rec.header = header
|
19
|
+
Harvestdor.druid(oai_rec).should == 'foo'
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
@@ -0,0 +1,220 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Harvestdor::Client oai harvesting' do
|
4
|
+
before(:all) do
|
5
|
+
@harvestdor_client = Harvestdor::Client.new
|
6
|
+
@oai_arg_defaults = {:metadata_prefix => @harvestdor_client.config.default_metadata_prefix,
|
7
|
+
:from => @harvestdor_client.config.default_from_date,
|
8
|
+
:until => @harvestdor_client.config.default_until_date,
|
9
|
+
:set => @harvestdor_client.config.default_set }
|
10
|
+
end
|
11
|
+
|
12
|
+
describe "druids_via_oai" do
|
13
|
+
before(:each) do
|
14
|
+
oai_response = double('oai_response')
|
15
|
+
oai_response.stub(:entries).and_return(['foo', 'bar'])
|
16
|
+
oai_response.stub(:resumption_token).and_return('')
|
17
|
+
@harvestdor_client.oai_client.stub(:list_identifiers).with(an_instance_of(Hash)) {
|
18
|
+
oai_response
|
19
|
+
}
|
20
|
+
end
|
21
|
+
it "should return druids" do
|
22
|
+
header1 = OAI::Header.new(nil)
|
23
|
+
header1.identifier = 'oai:searchworks.stanford.edu/druid:foo'
|
24
|
+
header2 = OAI::Header.new(nil)
|
25
|
+
header2.identifier = 'oai:searchworks.stanford.edu/druid:bar'
|
26
|
+
oai_response = double('oai_response')
|
27
|
+
oai_response.stub(:entries).and_return([header1, header2])
|
28
|
+
@harvestdor_client.druids_via_oai.should == ['foo', 'bar']
|
29
|
+
end
|
30
|
+
it "should have results viewable as an array" do
|
31
|
+
@harvestdor_client.druids_via_oai.should be_an_instance_of(Array)
|
32
|
+
end
|
33
|
+
it "should have enumerable results" do
|
34
|
+
@harvestdor_client.druids_via_oai.should respond_to(:each, :count)
|
35
|
+
end
|
36
|
+
it "should yield to a passed block" do
|
37
|
+
expect { |b| @harvestdor_client.druids_via_oai(&b) }.to yield_successive_args('foo', 'bar')
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
describe "oai_records" do
|
42
|
+
before(:each) do
|
43
|
+
@oai_response = double('oai_response')
|
44
|
+
@oai_response.stub(:entries).and_return([1, 2])
|
45
|
+
@oai_response.stub(:resumption_token).and_return('')
|
46
|
+
@harvestdor_client.oai_client.stub(:list_records).with(an_instance_of(Hash)) {
|
47
|
+
@oai_response
|
48
|
+
}
|
49
|
+
end
|
50
|
+
it "should return OAI::Record objects" do
|
51
|
+
header1 = OAI::Header.new(nil)
|
52
|
+
header1.identifier = 'oai:searchworks.stanford.edu/druid:foo'
|
53
|
+
oai_rec1 = OAI::Record.new(nil)
|
54
|
+
oai_rec1.header = header1
|
55
|
+
header2 = OAI::Header.new(nil)
|
56
|
+
header2.identifier = 'oai:searchworks.stanford.edu/druid:bar'
|
57
|
+
oai_rec2 = OAI::Record.new(nil)
|
58
|
+
oai_rec2.header = header2
|
59
|
+
@oai_response.stub(:entries).and_return([oai_rec1, oai_rec2])
|
60
|
+
@harvestdor_client.oai_records.should == [oai_rec1, oai_rec2]
|
61
|
+
end
|
62
|
+
it "should have results viewable as an array" do
|
63
|
+
@harvestdor_client.oai_records.should be_an_instance_of(Array)
|
64
|
+
end
|
65
|
+
it "should have enumerable results" do
|
66
|
+
@harvestdor_client.oai_records.should respond_to(:each, :count)
|
67
|
+
end
|
68
|
+
it "should yield to a passed block" do
|
69
|
+
expect { |b| @harvestdor_client.oai_records(&b) }.to yield_successive_args(1, 2)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
describe "oai_headers" do
|
74
|
+
before(:each) do
|
75
|
+
@oai_response = double('oai_response')
|
76
|
+
@oai_response.stub(:entries).and_return([1, 2])
|
77
|
+
@oai_response.stub(:resumption_token).and_return('')
|
78
|
+
@harvestdor_client.oai_client.stub(:list_identifiers).with(an_instance_of(Hash)) {
|
79
|
+
@oai_response
|
80
|
+
}
|
81
|
+
end
|
82
|
+
it "should return OAI::Header objects" do
|
83
|
+
header1 = OAI::Header.new(nil)
|
84
|
+
header1.identifier = 'oai:searchworks.stanford.edu/druid:foo'
|
85
|
+
header2 = OAI::Header.new(nil)
|
86
|
+
header2.identifier = 'oai:searchworks.stanford.edu/druid:bar'
|
87
|
+
@oai_response.stub(:entries).and_return([header1, header2])
|
88
|
+
@harvestdor_client.oai_headers.should == [header1, header2]
|
89
|
+
end
|
90
|
+
it "should have results viewable as an array" do
|
91
|
+
@harvestdor_client.oai_headers.should be_an_instance_of(Array)
|
92
|
+
end
|
93
|
+
it "should have enumerable results" do
|
94
|
+
@harvestdor_client.oai_headers.should respond_to(:each, :count)
|
95
|
+
end
|
96
|
+
it "should yield to a passed block" do
|
97
|
+
expect { |b| @harvestdor_client.oai_headers(&b) }.to yield_successive_args(1, 2)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
describe "oai_record (single record request)" do
|
102
|
+
it "should return OAI::Record object" do
|
103
|
+
oai_rec = OAI::Record.new(nil)
|
104
|
+
oai_resp = double('oai_response')
|
105
|
+
oai_resp.stub(:record).and_return(oai_rec)
|
106
|
+
@harvestdor_client.oai_client.stub(:get_record) {
|
107
|
+
oai_resp
|
108
|
+
}
|
109
|
+
@harvestdor_client.oai_record('druid').should == oai_rec
|
110
|
+
@harvestdor_client.oai_record('druid', 'mods').should == oai_rec
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
describe "scrub_oai_args" do
|
115
|
+
before(:all) do
|
116
|
+
@expected_oai_args = @oai_arg_defaults.dup
|
117
|
+
@expected_oai_args.each { |k, v|
|
118
|
+
@expected_oai_args.delete(k) if v.nil? || v.size == 0
|
119
|
+
}
|
120
|
+
|
121
|
+
end
|
122
|
+
it "should use client's default values for OAI arguments if they are not present in the method param hash" do
|
123
|
+
@harvestdor_client.send(:scrub_oai_args).should == @expected_oai_args
|
124
|
+
end
|
125
|
+
it "should use OAI arguments from the method param hash if they are present" do
|
126
|
+
passed_options = {:metadata_prefix => 'mods', :from => '2012-11-30'}
|
127
|
+
@harvestdor_client.send(:scrub_oai_args, passed_options).should == @expected_oai_args.merge(passed_options)
|
128
|
+
end
|
129
|
+
it "should use nil value for option when it is passed in options hash" do
|
130
|
+
client = Harvestdor::Client.new({:default_from_date => '2012-01-01'})
|
131
|
+
client.config.default_from_date.should == '2012-01-01'
|
132
|
+
passed_options = {:from => nil}
|
133
|
+
client.send(:scrub_oai_args, passed_options)[:from].should == nil
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
describe "harvest" do
|
138
|
+
it "should perform a list_records OAI request when first arg is true" do
|
139
|
+
oai_response = double('oai_response')
|
140
|
+
oai_response.stub(:entries).and_return([])
|
141
|
+
@harvestdor_client.oai_client.stub(:list_records).with(an_instance_of(Hash)) {
|
142
|
+
oai_response
|
143
|
+
}
|
144
|
+
@harvestdor_client.oai_client.should_receive(:list_records)
|
145
|
+
@harvestdor_client.send(:harvest, :list_records, {})
|
146
|
+
end
|
147
|
+
|
148
|
+
it "should perform a list_identifiers OAI request when first arg is false" do
|
149
|
+
oai_response = double('oai_response')
|
150
|
+
oai_response.stub(:entries).and_return([])
|
151
|
+
@harvestdor_client.oai_client.stub(:list_identifiers).with(an_instance_of(Hash)) {
|
152
|
+
oai_response
|
153
|
+
}
|
154
|
+
@harvestdor_client.oai_client.should_receive(:list_identifiers)
|
155
|
+
@harvestdor_client.send(:harvest, :list_identifiers, {})
|
156
|
+
end
|
157
|
+
|
158
|
+
it "should use passed OAI arguments" do
|
159
|
+
oai_response = double('oai_response')
|
160
|
+
oai_response.stub(:entries).and_return([])
|
161
|
+
@harvestdor_client.oai_client.stub(:list_identifiers).with(an_instance_of(Hash)) {
|
162
|
+
oai_response
|
163
|
+
}
|
164
|
+
oai_options_hash = {:metadata_prefix => 'mods', :from => '2012-11-30'}
|
165
|
+
@harvestdor_client.oai_client.should_receive(:list_identifiers).with(oai_options_hash)
|
166
|
+
@harvestdor_client.send(:harvest, :list_identifiers, oai_options_hash)
|
167
|
+
end
|
168
|
+
|
169
|
+
it "should yield to a passed block" do
|
170
|
+
oai_response = double('oai_response')
|
171
|
+
oai_response.stub(:entries).and_return([1, 2])
|
172
|
+
oai_response.stub(:resumption_token).and_return('')
|
173
|
+
@harvestdor_client.oai_client.stub(:list_records).with(an_instance_of(Hash)) {
|
174
|
+
oai_response
|
175
|
+
}
|
176
|
+
expect { |b| @harvestdor_client.send(:harvest, :list_records, {}, &b) }.to yield_successive_args(1, 2)
|
177
|
+
end
|
178
|
+
|
179
|
+
context "resumption tokens" do
|
180
|
+
it "should stop processing when no records/headers are received" do
|
181
|
+
oai_response = double('oai_response')
|
182
|
+
oai_response.stub(:entries).and_return([])
|
183
|
+
@harvestdor_client.oai_client.stub(:list_records).with(an_instance_of(Hash)) {
|
184
|
+
oai_response
|
185
|
+
}
|
186
|
+
|
187
|
+
i = 0
|
188
|
+
@harvestdor_client.send(:harvest, :list_records, {}) { |record| i += 1 }
|
189
|
+
i.should == 0
|
190
|
+
end
|
191
|
+
|
192
|
+
it "should stop processing when the resumption token is empty" do
|
193
|
+
oai_response_with_token = double('oai_response')
|
194
|
+
oai_response_with_token.stub(:entries).and_return([1,2,3,4,5])
|
195
|
+
oai_response_with_token.stub(:resumption_token).and_return('')
|
196
|
+
@harvestdor_client.oai_client.stub(:list_records).with(an_instance_of(Hash)) {
|
197
|
+
oai_response_with_token
|
198
|
+
}
|
199
|
+
|
200
|
+
i = 0
|
201
|
+
@harvestdor_client.send(:harvest, :list_records, {}) { |record| i += 1 }
|
202
|
+
i.should == 5
|
203
|
+
end
|
204
|
+
|
205
|
+
it "should stop processing when there was no resumption token" do
|
206
|
+
oai_response_with_token = double('oai_response')
|
207
|
+
oai_response_with_token.stub(:entries).and_return([1,2,3,4,5])
|
208
|
+
oai_response_with_token.stub(:resumption_token).and_return(nil)
|
209
|
+
@harvestdor_client.oai_client.stub(:list_records).with(an_instance_of(Hash)) {
|
210
|
+
oai_response_with_token
|
211
|
+
}
|
212
|
+
|
213
|
+
i = 0
|
214
|
+
@harvestdor_client.send(:harvest, :list_records, {}) { |record| i += 1 }
|
215
|
+
expect(i).to eql(5)
|
216
|
+
end
|
217
|
+
end # resumption tokens
|
218
|
+
end
|
219
|
+
|
220
|
+
end
|
@@ -0,0 +1,125 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe 'Harvestdor::Client OAI Harvesting Integration Tests', :integration => true do
|
5
|
+
|
6
|
+
before(:all) do
|
7
|
+
@config_yml_path = File.join(File.dirname(__FILE__), "config", "oai.yml")
|
8
|
+
end
|
9
|
+
|
10
|
+
context "test OAI server" do
|
11
|
+
before(:all) do
|
12
|
+
@test_hclient ||= Harvestdor::Client.new({:config_yml_path => @config_yml_path, :oai_client_debug => 'true', :oai_repository_url => 'https://dor-oaiprovider-test.stanford.edu/oai'})
|
13
|
+
end
|
14
|
+
context "withOUT resumption tokens" do
|
15
|
+
before(:all) do
|
16
|
+
@oai_args = {:metadata_prefix => 'mods', :from => nil, :until => nil, :set => 'is_governed_by_hy787xj5878'}
|
17
|
+
end
|
18
|
+
it "should be able to harvest headers" do
|
19
|
+
headers = @test_hclient.oai_headers(@oai_args)
|
20
|
+
headers.should be_an_instance_of(Array)
|
21
|
+
headers.size.should > 0
|
22
|
+
headers.size.should < 50 # no resumption token
|
23
|
+
headers.first.should be_an_instance_of(OAI::Header)
|
24
|
+
end
|
25
|
+
it "should be able to harvest records" do
|
26
|
+
records = @test_hclient.oai_records(@oai_args)
|
27
|
+
records.should be_an_instance_of(Array)
|
28
|
+
records.size.should > 0
|
29
|
+
records.size.should < 50 # no resumption token
|
30
|
+
records.first.should be_an_instance_of(OAI::Record)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
context "with resumption tokens" do
|
34
|
+
before(:all) do
|
35
|
+
@oai_args = {:metadata_prefix => 'mods', :from => nil, :until => nil, :set => 'is_member_of_kh678dr8608'}
|
36
|
+
end
|
37
|
+
it "should be able to harvest headers" do
|
38
|
+
skip "need to find small set > 50 on test"
|
39
|
+
headers = @test_hclient.oai_headers(@oai_args)
|
40
|
+
headers.should be_an_instance_of(Array)
|
41
|
+
headers.size.should > 50
|
42
|
+
headers.first.should be_an_instance_of(OAI::Header)
|
43
|
+
end
|
44
|
+
it "should be able to harvest records" do
|
45
|
+
pending "need to find small set > 50 on test"
|
46
|
+
records = @test_hclient.harvest_records(@oai_args)
|
47
|
+
records.should be_an_instance_of(Array)
|
48
|
+
records.size.should > 50
|
49
|
+
records.first.should be_an_instance_of(OAI::Record)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
context "oai_record (single record request)" do
|
53
|
+
before(:all) do
|
54
|
+
@rec = @test_hclient.oai_record('jt959wc5586')
|
55
|
+
end
|
56
|
+
it "should get a single OAI::Record object" do
|
57
|
+
@rec.should be_an_instance_of(OAI::Record)
|
58
|
+
end
|
59
|
+
it "should keep utf-8 encoded characters intact" do
|
60
|
+
xml = Nokogiri::XML(@rec.metadata.to_s)
|
61
|
+
xml.remove_namespaces!
|
62
|
+
xml.root.xpath('/metadata/mods/titleInfo/subTitle').text.should =~ /^recueil complet des débats législatifs & politiques des chambres françaises/
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
context "production OAI server" do
|
68
|
+
before(:all) do
|
69
|
+
@prod_hclient ||= Harvestdor::Client.new({:config_yml_path => @config_yml_path, :oai_repository_url => 'https://dor-oaiprovider-prod.stanford.edu/oai'})
|
70
|
+
end
|
71
|
+
context "withOUT resumption tokens" do
|
72
|
+
before(:all) do
|
73
|
+
# Reid-Dennis: 47 objects
|
74
|
+
@oai_args = {:metadata_prefix => 'mods', :from => nil, :until => nil, :set => 'is_governed_by_sd064kn5856'}
|
75
|
+
end
|
76
|
+
it "should be able to harvest headers" do
|
77
|
+
headers = @prod_hclient.oai_headers(@oai_args)
|
78
|
+
headers.should be_an_instance_of(Array)
|
79
|
+
headers.size.should > 0
|
80
|
+
headers.size.should < 50 # no resumption token
|
81
|
+
headers.first.should be_an_instance_of(OAI::Header)
|
82
|
+
end
|
83
|
+
it "should be able to harvest records" do
|
84
|
+
records = @prod_hclient.oai_records(@oai_args)
|
85
|
+
records.should be_an_instance_of(Array)
|
86
|
+
records.size.should > 0
|
87
|
+
records.size.should < 50 # no resumption token
|
88
|
+
records.first.should be_an_instance_of(OAI::Record)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
context "with resumption tokens" do
|
92
|
+
before(:all) do
|
93
|
+
# Archives Parlementaires - 8x objects
|
94
|
+
@oai_args = {:metadata_prefix => 'mods', :from => nil, :until => nil, :set => 'is_member_of_collection_jh957jy1101'}
|
95
|
+
end
|
96
|
+
it "should be able to harvest headers" do
|
97
|
+
headers = @prod_hclient.oai_headers(@oai_args)
|
98
|
+
headers.should be_an_instance_of(Array)
|
99
|
+
headers.size.should > 50
|
100
|
+
headers.first.should be_an_instance_of(OAI::Header)
|
101
|
+
end
|
102
|
+
it "should be able to harvest records" do
|
103
|
+
pending "the request always seems to time out"
|
104
|
+
records = @prod_hclient.oai_records(@oai_args)
|
105
|
+
records.should be_an_instance_of(Array)
|
106
|
+
records.size.should > 50
|
107
|
+
records.first.should be_an_instance_of(OAI::Record)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
context "oai_record (single record request)" do
|
111
|
+
before(:all) do
|
112
|
+
@rec = @prod_hclient.oai_record('jt959wc5586')
|
113
|
+
end
|
114
|
+
it "should get a single OAI::Record object" do
|
115
|
+
@rec.should be_an_instance_of(OAI::Record)
|
116
|
+
end
|
117
|
+
it "should keep utf-8 encoded characters intact" do
|
118
|
+
xml = Nokogiri::XML(@rec.metadata.to_s)
|
119
|
+
xml.remove_namespaces!
|
120
|
+
xml.root.xpath('/metadata/mods/titleInfo/subTitle').text.should =~ /^recueil complet des débats législatifs & politiques des chambres françaises/
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|