harvestdor 0.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/harvestdor.rb ADDED
@@ -0,0 +1,121 @@
1
+ require 'harvestdor/errors'
2
+ require 'harvestdor/oai_harvest'
3
+ require 'harvestdor/purl_xml'
4
+ require 'harvestdor/version'
5
+ # external gems
6
+ require 'confstruct'
7
+ require 'oai'
8
+ # stdlib
9
+ require 'logger'
10
+ require 'open-uri'
11
+ require 'yaml'
12
+
13
+ module Harvestdor
14
+
15
+ LOG_NAME_DEFAULT = "harvestdor.log"
16
+ LOG_DIR_DEFAULT = File.join(File.dirname(__FILE__), "..", "logs")
17
+ PURL_DEFAULT = 'http://purl.stanford.edu'
18
+ HTTP_OPTIONS_DEFAULT = { 'ssl' => {
19
+ 'verify' => false
20
+ },
21
+ 'request' => {
22
+ 'timeout' => 60, # open/read timeout (seconds)
23
+ 'open_timeout' => 60 # connection open timeout (seconds)
24
+ }
25
+ }
26
+ OAI_CLIENT_DEBUG_DEFAULT = false
27
+ OAI_REPOSITORY_URL_DEFAULT = 'https://dor-oaiprovider-prod.stanford.edu/oai'
28
+ DEFAULT_METADATA_PREFIX = 'mods'
29
+ DEFAULT_FROM_DATE = nil
30
+ DEFAULT_UNTIL_DATE = nil
31
+ DEFAULT_SET = nil
32
+
33
+ class Client
34
+
35
+ # Set default values for the construction of Harvestdor::Client objects
36
+ def self.default_config
37
+ @class_config ||= Confstruct::Configuration.new({
38
+ :log_dir => LOG_DIR_DEFAULT,
39
+ :log_name => LOG_NAME_DEFAULT,
40
+ :purl => PURL_DEFAULT,
41
+ :http_options => HTTP_OPTIONS_DEFAULT,
42
+ :oai_repository_url => OAI_REPOSITORY_URL_DEFAULT,
43
+ :oai_client_debug => OAI_CLIENT_DEBUG_DEFAULT,
44
+ :default_metadata_prefix => DEFAULT_METADATA_PREFIX,
45
+ :default_from_date => DEFAULT_FROM_DATE,
46
+ :default_until_date => DEFAULT_UNTIL_DATE,
47
+ :default_set => DEFAULT_SET
48
+ })
49
+ end
50
+
51
+ # Initialize a new instance of Harvestdor::Client
52
+ # @param Hash options
53
+ # @example
54
+ # client = Harvestdor::Client.new({ # Example with all possible options
55
+ # :log_dir => File.join(File.dirname(__FILE__), "..", "logs"),
56
+ # :log_name => 'harvestdor.log',
57
+ # :purl => 'http://purl.stanford.edu',
58
+ # :http_options => { 'ssl' => {
59
+ # 'verify' => false
60
+ # },
61
+ # 'request' => {
62
+ # 'timeout' => 30, # open/read timeout (seconds)
63
+ # 'open_timeout' => 30 # connection open timeout (seconds)
64
+ # }
65
+ # },
66
+ # :oai_repository_url => 'https://dor-oaiprovider-prod.stanford.edu/oai', # The OAI repository to connect to
67
+ # :oai_client_debug => false,
68
+ # :default_metadata_prefix => 'mods',
69
+ # :default_from_date => '2012-12-01',
70
+ # :default_until_date => '2014-12-01',
71
+ # :default_set => nil,
72
+ # })
73
+ def initialize options = {}
74
+ config.configure(YAML.load_file(options[:config_yml_path])) if options[:config_yml_path]
75
+ config.configure options
76
+ yield(config) if block_given?
77
+ end
78
+
79
+ def config
80
+ @config ||= Confstruct::Configuration.new(self.class.default_config)
81
+ end
82
+
83
+ # @return OAI::Client an instantiated OAI::Client object, based on config options
84
+ def oai_client
85
+ @oai_client ||= OAI::Client.new config.oai_repository_url, :debug => config.oai_client_debug, :http => oai_http_client
86
+ end
87
+
88
+ def logger
89
+ @logger ||= self.class.logger(config.log_dir, config.log_name)
90
+ end
91
+
92
+ protected #---------------------------------------------------------------------
93
+
94
+ def oai_http_client
95
+ logger.info "Constructing OAI http client with faraday options #{config.http_options.to_hash.inspect}"
96
+ @oai_http_client ||= Faraday.new config.oai_repository_url, config.http_options.to_hash
97
+ end
98
+
99
+ # Global, memoized, lazy initialized instance of a logger
100
+ # @param [String] log_dir directory for to get log file
101
+ # @param [String] log_name name of log file
102
+ def self.logger(log_dir, log_name)
103
+ Dir.mkdir(log_dir) unless File.directory?(log_dir)
104
+ @logger ||= Logger.new(File.join(log_dir, log_name), 'daily')
105
+ end
106
+
107
+ end # class Client
108
+
109
+ # @param [Object] arg OAI::Header object or OAI::Record object or String (oai identifier)
110
+ # @return [String] the druid part of an OAI identifier in an OAI header, e.g. bb134cc1324
111
+ def self.druid(arg)
112
+ oai_id = arg
113
+ if arg.is_a?(OAI::Header)
114
+ oai_id = arg.identifier
115
+ elsif arg.is_a?(OAI::Record)
116
+ oai_id = arg.header.identifier
117
+ end
118
+ oai_id.split('druid:').last
119
+ end
120
+
121
+ end # module Harvestdor
@@ -0,0 +1,37 @@
1
+ # log_dir: directory for log file (default logs, relative to harvestdor gem path)
2
+ log_dir: spec/test_logs
3
+
4
+ # log_name: name of log file (default: harvestdor.log)
5
+
6
+ # purl: url for the DOR purl server (used to get ContentMetadata, etc.)
7
+ # purl: http://purl-test.stanford.edu
8
+
9
+ # ---------- OAI harvesting parameters -----------
10
+
11
+ # oai_client_debug: true for OAI::Client debug mode (default: false)
12
+
13
+ # oai_repository_url: URL of the OAI data provider
14
+ oai_repository_url: https://dor-oaiprovider-test.stanford.edu/oai
15
+
16
+ # default_metadata_prefix: default metadata prefix to be used for harvesting (default: mods)
17
+ # can be overridden on calls to harvest_ids and harvest_records
18
+ default_metadata_prefix: mods
19
+
20
+ # default_from_date: default from date for harvest (default: nil)
21
+ # can be overridden on calls to harvest_ids and harvest_records
22
+ default_from_date: '2012-11-01'
23
+
24
+ # default_until_date: default until date for harvest (default: nil)
25
+ # can be overridden on calls to harvest_ids and harvest_records
26
+
27
+ # default_set: default set for harvest (default: nil)
28
+ # can be overridden on calls to harvest_ids and harvest_records
29
+
30
+ # Additional options to pass to Faraday http client (https://github.com/technoweenie/faraday)
31
+ # timeouts are in seconds; timeout -> open/read, open_timeout -> connection open
32
+ http_options:
33
+ ssl:
34
+ verify: false
35
+ request:
36
+ timeout: 121
37
+ open_timeout: 122
@@ -0,0 +1,135 @@
1
+ require "spec_helper"
2
+
3
+ describe Harvestdor::Client do
4
+
5
+ before(:all) do
6
+ @config_yml_path = File.join(File.dirname(__FILE__), "config", "oai.yml")
7
+ @client_via_yml_only = Harvestdor::Client.new({:config_yml_path => @config_yml_path})
8
+ require 'yaml'
9
+ @yaml = YAML.load_file(@config_yml_path)
10
+ end
11
+
12
+ describe "initialization" do
13
+ before(:all) do
14
+ @from_date = '2012-11-29'
15
+ @repo_url = 'http://my_oai_repo.org/oai'
16
+ end
17
+ context "attributes passed in hash argument" do
18
+ before(:all) do
19
+ @some_args = Harvestdor::Client.new({:default_from_date => @from_date, :oai_repository_url => @repo_url}).config
20
+ end
21
+ it "should set the attributes to the passed values" do
22
+ expect(@some_args.oai_repository_url).to eql(@repo_url)
23
+ expect(@some_args.default_from_date).to eql(@from_date)
24
+ end
25
+ it "should keep the defaults for attributes not in the hash argument" do
26
+ expect(@some_args.log_name).to eql(Harvestdor::LOG_NAME_DEFAULT)
27
+ expect(@some_args.log_dir).to eql(Harvestdor::LOG_DIR_DEFAULT)
28
+ expect(@some_args.purl).to eql(Harvestdor::PURL_DEFAULT)
29
+ expect(@some_args.http_options).to eql(Confstruct::Configuration.new(Harvestdor::HTTP_OPTIONS_DEFAULT))
30
+ expect(@some_args.oai_client_debug).to eql(Harvestdor::OAI_CLIENT_DEBUG_DEFAULT)
31
+ expect(@some_args.default_metadata_prefix).to eql(Harvestdor::DEFAULT_METADATA_PREFIX)
32
+ expect(@some_args.default_until_date).to eql(Harvestdor::DEFAULT_UNTIL_DATE)
33
+ expect(@some_args.default_set).to eql(Harvestdor::DEFAULT_SET)
34
+ end
35
+ end
36
+
37
+ context "config_yml_path in hash argument" do
38
+ before(:all) do
39
+ @config_via_yml_only = @client_via_yml_only.config
40
+ end
41
+ it "should set attributes in yml file over defaults" do
42
+ expect(@config_via_yml_only.log_dir).to eql(@yaml['log_dir'])
43
+ expect(@config_via_yml_only.oai_repository_url).to eql(@yaml['oai_repository_url'])
44
+ expect(@config_via_yml_only.default_from_date).to eql(@yaml['default_from_date'])
45
+ expect(@config_via_yml_only.default_metadata_prefix).to eql(@yaml['default_metadata_prefix'])
46
+ expect(@config_via_yml_only.http_options.request.timeout).to eql(@yaml['http_options']['request']['timeout'])
47
+ end
48
+ it "should keep the defaults for attributes not present in yml file nor a config yml file" do
49
+ expect(@config_via_yml_only.log_name).to eql(Harvestdor::LOG_NAME_DEFAULT)
50
+ expect(@config_via_yml_only.purl).to eql(Harvestdor::PURL_DEFAULT)
51
+ expect(@config_via_yml_only.default_until_date).to eql(Harvestdor::DEFAULT_UNTIL_DATE)
52
+ expect(@config_via_yml_only.default_set).to eql(Harvestdor::DEFAULT_SET)
53
+ end
54
+ context "and some hash arguments" do
55
+ before(:all) do
56
+ @config_via_yml_plus = Harvestdor::Client.new({:config_yml_path => @config_yml_path,
57
+ :default_from_date => @from_date, :oai_repository_url => @repo_url}).config
58
+ end
59
+ it "should favor hash arg attribute values over yml file values" do
60
+ expect(@config_via_yml_plus.oai_repository_url).to eql(@repo_url)
61
+ expect(@config_via_yml_plus.default_from_date).to eql(@from_date)
62
+ end
63
+ it "should favor yml file values over defaults" do
64
+ expect(@config_via_yml_plus.log_dir).to eql(@yaml['log_dir'])
65
+ expect(@config_via_yml_plus.default_metadata_prefix).to eql(@yaml['default_metadata_prefix'])
66
+ expect(@config_via_yml_plus.http_options.timeout).to eql(@yaml['http_options']['timeout'])
67
+ end
68
+ it "should keep the defaults for attributes not present in yml file" do
69
+ expect(@config_via_yml_plus.log_name).to eql(Harvestdor::LOG_NAME_DEFAULT)
70
+ expect(@config_via_yml_plus.default_until_date).to eql(Harvestdor::DEFAULT_UNTIL_DATE)
71
+ expect(@config_via_yml_plus.default_set).to eql(Harvestdor::DEFAULT_SET)
72
+ end
73
+ end
74
+ end
75
+
76
+ context "without hash arguments" do
77
+ it "should keep the defaults for all attributes" do
78
+ no_args = Harvestdor::Client.new.config
79
+ expect(no_args.log_name).to eql(Harvestdor::LOG_NAME_DEFAULT)
80
+ expect(no_args.log_dir).to eql(Harvestdor::LOG_DIR_DEFAULT)
81
+ expect(no_args.purl).to eql(Harvestdor::PURL_DEFAULT)
82
+ expect(no_args.http_options).to eql(Confstruct::Configuration.new(Harvestdor::HTTP_OPTIONS_DEFAULT))
83
+ expect(no_args.oai_client_debug).to eql(Harvestdor::OAI_CLIENT_DEBUG_DEFAULT)
84
+ expect(no_args.oai_repository_url).to eql(Harvestdor::OAI_REPOSITORY_URL_DEFAULT)
85
+ expect(no_args.default_metadata_prefix).to eql(Harvestdor::DEFAULT_METADATA_PREFIX)
86
+ expect(no_args.default_from_date).to eql(Harvestdor::DEFAULT_FROM_DATE)
87
+ expect(no_args.default_until_date).to eql(Harvestdor::DEFAULT_UNTIL_DATE)
88
+ expect(no_args.default_set).to eql(Harvestdor::DEFAULT_SET)
89
+ end
90
+ end
91
+ end # initialize client
92
+
93
+ it "should allow direct setting of configuration attributes" do
94
+ conf = Harvestdor::Client.new.config
95
+ expect(conf.log_dir).to eql(Harvestdor::LOG_DIR_DEFAULT)
96
+ conf['log_dir'] = 'my_log_dir'
97
+ expect(conf.log_dir).to eql('my_log_dir')
98
+ end
99
+
100
+ describe "logging" do
101
+ it "should write the log file to the directory indicated by log_dir" do
102
+ @client_via_yml_only.logger.info("harvestdor_client_spec logging test message")
103
+ expect(File.exists?(File.join(@yaml['log_dir'], Harvestdor::LOG_NAME_DEFAULT))).to eql(true)
104
+ end
105
+ end
106
+
107
+ context "oai_client" do
108
+ before(:all) do
109
+ @client = Harvestdor::Client.new
110
+ @default_oai_client = Harvestdor::Client.new.oai_client
111
+ end
112
+
113
+ it "oai_client should return an OAI::Client object based on config data" do
114
+ expect(@default_oai_client).to be_an_instance_of(OAI::Client)
115
+ end
116
+
117
+ it "oai_client should have an http_client" do
118
+ expect(@default_oai_client.instance_variable_get(:@http_client)).to be_an_instance_of(Faraday::Connection)
119
+ end
120
+
121
+ context "oai_http_client (protected method)" do
122
+ before(:all) do
123
+ @http_client = @client.send(:oai_http_client)
124
+ end
125
+ it "should be a Faraday object" do
126
+ expect(@http_client).to be_an_instance_of(Faraday::Connection)
127
+ end
128
+ it "should have the oai_provider url from config" do
129
+ uri_obj = @http_client.url_prefix
130
+ expect(@client.config.oai_repository_url).to match(Regexp.new(uri_obj.host + uri_obj.path))
131
+ end
132
+ end
133
+ end # context oai_client
134
+
135
+ end
@@ -0,0 +1,23 @@
1
+ require "spec_helper"
2
+
3
+ describe Harvestdor do
4
+
5
+ context "#druid" do
6
+ it "should return the druid part of an oai identifier" do
7
+ Harvestdor.druid('oai:searchworks.stanford.edu/druid:foo').should == 'foo'
8
+ end
9
+ it "should work with OAI::Header as argument" do
10
+ header = OAI::Header.new(nil)
11
+ header.identifier = 'oai:searchworks.stanford.edu/druid:foo'
12
+ Harvestdor.druid(header).should == 'foo'
13
+ end
14
+ it "should work with OAI::Record as argument" do
15
+ oai_rec = OAI::Record.new(nil)
16
+ header = OAI::Header.new(nil)
17
+ header.identifier = 'oai:searchworks.stanford.edu/druid:foo'
18
+ oai_rec.header = header
19
+ Harvestdor.druid(oai_rec).should == 'foo'
20
+ end
21
+ end
22
+
23
+ end
@@ -0,0 +1,220 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'Harvestdor::Client oai harvesting' do
4
+ before(:all) do
5
+ @harvestdor_client = Harvestdor::Client.new
6
+ @oai_arg_defaults = {:metadata_prefix => @harvestdor_client.config.default_metadata_prefix,
7
+ :from => @harvestdor_client.config.default_from_date,
8
+ :until => @harvestdor_client.config.default_until_date,
9
+ :set => @harvestdor_client.config.default_set }
10
+ end
11
+
12
+ describe "druids_via_oai" do
13
+ before(:each) do
14
+ oai_response = double('oai_response')
15
+ oai_response.stub(:entries).and_return(['foo', 'bar'])
16
+ oai_response.stub(:resumption_token).and_return('')
17
+ @harvestdor_client.oai_client.stub(:list_identifiers).with(an_instance_of(Hash)) {
18
+ oai_response
19
+ }
20
+ end
21
+ it "should return druids" do
22
+ header1 = OAI::Header.new(nil)
23
+ header1.identifier = 'oai:searchworks.stanford.edu/druid:foo'
24
+ header2 = OAI::Header.new(nil)
25
+ header2.identifier = 'oai:searchworks.stanford.edu/druid:bar'
26
+ oai_response = double('oai_response')
27
+ oai_response.stub(:entries).and_return([header1, header2])
28
+ @harvestdor_client.druids_via_oai.should == ['foo', 'bar']
29
+ end
30
+ it "should have results viewable as an array" do
31
+ @harvestdor_client.druids_via_oai.should be_an_instance_of(Array)
32
+ end
33
+ it "should have enumerable results" do
34
+ @harvestdor_client.druids_via_oai.should respond_to(:each, :count)
35
+ end
36
+ it "should yield to a passed block" do
37
+ expect { |b| @harvestdor_client.druids_via_oai(&b) }.to yield_successive_args('foo', 'bar')
38
+ end
39
+ end
40
+
41
+ describe "oai_records" do
42
+ before(:each) do
43
+ @oai_response = double('oai_response')
44
+ @oai_response.stub(:entries).and_return([1, 2])
45
+ @oai_response.stub(:resumption_token).and_return('')
46
+ @harvestdor_client.oai_client.stub(:list_records).with(an_instance_of(Hash)) {
47
+ @oai_response
48
+ }
49
+ end
50
+ it "should return OAI::Record objects" do
51
+ header1 = OAI::Header.new(nil)
52
+ header1.identifier = 'oai:searchworks.stanford.edu/druid:foo'
53
+ oai_rec1 = OAI::Record.new(nil)
54
+ oai_rec1.header = header1
55
+ header2 = OAI::Header.new(nil)
56
+ header2.identifier = 'oai:searchworks.stanford.edu/druid:bar'
57
+ oai_rec2 = OAI::Record.new(nil)
58
+ oai_rec2.header = header2
59
+ @oai_response.stub(:entries).and_return([oai_rec1, oai_rec2])
60
+ @harvestdor_client.oai_records.should == [oai_rec1, oai_rec2]
61
+ end
62
+ it "should have results viewable as an array" do
63
+ @harvestdor_client.oai_records.should be_an_instance_of(Array)
64
+ end
65
+ it "should have enumerable results" do
66
+ @harvestdor_client.oai_records.should respond_to(:each, :count)
67
+ end
68
+ it "should yield to a passed block" do
69
+ expect { |b| @harvestdor_client.oai_records(&b) }.to yield_successive_args(1, 2)
70
+ end
71
+ end
72
+
73
+ describe "oai_headers" do
74
+ before(:each) do
75
+ @oai_response = double('oai_response')
76
+ @oai_response.stub(:entries).and_return([1, 2])
77
+ @oai_response.stub(:resumption_token).and_return('')
78
+ @harvestdor_client.oai_client.stub(:list_identifiers).with(an_instance_of(Hash)) {
79
+ @oai_response
80
+ }
81
+ end
82
+ it "should return OAI::Header objects" do
83
+ header1 = OAI::Header.new(nil)
84
+ header1.identifier = 'oai:searchworks.stanford.edu/druid:foo'
85
+ header2 = OAI::Header.new(nil)
86
+ header2.identifier = 'oai:searchworks.stanford.edu/druid:bar'
87
+ @oai_response.stub(:entries).and_return([header1, header2])
88
+ @harvestdor_client.oai_headers.should == [header1, header2]
89
+ end
90
+ it "should have results viewable as an array" do
91
+ @harvestdor_client.oai_headers.should be_an_instance_of(Array)
92
+ end
93
+ it "should have enumerable results" do
94
+ @harvestdor_client.oai_headers.should respond_to(:each, :count)
95
+ end
96
+ it "should yield to a passed block" do
97
+ expect { |b| @harvestdor_client.oai_headers(&b) }.to yield_successive_args(1, 2)
98
+ end
99
+ end
100
+
101
+ describe "oai_record (single record request)" do
102
+ it "should return OAI::Record object" do
103
+ oai_rec = OAI::Record.new(nil)
104
+ oai_resp = double('oai_response')
105
+ oai_resp.stub(:record).and_return(oai_rec)
106
+ @harvestdor_client.oai_client.stub(:get_record) {
107
+ oai_resp
108
+ }
109
+ @harvestdor_client.oai_record('druid').should == oai_rec
110
+ @harvestdor_client.oai_record('druid', 'mods').should == oai_rec
111
+ end
112
+ end
113
+
114
+ describe "scrub_oai_args" do
115
+ before(:all) do
116
+ @expected_oai_args = @oai_arg_defaults.dup
117
+ @expected_oai_args.each { |k, v|
118
+ @expected_oai_args.delete(k) if v.nil? || v.size == 0
119
+ }
120
+
121
+ end
122
+ it "should use client's default values for OAI arguments if they are not present in the method param hash" do
123
+ @harvestdor_client.send(:scrub_oai_args).should == @expected_oai_args
124
+ end
125
+ it "should use OAI arguments from the method param hash if they are present" do
126
+ passed_options = {:metadata_prefix => 'mods', :from => '2012-11-30'}
127
+ @harvestdor_client.send(:scrub_oai_args, passed_options).should == @expected_oai_args.merge(passed_options)
128
+ end
129
+ it "should use nil value for option when it is passed in options hash" do
130
+ client = Harvestdor::Client.new({:default_from_date => '2012-01-01'})
131
+ client.config.default_from_date.should == '2012-01-01'
132
+ passed_options = {:from => nil}
133
+ client.send(:scrub_oai_args, passed_options)[:from].should == nil
134
+ end
135
+ end
136
+
137
+ describe "harvest" do
138
+ it "should perform a list_records OAI request when first arg is true" do
139
+ oai_response = double('oai_response')
140
+ oai_response.stub(:entries).and_return([])
141
+ @harvestdor_client.oai_client.stub(:list_records).with(an_instance_of(Hash)) {
142
+ oai_response
143
+ }
144
+ @harvestdor_client.oai_client.should_receive(:list_records)
145
+ @harvestdor_client.send(:harvest, :list_records, {})
146
+ end
147
+
148
+ it "should perform a list_identifiers OAI request when first arg is false" do
149
+ oai_response = double('oai_response')
150
+ oai_response.stub(:entries).and_return([])
151
+ @harvestdor_client.oai_client.stub(:list_identifiers).with(an_instance_of(Hash)) {
152
+ oai_response
153
+ }
154
+ @harvestdor_client.oai_client.should_receive(:list_identifiers)
155
+ @harvestdor_client.send(:harvest, :list_identifiers, {})
156
+ end
157
+
158
+ it "should use passed OAI arguments" do
159
+ oai_response = double('oai_response')
160
+ oai_response.stub(:entries).and_return([])
161
+ @harvestdor_client.oai_client.stub(:list_identifiers).with(an_instance_of(Hash)) {
162
+ oai_response
163
+ }
164
+ oai_options_hash = {:metadata_prefix => 'mods', :from => '2012-11-30'}
165
+ @harvestdor_client.oai_client.should_receive(:list_identifiers).with(oai_options_hash)
166
+ @harvestdor_client.send(:harvest, :list_identifiers, oai_options_hash)
167
+ end
168
+
169
+ it "should yield to a passed block" do
170
+ oai_response = double('oai_response')
171
+ oai_response.stub(:entries).and_return([1, 2])
172
+ oai_response.stub(:resumption_token).and_return('')
173
+ @harvestdor_client.oai_client.stub(:list_records).with(an_instance_of(Hash)) {
174
+ oai_response
175
+ }
176
+ expect { |b| @harvestdor_client.send(:harvest, :list_records, {}, &b) }.to yield_successive_args(1, 2)
177
+ end
178
+
179
+ context "resumption tokens" do
180
+ it "should stop processing when no records/headers are received" do
181
+ oai_response = double('oai_response')
182
+ oai_response.stub(:entries).and_return([])
183
+ @harvestdor_client.oai_client.stub(:list_records).with(an_instance_of(Hash)) {
184
+ oai_response
185
+ }
186
+
187
+ i = 0
188
+ @harvestdor_client.send(:harvest, :list_records, {}) { |record| i += 1 }
189
+ i.should == 0
190
+ end
191
+
192
+ it "should stop processing when the resumption token is empty" do
193
+ oai_response_with_token = double('oai_response')
194
+ oai_response_with_token.stub(:entries).and_return([1,2,3,4,5])
195
+ oai_response_with_token.stub(:resumption_token).and_return('')
196
+ @harvestdor_client.oai_client.stub(:list_records).with(an_instance_of(Hash)) {
197
+ oai_response_with_token
198
+ }
199
+
200
+ i = 0
201
+ @harvestdor_client.send(:harvest, :list_records, {}) { |record| i += 1 }
202
+ i.should == 5
203
+ end
204
+
205
+ it "should stop processing when there was no resumption token" do
206
+ oai_response_with_token = double('oai_response')
207
+ oai_response_with_token.stub(:entries).and_return([1,2,3,4,5])
208
+ oai_response_with_token.stub(:resumption_token).and_return(nil)
209
+ @harvestdor_client.oai_client.stub(:list_records).with(an_instance_of(Hash)) {
210
+ oai_response_with_token
211
+ }
212
+
213
+ i = 0
214
+ @harvestdor_client.send(:harvest, :list_records, {}) { |record| i += 1 }
215
+ expect(i).to eql(5)
216
+ end
217
+ end # resumption tokens
218
+ end
219
+
220
+ end
@@ -0,0 +1,125 @@
1
+ # encoding: utf-8
2
+ require 'spec_helper'
3
+
4
+ describe 'Harvestdor::Client OAI Harvesting Integration Tests', :integration => true do
5
+
6
+ before(:all) do
7
+ @config_yml_path = File.join(File.dirname(__FILE__), "config", "oai.yml")
8
+ end
9
+
10
+ context "test OAI server" do
11
+ before(:all) do
12
+ @test_hclient ||= Harvestdor::Client.new({:config_yml_path => @config_yml_path, :oai_client_debug => 'true', :oai_repository_url => 'https://dor-oaiprovider-test.stanford.edu/oai'})
13
+ end
14
+ context "withOUT resumption tokens" do
15
+ before(:all) do
16
+ @oai_args = {:metadata_prefix => 'mods', :from => nil, :until => nil, :set => 'is_governed_by_hy787xj5878'}
17
+ end
18
+ it "should be able to harvest headers" do
19
+ headers = @test_hclient.oai_headers(@oai_args)
20
+ headers.should be_an_instance_of(Array)
21
+ headers.size.should > 0
22
+ headers.size.should < 50 # no resumption token
23
+ headers.first.should be_an_instance_of(OAI::Header)
24
+ end
25
+ it "should be able to harvest records" do
26
+ records = @test_hclient.oai_records(@oai_args)
27
+ records.should be_an_instance_of(Array)
28
+ records.size.should > 0
29
+ records.size.should < 50 # no resumption token
30
+ records.first.should be_an_instance_of(OAI::Record)
31
+ end
32
+ end
33
+ context "with resumption tokens" do
34
+ before(:all) do
35
+ @oai_args = {:metadata_prefix => 'mods', :from => nil, :until => nil, :set => 'is_member_of_kh678dr8608'}
36
+ end
37
+ it "should be able to harvest headers" do
38
+ skip "need to find small set > 50 on test"
39
+ headers = @test_hclient.oai_headers(@oai_args)
40
+ headers.should be_an_instance_of(Array)
41
+ headers.size.should > 50
42
+ headers.first.should be_an_instance_of(OAI::Header)
43
+ end
44
+ it "should be able to harvest records" do
45
+ pending "need to find small set > 50 on test"
46
+ records = @test_hclient.harvest_records(@oai_args)
47
+ records.should be_an_instance_of(Array)
48
+ records.size.should > 50
49
+ records.first.should be_an_instance_of(OAI::Record)
50
+ end
51
+ end
52
+ context "oai_record (single record request)" do
53
+ before(:all) do
54
+ @rec = @test_hclient.oai_record('jt959wc5586')
55
+ end
56
+ it "should get a single OAI::Record object" do
57
+ @rec.should be_an_instance_of(OAI::Record)
58
+ end
59
+ it "should keep utf-8 encoded characters intact" do
60
+ xml = Nokogiri::XML(@rec.metadata.to_s)
61
+ xml.remove_namespaces!
62
+ xml.root.xpath('/metadata/mods/titleInfo/subTitle').text.should =~ /^recueil complet des débats législatifs & politiques des chambres françaises/
63
+ end
64
+ end
65
+ end
66
+
67
+ context "production OAI server" do
68
+ before(:all) do
69
+ @prod_hclient ||= Harvestdor::Client.new({:config_yml_path => @config_yml_path, :oai_repository_url => 'https://dor-oaiprovider-prod.stanford.edu/oai'})
70
+ end
71
+ context "withOUT resumption tokens" do
72
+ before(:all) do
73
+ # Reid-Dennis: 47 objects
74
+ @oai_args = {:metadata_prefix => 'mods', :from => nil, :until => nil, :set => 'is_governed_by_sd064kn5856'}
75
+ end
76
+ it "should be able to harvest headers" do
77
+ headers = @prod_hclient.oai_headers(@oai_args)
78
+ headers.should be_an_instance_of(Array)
79
+ headers.size.should > 0
80
+ headers.size.should < 50 # no resumption token
81
+ headers.first.should be_an_instance_of(OAI::Header)
82
+ end
83
+ it "should be able to harvest records" do
84
+ records = @prod_hclient.oai_records(@oai_args)
85
+ records.should be_an_instance_of(Array)
86
+ records.size.should > 0
87
+ records.size.should < 50 # no resumption token
88
+ records.first.should be_an_instance_of(OAI::Record)
89
+ end
90
+ end
91
+ context "with resumption tokens" do
92
+ before(:all) do
93
+ # Archives Parlementaires - 8x objects
94
+ @oai_args = {:metadata_prefix => 'mods', :from => nil, :until => nil, :set => 'is_member_of_collection_jh957jy1101'}
95
+ end
96
+ it "should be able to harvest headers" do
97
+ headers = @prod_hclient.oai_headers(@oai_args)
98
+ headers.should be_an_instance_of(Array)
99
+ headers.size.should > 50
100
+ headers.first.should be_an_instance_of(OAI::Header)
101
+ end
102
+ it "should be able to harvest records" do
103
+ pending "the request always seems to time out"
104
+ records = @prod_hclient.oai_records(@oai_args)
105
+ records.should be_an_instance_of(Array)
106
+ records.size.should > 50
107
+ records.first.should be_an_instance_of(OAI::Record)
108
+ end
109
+ end
110
+ context "oai_record (single record request)" do
111
+ before(:all) do
112
+ @rec = @prod_hclient.oai_record('jt959wc5586')
113
+ end
114
+ it "should get a single OAI::Record object" do
115
+ @rec.should be_an_instance_of(OAI::Record)
116
+ end
117
+ it "should keep utf-8 encoded characters intact" do
118
+ xml = Nokogiri::XML(@rec.metadata.to_s)
119
+ xml.remove_namespaces!
120
+ xml.root.xpath('/metadata/mods/titleInfo/subTitle').text.should =~ /^recueil complet des débats législatifs & politiques des chambres françaises/
121
+ end
122
+ end
123
+ end
124
+
125
+ end