harvestdor 0.0.13

Sign up to get free protection for your applications and to get access to all the features.
data/lib/harvestdor.rb ADDED
@@ -0,0 +1,121 @@
1
+ require 'harvestdor/errors'
2
+ require 'harvestdor/oai_harvest'
3
+ require 'harvestdor/purl_xml'
4
+ require 'harvestdor/version'
5
+ # external gems
6
+ require 'confstruct'
7
+ require 'oai'
8
+ # stdlib
9
+ require 'logger'
10
+ require 'open-uri'
11
+ require 'yaml'
12
+
13
+ module Harvestdor
14
+
15
+ LOG_NAME_DEFAULT = "harvestdor.log"
16
+ LOG_DIR_DEFAULT = File.join(File.dirname(__FILE__), "..", "logs")
17
+ PURL_DEFAULT = 'http://purl.stanford.edu'
18
+ HTTP_OPTIONS_DEFAULT = { 'ssl' => {
19
+ 'verify' => false
20
+ },
21
+ 'request' => {
22
+ 'timeout' => 60, # open/read timeout (seconds)
23
+ 'open_timeout' => 60 # connection open timeout (seconds)
24
+ }
25
+ }
26
+ OAI_CLIENT_DEBUG_DEFAULT = false
27
+ OAI_REPOSITORY_URL_DEFAULT = 'https://dor-oaiprovider-prod.stanford.edu/oai'
28
+ DEFAULT_METADATA_PREFIX = 'mods'
29
+ DEFAULT_FROM_DATE = nil
30
+ DEFAULT_UNTIL_DATE = nil
31
+ DEFAULT_SET = nil
32
+
33
+ class Client
34
+
35
+ # Set default values for the construction of Harvestdor::Client objects
36
+ def self.default_config
37
+ @class_config ||= Confstruct::Configuration.new({
38
+ :log_dir => LOG_DIR_DEFAULT,
39
+ :log_name => LOG_NAME_DEFAULT,
40
+ :purl => PURL_DEFAULT,
41
+ :http_options => HTTP_OPTIONS_DEFAULT,
42
+ :oai_repository_url => OAI_REPOSITORY_URL_DEFAULT,
43
+ :oai_client_debug => OAI_CLIENT_DEBUG_DEFAULT,
44
+ :default_metadata_prefix => DEFAULT_METADATA_PREFIX,
45
+ :default_from_date => DEFAULT_FROM_DATE,
46
+ :default_until_date => DEFAULT_UNTIL_DATE,
47
+ :default_set => DEFAULT_SET
48
+ })
49
+ end
50
+
51
+ # Initialize a new instance of Harvestdor::Client
52
+ # @param Hash options
53
+ # @example
54
+ # client = Harvestdor::Client.new({ # Example with all possible options
55
+ # :log_dir => File.join(File.dirname(__FILE__), "..", "logs"),
56
+ # :log_name => 'harvestdor.log',
57
+ # :purl => 'http://purl.stanford.edu',
58
+ # :http_options => { 'ssl' => {
59
+ # 'verify' => false
60
+ # },
61
+ # 'request' => {
62
+ # 'timeout' => 30, # open/read timeout (seconds)
63
+ # 'open_timeout' => 30 # connection open timeout (seconds)
64
+ # }
65
+ # },
66
+ # :oai_repository_url => 'https://dor-oaiprovider-prod.stanford.edu/oai', # The OAI repository to connect to
67
+ # :oai_client_debug => false,
68
+ # :default_metadata_prefix => 'mods',
69
+ # :default_from_date => '2012-12-01',
70
+ # :default_until_date => '2014-12-01',
71
+ # :default_set => nil,
72
+ # })
73
+ def initialize options = {}
74
+ config.configure(YAML.load_file(options[:config_yml_path])) if options[:config_yml_path]
75
+ config.configure options
76
+ yield(config) if block_given?
77
+ end
78
+
79
+ def config
80
+ @config ||= Confstruct::Configuration.new(self.class.default_config)
81
+ end
82
+
83
+ # @return OAI::Client an instantiated OAI::Client object, based on config options
84
+ def oai_client
85
+ @oai_client ||= OAI::Client.new config.oai_repository_url, :debug => config.oai_client_debug, :http => oai_http_client
86
+ end
87
+
88
+ def logger
89
+ @logger ||= self.class.logger(config.log_dir, config.log_name)
90
+ end
91
+
92
+ protected #---------------------------------------------------------------------
93
+
94
+ def oai_http_client
95
+ logger.info "Constructing OAI http client with faraday options #{config.http_options.to_hash.inspect}"
96
+ @oai_http_client ||= Faraday.new config.oai_repository_url, config.http_options.to_hash
97
+ end
98
+
99
+ # Global, memoized, lazy initialized instance of a logger
100
+ # @param [String] log_dir directory for to get log file
101
+ # @param [String] log_name name of log file
102
+ def self.logger(log_dir, log_name)
103
+ Dir.mkdir(log_dir) unless File.directory?(log_dir)
104
+ @logger ||= Logger.new(File.join(log_dir, log_name), 'daily')
105
+ end
106
+
107
+ end # class Client
108
+
109
+ # @param [Object] arg OAI::Header object or OAI::Record object or String (oai identifier)
110
+ # @return [String] the druid part of an OAI identifier in an OAI header, e.g. bb134cc1324
111
+ def self.druid(arg)
112
+ oai_id = arg
113
+ if arg.is_a?(OAI::Header)
114
+ oai_id = arg.identifier
115
+ elsif arg.is_a?(OAI::Record)
116
+ oai_id = arg.header.identifier
117
+ end
118
+ oai_id.split('druid:').last
119
+ end
120
+
121
+ end # module Harvestdor
@@ -0,0 +1,37 @@
1
+ # log_dir: directory for log file (default logs, relative to harvestdor gem path)
2
+ log_dir: spec/test_logs
3
+
4
+ # log_name: name of log file (default: harvestdor.log)
5
+
6
+ # purl: url for the DOR purl server (used to get ContentMetadata, etc.)
7
+ # purl: http://purl-test.stanford.edu
8
+
9
+ # ---------- OAI harvesting parameters -----------
10
+
11
+ # oai_client_debug: true for OAI::Client debug mode (default: false)
12
+
13
+ # oai_repository_url: URL of the OAI data provider
14
+ oai_repository_url: https://dor-oaiprovider-test.stanford.edu/oai
15
+
16
+ # default_metadata_prefix: default metadata prefix to be used for harvesting (default: mods)
17
+ # can be overridden on calls to harvest_ids and harvest_records
18
+ default_metadata_prefix: mods
19
+
20
+ # default_from_date: default from date for harvest (default: nil)
21
+ # can be overridden on calls to harvest_ids and harvest_records
22
+ default_from_date: '2012-11-01'
23
+
24
+ # default_until_date: default until date for harvest (default: nil)
25
+ # can be overridden on calls to harvest_ids and harvest_records
26
+
27
+ # default_set: default set for harvest (default: nil)
28
+ # can be overridden on calls to harvest_ids and harvest_records
29
+
30
+ # Additional options to pass to Faraday http client (https://github.com/technoweenie/faraday)
31
+ # timeouts are in seconds; timeout -> open/read, open_timeout -> connection open
32
+ http_options:
33
+ ssl:
34
+ verify: false
35
+ request:
36
+ timeout: 121
37
+ open_timeout: 122
@@ -0,0 +1,135 @@
1
+ require "spec_helper"
2
+
3
+ describe Harvestdor::Client do
4
+
5
+ before(:all) do
6
+ @config_yml_path = File.join(File.dirname(__FILE__), "config", "oai.yml")
7
+ @client_via_yml_only = Harvestdor::Client.new({:config_yml_path => @config_yml_path})
8
+ require 'yaml'
9
+ @yaml = YAML.load_file(@config_yml_path)
10
+ end
11
+
12
+ describe "initialization" do
13
+ before(:all) do
14
+ @from_date = '2012-11-29'
15
+ @repo_url = 'http://my_oai_repo.org/oai'
16
+ end
17
+ context "attributes passed in hash argument" do
18
+ before(:all) do
19
+ @some_args = Harvestdor::Client.new({:default_from_date => @from_date, :oai_repository_url => @repo_url}).config
20
+ end
21
+ it "should set the attributes to the passed values" do
22
+ expect(@some_args.oai_repository_url).to eql(@repo_url)
23
+ expect(@some_args.default_from_date).to eql(@from_date)
24
+ end
25
+ it "should keep the defaults for attributes not in the hash argument" do
26
+ expect(@some_args.log_name).to eql(Harvestdor::LOG_NAME_DEFAULT)
27
+ expect(@some_args.log_dir).to eql(Harvestdor::LOG_DIR_DEFAULT)
28
+ expect(@some_args.purl).to eql(Harvestdor::PURL_DEFAULT)
29
+ expect(@some_args.http_options).to eql(Confstruct::Configuration.new(Harvestdor::HTTP_OPTIONS_DEFAULT))
30
+ expect(@some_args.oai_client_debug).to eql(Harvestdor::OAI_CLIENT_DEBUG_DEFAULT)
31
+ expect(@some_args.default_metadata_prefix).to eql(Harvestdor::DEFAULT_METADATA_PREFIX)
32
+ expect(@some_args.default_until_date).to eql(Harvestdor::DEFAULT_UNTIL_DATE)
33
+ expect(@some_args.default_set).to eql(Harvestdor::DEFAULT_SET)
34
+ end
35
+ end
36
+
37
+ context "config_yml_path in hash argument" do
38
+ before(:all) do
39
+ @config_via_yml_only = @client_via_yml_only.config
40
+ end
41
+ it "should set attributes in yml file over defaults" do
42
+ expect(@config_via_yml_only.log_dir).to eql(@yaml['log_dir'])
43
+ expect(@config_via_yml_only.oai_repository_url).to eql(@yaml['oai_repository_url'])
44
+ expect(@config_via_yml_only.default_from_date).to eql(@yaml['default_from_date'])
45
+ expect(@config_via_yml_only.default_metadata_prefix).to eql(@yaml['default_metadata_prefix'])
46
+ expect(@config_via_yml_only.http_options.request.timeout).to eql(@yaml['http_options']['request']['timeout'])
47
+ end
48
+ it "should keep the defaults for attributes not present in yml file nor a config yml file" do
49
+ expect(@config_via_yml_only.log_name).to eql(Harvestdor::LOG_NAME_DEFAULT)
50
+ expect(@config_via_yml_only.purl).to eql(Harvestdor::PURL_DEFAULT)
51
+ expect(@config_via_yml_only.default_until_date).to eql(Harvestdor::DEFAULT_UNTIL_DATE)
52
+ expect(@config_via_yml_only.default_set).to eql(Harvestdor::DEFAULT_SET)
53
+ end
54
+ context "and some hash arguments" do
55
+ before(:all) do
56
+ @config_via_yml_plus = Harvestdor::Client.new({:config_yml_path => @config_yml_path,
57
+ :default_from_date => @from_date, :oai_repository_url => @repo_url}).config
58
+ end
59
+ it "should favor hash arg attribute values over yml file values" do
60
+ expect(@config_via_yml_plus.oai_repository_url).to eql(@repo_url)
61
+ expect(@config_via_yml_plus.default_from_date).to eql(@from_date)
62
+ end
63
+ it "should favor yml file values over defaults" do
64
+ expect(@config_via_yml_plus.log_dir).to eql(@yaml['log_dir'])
65
+ expect(@config_via_yml_plus.default_metadata_prefix).to eql(@yaml['default_metadata_prefix'])
66
+ expect(@config_via_yml_plus.http_options.timeout).to eql(@yaml['http_options']['timeout'])
67
+ end
68
+ it "should keep the defaults for attributes not present in yml file" do
69
+ expect(@config_via_yml_plus.log_name).to eql(Harvestdor::LOG_NAME_DEFAULT)
70
+ expect(@config_via_yml_plus.default_until_date).to eql(Harvestdor::DEFAULT_UNTIL_DATE)
71
+ expect(@config_via_yml_plus.default_set).to eql(Harvestdor::DEFAULT_SET)
72
+ end
73
+ end
74
+ end
75
+
76
+ context "without hash arguments" do
77
+ it "should keep the defaults for all attributes" do
78
+ no_args = Harvestdor::Client.new.config
79
+ expect(no_args.log_name).to eql(Harvestdor::LOG_NAME_DEFAULT)
80
+ expect(no_args.log_dir).to eql(Harvestdor::LOG_DIR_DEFAULT)
81
+ expect(no_args.purl).to eql(Harvestdor::PURL_DEFAULT)
82
+ expect(no_args.http_options).to eql(Confstruct::Configuration.new(Harvestdor::HTTP_OPTIONS_DEFAULT))
83
+ expect(no_args.oai_client_debug).to eql(Harvestdor::OAI_CLIENT_DEBUG_DEFAULT)
84
+ expect(no_args.oai_repository_url).to eql(Harvestdor::OAI_REPOSITORY_URL_DEFAULT)
85
+ expect(no_args.default_metadata_prefix).to eql(Harvestdor::DEFAULT_METADATA_PREFIX)
86
+ expect(no_args.default_from_date).to eql(Harvestdor::DEFAULT_FROM_DATE)
87
+ expect(no_args.default_until_date).to eql(Harvestdor::DEFAULT_UNTIL_DATE)
88
+ expect(no_args.default_set).to eql(Harvestdor::DEFAULT_SET)
89
+ end
90
+ end
91
+ end # initialize client
92
+
93
+ it "should allow direct setting of configuration attributes" do
94
+ conf = Harvestdor::Client.new.config
95
+ expect(conf.log_dir).to eql(Harvestdor::LOG_DIR_DEFAULT)
96
+ conf['log_dir'] = 'my_log_dir'
97
+ expect(conf.log_dir).to eql('my_log_dir')
98
+ end
99
+
100
+ describe "logging" do
101
+ it "should write the log file to the directory indicated by log_dir" do
102
+ @client_via_yml_only.logger.info("harvestdor_client_spec logging test message")
103
+ expect(File.exists?(File.join(@yaml['log_dir'], Harvestdor::LOG_NAME_DEFAULT))).to eql(true)
104
+ end
105
+ end
106
+
107
+ context "oai_client" do
108
+ before(:all) do
109
+ @client = Harvestdor::Client.new
110
+ @default_oai_client = Harvestdor::Client.new.oai_client
111
+ end
112
+
113
+ it "oai_client should return an OAI::Client object based on config data" do
114
+ expect(@default_oai_client).to be_an_instance_of(OAI::Client)
115
+ end
116
+
117
+ it "oai_client should have an http_client" do
118
+ expect(@default_oai_client.instance_variable_get(:@http_client)).to be_an_instance_of(Faraday::Connection)
119
+ end
120
+
121
+ context "oai_http_client (protected method)" do
122
+ before(:all) do
123
+ @http_client = @client.send(:oai_http_client)
124
+ end
125
+ it "should be a Faraday object" do
126
+ expect(@http_client).to be_an_instance_of(Faraday::Connection)
127
+ end
128
+ it "should have the oai_provider url from config" do
129
+ uri_obj = @http_client.url_prefix
130
+ expect(@client.config.oai_repository_url).to match(Regexp.new(uri_obj.host + uri_obj.path))
131
+ end
132
+ end
133
+ end # context oai_client
134
+
135
+ end
@@ -0,0 +1,23 @@
1
+ require "spec_helper"
2
+
3
+ describe Harvestdor do
4
+
5
+ context "#druid" do
6
+ it "should return the druid part of an oai identifier" do
7
+ Harvestdor.druid('oai:searchworks.stanford.edu/druid:foo').should == 'foo'
8
+ end
9
+ it "should work with OAI::Header as argument" do
10
+ header = OAI::Header.new(nil)
11
+ header.identifier = 'oai:searchworks.stanford.edu/druid:foo'
12
+ Harvestdor.druid(header).should == 'foo'
13
+ end
14
+ it "should work with OAI::Record as argument" do
15
+ oai_rec = OAI::Record.new(nil)
16
+ header = OAI::Header.new(nil)
17
+ header.identifier = 'oai:searchworks.stanford.edu/druid:foo'
18
+ oai_rec.header = header
19
+ Harvestdor.druid(oai_rec).should == 'foo'
20
+ end
21
+ end
22
+
23
+ end
@@ -0,0 +1,220 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'Harvestdor::Client oai harvesting' do
4
+ before(:all) do
5
+ @harvestdor_client = Harvestdor::Client.new
6
+ @oai_arg_defaults = {:metadata_prefix => @harvestdor_client.config.default_metadata_prefix,
7
+ :from => @harvestdor_client.config.default_from_date,
8
+ :until => @harvestdor_client.config.default_until_date,
9
+ :set => @harvestdor_client.config.default_set }
10
+ end
11
+
12
+ describe "druids_via_oai" do
13
+ before(:each) do
14
+ oai_response = double('oai_response')
15
+ oai_response.stub(:entries).and_return(['foo', 'bar'])
16
+ oai_response.stub(:resumption_token).and_return('')
17
+ @harvestdor_client.oai_client.stub(:list_identifiers).with(an_instance_of(Hash)) {
18
+ oai_response
19
+ }
20
+ end
21
+ it "should return druids" do
22
+ header1 = OAI::Header.new(nil)
23
+ header1.identifier = 'oai:searchworks.stanford.edu/druid:foo'
24
+ header2 = OAI::Header.new(nil)
25
+ header2.identifier = 'oai:searchworks.stanford.edu/druid:bar'
26
+ oai_response = double('oai_response')
27
+ oai_response.stub(:entries).and_return([header1, header2])
28
+ @harvestdor_client.druids_via_oai.should == ['foo', 'bar']
29
+ end
30
+ it "should have results viewable as an array" do
31
+ @harvestdor_client.druids_via_oai.should be_an_instance_of(Array)
32
+ end
33
+ it "should have enumerable results" do
34
+ @harvestdor_client.druids_via_oai.should respond_to(:each, :count)
35
+ end
36
+ it "should yield to a passed block" do
37
+ expect { |b| @harvestdor_client.druids_via_oai(&b) }.to yield_successive_args('foo', 'bar')
38
+ end
39
+ end
40
+
41
+ describe "oai_records" do
42
+ before(:each) do
43
+ @oai_response = double('oai_response')
44
+ @oai_response.stub(:entries).and_return([1, 2])
45
+ @oai_response.stub(:resumption_token).and_return('')
46
+ @harvestdor_client.oai_client.stub(:list_records).with(an_instance_of(Hash)) {
47
+ @oai_response
48
+ }
49
+ end
50
+ it "should return OAI::Record objects" do
51
+ header1 = OAI::Header.new(nil)
52
+ header1.identifier = 'oai:searchworks.stanford.edu/druid:foo'
53
+ oai_rec1 = OAI::Record.new(nil)
54
+ oai_rec1.header = header1
55
+ header2 = OAI::Header.new(nil)
56
+ header2.identifier = 'oai:searchworks.stanford.edu/druid:bar'
57
+ oai_rec2 = OAI::Record.new(nil)
58
+ oai_rec2.header = header2
59
+ @oai_response.stub(:entries).and_return([oai_rec1, oai_rec2])
60
+ @harvestdor_client.oai_records.should == [oai_rec1, oai_rec2]
61
+ end
62
+ it "should have results viewable as an array" do
63
+ @harvestdor_client.oai_records.should be_an_instance_of(Array)
64
+ end
65
+ it "should have enumerable results" do
66
+ @harvestdor_client.oai_records.should respond_to(:each, :count)
67
+ end
68
+ it "should yield to a passed block" do
69
+ expect { |b| @harvestdor_client.oai_records(&b) }.to yield_successive_args(1, 2)
70
+ end
71
+ end
72
+
73
+ describe "oai_headers" do
74
+ before(:each) do
75
+ @oai_response = double('oai_response')
76
+ @oai_response.stub(:entries).and_return([1, 2])
77
+ @oai_response.stub(:resumption_token).and_return('')
78
+ @harvestdor_client.oai_client.stub(:list_identifiers).with(an_instance_of(Hash)) {
79
+ @oai_response
80
+ }
81
+ end
82
+ it "should return OAI::Header objects" do
83
+ header1 = OAI::Header.new(nil)
84
+ header1.identifier = 'oai:searchworks.stanford.edu/druid:foo'
85
+ header2 = OAI::Header.new(nil)
86
+ header2.identifier = 'oai:searchworks.stanford.edu/druid:bar'
87
+ @oai_response.stub(:entries).and_return([header1, header2])
88
+ @harvestdor_client.oai_headers.should == [header1, header2]
89
+ end
90
+ it "should have results viewable as an array" do
91
+ @harvestdor_client.oai_headers.should be_an_instance_of(Array)
92
+ end
93
+ it "should have enumerable results" do
94
+ @harvestdor_client.oai_headers.should respond_to(:each, :count)
95
+ end
96
+ it "should yield to a passed block" do
97
+ expect { |b| @harvestdor_client.oai_headers(&b) }.to yield_successive_args(1, 2)
98
+ end
99
+ end
100
+
101
+ describe "oai_record (single record request)" do
102
+ it "should return OAI::Record object" do
103
+ oai_rec = OAI::Record.new(nil)
104
+ oai_resp = double('oai_response')
105
+ oai_resp.stub(:record).and_return(oai_rec)
106
+ @harvestdor_client.oai_client.stub(:get_record) {
107
+ oai_resp
108
+ }
109
+ @harvestdor_client.oai_record('druid').should == oai_rec
110
+ @harvestdor_client.oai_record('druid', 'mods').should == oai_rec
111
+ end
112
+ end
113
+
114
+ describe "scrub_oai_args" do
115
+ before(:all) do
116
+ @expected_oai_args = @oai_arg_defaults.dup
117
+ @expected_oai_args.each { |k, v|
118
+ @expected_oai_args.delete(k) if v.nil? || v.size == 0
119
+ }
120
+
121
+ end
122
+ it "should use client's default values for OAI arguments if they are not present in the method param hash" do
123
+ @harvestdor_client.send(:scrub_oai_args).should == @expected_oai_args
124
+ end
125
+ it "should use OAI arguments from the method param hash if they are present" do
126
+ passed_options = {:metadata_prefix => 'mods', :from => '2012-11-30'}
127
+ @harvestdor_client.send(:scrub_oai_args, passed_options).should == @expected_oai_args.merge(passed_options)
128
+ end
129
+ it "should use nil value for option when it is passed in options hash" do
130
+ client = Harvestdor::Client.new({:default_from_date => '2012-01-01'})
131
+ client.config.default_from_date.should == '2012-01-01'
132
+ passed_options = {:from => nil}
133
+ client.send(:scrub_oai_args, passed_options)[:from].should == nil
134
+ end
135
+ end
136
+
137
+ describe "harvest" do
138
+ it "should perform a list_records OAI request when first arg is true" do
139
+ oai_response = double('oai_response')
140
+ oai_response.stub(:entries).and_return([])
141
+ @harvestdor_client.oai_client.stub(:list_records).with(an_instance_of(Hash)) {
142
+ oai_response
143
+ }
144
+ @harvestdor_client.oai_client.should_receive(:list_records)
145
+ @harvestdor_client.send(:harvest, :list_records, {})
146
+ end
147
+
148
+ it "should perform a list_identifiers OAI request when first arg is false" do
149
+ oai_response = double('oai_response')
150
+ oai_response.stub(:entries).and_return([])
151
+ @harvestdor_client.oai_client.stub(:list_identifiers).with(an_instance_of(Hash)) {
152
+ oai_response
153
+ }
154
+ @harvestdor_client.oai_client.should_receive(:list_identifiers)
155
+ @harvestdor_client.send(:harvest, :list_identifiers, {})
156
+ end
157
+
158
+ it "should use passed OAI arguments" do
159
+ oai_response = double('oai_response')
160
+ oai_response.stub(:entries).and_return([])
161
+ @harvestdor_client.oai_client.stub(:list_identifiers).with(an_instance_of(Hash)) {
162
+ oai_response
163
+ }
164
+ oai_options_hash = {:metadata_prefix => 'mods', :from => '2012-11-30'}
165
+ @harvestdor_client.oai_client.should_receive(:list_identifiers).with(oai_options_hash)
166
+ @harvestdor_client.send(:harvest, :list_identifiers, oai_options_hash)
167
+ end
168
+
169
+ it "should yield to a passed block" do
170
+ oai_response = double('oai_response')
171
+ oai_response.stub(:entries).and_return([1, 2])
172
+ oai_response.stub(:resumption_token).and_return('')
173
+ @harvestdor_client.oai_client.stub(:list_records).with(an_instance_of(Hash)) {
174
+ oai_response
175
+ }
176
+ expect { |b| @harvestdor_client.send(:harvest, :list_records, {}, &b) }.to yield_successive_args(1, 2)
177
+ end
178
+
179
+ context "resumption tokens" do
180
+ it "should stop processing when no records/headers are received" do
181
+ oai_response = double('oai_response')
182
+ oai_response.stub(:entries).and_return([])
183
+ @harvestdor_client.oai_client.stub(:list_records).with(an_instance_of(Hash)) {
184
+ oai_response
185
+ }
186
+
187
+ i = 0
188
+ @harvestdor_client.send(:harvest, :list_records, {}) { |record| i += 1 }
189
+ i.should == 0
190
+ end
191
+
192
+ it "should stop processing when the resumption token is empty" do
193
+ oai_response_with_token = double('oai_response')
194
+ oai_response_with_token.stub(:entries).and_return([1,2,3,4,5])
195
+ oai_response_with_token.stub(:resumption_token).and_return('')
196
+ @harvestdor_client.oai_client.stub(:list_records).with(an_instance_of(Hash)) {
197
+ oai_response_with_token
198
+ }
199
+
200
+ i = 0
201
+ @harvestdor_client.send(:harvest, :list_records, {}) { |record| i += 1 }
202
+ i.should == 5
203
+ end
204
+
205
+ it "should stop processing when there was no resumption token" do
206
+ oai_response_with_token = double('oai_response')
207
+ oai_response_with_token.stub(:entries).and_return([1,2,3,4,5])
208
+ oai_response_with_token.stub(:resumption_token).and_return(nil)
209
+ @harvestdor_client.oai_client.stub(:list_records).with(an_instance_of(Hash)) {
210
+ oai_response_with_token
211
+ }
212
+
213
+ i = 0
214
+ @harvestdor_client.send(:harvest, :list_records, {}) { |record| i += 1 }
215
+ expect(i).to eql(5)
216
+ end
217
+ end # resumption tokens
218
+ end
219
+
220
+ end
@@ -0,0 +1,125 @@
1
+ # encoding: utf-8
2
+ require 'spec_helper'
3
+
4
+ describe 'Harvestdor::Client OAI Harvesting Integration Tests', :integration => true do
5
+
6
+ before(:all) do
7
+ @config_yml_path = File.join(File.dirname(__FILE__), "config", "oai.yml")
8
+ end
9
+
10
+ context "test OAI server" do
11
+ before(:all) do
12
+ @test_hclient ||= Harvestdor::Client.new({:config_yml_path => @config_yml_path, :oai_client_debug => 'true', :oai_repository_url => 'https://dor-oaiprovider-test.stanford.edu/oai'})
13
+ end
14
+ context "withOUT resumption tokens" do
15
+ before(:all) do
16
+ @oai_args = {:metadata_prefix => 'mods', :from => nil, :until => nil, :set => 'is_governed_by_hy787xj5878'}
17
+ end
18
+ it "should be able to harvest headers" do
19
+ headers = @test_hclient.oai_headers(@oai_args)
20
+ headers.should be_an_instance_of(Array)
21
+ headers.size.should > 0
22
+ headers.size.should < 50 # no resumption token
23
+ headers.first.should be_an_instance_of(OAI::Header)
24
+ end
25
+ it "should be able to harvest records" do
26
+ records = @test_hclient.oai_records(@oai_args)
27
+ records.should be_an_instance_of(Array)
28
+ records.size.should > 0
29
+ records.size.should < 50 # no resumption token
30
+ records.first.should be_an_instance_of(OAI::Record)
31
+ end
32
+ end
33
+ context "with resumption tokens" do
34
+ before(:all) do
35
+ @oai_args = {:metadata_prefix => 'mods', :from => nil, :until => nil, :set => 'is_member_of_kh678dr8608'}
36
+ end
37
+ it "should be able to harvest headers" do
38
+ skip "need to find small set > 50 on test"
39
+ headers = @test_hclient.oai_headers(@oai_args)
40
+ headers.should be_an_instance_of(Array)
41
+ headers.size.should > 50
42
+ headers.first.should be_an_instance_of(OAI::Header)
43
+ end
44
+ it "should be able to harvest records" do
45
+ pending "need to find small set > 50 on test"
46
+ records = @test_hclient.harvest_records(@oai_args)
47
+ records.should be_an_instance_of(Array)
48
+ records.size.should > 50
49
+ records.first.should be_an_instance_of(OAI::Record)
50
+ end
51
+ end
52
+ context "oai_record (single record request)" do
53
+ before(:all) do
54
+ @rec = @test_hclient.oai_record('jt959wc5586')
55
+ end
56
+ it "should get a single OAI::Record object" do
57
+ @rec.should be_an_instance_of(OAI::Record)
58
+ end
59
+ it "should keep utf-8 encoded characters intact" do
60
+ xml = Nokogiri::XML(@rec.metadata.to_s)
61
+ xml.remove_namespaces!
62
+ xml.root.xpath('/metadata/mods/titleInfo/subTitle').text.should =~ /^recueil complet des débats législatifs & politiques des chambres françaises/
63
+ end
64
+ end
65
+ end
66
+
67
+ context "production OAI server" do
68
+ before(:all) do
69
+ @prod_hclient ||= Harvestdor::Client.new({:config_yml_path => @config_yml_path, :oai_repository_url => 'https://dor-oaiprovider-prod.stanford.edu/oai'})
70
+ end
71
+ context "withOUT resumption tokens" do
72
+ before(:all) do
73
+ # Reid-Dennis: 47 objects
74
+ @oai_args = {:metadata_prefix => 'mods', :from => nil, :until => nil, :set => 'is_governed_by_sd064kn5856'}
75
+ end
76
+ it "should be able to harvest headers" do
77
+ headers = @prod_hclient.oai_headers(@oai_args)
78
+ headers.should be_an_instance_of(Array)
79
+ headers.size.should > 0
80
+ headers.size.should < 50 # no resumption token
81
+ headers.first.should be_an_instance_of(OAI::Header)
82
+ end
83
+ it "should be able to harvest records" do
84
+ records = @prod_hclient.oai_records(@oai_args)
85
+ records.should be_an_instance_of(Array)
86
+ records.size.should > 0
87
+ records.size.should < 50 # no resumption token
88
+ records.first.should be_an_instance_of(OAI::Record)
89
+ end
90
+ end
91
+ context "with resumption tokens" do
92
+ before(:all) do
93
+ # Archives Parlementaires - 8x objects
94
+ @oai_args = {:metadata_prefix => 'mods', :from => nil, :until => nil, :set => 'is_member_of_collection_jh957jy1101'}
95
+ end
96
+ it "should be able to harvest headers" do
97
+ headers = @prod_hclient.oai_headers(@oai_args)
98
+ headers.should be_an_instance_of(Array)
99
+ headers.size.should > 50
100
+ headers.first.should be_an_instance_of(OAI::Header)
101
+ end
102
+ it "should be able to harvest records" do
103
+ pending "the request always seems to time out"
104
+ records = @prod_hclient.oai_records(@oai_args)
105
+ records.should be_an_instance_of(Array)
106
+ records.size.should > 50
107
+ records.first.should be_an_instance_of(OAI::Record)
108
+ end
109
+ end
110
+ context "oai_record (single record request)" do
111
+ before(:all) do
112
+ @rec = @prod_hclient.oai_record('jt959wc5586')
113
+ end
114
+ it "should get a single OAI::Record object" do
115
+ @rec.should be_an_instance_of(OAI::Record)
116
+ end
117
+ it "should keep utf-8 encoded characters intact" do
118
+ xml = Nokogiri::XML(@rec.metadata.to_s)
119
+ xml.remove_namespaces!
120
+ xml.root.xpath('/metadata/mods/titleInfo/subTitle').text.should =~ /^recueil complet des débats législatifs & politiques des chambres françaises/
121
+ end
122
+ end
123
+ end
124
+
125
+ end