harvestdor-indexer 1.0.4 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,47 +2,100 @@
2
2
  http_interactions:
3
3
  - request:
4
4
  method: get
5
- uri: http://127.0.0.1:3000/
5
+ uri: http://purl.stanford.edu/yg867hg1375.xml
6
6
  body:
7
7
  encoding: US-ASCII
8
8
  string: ''
9
9
  headers:
10
+ Accept-Encoding:
11
+ - gzip;q=1.0,deflate;q=0.6,identity;q=0.3
10
12
  Accept:
11
- - '*/*'
13
+ - "*/*"
12
14
  User-Agent:
13
15
  - Ruby
14
16
  response:
15
17
  status:
16
18
  code: 200
17
- message: OK
19
+ message: ''
18
20
  headers:
19
- X-Frame-Options:
20
- - SAMEORIGIN
21
- X-Xss-Protection:
22
- - 1; mode=block
23
- X-Content-Type-Options:
24
- - nosniff
25
- Content-Type:
26
- - text/html; charset=utf-8
21
+ Date:
22
+ - Wed, 17 Dec 2014 19:39:37 GMT
23
+ Server:
24
+ - Apache/2.2.15 (Red Hat)
25
+ X-Powered-By:
26
+ - Phusion Passenger (mod_rails/mod_rack) 3.0.19
27
+ X-Ua-Compatible:
28
+ - IE=Edge,chrome=1
27
29
  Etag:
28
- - '"444bcb3a3fcf8389296c49467f27e1d6"'
30
+ - '"67aa6d1481ba1537ae63af5aaf493f84"'
29
31
  Cache-Control:
30
32
  - max-age=0, private, must-revalidate
31
- X-Meta-Request-Version:
32
- - 0.3.4
33
33
  X-Request-Id:
34
- - 4fef0b48-9ee9-4ef8-811e-78b8c0f643f0
34
+ - f2e753d56bf896cde6e941be0f51d05a
35
35
  X-Runtime:
36
- - '0.005477'
37
- Connection:
38
- - close
39
- Server:
40
- - thin 1.6.2 codename Doc Brown
36
+ - '0.007983'
37
+ X-Rack-Cache:
38
+ - miss
39
+ Status:
40
+ - '200'
41
+ Content-Length:
42
+ - '2180'
43
+ Content-Type:
44
+ - application/xml; charset=utf-8
41
45
  body:
42
- encoding: US-ASCII
43
- string: ok
44
- http_version:
45
- recorded_at: Wed, 12 Nov 2014 19:34:02 GMT
46
+ encoding: UTF-8
47
+ string: |
48
+ <publicObject id="druid:yg867hg1375" published="2013-11-11T15:34:32-08:00">
49
+ <identityMetadata>
50
+ <objectId>druid:yg867hg1375</objectId>
51
+ <objectCreator>DOR</objectCreator>
52
+ <objectLabel>Francis E. Stafford photographs, 1909-1933</objectLabel>
53
+ <objectType>collection</objectType>
54
+ <adminPolicy>druid:vb546ms7107</adminPolicy>
55
+ <otherId name="catkey">9615156</otherId>
56
+ <otherId name="uuid">8f1feb20-4b29-11e3-8e31-0050569b3c3c</otherId>
57
+ <tag>Remediated By : 3.25.3</tag>
58
+ </identityMetadata>
59
+ <xml/>
60
+ <rightsMetadata>
61
+ <access type="discover">
62
+ <machine>
63
+ <world/>
64
+ </machine>
65
+ </access>
66
+ <access type="read">
67
+ <machine>
68
+ <world/>
69
+ </machine>
70
+ </access>
71
+ <use>
72
+ <human type="useAndReproduction"/>
73
+ <human type="creativeCommons"/>
74
+ <machine type="creativeCommons"/>
75
+ </use>
76
+ <copyright>
77
+ <human/>
78
+ </copyright>
79
+ </rightsMetadata>
80
+ <rdf:RDF xmlns:fedora-model="info:fedora/fedora-system:def/model#" xmlns:hydra="http://projecthydra.org/ns/relations#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
81
+ <rdf:Description rdf:about="info:fedora/druid:yg867hg1375">
82
+ </rdf:Description>
83
+ </rdf:RDF>
84
+ <oai_dc:dc xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:srw_dc="info:srw/schema/1/dc-schema" xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
85
+ <dc:title>Francis E. Stafford photographs, 1909-1933</dc:title>
86
+ <dc:contributor>Stafford, Francis E., 1884-1938</dc:contributor>
87
+ <dc:type>Collection</dc:type>
88
+ <dc:date>1909-1933</dc:date>
89
+ <dc:language>und</dc:language>
90
+ <dc:format>3 oversize boxes.</dc:format>
91
+ <dc:description>Photographs of scenes in China, mainly between 1909 and 1915.</dc:description>
92
+ <dc:rights>Closed. Digital use copies available.</dc:rights>
93
+ <dc:description type="biographical/historical">American missionary in China, 1909-1915 and 1932-1933.</dc:description>
94
+ <dc:coverage>China</dc:coverage>
95
+ </oai_dc:dc>
96
+ </publicObject>
97
+ http_version:
98
+ recorded_at: Wed, 17 Dec 2014 19:39:38 GMT
46
99
  - request:
47
100
  method: get
48
101
  uri: http://127.0.0.1:3000/collections/yg867hg1375
@@ -74,9 +127,9 @@ http_interactions:
74
127
  X-Meta-Request-Version:
75
128
  - 0.3.4
76
129
  X-Request-Id:
77
- - 26936654-94cb-4135-b867-53f9ba72d1b5
130
+ - 125a9964-6326-4114-9f59-fb533551d554
78
131
  X-Runtime:
79
- - '0.011706'
132
+ - '0.011086'
80
133
  Connection:
81
134
  - close
82
135
  Server:
@@ -95,5 +148,5 @@ http_interactions:
95
148
  B: Photographs of China''s natural landscapes, urban scenes, cultural landmarks,
96
149
  social customs, and people."}],"counts":{"collections":1,"items":5,"total_count":6}}'
97
150
  http_version:
98
- recorded_at: Wed, 12 Nov 2014 19:34:02 GMT
151
+ recorded_at: Wed, 12 Nov 2014 19:34:03 GMT
99
152
  recorded_with: VCR 2.9.3
@@ -4,7 +4,7 @@ Coveralls.wear!
4
4
  $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
5
5
  $LOAD_PATH.unshift(File.dirname(__FILE__))
6
6
 
7
- require 'harvestdor-indexer'
7
+ require 'harvestdor/indexer'
8
8
 
9
9
  require 'vcr'
10
10
 
@@ -0,0 +1,174 @@
1
+ require 'spec_helper'
2
+
3
+ describe Harvestdor::Indexer::Resource do
4
+
5
+ before(:all) do
6
+ VCR.use_cassette('before_all_call') do
7
+ @config_yml_path = File.join(File.dirname(__FILE__), "..", "config", "ap.yml")
8
+ require 'yaml'
9
+ @config = YAML.load_file(@config_yml_path)
10
+ @fake_druid = 'oo000oo0000'
11
+
12
+ @indexer = Harvestdor::Indexer.new(@config)
13
+ @hdor_client = @indexer.send(:harvestdor_client)
14
+ @whitelist_path = File.join(File.dirname(__FILE__), "../config/ap_whitelist.txt")
15
+ end
16
+ end
17
+
18
+ let :resource do
19
+ Harvestdor::Indexer::Resource.new(@indexer, @fake_druid)
20
+ end
21
+
22
+ context "smods_rec method" do
23
+ before(:all) do
24
+ @ns_decl = "xmlns='#{Mods::MODS_NS}'"
25
+ @mods_xml = "<mods #{@ns_decl}><note>hi</note></mods>"
26
+ @ng_mods_xml = Nokogiri::XML(@mods_xml)
27
+ end
28
+ it "should call mods method on harvestdor_client" do
29
+ expect(@hdor_client).to receive(:mods).with(@fake_druid).and_return(@ng_mods_xml)
30
+ resource.smods_rec
31
+ end
32
+ it "should return Stanford::Mods::Record object" do
33
+ expect(@hdor_client).to receive(:mods).with(@fake_druid).and_return(@ng_mods_xml)
34
+ expect(resource.smods_rec).to be_an_instance_of(Stanford::Mods::Record)
35
+ end
36
+ it "should raise exception if MODS xml for the druid is empty" do
37
+ allow(@hdor_client).to receive(:mods).with(@fake_druid).and_return(Nokogiri::XML("<mods #{@ns_decl}/>"))
38
+ expect { resource.smods_rec }.to raise_error(RuntimeError, Regexp.new("^Empty MODS metadata for #{@fake_druid}: <"))
39
+ end
40
+ it "should raise exception if there is no MODS xml for the druid" do
41
+ VCR.use_cassette('exception_no_MODS_call') do
42
+ expect { resource.smods_rec }.to raise_error(Harvestdor::Errors::MissingMods)
43
+ end
44
+ end
45
+ end
46
+
47
+ context "public_xml related methods" do
48
+ before(:all) do
49
+ @id_md_xml = "<identityMetadata><objectId>druid:#{@fake_druid}</objectId></identityMetadata>"
50
+ @cntnt_md_xml = "<contentMetadata type='image' objectId='#{@fake_druid}'>foo</contentMetadata>"
51
+ @rights_md_xml = "<rightsMetadata><access type=\"discover\"><machine><world>bar</world></machine></access></rightsMetadata>"
52
+ @rdf_xml = "<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#'><rdf:Description rdf:about=\"info:fedora/druid:#{@fake_druid}\">relationship!</rdf:Description></rdf:RDF>"
53
+ @pub_xml = "<publicObject id='druid:#{@fake_druid}'>#{@id_md_xml}#{@cntnt_md_xml}#{@rights_md_xml}#{@rdf_xml}</publicObject>"
54
+ @ng_pub_xml = Nokogiri::XML(@pub_xml)
55
+ end
56
+ context "#public_xml" do
57
+ it "should call public_xml method on harvestdor_client" do
58
+ expect(@hdor_client).to receive(:public_xml).with(@fake_druid).and_return(@ng_pub_xml)
59
+ resource.public_xml
60
+ end
61
+ it "retrieves entire public xml as a Nokogiri::XML::Document" do
62
+ expect(@hdor_client).to receive(:public_xml).with(@fake_druid).and_return(@ng_pub_xml)
63
+ px = resource.public_xml
64
+ expect(px).to be_kind_of(Nokogiri::XML::Document)
65
+ expect(px.root.name).to eq('publicObject')
66
+ expect(px.root.attributes['id'].text).to eq("druid:#{@fake_druid}")
67
+ end
68
+ it "raises exception if public xml for the druid is empty" do
69
+ expect(@hdor_client).to receive(:public_xml).with(@fake_druid).and_return(Nokogiri::XML("<publicObject/>"))
70
+ expect { resource.public_xml }.to raise_error(RuntimeError, Regexp.new("^Empty public xml for #{@fake_druid}: <"))
71
+ end
72
+ it "raises error if there is no public_xml page for the druid" do
73
+ expect(@hdor_client).to receive(:public_xml).with(@fake_druid).and_return(nil)
74
+ expect { resource.public_xml }.to raise_error(RuntimeError, "No public xml for #{@fake_druid}")
75
+ end
76
+ end
77
+ context "#content_metadata" do
78
+ it "returns a Nokogiri::XML::Document derived from the public xml if a druid is passed" do
79
+ allow(Harvestdor).to receive(:public_xml).with(@fake_druid, @indexer.config.harvestdor.purl).and_return(@ng_pub_xml)
80
+ cm = resource.content_metadata
81
+ expect(cm).to be_kind_of(Nokogiri::XML::Document)
82
+ expect(cm.root).not_to eq(nil)
83
+ expect(cm.root.name).to eq('contentMetadata')
84
+ expect(cm.root.attributes['objectId'].text).to eq(@fake_druid)
85
+ expect(cm.root.text.strip).to eq('foo')
86
+ end
87
+ it "raises RuntimeError if nil is returned by Harvestdor::Client.contentMetadata for the druid" do
88
+ expect(@hdor_client).to receive(:content_metadata).with(@fake_druid).and_return(nil)
89
+ expect { resource.content_metadata }.to raise_error(RuntimeError, "No contentMetadata for \"#{@fake_druid}\"")
90
+ end
91
+ end
92
+ context "#identity_metadata" do
93
+ it "returns a Nokogiri::XML::Document derived from the public xml if a druid is passed" do
94
+ allow(Harvestdor).to receive(:public_xml).with(@fake_druid, @indexer.config.harvestdor.purl).and_return(@ng_pub_xml)
95
+ im = resource.identity_metadata
96
+ expect(im).to be_kind_of(Nokogiri::XML::Document)
97
+ expect(im.root).not_to eq(nil)
98
+ expect(im.root.name).to eq('identityMetadata')
99
+ expect(im.root.text.strip).to eq("druid:#{@fake_druid}")
100
+ end
101
+ it "raises RuntimeError if nil is returned by Harvestdor::Client.identityMetadata for the druid" do
102
+ expect(@hdor_client).to receive(:identity_metadata).with(@fake_druid).and_return(nil)
103
+ expect { resource.identity_metadata }.to raise_error(RuntimeError, "No identityMetadata for \"#{@fake_druid}\"")
104
+ end
105
+ end
106
+ context "#rights_metadata" do
107
+ it "returns a Nokogiri::XML::Document derived from the public xml if a druid is passed" do
108
+ allow(Harvestdor).to receive(:public_xml).with(@fake_druid, @indexer.config.harvestdor.purl).and_return(@ng_pub_xml)
109
+ im = resource.rights_metadata
110
+ expect(im).to be_kind_of(Nokogiri::XML::Document)
111
+ expect(im.root).not_to eq(nil)
112
+ expect(im.root.name).to eq('rightsMetadata')
113
+ expect(im.root.text.strip).to eq("bar")
114
+ end
115
+ it "raises RuntimeError if nil is returned by Harvestdor::Client.rightsMetadata for the druid" do
116
+ expect(@hdor_client).to receive(:rights_metadata).with(@fake_druid).and_return(nil)
117
+ expect { resource.rights_metadata }.to raise_error(RuntimeError, "No rightsMetadata for \"#{@fake_druid}\"")
118
+ end
119
+ end
120
+ context "#rdf" do
121
+ it "returns a Nokogiri::XML::Document derived from the public xml if a druid is passed" do
122
+ allow(Harvestdor).to receive(:public_xml).with(@fake_druid, @indexer.config.harvestdor.purl).and_return(@ng_pub_xml)
123
+ im = resource.rdf
124
+ expect(im).to be_kind_of(Nokogiri::XML::Document)
125
+ expect(im.root).not_to eq(nil)
126
+ expect(im.root.name).to eq('RDF')
127
+ expect(im.root.text.strip).to eq("relationship!")
128
+ end
129
+ it "raises RuntimeError if nil is returned by Harvestdor::Client.rdf for the druid" do
130
+ expect(@hdor_client).to receive(:rdf).with(@fake_druid).and_return(nil)
131
+ expect { resource.rdf }.to raise_error(RuntimeError, "No RDF for \"#{@fake_druid}\"")
132
+ end
133
+ end
134
+
135
+ describe "#public_xml_or_druid" do
136
+ it "should return the public_xml, if the public_xml has been loaded" do
137
+ allow(resource).to receive(:public_xml?).and_return(true)
138
+ allow(resource).to receive(:public_xml).and_return(double)
139
+ expect(resource.public_xml_or_druid).to eq resource.public_xml
140
+ end
141
+ it "should return the druid, if the public_xml has not been loaded" do
142
+ allow(resource).to receive(:public_xml?).and_return(false)
143
+ expect(resource.public_xml_or_druid).to eq @fake_druid
144
+ end
145
+ end
146
+
147
+ describe "#identity_md_obj_label" do
148
+ it "should extract the objectLabel from the identity metadata" do
149
+ allow(resource).to receive(:identity_metadata).and_return(Nokogiri::XML("<identityMetadata><objectLabel>label</objectLabel></identityMetadata>"))
150
+ expect(resource.identity_md_obj_label).to eq "label"
151
+ end
152
+ end
153
+
154
+ describe "#collections" do
155
+ it "should extract the collection this resource is a member of and return Resource objects for those collections" do
156
+ allow(resource).to receive(:public_xml).and_return(Nokogiri::XML <<-EOF
157
+ <publicObject>
158
+ <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:fedora="info:fedora/fedora-system:def/relations-external#">
159
+ <rdf:Description>
160
+ <fedora:isMemberOfCollection rdf:resource="some:druid" />
161
+ </rdf:Description>
162
+ </rdf:RDF>
163
+ </publicObject>
164
+ EOF
165
+ )
166
+
167
+ expect(resource.collections.length).to eq 1
168
+ expect(resource.collections.first.druid).to eq "some:druid"
169
+ expect(resource.collections.first.indexer).to eq resource.indexer
170
+ end
171
+ end
172
+ end
173
+
174
+ end
@@ -0,0 +1,32 @@
1
+ require 'spec_helper'
2
+
3
+ describe Harvestdor::Indexer::Solr do
4
+ let :indexer do
5
+ double(logger: Logger.new("/dev/null"))
6
+ end
7
+
8
+ let :solr do
9
+ Harvestdor::Indexer::Solr.new indexer
10
+ end
11
+
12
+ # The method that sends the solr document to solr
13
+ describe "#add" do
14
+ let(:doc_hash) do
15
+ {
16
+ :id => "whatever",
17
+ :modsxml => 'whatever',
18
+ :title_display => 'title',
19
+ :pub_year_tisim => 'some year',
20
+ :author_person_display => 'author',
21
+ :format => 'Image',
22
+ :language => 'English'
23
+ }
24
+ end
25
+
26
+ it "sends an add request to the solr_client" do
27
+ expect(solr.client).to receive(:add).with(doc_hash)
28
+ solr.add(doc_hash)
29
+ end
30
+ end
31
+
32
+ end
@@ -5,61 +5,37 @@ describe Harvestdor::Indexer do
5
5
  before(:all) do
6
6
  VCR.use_cassette('before_all_call') do
7
7
  @config_yml_path = File.join(File.dirname(__FILE__), "..", "config", "ap.yml")
8
- @client_config_path = File.join(File.dirname(__FILE__), "../..", "config", "dor-fetcher-client.yml")
9
- @indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path)
10
8
  require 'yaml'
11
- @yaml = YAML.load_file(@config_yml_path)
9
+ @config = YAML.load_file(@config_yml_path)
10
+
11
+ @indexer = Harvestdor::Indexer.new(@config) do |config|
12
+ config.whitelist = ["druid:yg867hg1375"]
13
+ end
12
14
  @hdor_client = @indexer.send(:harvestdor_client)
13
- @fake_druid = 'oo000oo0000'
14
- @blacklist_path = File.join(File.dirname(__FILE__), "../config/ap_blacklist.txt")
15
+ @fake_druid = 'druid:oo000oo0000'
15
16
  @whitelist_path = File.join(File.dirname(__FILE__), "../config/ap_whitelist.txt")
16
17
  end
17
18
  end
18
-
19
- # The method that sends the solr document to solr
20
- describe "#solr_add" do
21
- before(:each) do
22
- doc_hash = {
23
- :modsxml => 'whatever',
24
- :title_display => 'title',
25
- :pub_year_tisim => 'some year',
26
- :author_person_display => 'author',
27
- :format => 'Image',
28
- :language => 'English'
29
- }
30
- end
31
- it "sends an add request to the solr_client" do
32
- expect(@indexer.solr_client).to receive(:add)
33
- @indexer.solr_add(@doc_hash, "abc123")
34
- end
35
- end
36
-
19
+
37
20
  describe "access methods" do
38
21
  it "initializes success count" do
39
- @indexer.success_count.should == 0
22
+ expect(@indexer.metrics.success_count).to eq(0)
40
23
  end
41
24
  it "initializes error count" do
42
- @indexer.error_count.should == 0
43
- end
44
- it "initializes max_retries" do
45
- expect(@indexer.max_retries).to eql(10)
46
- end
47
- it "allows overriding of max_retries" do
48
- @indexer.max_retries=6
49
- @indexer.max_retries.should == 6
25
+ expect(@indexer.metrics.error_count).to eq(0)
50
26
  end
51
27
  end
52
28
 
53
29
  describe "logging" do
54
30
  it "should write the log file to the directory indicated by log_dir" do
55
31
  @indexer.logger.info("indexer_spec logging test message")
56
- File.exists?(File.join(@yaml['log_dir'], @yaml['log_name'])).should == true
32
+ expect(File.exists?(File.join(@config['harvestdor']['log_dir'], @config['harvestdor']['log_name']))).to eq(true)
57
33
  end
58
34
  end
59
35
 
60
36
  it "should initialize the harvestdor_client from the config" do
61
37
  expect(@hdor_client).to be_an_instance_of(Harvestdor::Client)
62
- expect(@hdor_client.config.default_set).to eq(@yaml['default_set'])
38
+ expect(@hdor_client.config.default_set).to eq(@config['harvestdor']['default_set'])
63
39
  end
64
40
 
65
41
  context "harvest_and_index" do
@@ -69,63 +45,34 @@ describe Harvestdor::Indexer do
69
45
  }
70
46
  end
71
47
  it "should call dor_fetcher_client.druid_array and then call :add on rsolr connection" do
72
- @indexer.should_receive(:druids).and_return([@fake_druid])
73
- @indexer.solr_client.should_receive(:add).with(@doc_hash)
74
- @indexer.solr_client.should_receive(:commit)
48
+ allow_any_instance_of(Harvestdor::Indexer::Resource).to receive(:collection?).and_return(false)
49
+ expect(@indexer).to receive(:druids).and_return([@fake_druid])
50
+ expect(@indexer.solr).to receive(:add).with(@doc_hash)
51
+ expect(@indexer.solr).to receive(:commit!)
75
52
  @indexer.harvest_and_index
76
53
  end
77
54
 
78
55
  it "should only call :commit on rsolr connection once" do
79
56
  VCR.use_cassette('single_rsolr_connection_call') do
80
- indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path)
81
- hdor_client = indexer.send(:harvestdor_client)
82
- indexer.dor_fetcher_client.should_receive(:druid_array).and_return(["druid:yg867hg1375", "druid:jf275fd6276", "druid:nz353cp1092", "druid:tc552kq0798", "druid:th998nk0722", "druid:ww689vs6534"])
83
- indexer.solr_client.should_receive(:add).exactly(6).times
84
- indexer.solr_client.should_receive(:commit).once
85
- indexer.harvest_and_index
57
+ hdor_client = @indexer.send(:harvestdor_client)
58
+ expect(@indexer.dor_fetcher_client).to receive(:druid_array).and_return(["druid:yg867hg1375", "druid:jf275fd6276", "druid:nz353cp1092", "druid:tc552kq0798", "druid:th998nk0722", "druid:ww689vs6534"])
59
+ expect(@indexer.solr).to receive(:add).exactly(6).times
60
+ expect(@indexer.solr).to receive(:commit!).once
61
+ @indexer.harvest_and_index
86
62
  end
87
63
  end
88
64
 
89
- it "should not process druids in blacklist" do
90
- VCR.use_cassette('ignore_druids_in_blacklist_call') do
91
- lambda{
92
- indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path, {:blacklist => @blacklist_path})
93
- hdor_client = indexer.send(:harvestdor_client)
94
- indexer.dor_fetcher_client.should_receive(:druid_array).and_return(["druid:yg867hg1375", "druid:jf275fd6276", "druid:nz353cp1092", "druid:tc552kq0798", "druid:th998nk0722", "druid:ww689vs6534"])
95
- indexer.solr_client.should_receive(:add).with(hash_including({:id => 'druid:nz353cp1092'}))
96
- indexer.solr_client.should_not_receive(:add).with(hash_including({:id => 'druid:jf275fd6276'}))
97
- indexer.solr_client.should_not_receive(:add).with(hash_including({:id => 'druid:tc552kq0798'}))
98
- indexer.solr_client.should_receive(:add).with(hash_including({:id => 'druid:th998nk0722'}))
99
- indexer.solr_client.should_receive(:commit)
100
- indexer.harvest_and_index
101
- }
102
- end
103
- end
104
- it "should not process druid if it is in both blacklist and whitelist" do
105
- VCR.use_cassette('ignore_druids_in_blacklist_and_whitelist_call') do
106
- lambda{
107
- indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path, {:blacklist => @blacklist_path, :whitelist => @whitelist_path})
108
- hdor_client = indexer.send(:harvestdor_client)
109
- indexer.dor_fetcher_client.should_not_receive(:druid_array)
110
- indexer.solr_client.should_receive(:add).with(hash_including({:id => 'druid:yg867hg1375'}))
111
- indexer.solr_client.should_not_receive(:add).with(hash_including({:id => 'druid:jf275fd6276'}))
112
- indexer.solr_client.should_receive(:commit)
113
- indexer.harvest_and_index
114
- }
115
- end
116
- end
117
65
  it "should only process druids in whitelist if it exists" do
118
66
  VCR.use_cassette('process_druids_whitelist_call') do
119
- lambda{
120
- indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path, {:whitelist => @whitelist_path})
121
- hdor_client = indexer.send(:harvestdor_client)
122
- indexer.dor_fetcher_client.should_not_receive(:druid_array)
123
- indexer.solr_client.should_receive(:add).with(hash_including({:id => 'druid:yg867hg1375'}))
124
- indexer.solr_client.should_receive(:add).with(hash_including({:id => 'druid:jf275fd6276'}))
125
- indexer.solr_client.should_receive(:add).with(hash_including({:id => 'druid:nz353cp1092'}))
126
- indexer.solr_client.should_receive(:commit)
127
- indexer.harvest_and_index
67
+ indexer = Harvestdor::Indexer.new(@config.merge(:whitelist => @whitelist_path))
68
+ hdor_client = indexer.send(:harvestdor_client)
69
+ added = []
70
+ allow(indexer.solr).to receive(:add) { |hash|
71
+ added << hash[:id]
128
72
  }
73
+ expect(indexer.solr).to receive(:commit!)
74
+ indexer.harvest_and_index
75
+ expect(added).to match_array ["druid:tc552kq0798", "druid:th998nk0722", "druid:ww689vs6534", "druid:yg867hg1375", 'druid:jf275fd6276', 'druid:nz353cp1092']
129
76
  end
130
77
  end
131
78
 
@@ -137,227 +84,36 @@ describe Harvestdor::Indexer do
137
84
  expect(@indexer.dor_fetcher_client).to be_an_instance_of(DorFetcher::Client)
138
85
  end
139
86
 
140
- it "should strip off is_member_of_collection_ and is_governed_by_ and return only the druid" do
141
- expect(@indexer.strip_default_set_string()).to eq("yg867hg1375")
142
- end
143
-
144
87
  it "druids method should call druid_array and get_collection methods on fetcher_client" do
145
88
  VCR.use_cassette('get_collection_druids_call') do
146
- expect(@indexer.druids).to eq(["druid:yg867hg1375", "druid:jf275fd6276", "druid:nz353cp1092", "druid:tc552kq0798", "druid:th998nk0722", "druid:ww689vs6534"])
89
+ expect(@indexer.resources.map(&:druid)).to match_array ["druid:yg867hg1375", "druid:jf275fd6276", "druid:nz353cp1092", "druid:tc552kq0798", "druid:th998nk0722", "druid:ww689vs6534"]
147
90
  end
148
91
  end
149
92
 
150
93
  it "should get the configuration of the dor-fetcher client from included yml file" do
151
- expect(@indexer.dor_fetcher_client.service_url).to eq(@indexer.client_config["dor_fetcher_service_url"])
94
+ expect(@indexer.dor_fetcher_client.service_url).to eq("http://127.0.0.1:3000")
152
95
  end
153
96
 
154
97
  end # ending replacing OAI context
155
-
156
- context "smods_rec method" do
157
- before(:all) do
158
- @fake_druid = 'oo000oo0000'
159
- @ns_decl = "xmlns='#{Mods::MODS_NS}'"
160
- @mods_xml = "<mods #{@ns_decl}><note>hi</note></mods>"
161
- @ng_mods_xml = Nokogiri::XML(@mods_xml)
162
- end
163
- it "should call mods method on harvestdor_client" do
164
- @hdor_client.should_receive(:mods).with(@fake_druid).and_return(@ng_mods_xml)
165
- @indexer.smods_rec(@fake_druid)
166
- end
167
- it "should return Stanford::Mods::Record object" do
168
- @hdor_client.should_receive(:mods).with(@fake_druid).and_return(@ng_mods_xml)
169
- @indexer.smods_rec(@fake_druid).should be_an_instance_of(Stanford::Mods::Record)
170
- end
171
- it "should raise exception if MODS xml for the druid is empty" do
172
- @hdor_client.stub(:mods).with(@fake_druid).and_return(Nokogiri::XML("<mods #{@ns_decl}/>"))
173
- expect { @indexer.smods_rec(@fake_druid) }.to raise_error(RuntimeError, Regexp.new("^Empty MODS metadata for #{@fake_druid}: <"))
174
- end
175
- it "should raise exception if there is no MODS xml for the druid" do
176
- VCR.use_cassette('exception_no_MODS_call') do
177
- expect { @indexer.smods_rec(@fake_druid) }.to raise_error(Harvestdor::Errors::MissingMods)
178
- end
179
- end
180
- end
181
-
182
- context "public_xml related methods" do
183
- before(:all) do
184
- @id_md_xml = "<identityMetadata><objectId>druid:#{@fake_druid}</objectId></identityMetadata>"
185
- @cntnt_md_xml = "<contentMetadata type='image' objectId='#{@fake_druid}'>foo</contentMetadata>"
186
- @rights_md_xml = "<rightsMetadata><access type=\"discover\"><machine><world>bar</world></machine></access></rightsMetadata>"
187
- @rdf_xml = "<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#'><rdf:Description rdf:about=\"info:fedora/druid:#{@fake_druid}\">relationship!</rdf:Description></rdf:RDF>"
188
- @pub_xml = "<publicObject id='druid:#{@fake_druid}'>#{@id_md_xml}#{@cntnt_md_xml}#{@rights_md_xml}#{@rdf_xml}</publicObject>"
189
- @ng_pub_xml = Nokogiri::XML(@pub_xml)
190
- end
191
- context "#public_xml" do
192
- it "should call public_xml method on harvestdor_client" do
193
- @hdor_client.should_receive(:public_xml).with(@fake_druid).and_return(@ng_pub_xml)
194
- @indexer.public_xml @fake_druid
195
- end
196
- it "retrieves entire public xml as a Nokogiri::XML::Document" do
197
- @hdor_client.should_receive(:public_xml).with(@fake_druid).and_return(@ng_pub_xml)
198
- px = @indexer.public_xml @fake_druid
199
- px.should be_kind_of(Nokogiri::XML::Document)
200
- px.root.name.should == 'publicObject'
201
- px.root.attributes['id'].text.should == "druid:#{@fake_druid}"
202
- end
203
- it "raises exception if public xml for the druid is empty" do
204
- @hdor_client.should_receive(:public_xml).with(@fake_druid).and_return(Nokogiri::XML("<publicObject/>"))
205
- expect { @indexer.public_xml(@fake_druid) }.to raise_error(RuntimeError, Regexp.new("^Empty public xml for #{@fake_druid}: <"))
206
- end
207
- it "raises error if there is no public_xml page for the druid" do
208
- @hdor_client.should_receive(:public_xml).with(@fake_druid).and_return(nil)
209
- expect { @indexer.public_xml(@fake_druid) }.to raise_error(RuntimeError, "No public xml for #{@fake_druid}")
210
- end
211
- end
212
- context "#content_metadata" do
213
- it "returns a Nokogiri::XML::Document derived from the public xml if a druid is passed" do
214
- Harvestdor.stub(:public_xml).with(@fake_druid, @indexer.config.purl).and_return(@ng_pub_xml)
215
- cm = @indexer.content_metadata(@fake_druid)
216
- cm.should be_kind_of(Nokogiri::XML::Document)
217
- cm.root.should_not == nil
218
- cm.root.name.should == 'contentMetadata'
219
- cm.root.attributes['objectId'].text.should == @fake_druid
220
- cm.root.text.strip.should == 'foo'
221
- end
222
- it "if passed a Nokogiri::XML::Document of the public xml, it does no fetch" do
223
- URI::HTTP.any_instance.should_not_receive(:open)
224
- @hdor_client.should_receive(:content_metadata).and_call_original
225
- cm = @indexer.content_metadata(@ng_pub_xml)
226
- cm.should be_kind_of(Nokogiri::XML::Document)
227
- cm.root.should_not == nil
228
- cm.root.name.should == 'contentMetadata'
229
- cm.root.attributes['objectId'].text.should == @fake_druid
230
- cm.root.text.strip.should == 'foo'
231
- end
232
- it "raises RuntimeError if nil is returned by Harvestdor::Client.contentMetadata for the druid" do
233
- @hdor_client.should_receive(:content_metadata).with(@fake_druid).and_return(nil)
234
- expect { @indexer.content_metadata(@fake_druid) }.to raise_error(RuntimeError, "No contentMetadata for \"#{@fake_druid}\"")
235
- end
236
- end
237
- context "#identity_metadata" do
238
- it "returns a Nokogiri::XML::Document derived from the public xml if a druid is passed" do
239
- Harvestdor.stub(:public_xml).with(@fake_druid, @indexer.config.purl).and_return(@ng_pub_xml)
240
- im = @indexer.identity_metadata(@fake_druid)
241
- im.should be_kind_of(Nokogiri::XML::Document)
242
- im.root.should_not == nil
243
- im.root.name.should == 'identityMetadata'
244
- im.root.text.strip.should == "druid:#{@fake_druid}"
245
- end
246
- it "if passed a Nokogiri::XML::Document of the public xml, it does no fetch" do
247
- URI::HTTP.any_instance.should_not_receive(:open)
248
- @hdor_client.should_receive(:identity_metadata).and_call_original
249
- im = @indexer.identity_metadata(@ng_pub_xml)
250
- im.should be_kind_of(Nokogiri::XML::Document)
251
- im.root.should_not == nil
252
- im.root.name.should == 'identityMetadata'
253
- im.root.text.strip.should == "druid:#{@fake_druid}"
254
- end
255
- it "raises RuntimeError if nil is returned by Harvestdor::Client.identityMetadata for the druid" do
256
- @hdor_client.should_receive(:identity_metadata).with(@fake_druid).and_return(nil)
257
- expect { @indexer.identity_metadata(@fake_druid) }.to raise_error(RuntimeError, "No identityMetadata for \"#{@fake_druid}\"")
258
- end
259
- end
260
- context "#rights_metadata" do
261
- it "returns a Nokogiri::XML::Document derived from the public xml if a druid is passed" do
262
- Harvestdor.stub(:public_xml).with(@fake_druid, @indexer.config.purl).and_return(@ng_pub_xml)
263
- im = @indexer.rights_metadata(@fake_druid)
264
- im.should be_kind_of(Nokogiri::XML::Document)
265
- im.root.should_not == nil
266
- im.root.name.should == 'rightsMetadata'
267
- im.root.text.strip.should == "bar"
268
- end
269
- it "raises RuntimeError if nil is returned by Harvestdor::Client.rightsMetadata for the druid" do
270
- @hdor_client.should_receive(:rights_metadata).with(@fake_druid).and_return(nil)
271
- expect { @indexer.rights_metadata(@fake_druid) }.to raise_error(RuntimeError, "No rightsMetadata for \"#{@fake_druid}\"")
272
- end
273
- end
274
- context "#rdf" do
275
- it "returns a Nokogiri::XML::Document derived from the public xml if a druid is passed" do
276
- Harvestdor.stub(:public_xml).with(@fake_druid, @indexer.config.purl).and_return(@ng_pub_xml)
277
- im = @indexer.rdf(@fake_druid)
278
- im.should be_kind_of(Nokogiri::XML::Document)
279
- im.root.should_not == nil
280
- im.root.name.should == 'RDF'
281
- im.root.text.strip.should == "relationship!"
282
- end
283
- it "raises RuntimeError if nil is returned by Harvestdor::Client.rdf for the druid" do
284
- @hdor_client.should_receive(:rdf).with(@fake_druid).and_return(nil)
285
- expect { @indexer.rdf(@fake_druid) }.to raise_error(RuntimeError, "No RDF for \"#{@fake_druid}\"")
286
- end
287
- end
288
- end
289
-
290
- context "blacklist" do
291
- it "should be an Array with an entry for each non-empty line in the file" do
292
- @indexer.send(:load_blacklist, @blacklist_path)
293
- @indexer.send(:blacklist).should be_an_instance_of(Array)
294
- @indexer.send(:blacklist).size.should == 2
295
- end
296
- it "should be empty Array if there was no blacklist config setting" do
297
- VCR.use_cassette('empty_array_no_blacklist_config_call') do
298
- indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path)
299
- expect(indexer.blacklist).to eq([])
300
- end
301
- end
302
- context "load_blacklist" do
303
- it "knows what is in the blacklist" do
304
- VCR.use_cassette('know_what_is_in_blacklist_call') do
305
- indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path, {:blacklist => @blacklist_path})
306
- expect(indexer.blacklist).to eq(["druid:jf275fd6276", "druid:tc552kq0798"])
307
- end
308
- end
309
- it "should not be called if there was no blacklist config setting" do
310
- VCR.use_cassette('no_blacklist_config_call') do
311
- lambda{
312
- indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path)
313
-
314
- indexer.should_not_receive(:load_blacklist)
315
-
316
- hdor_client = indexer.send(:harvestdor_client)
317
- indexer.dor_fetcher_client.should_receive(:druid_array).and_return([@fake_druid])
318
- indexer.solr_client.should_receive(:add)
319
- indexer.solr_client.should_receive(:commit)
320
- indexer.harvest_and_index
321
- }
322
- end
323
- end
324
- it "should only try to load a blacklist once" do
325
- VCR.use_cassette('load_blacklist_once_call') do
326
- indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path, {:blacklist => @blacklist_path})
327
- indexer.send(:blacklist)
328
- File.any_instance.should_not_receive(:open)
329
- indexer.send(:blacklist)
330
- end
331
- end
332
- it "should log an error message and throw RuntimeError if it can't find the indicated blacklist file" do
333
- VCR.use_cassette('no_blacklist_found_call') do
334
- exp_msg = 'Unable to find list of druids at bad_path'
335
- indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path, {:blacklist => 'bad_path'})
336
- indexer.logger.should_receive(:fatal).with(exp_msg)
337
- expect { indexer.send(:load_blacklist, 'bad_path') }.to raise_error(exp_msg)
338
- end
339
- end
340
- end
341
- end # blacklist
342
98
 
343
99
  context "whitelist" do
344
100
  it "knows what is in the whitelist" do
345
101
  VCR.use_cassette('know_what_is_in_whitelist_call') do
346
102
  lambda{
347
- indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path, {:whitelist => @whitelist_path})
103
+ indexer = Harvestdor::Indexer.new({:whitelist => @whitelist_path})
348
104
  expect(indexer.whitelist).to eq(["druid:yg867hg1375", "druid:jf275fd6276", "druid:nz353cp1092"])
349
105
  }
350
106
  end
351
107
  end
352
108
  it "should be an Array with an entry for each non-empty line in the file" do
353
109
  @indexer.send(:load_whitelist, @whitelist_path)
354
- @indexer.send(:whitelist).should be_an_instance_of(Array)
355
- @indexer.send(:whitelist).size.should == 3
110
+ expect(@indexer.send(:whitelist)).to be_an_instance_of(Array)
111
+ expect(@indexer.send(:whitelist).size).to eq(3)
356
112
  end
357
113
  it "should be empty Array if there was no whitelist config setting" do
358
114
  VCR.use_cassette('empty_array_no_whitelist_config_call') do
359
115
  lambda{
360
- indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path)
116
+ indexer = Harvestdor::Indexer.new()
361
117
  expect(indexer.whitelist).to eq([])
362
118
  }
363
119
  end
@@ -366,31 +122,31 @@ describe Harvestdor::Indexer do
366
122
  it "should not be called if there was no whitelist config setting" do
367
123
  VCR.use_cassette('no_whitelist_config_call') do
368
124
  lambda{
369
- indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path)
125
+ indexer = Harvestdor::Indexer.new()
370
126
 
371
- indexer.should_not_receive(:load_whitelist)
127
+ expect(indexer).not_to receive(:load_whitelist)
372
128
 
373
129
  hdor_client = indexer.send(:harvestdor_client)
374
- indexer.dor_fetcher_client.should_receive(:druid_array).and_return([@fake_druid])
375
- indexer.solr_client.should_receive(:add)
376
- indexer.solr_client.should_receive(:commit)
130
+ expect(indexer.dor_fetcher_client).to receive(:druid_array).and_return([@fake_druid])
131
+ expect(indexer.solr_client).to receive(:add)
132
+ expect(indexer.solr_client).to receive(:commit)
377
133
  indexer.harvest_and_index
378
134
  }
379
135
  end
380
136
  end
381
137
  it "should only try to load a whitelist once" do
382
138
  VCR.use_cassette('load_whitelist_once_call') do
383
- indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path, {:whitelist => @whitelist_path})
139
+ indexer = Harvestdor::Indexer.new({:whitelist => @whitelist_path})
384
140
  indexer.send(:whitelist)
385
- File.any_instance.should_not_receive(:open)
141
+ expect_any_instance_of(File).not_to receive(:open)
386
142
  indexer.send(:whitelist)
387
143
  end
388
144
  end
389
145
  it "should log an error message and throw RuntimeError if it can't find the indicated whitelist file" do
390
146
  VCR.use_cassette('cant_find_whitelist_call') do
391
147
  exp_msg = 'Unable to find list of druids at bad_path'
392
- indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path, {:whitelist => 'bad_path'})
393
- indexer.logger.should_receive(:fatal).with(exp_msg)
148
+ indexer = Harvestdor::Indexer.new(@config.merge(:whitelist => 'bad_path'))
149
+ expect(indexer.logger).to receive(:fatal).with(exp_msg)
394
150
  expect { indexer.send(:load_whitelist, 'bad_path') }.to raise_error(exp_msg)
395
151
  end
396
152
  end
@@ -399,15 +155,15 @@ describe Harvestdor::Indexer do
399
155
 
400
156
  it "solr_client should initialize the rsolr client using the options from the config" do
401
157
  VCR.use_cassette('rsolr_client_config_call') do
402
- indexer = Harvestdor::Indexer.new(nil, @client_config_path, Confstruct::Configuration.new(:solr => { :url => 'http://localhost:2345', :a => 1 }) )
403
- RSolr.should_receive(:connect).with(hash_including(:a => 1, :url => 'http://localhost:2345')).and_return('foo')
404
- indexer.solr_client
158
+ indexer = Harvestdor::Indexer.new(Confstruct::Configuration.new(:solr => { :url => 'http://localhost:2345', :a => 1 }) )
159
+ expect(RSolr).to receive(:connect).with(hash_including(:a => 1, :url => 'http://localhost:2345')).and_return('foo')
160
+ indexer.solr
405
161
  end
406
162
  end
407
163
 
408
164
  context "skip heartbeat" do
409
165
  it "allows me to use a fake url for dor-fetcher-client" do
410
- expect {Harvestdor::Indexer.new(@config_yml_path, @client_config_path)}.not_to raise_error
166
+ expect {Harvestdor::Indexer.new()}.not_to raise_error
411
167
  end
412
168
  end
413
169
  end