gdor-indexer 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +31 -0
- data/.hound.yml +2 -0
- data/.rubocop.yml +3 -0
- data/.rubocop_todo.yml +131 -0
- data/.yardopts +3 -0
- data/Capfile +26 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +5 -0
- data/README.md +67 -0
- data/Rakefile +57 -0
- data/VERSION +1 -0
- data/bin/indexer +71 -0
- data/config/deploy.rb +31 -0
- data/config/deploy/dev.rb +41 -0
- data/config/deploy/fetcher.rb +6 -0
- data/config/deploy/prod.rb +41 -0
- data/config/deploy/stage.rb +41 -0
- data/gdor-indexer.gemspec +43 -0
- data/lib/gdor/indexer.rb +327 -0
- data/lib/gdor/indexer/mods_fields.rb +114 -0
- data/lib/gdor/indexer/nokogiri_xml_node_mixin.rb +42 -0
- data/lib/gdor/indexer/public_xml_fields.rb +81 -0
- data/lib/gdor/indexer/solr_doc_builder.rb +85 -0
- data/lib/gdor/indexer/solr_doc_hash.rb +112 -0
- data/lib/gdor/indexer/version.rb +5 -0
- data/spec/config/walters_integration_spec.yml +44 -0
- data/spec/spec_helper.rb +26 -0
- data/spec/unit/gdor_mods_fields_spec.rb +812 -0
- data/spec/unit/indexer_spec.rb +411 -0
- data/spec/unit/public_xml_fields_spec.rb +286 -0
- data/spec/unit/solr_doc_builder_spec.rb +128 -0
- data/spec/unit/solr_doc_hash_spec.rb +399 -0
- data/spec/vcr_cassettes/no_coll_druid_in_druid_array_call.yml +745 -0
- metadata +411 -0
@@ -0,0 +1,411 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe GDor::Indexer do
|
4
|
+
before(:all) do
|
5
|
+
@config_yml_path = File.join(File.dirname(__FILE__), '..', 'config', 'walters_integration_spec.yml')
|
6
|
+
require 'yaml'
|
7
|
+
@yaml = YAML.load_file(@config_yml_path)
|
8
|
+
@ns_decl = "xmlns='#{Mods::MODS_NS}'"
|
9
|
+
@fake_druid = 'oo000oo0000'
|
10
|
+
@coll_druid_from_test_config = 'ww121ss5000'
|
11
|
+
@mods_xml = "<mods #{@ns_decl}><note>Indexer test</note></mods>"
|
12
|
+
@ng_mods_xml = Nokogiri::XML("<mods #{@ns_decl}><note>Indexer test</note></mods>")
|
13
|
+
@pub_xml = "<publicObject id='druid#{@fake_druid}'></publicObject>"
|
14
|
+
@ng_pub_xml = Nokogiri::XML("<publicObject id='druid#{@fake_druid}'></publicObject>")
|
15
|
+
end
|
16
|
+
before(:each) do
|
17
|
+
@indexer = described_class.new(@config_yml_path) do |config|
|
18
|
+
config.whitelist = ['druid:ww121ss5000']
|
19
|
+
end
|
20
|
+
allow(@indexer.solr_client).to receive(:add)
|
21
|
+
end
|
22
|
+
|
23
|
+
let :resource do
|
24
|
+
r = Harvestdor::Indexer::Resource.new(double, @fake_druid)
|
25
|
+
allow(r).to receive(:collections).and_return []
|
26
|
+
allow(r).to receive(:mods).and_return Nokogiri::XML(@mods_xml)
|
27
|
+
allow(r).to receive(:public_xml).and_return Nokogiri::XML(@pub_xml)
|
28
|
+
allow(r).to receive(:public_xml?).and_return true
|
29
|
+
allow(r).to receive(:content_metadata).and_return nil
|
30
|
+
allow(r).to receive(:collection?).and_return false
|
31
|
+
r
|
32
|
+
end
|
33
|
+
|
34
|
+
let :collection do
|
35
|
+
r = Harvestdor::Indexer::Resource.new(double, @coll_druid_from_test_config)
|
36
|
+
allow(r).to receive(:collections).and_return []
|
37
|
+
allow(r).to receive(:mods).and_return Nokogiri::XML(@mods_xml)
|
38
|
+
allow(r).to receive(:public_xml).and_return Nokogiri::XML(@pub_xml)
|
39
|
+
allow(r).to receive(:public_xml?).and_return true
|
40
|
+
allow(r).to receive(:content_metadata).and_return nil
|
41
|
+
allow(r).to receive(:identity_md_obj_label).and_return ''
|
42
|
+
allow(r).to receive(:collection?).and_return true
|
43
|
+
r
|
44
|
+
end
|
45
|
+
|
46
|
+
context 'logging' do
|
47
|
+
it 'writes the log file to the directory indicated by log_dir' do
|
48
|
+
@indexer.logger.info('walters_integration_spec logging test message')
|
49
|
+
expect(File).to exist(File.join(@yaml['harvestdor']['log_dir'], @yaml['harvestdor']['log_name']))
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
describe '#harvest_and_index' do
|
54
|
+
before :each do
|
55
|
+
allow(@indexer.harvestdor).to receive(:each_resource)
|
56
|
+
allow(@indexer).to receive(:solr_client).and_return(double(commit!: nil))
|
57
|
+
allow(@indexer).to receive(:log_results)
|
58
|
+
allow(@indexer).to receive(:email_results)
|
59
|
+
end
|
60
|
+
it 'logs and email results' do
|
61
|
+
expect(@indexer).to receive(:log_results)
|
62
|
+
expect(@indexer).to receive(:email_results)
|
63
|
+
|
64
|
+
@indexer.harvest_and_index
|
65
|
+
end
|
66
|
+
it 'indexs each resource' do
|
67
|
+
allow(@indexer).to receive(:harvestdor).and_return(Class.new do
|
68
|
+
def initialize(*items)
|
69
|
+
@items = items
|
70
|
+
end
|
71
|
+
|
72
|
+
def each_resource(_opts = {})
|
73
|
+
@items.each { |x| yield x }
|
74
|
+
end
|
75
|
+
|
76
|
+
def logger
|
77
|
+
Logger.new(STDERR)
|
78
|
+
end
|
79
|
+
end.new(collection, resource))
|
80
|
+
|
81
|
+
expect(@indexer).to receive(:index).with(collection)
|
82
|
+
expect(@indexer).to receive(:index).with(resource)
|
83
|
+
|
84
|
+
@indexer.harvest_and_index
|
85
|
+
end
|
86
|
+
it 'sends a solr commit' do
|
87
|
+
expect(@indexer.solr_client).to receive(:commit!)
|
88
|
+
@indexer.harvest_and_index
|
89
|
+
end
|
90
|
+
it 'does not commit if nocommit is set' do
|
91
|
+
expect(@indexer.solr_client).to_not receive(:commit!)
|
92
|
+
@indexer.harvest_and_index(true)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
describe '#index' do
|
97
|
+
it 'indexs collections as collections' do
|
98
|
+
expect(@indexer).to receive(:collection_solr_document).with(collection)
|
99
|
+
@indexer.index collection
|
100
|
+
end
|
101
|
+
|
102
|
+
it 'indexs other resources as items' do
|
103
|
+
expect(@indexer).to receive(:item_solr_document).with(resource)
|
104
|
+
@indexer.index resource
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
describe '#index_with_exception_handling' do
|
109
|
+
it 'capture,s log, and re-raise any exception thrown by the indexing process' do
|
110
|
+
expect(@indexer).to receive(:index).with(resource).and_raise 'xyz'
|
111
|
+
expect(@indexer.logger).to receive(:error)
|
112
|
+
expect { @indexer.index_with_exception_handling(resource) }.to raise_error RuntimeError
|
113
|
+
expect(@indexer.druids_failed_to_ix).to include resource.druid
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
context '#item_solr_document' do
|
118
|
+
context 'unmerged' do
|
119
|
+
it 'calls Harvestdor::Indexer.solr_add' do
|
120
|
+
doc_hash = @indexer.item_solr_document(resource)
|
121
|
+
expect(doc_hash).to include id: @fake_druid
|
122
|
+
end
|
123
|
+
it 'calls validate_item' do
|
124
|
+
expect_any_instance_of(GDor::Indexer::SolrDocHash).to receive(:validate_item).and_return([])
|
125
|
+
@indexer.item_solr_document resource
|
126
|
+
end
|
127
|
+
it 'calls GDor::Indexer::SolrDocBuilder.validate_mods' do
|
128
|
+
allow_any_instance_of(GDor::Indexer::SolrDocHash).to receive(:validate_item).and_return([])
|
129
|
+
expect_any_instance_of(GDor::Indexer::SolrDocHash).to receive(:validate_mods).and_return([])
|
130
|
+
@indexer.item_solr_document resource
|
131
|
+
end
|
132
|
+
it 'calls add_coll_info' do
|
133
|
+
expect(@indexer).to receive(:add_coll_info)
|
134
|
+
@indexer.item_solr_document resource
|
135
|
+
end
|
136
|
+
it 'has fields populated from the collection record' do
|
137
|
+
sdb = double
|
138
|
+
allow(sdb).to receive(:doc_hash).and_return(GDor::Indexer::SolrDocHash.new)
|
139
|
+
allow(sdb).to receive(:display_type)
|
140
|
+
allow(sdb).to receive(:file_ids)
|
141
|
+
allow(sdb.doc_hash).to receive(:validate_mods).and_return([])
|
142
|
+
allow(GDor::Indexer::SolrDocBuilder).to receive(:new).and_return(sdb)
|
143
|
+
allow(resource).to receive(:collections).and_return([double(druid: 'foo', bare_druid: 'foo', identity_md_obj_label: 'bar')])
|
144
|
+
doc_hash = @indexer.item_solr_document resource
|
145
|
+
expect(doc_hash).to include druid: @fake_druid, collection: ['foo'], collection_with_title: ['foo-|-bar']
|
146
|
+
end
|
147
|
+
it 'has fields populated from the MODS' do
|
148
|
+
title = 'fake title in mods'
|
149
|
+
ng_mods = Nokogiri::XML("<mods #{@ns_decl}><titleInfo><title>#{title}</title></titleInfo></mods>")
|
150
|
+
allow(resource).to receive(:mods).and_return(ng_mods)
|
151
|
+
doc_hash = @indexer.item_solr_document resource
|
152
|
+
expect(doc_hash).to include id: @fake_druid, title_display: title
|
153
|
+
end
|
154
|
+
it 'populates url_fulltext field with purl page url' do
|
155
|
+
doc_hash = @indexer.item_solr_document resource
|
156
|
+
expect(doc_hash).to include id: @fake_druid, url_fulltext: "#{@yaml['harvestdor']['purl']}/#{@fake_druid}"
|
157
|
+
end
|
158
|
+
it 'populates druid and access_facet fields' do
|
159
|
+
doc_hash = @indexer.item_solr_document resource
|
160
|
+
expect(doc_hash).to include id: @fake_druid, druid: @fake_druid, access_facet: 'Online'
|
161
|
+
end
|
162
|
+
it 'populates display_type field by calling display_type method' do
|
163
|
+
expect_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:display_type).and_return('foo')
|
164
|
+
doc_hash = @indexer.item_solr_document resource
|
165
|
+
expect(doc_hash).to include id: @fake_druid, display_type: 'foo'
|
166
|
+
end
|
167
|
+
it 'populates file_id field by calling file_ids method' do
|
168
|
+
expect_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:file_ids).at_least(1).times.and_return(['foo'])
|
169
|
+
doc_hash = @indexer.item_solr_document resource
|
170
|
+
expect(doc_hash).to include id: @fake_druid, file_id: ['foo']
|
171
|
+
end
|
172
|
+
it 'populates building_facet field with Stanford Digital Repository' do
|
173
|
+
doc_hash = @indexer.item_solr_document resource
|
174
|
+
expect(doc_hash).to include id: @fake_druid, building_facet: 'Stanford Digital Repository'
|
175
|
+
end
|
176
|
+
end # unmerged item
|
177
|
+
end # item_solr_document
|
178
|
+
|
179
|
+
context '#collection_solr_document' do
|
180
|
+
context 'unmerged' do
|
181
|
+
it 'calls validate_collection' do
|
182
|
+
doc_hash = GDor::Indexer::SolrDocHash.new
|
183
|
+
allow_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:doc_hash).and_return(doc_hash) # speed up the test
|
184
|
+
expect(doc_hash).to receive(:validate_collection).and_return([])
|
185
|
+
doc_hash = @indexer.collection_solr_document collection
|
186
|
+
end
|
187
|
+
it 'calls GDor::Indexer::SolrDocBuilder.validate_mods' do
|
188
|
+
doc_hash = GDor::Indexer::SolrDocHash.new
|
189
|
+
allow_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:doc_hash).and_return(doc_hash) # speed up the test
|
190
|
+
expect(doc_hash).to receive(:validate_mods).and_return([])
|
191
|
+
doc_hash = @indexer.collection_solr_document collection
|
192
|
+
end
|
193
|
+
it 'populates druid and access_facet fields' do
|
194
|
+
doc_hash = @indexer.collection_solr_document collection
|
195
|
+
expect(doc_hash).to include druid: @coll_druid_from_test_config, access_facet: 'Online'
|
196
|
+
end
|
197
|
+
it 'populates url_fulltext field with purl page url' do
|
198
|
+
doc_hash = @indexer.collection_solr_document collection
|
199
|
+
expect(doc_hash).to include url_fulltext: "#{@yaml['harvestdor']['purl']}/#{@coll_druid_from_test_config}"
|
200
|
+
end
|
201
|
+
it "collection_type should be 'Digital Collection'" do
|
202
|
+
allow_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:doc_hash).and_return(GDor::Indexer::SolrDocHash.new) # speed up the test
|
203
|
+
|
204
|
+
doc_hash = @indexer.collection_solr_document collection
|
205
|
+
expect(doc_hash).to include collection_type: 'Digital Collection'
|
206
|
+
end
|
207
|
+
context 'add format_main_ssim Archive/Manuscript' do
|
208
|
+
it 'no other values' do
|
209
|
+
allow_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:doc_hash).and_return(GDor::Indexer::SolrDocHash.new)
|
210
|
+
|
211
|
+
doc_hash = @indexer.collection_solr_document collection
|
212
|
+
expect(doc_hash).to include format_main_ssim: 'Archive/Manuscript'
|
213
|
+
end
|
214
|
+
it 'other values present' do
|
215
|
+
allow_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:doc_hash).and_return(GDor::Indexer::SolrDocHash.new({ format_main_ssim: %w(Image Video) }))
|
216
|
+
|
217
|
+
doc_hash = @indexer.collection_solr_document collection
|
218
|
+
expect(doc_hash).to include format_main_ssim: ['Image', 'Video', 'Archive/Manuscript']
|
219
|
+
end
|
220
|
+
it 'already has values Archive/Manuscript' do
|
221
|
+
allow_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:doc_hash).and_return(GDor::Indexer::SolrDocHash.new({ format_main_ssim: 'Archive/Manuscript' }))
|
222
|
+
|
223
|
+
doc_hash = @indexer.collection_solr_document collection
|
224
|
+
expect(doc_hash).to include format_main_ssim: ['Archive/Manuscript']
|
225
|
+
end
|
226
|
+
end
|
227
|
+
it 'populates building_facet field with Stanford Digital Repository' do
|
228
|
+
doc_hash = @indexer.collection_solr_document collection
|
229
|
+
expect(doc_hash).to include building_facet: 'Stanford Digital Repository'
|
230
|
+
end
|
231
|
+
end # unmerged collection
|
232
|
+
end # index_coll_obj_per_config
|
233
|
+
|
234
|
+
context '#add_coll_info and supporting methods' do
|
235
|
+
before(:each) do
|
236
|
+
@coll_druids_array = [collection]
|
237
|
+
end
|
238
|
+
|
239
|
+
it 'adds no collection field values to doc_hash if there are none' do
|
240
|
+
doc_hash = GDor::Indexer::SolrDocHash.new({})
|
241
|
+
@indexer.add_coll_info(doc_hash, nil)
|
242
|
+
expect(doc_hash[:collection]).to be_nil
|
243
|
+
expect(doc_hash[:collection_with_title]).to be_nil
|
244
|
+
expect(doc_hash[:display_type]).to be_nil
|
245
|
+
end
|
246
|
+
|
247
|
+
context 'collection field' do
|
248
|
+
it 'is added field to doc hash' do
|
249
|
+
doc_hash = GDor::Indexer::SolrDocHash.new({})
|
250
|
+
@indexer.add_coll_info(doc_hash, @coll_druids_array)
|
251
|
+
expect(doc_hash[:collection]).to match_array [@coll_druid_from_test_config]
|
252
|
+
end
|
253
|
+
it 'adds two values to doc_hash when object belongs to two collections' do
|
254
|
+
coll_druid1 = 'oo111oo2222'
|
255
|
+
coll_druid2 = 'oo333oo4444'
|
256
|
+
doc_hash = GDor::Indexer::SolrDocHash.new({})
|
257
|
+
@indexer.add_coll_info(doc_hash, [double(druid: coll_druid1, bare_druid: coll_druid1, public_xml: @ng_pub_xml, identity_md_obj_label: ''), double(druid: coll_druid2, bare_druid: coll_druid2, public_xml: @ng_pub_xml, identity_md_obj_label: '')])
|
258
|
+
expect(doc_hash[:collection]).to match_array [coll_druid1, coll_druid2]
|
259
|
+
end
|
260
|
+
end
|
261
|
+
|
262
|
+
context 'collection_with_title field' do
|
263
|
+
it 'is added to doc_hash' do
|
264
|
+
coll_druid = 'oo000oo1234'
|
265
|
+
doc_hash = GDor::Indexer::SolrDocHash.new({})
|
266
|
+
@indexer.add_coll_info(doc_hash, [double(druid: coll_druid, bare_druid: coll_druid, public_xml: @ng_pub_xml, identity_md_obj_label: 'zzz')])
|
267
|
+
expect(doc_hash[:collection_with_title]).to match_array ["#{coll_druid}-|-zzz"]
|
268
|
+
end
|
269
|
+
it 'adds two values to doc_hash when object belongs to two collections' do
|
270
|
+
coll_druid1 = 'oo111oo2222'
|
271
|
+
coll_druid2 = 'oo333oo4444'
|
272
|
+
doc_hash = GDor::Indexer::SolrDocHash.new({})
|
273
|
+
@indexer.add_coll_info(doc_hash, [double(druid: coll_druid1, bare_druid: coll_druid1, public_xml: @ng_pub_xml, identity_md_obj_label: 'foo'), double(druid: coll_druid2, bare_druid: coll_druid2, public_xml: @ng_pub_xml, identity_md_obj_label: 'bar')])
|
274
|
+
expect(doc_hash[:collection_with_title]).to match_array ["#{coll_druid1}-|-foo", "#{coll_druid2}-|-bar"]
|
275
|
+
end
|
276
|
+
# other tests show it uses druid when coll rec isn't merged
|
277
|
+
end
|
278
|
+
|
279
|
+
context '#coll_display_types_from_items' do
|
280
|
+
before(:each) do
|
281
|
+
@indexer.coll_display_types_from_items(collection)
|
282
|
+
end
|
283
|
+
it 'gets single item display_type for single collection (and no dups)' do
|
284
|
+
allow(@indexer).to receive(:identity_md_obj_label)
|
285
|
+
doc_hash = GDor::Indexer::SolrDocHash.new({ display_type: 'image' })
|
286
|
+
@indexer.add_coll_info(doc_hash, @coll_druids_array)
|
287
|
+
doc_hash = GDor::Indexer::SolrDocHash.new({ display_type: 'image' })
|
288
|
+
@indexer.add_coll_info(doc_hash, @coll_druids_array)
|
289
|
+
expect(@indexer.coll_display_types_from_items(collection)).to match_array ['image']
|
290
|
+
end
|
291
|
+
it 'gets multiple formats from multiple items for single collection' do
|
292
|
+
allow(@indexer).to receive(:identity_md_obj_label)
|
293
|
+
doc_hash = GDor::Indexer::SolrDocHash.new({ display_type: 'image' })
|
294
|
+
@indexer.add_coll_info(doc_hash, @coll_druids_array)
|
295
|
+
doc_hash = GDor::Indexer::SolrDocHash.new({ display_type: 'file' })
|
296
|
+
@indexer.add_coll_info(doc_hash, @coll_druids_array)
|
297
|
+
expect(@indexer.coll_display_types_from_items(collection)).to match_array %w(image file)
|
298
|
+
end
|
299
|
+
end # coll_display_types_from_items
|
300
|
+
end # add_coll_info
|
301
|
+
|
302
|
+
context '#num_found_in_solr' do
|
303
|
+
before :each do
|
304
|
+
@unmerged_collection_response = { 'response' => { 'numFound' => '1', 'docs' => [{ 'id' => 'dm212rn7381', 'url_fulltext' => ['https://purl.stanford.edu/dm212rn7381'] }] } }
|
305
|
+
@item_response = { 'response' => { 'numFound' => '265', 'docs' => [{ 'id' => 'dm212rn7381' }] } }
|
306
|
+
end
|
307
|
+
|
308
|
+
it 'counts the items and the collection object in the solr index after indexing' do
|
309
|
+
allow(@indexer.solr_client.client).to receive(:get) do |_wt, params|
|
310
|
+
if params[:params][:fq].include?('id:"dm212rn7381"')
|
311
|
+
@unmerged_collection_response
|
312
|
+
else
|
313
|
+
@item_response
|
314
|
+
end
|
315
|
+
end
|
316
|
+
expect(@indexer.num_found_in_solr(collection: 'dm212rn7381')).to eq(266)
|
317
|
+
end
|
318
|
+
end # num_found_in_solr
|
319
|
+
|
320
|
+
context '#email_report_body' do
|
321
|
+
before :each do
|
322
|
+
@indexer.config.notification = 'notification-list@example.com'
|
323
|
+
allow(@indexer).to receive(:num_found_in_solr).and_return(500)
|
324
|
+
allow(@indexer.harvestdor).to receive(:resources).and_return([collection])
|
325
|
+
allow(collection).to receive(:items).and_return([1, 2, 3])
|
326
|
+
allow(collection).to receive(:identity_md_obj_label).and_return('testcoll title')
|
327
|
+
end
|
328
|
+
|
329
|
+
subject do
|
330
|
+
@indexer.email_report_body
|
331
|
+
end
|
332
|
+
|
333
|
+
it 'email body includes coll id' do
|
334
|
+
expect(subject).to match /testcoll indexed coll record is: ww121ss5000/
|
335
|
+
end
|
336
|
+
|
337
|
+
it 'email body includes coll title' do
|
338
|
+
expect(subject).to match /coll title: testcoll title/
|
339
|
+
end
|
340
|
+
|
341
|
+
it 'email body includes failed to index druids' do
|
342
|
+
@indexer.instance_variable_set(:@druids_failed_to_ix, %w(a b))
|
343
|
+
expect(subject).to match /records that may have failed to index \(merged recs as druids, not ckeys\): \na\nb\n\n/
|
344
|
+
end
|
345
|
+
|
346
|
+
it 'email body include validation messages' do
|
347
|
+
@indexer.instance_variable_set(:@validation_messages, ['this is a validation message'])
|
348
|
+
expect(subject).to match /this is a validation message/
|
349
|
+
end
|
350
|
+
|
351
|
+
it 'email includes reference to full log' do
|
352
|
+
expect(subject).to match /full log is at gdor_indexer\/shared\/spec\/test_logs\/testcoll\.log/
|
353
|
+
end
|
354
|
+
end
|
355
|
+
|
356
|
+
describe '#email_results' do
|
357
|
+
before :each do
|
358
|
+
@indexer.config.notification = 'notification-list@example.com'
|
359
|
+
allow(@indexer).to receive(:send_email)
|
360
|
+
allow(@indexer).to receive(:email_report_body).and_return('Report Body')
|
361
|
+
end
|
362
|
+
|
363
|
+
it 'has an appropriate subject' do
|
364
|
+
expect(@indexer).to receive(:send_email) do |_to, opts|
|
365
|
+
expect(opts[:subject]).to match /is finished/
|
366
|
+
end
|
367
|
+
|
368
|
+
@indexer.email_results
|
369
|
+
end
|
370
|
+
|
371
|
+
it 'sends the email to the notification list' do
|
372
|
+
expect(@indexer).to receive(:send_email) do |to, _opts|
|
373
|
+
expect(to).to eq @indexer.config.notification
|
374
|
+
end
|
375
|
+
|
376
|
+
@indexer.email_results
|
377
|
+
end
|
378
|
+
|
379
|
+
it 'has the report body' do
|
380
|
+
expect(@indexer).to receive(:send_email) do |_to, opts|
|
381
|
+
expect(opts[:body]).to eq 'Report Body'
|
382
|
+
end
|
383
|
+
|
384
|
+
@indexer.email_results
|
385
|
+
end
|
386
|
+
end
|
387
|
+
|
388
|
+
describe '#send_email' do
|
389
|
+
it 'sends an email to the right list' do
|
390
|
+
expect_any_instance_of(Mail::Message).to receive(:deliver!) do |mail|
|
391
|
+
expect(mail.to).to match_array ['notification-list@example.com']
|
392
|
+
end
|
393
|
+
@indexer.send_email 'notification-list@example.com', {}
|
394
|
+
end
|
395
|
+
|
396
|
+
it 'has the appropriate options set' do
|
397
|
+
expect_any_instance_of(Mail::Message).to receive(:deliver!) do |mail|
|
398
|
+
expect(mail.subject).to eq 'Subject'
|
399
|
+
expect(mail.from).to match_array ['rspec']
|
400
|
+
expect(mail.body).to eq 'Body'
|
401
|
+
end
|
402
|
+
@indexer.send_email 'notification-list@example.com', { from: 'rspec', subject: 'Subject', body: 'Body' }
|
403
|
+
end
|
404
|
+
end
|
405
|
+
|
406
|
+
# context "skip heartbeat" do
|
407
|
+
# it "allows me to use a fake url for dor-fetcher-client" do
|
408
|
+
# expect {GDor::Indexer.new(@config_yml_path)}.not_to raise_error
|
409
|
+
# end
|
410
|
+
# end
|
411
|
+
end
|
@@ -0,0 +1,286 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe GDor::Indexer::PublicXmlFields do
|
4
|
+
before(:all) do
|
5
|
+
@fake_druid = 'oo000oo0000'
|
6
|
+
@ns_decl = "xmlns='#{Mods::MODS_NS}'"
|
7
|
+
@mods_xml = "<mods #{@ns_decl}><note>public_xml_fields tests</note></mods>"
|
8
|
+
@empty_pub_xml = "<publicObject id='druid:#{@fake_druid}'></publicObject>"
|
9
|
+
end
|
10
|
+
|
11
|
+
let :logger do
|
12
|
+
Logger.new(StringIO.new)
|
13
|
+
end
|
14
|
+
|
15
|
+
def sdb_for_pub_xml(m)
|
16
|
+
resource = Harvestdor::Indexer::Resource.new(double, @fake_druid)
|
17
|
+
allow(resource).to receive(:public_xml).and_return(Nokogiri::XML(m))
|
18
|
+
allow(resource).to receive(:mods).and_return(@mods_xml)
|
19
|
+
GDor::Indexer::SolrDocBuilder.new(resource, logger)
|
20
|
+
end
|
21
|
+
|
22
|
+
def sdb_for_content_md(m)
|
23
|
+
resource = Harvestdor::Indexer::Resource.new(double, @fake_druid)
|
24
|
+
allow(resource).to receive(:content_metadata).and_return(Nokogiri::XML(m).root)
|
25
|
+
allow(resource).to receive(:public_xml).and_return(@empty_pub_xml)
|
26
|
+
allow(resource).to receive(:mods).and_return(@mods_xml)
|
27
|
+
GDor::Indexer::SolrDocBuilder.new(resource, logger)
|
28
|
+
end
|
29
|
+
|
30
|
+
# NOTE:
|
31
|
+
# "Doubles, stubs, and message expectations are all cleaned out after each example."
|
32
|
+
# per https://www.relishapp.com/rspec/rspec-mocks/docs/scope
|
33
|
+
|
34
|
+
context 'contentMetadata fields and methods' do
|
35
|
+
before(:all) do
|
36
|
+
@content_md_start = "<contentMetadata objectId='#{@fake_druid}'>"
|
37
|
+
@content_md_end = '</contentMetadata>'
|
38
|
+
@cntnt_md_type = 'image'
|
39
|
+
@cntnt_md_xml = "<contentMetadata type='#{@cntnt_md_type}' objectId='#{@fake_druid}'>#{@content_md_end}"
|
40
|
+
@pub_xml = "<publicObject id='druid:#{@fake_druid}'>#{@cntnt_md_xml}</publicObject>"
|
41
|
+
@ng_pub_xml = Nokogiri::XML(@pub_xml)
|
42
|
+
end
|
43
|
+
|
44
|
+
context 'dor_content_type' do
|
45
|
+
it 'is the value of the type attribute on <contentMetadata> element' do
|
46
|
+
val = 'foo'
|
47
|
+
cntnt_md = "<contentMetadata type='#{val}'>#{@content_md_end}"
|
48
|
+
sdb = sdb_for_content_md(cntnt_md)
|
49
|
+
expect(sdb.send(:dor_content_type)).to eq(val)
|
50
|
+
end
|
51
|
+
it 'logs an error message if there is no content type' do
|
52
|
+
cntnt_md = "#{@content_md_start}#{@content_md_end}"
|
53
|
+
sdb = sdb_for_content_md(cntnt_md)
|
54
|
+
expect(sdb.logger).to receive(:error).with("#{@fake_druid} has no DOR content type (<contentMetadata> element may be missing type attribute)")
|
55
|
+
sdb.send(:dor_content_type)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
context 'display_type' do
|
60
|
+
let :sdb do
|
61
|
+
sdb_for_pub_xml @empty_pub_xml
|
62
|
+
end
|
63
|
+
|
64
|
+
it "'image' for dor_content_type 'image'" do
|
65
|
+
allow(sdb).to receive(:dor_content_type).and_return('image')
|
66
|
+
expect(sdb.display_type).to eq('image')
|
67
|
+
end
|
68
|
+
it "'image' for dor_content_type 'manuscript'" do
|
69
|
+
allow(sdb).to receive(:dor_content_type).and_return('manuscript')
|
70
|
+
expect(sdb.display_type).to eq('image')
|
71
|
+
end
|
72
|
+
it "'image' for dor_content_type 'map'" do
|
73
|
+
allow(sdb).to receive(:dor_content_type).and_return('map')
|
74
|
+
expect(sdb.display_type).to eq('image')
|
75
|
+
end
|
76
|
+
it "'file' for dor_content_type 'media'" do
|
77
|
+
allow(sdb).to receive(:dor_content_type).and_return('media')
|
78
|
+
expect(sdb.display_type).to eq('file')
|
79
|
+
end
|
80
|
+
it "'book' for dor_content_type 'book'" do
|
81
|
+
allow(sdb).to receive(:dor_content_type).and_return('book')
|
82
|
+
expect(sdb.display_type).to eq('book')
|
83
|
+
end
|
84
|
+
it "'file' for unrecognized dor_content_type" do
|
85
|
+
allow(sdb).to receive(:dor_content_type).and_return('foo')
|
86
|
+
expect(sdb.display_type).to eq('file')
|
87
|
+
end
|
88
|
+
end # display_type
|
89
|
+
|
90
|
+
context '#file_ids' do
|
91
|
+
context 'file display_type' do
|
92
|
+
context 'contentMetadata type=file, resource type=file' do
|
93
|
+
it 'is id attrib of file element in single resource element with type=file' do
|
94
|
+
m = '<contentMetadata type="file" objectId="xh812jt9999">
|
95
|
+
<resource type="file" sequence="1" id="xh812jt9999_1">
|
96
|
+
<label>John A. Blume Earthquake Engineering Center Technical Report 180</label>
|
97
|
+
<file id="TR180_Shahi.pdf" mimetype="application/pdf" size="4949212" />
|
98
|
+
</resource></contentMetadata>'
|
99
|
+
sdb = sdb_for_content_md(m)
|
100
|
+
expect(sdb.file_ids).to match_array ['TR180_Shahi.pdf']
|
101
|
+
end
|
102
|
+
it 'is id attrib of file elements in multiple resource elements with type=file' do
|
103
|
+
m = '<contentMetadata objectId="jt108hm9275" type="file">
|
104
|
+
<resource id="jt108hm9275_1" sequence="1" type="file">
|
105
|
+
<label>Access to Energy newsletter, 1973-1994</label>
|
106
|
+
<file id="ATE.PDF" mimetype="application/pdf" size="16297305" />
|
107
|
+
</resource>
|
108
|
+
<resource id="jt108hm9275_8" sequence="8" type="file">
|
109
|
+
<label>Computer Forum Festschrift for Edward Feigenbaum, 2006 (part 6)</label>
|
110
|
+
<file id="SC0524_2013-047_b8_811.mp4" mimetype="video/mp4" size="860912776" />
|
111
|
+
</resource>
|
112
|
+
<resource id="jt108hm9275_9" sequence="9" type="file">
|
113
|
+
<label>Stanford AI Lab (SAILDART) files</label>
|
114
|
+
<file id="SAILDART.zip" mimetype="application/zip" size="472230479" />
|
115
|
+
</resource>
|
116
|
+
<resource id="jt108hm9275_10" sequence="10" type="file">
|
117
|
+
<label>WTDS Interview: Douglas C. Engelbart, 2006 Apr 13</label>
|
118
|
+
<file id="DougEngelbart041306.wav" mimetype="audio/x-wav" size="273705910" />
|
119
|
+
</resource></contentMetadata>'
|
120
|
+
sdb = sdb_for_content_md(m)
|
121
|
+
expect(sdb.file_ids).to match_array ['ATE.PDF', 'SC0524_2013-047_b8_811.mp4', 'SAILDART.zip', 'DougEngelbart041306.wav']
|
122
|
+
end
|
123
|
+
end # contentMetadata type=file, resource type=file
|
124
|
+
it 'contentMetadata type=geo, resource type=object' do
|
125
|
+
m = '<contentMetadata objectId="druid:qk786js7484" type="geo">
|
126
|
+
<resource id="druid:qk786js7484_1" sequence="1" type="object">
|
127
|
+
<label>Data</label>
|
128
|
+
<file id="data.zip" mimetype="application/zip" role="master" size="10776648" />
|
129
|
+
</resource>
|
130
|
+
<resource id="druid:qk786js7484_2" sequence="2" type="preview">
|
131
|
+
<label>Preview</label>
|
132
|
+
<file id="preview.jpg" mimetype="image/jpeg" role="master" size="140661">
|
133
|
+
<imageData height="846" width="919"/>
|
134
|
+
</file>
|
135
|
+
</resource></contentMetadata>'
|
136
|
+
sdb = sdb_for_content_md(m)
|
137
|
+
expect(sdb.file_ids).to match_array ['data.zip', 'preview.jpg']
|
138
|
+
end
|
139
|
+
|
140
|
+
# FIXME: non-file resource types
|
141
|
+
end # file display_type
|
142
|
+
context 'image display_type' do
|
143
|
+
context 'contentMetadata type=image' do
|
144
|
+
it 'resource type=image should be id attrib of file elements' do
|
145
|
+
m = '<contentMetadata objectId="rg759wj0953" type="image">
|
146
|
+
<resource id="rg759wj0953_1" sequence="1" type="image">
|
147
|
+
<label>Image 1</label>
|
148
|
+
<file id="rg759wj0953_00_0003.jp2" mimetype="image/jp2" size="13248250">
|
149
|
+
<imageData width="6254" height="11236"/>
|
150
|
+
</file>
|
151
|
+
</resource>
|
152
|
+
<resource id="rg759wj0953_2" sequence="2" type="image">
|
153
|
+
<label>Image 2</label>
|
154
|
+
<file id="rg759wj0953_00_00_0001.jp2" mimetype="image/jp2" size="8484503">
|
155
|
+
<imageData width="7266" height="6188"/>
|
156
|
+
</file>
|
157
|
+
</resource></contentMetadata>'
|
158
|
+
sdb = sdb_for_content_md m
|
159
|
+
expect(sdb.file_ids).to match_array ['rg759wj0953_00_0003.jp2', 'rg759wj0953_00_00_0001.jp2']
|
160
|
+
end
|
161
|
+
it 'resource type=object should be ignored' do
|
162
|
+
m = '<contentMetadata objectId="ny981gz0831" type="image">
|
163
|
+
<resource id="ny981gz0831_1" sequence="1" type="object">
|
164
|
+
<label>File 1</label>
|
165
|
+
<file id="da39a3ee5e6b4b0d3255bfef95601890afd80709.dderr" mimetype="application/x-symlink" size="26634" />
|
166
|
+
<file id="da39a3ee5e6b4b0d3255bfef95601890afd80709.img" mimetype="application/x-symlink" size="368640" />
|
167
|
+
<file id="da39a3ee5e6b4b0d3255bfef95601890afd80709.img.sha" mimetype="application/x-symlink" size="173" />
|
168
|
+
</resource></contentMetadata>'
|
169
|
+
sdb = sdb_for_content_md(m)
|
170
|
+
expect(sdb.file_ids).to be_nil
|
171
|
+
end
|
172
|
+
end # contentMetadata type=image
|
173
|
+
context 'contentMetadata type=map, resource type=image' do
|
174
|
+
it 'is id attrib of file elements' do
|
175
|
+
m = '<contentMetadata objectId="druid:rf935xg1061" type="map">
|
176
|
+
<resource id="0001" sequence="1" type="image">
|
177
|
+
<file id="rf935xg1061_00_0001.jp2" mimetype="image/jp2" size="20204910">
|
178
|
+
<imageData height="7248" width="14787"/>
|
179
|
+
</file>
|
180
|
+
</resource>
|
181
|
+
<resource id="0002" sequence="2" type="image">
|
182
|
+
<file id="rf935xg1061_00_0002.jp2" mimetype="image/jp2" size="20209446">
|
183
|
+
<imageData height="7248" width="14787"/>
|
184
|
+
</file>
|
185
|
+
</resource></contentMetadata>'
|
186
|
+
sdb = sdb_for_content_md(m)
|
187
|
+
expect(sdb.file_ids).to match_array ['rf935xg1061_00_0001.jp2', 'rf935xg1061_00_0002.jp2']
|
188
|
+
end
|
189
|
+
end # contentMetadata type=map, resource type=image
|
190
|
+
context 'contentMetadata type=manuscript' do
|
191
|
+
it 'resource type=image' do
|
192
|
+
m = '<contentMetadata objectId="druid:my191bb7431" type="manuscript">
|
193
|
+
<resource id="manuscript-image-1" sequence="1" type="image">
|
194
|
+
<label>Front Outer Board</label>
|
195
|
+
<file format="JPEG2000" id="T0000001.jp2" mimetype="image/jp2" size="7553958">
|
196
|
+
<imageData height="4578" width="3442"/>
|
197
|
+
</file>
|
198
|
+
</resource>
|
199
|
+
<resource id="manuscript-image-343" sequence="343" type="image">
|
200
|
+
<label>Spine</label>
|
201
|
+
<file format="JPEG2000" id="T0000343.jp2" mimetype="image/jp2" size="1929355">
|
202
|
+
<imageData height="4611" width="986"/>
|
203
|
+
</file>
|
204
|
+
</resource>
|
205
|
+
</contentMetadata>'
|
206
|
+
sdb = sdb_for_content_md(m)
|
207
|
+
expect(sdb.file_ids).to match_array ['T0000001.jp2', 'T0000343.jp2']
|
208
|
+
end
|
209
|
+
it 'resource type=page should be ignored' do
|
210
|
+
m = '<contentMetadata objectId="druid:Bodley342" type="manuscript">
|
211
|
+
<resource type="page" sequence="1" id="image-1">
|
212
|
+
<label>1</label>
|
213
|
+
<file mimetype="image/jp2" format="JPEG2000" size="1319924" id="asn0001-M.jp2">
|
214
|
+
<imageData height="3466" width="2405"/>
|
215
|
+
</file>
|
216
|
+
</resource>
|
217
|
+
<resource type="page" sequence="453" id="image-453">
|
218
|
+
<label>453</label>
|
219
|
+
<file mimetype="image/jp2" format="JPEG2000" size="1457066" id="asn0452-M.jp2">
|
220
|
+
<imageData height="3431" width="2431"/>
|
221
|
+
</file>
|
222
|
+
</resource></contentMetadata>'
|
223
|
+
sdb = sdb_for_content_md(m)
|
224
|
+
expect(sdb.file_ids).to be_nil
|
225
|
+
end
|
226
|
+
end # contentMetadata type=manuscript
|
227
|
+
end # image display_type
|
228
|
+
|
229
|
+
it 'is nil for book display_type' do
|
230
|
+
m = '<contentMetadata type="book" objectId="xm901jg3836">
|
231
|
+
<resource type="image" sequence="1" id="xm901jg3836_1">
|
232
|
+
<label>Item 1</label>
|
233
|
+
<file id="xm901jg3836_00_0002.jp2" mimetype="image/jp2" size="1152852">
|
234
|
+
<imageData width="2091" height="2905"/>
|
235
|
+
</file>
|
236
|
+
</resource>
|
237
|
+
<resource type="image" sequence="608" id="xm901jg3836_608">
|
238
|
+
<label>Item 608</label>
|
239
|
+
<file id="xm901jg3836_00_0609.jp2" mimetype="image/jp2" size="1152297">
|
240
|
+
<imageData width="2090" height="2905"/>
|
241
|
+
</file>
|
242
|
+
</resource></contentMetadata>'
|
243
|
+
sdb = sdb_for_content_md(m)
|
244
|
+
expect(sdb.file_ids).to be_nil
|
245
|
+
end
|
246
|
+
it 'is id attrib of file elements for media display_type' do
|
247
|
+
m = '<contentMetadata objectId="jy496kh1727" type="media">
|
248
|
+
<resource sequence="1" id="jy496kh1727_1" type="audio">
|
249
|
+
<label>Tape 1, Pass 1</label>
|
250
|
+
<file id="jy496kh1727_sl.mp3" mimetype="audio/mpeg" size="57010677" />
|
251
|
+
</resource>
|
252
|
+
<resource sequence="2" id="jy496kh1727_2" type="image">
|
253
|
+
<label>Image of media (1 of 3)</label>
|
254
|
+
<file id="jy496kh1727_img_1.jp2" mimetype="image/jp2" size="1277821">
|
255
|
+
<imageData width="2659" height="2535"/>
|
256
|
+
</file>
|
257
|
+
</resource></contentMetadata>'
|
258
|
+
sdb = sdb_for_content_md(m)
|
259
|
+
expect(sdb.file_ids).to match_array ['jy496kh1727_sl.mp3', 'jy496kh1727_img_1.jp2']
|
260
|
+
end
|
261
|
+
it 'is nil if there are no <resource> elements in the contentMetadata' do
|
262
|
+
m = '<contentMetadata objectId="jy496kh1727" type="file"></contentMetadata>'
|
263
|
+
sdb = sdb_for_content_md(m)
|
264
|
+
expect(sdb.file_ids).to be_nil
|
265
|
+
end
|
266
|
+
it 'is nil if there are no <file> elements in the contentMetadata' do
|
267
|
+
m = '<contentMetadata objectId="jy496kh1727" type="file">
|
268
|
+
<resource sequence="1" id="jy496kh1727_1" type="file">
|
269
|
+
<label>Tape 1, Pass 1</label>
|
270
|
+
</resource>
|
271
|
+
<resource sequence="2" id="jy496kh1727_2" type="image">
|
272
|
+
<label>Image of media (1 of 3)</label>
|
273
|
+
</resource></contentMetadata>'
|
274
|
+
sdb = sdb_for_content_md(m)
|
275
|
+
expect(sdb.file_ids).to be_nil
|
276
|
+
end
|
277
|
+
it 'is nil if there are no id elements on file elements' do
|
278
|
+
m = "#{@content_md_start}<resource type='image'><file/></resource>#{@content_md_end}"
|
279
|
+
sdb = sdb_for_content_md(m)
|
280
|
+
expect(sdb.file_ids).to be_nil
|
281
|
+
end
|
282
|
+
|
283
|
+
# TODO: multiple file elements in a single resource element
|
284
|
+
end # file_ids
|
285
|
+
end # contentMetadata fields and methods
|
286
|
+
end
|