gdor-indexer 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +31 -0
- data/.hound.yml +2 -0
- data/.rubocop.yml +3 -0
- data/.rubocop_todo.yml +131 -0
- data/.yardopts +3 -0
- data/Capfile +26 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +5 -0
- data/README.md +67 -0
- data/Rakefile +57 -0
- data/VERSION +1 -0
- data/bin/indexer +71 -0
- data/config/deploy.rb +31 -0
- data/config/deploy/dev.rb +41 -0
- data/config/deploy/fetcher.rb +6 -0
- data/config/deploy/prod.rb +41 -0
- data/config/deploy/stage.rb +41 -0
- data/gdor-indexer.gemspec +43 -0
- data/lib/gdor/indexer.rb +327 -0
- data/lib/gdor/indexer/mods_fields.rb +114 -0
- data/lib/gdor/indexer/nokogiri_xml_node_mixin.rb +42 -0
- data/lib/gdor/indexer/public_xml_fields.rb +81 -0
- data/lib/gdor/indexer/solr_doc_builder.rb +85 -0
- data/lib/gdor/indexer/solr_doc_hash.rb +112 -0
- data/lib/gdor/indexer/version.rb +5 -0
- data/spec/config/walters_integration_spec.yml +44 -0
- data/spec/spec_helper.rb +26 -0
- data/spec/unit/gdor_mods_fields_spec.rb +812 -0
- data/spec/unit/indexer_spec.rb +411 -0
- data/spec/unit/public_xml_fields_spec.rb +286 -0
- data/spec/unit/solr_doc_builder_spec.rb +128 -0
- data/spec/unit/solr_doc_hash_spec.rb +399 -0
- data/spec/vcr_cassettes/no_coll_druid_in_druid_array_call.yml +745 -0
- metadata +411 -0
@@ -0,0 +1,411 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe GDor::Indexer do
|
4
|
+
before(:all) do
|
5
|
+
@config_yml_path = File.join(File.dirname(__FILE__), '..', 'config', 'walters_integration_spec.yml')
|
6
|
+
require 'yaml'
|
7
|
+
@yaml = YAML.load_file(@config_yml_path)
|
8
|
+
@ns_decl = "xmlns='#{Mods::MODS_NS}'"
|
9
|
+
@fake_druid = 'oo000oo0000'
|
10
|
+
@coll_druid_from_test_config = 'ww121ss5000'
|
11
|
+
@mods_xml = "<mods #{@ns_decl}><note>Indexer test</note></mods>"
|
12
|
+
@ng_mods_xml = Nokogiri::XML("<mods #{@ns_decl}><note>Indexer test</note></mods>")
|
13
|
+
@pub_xml = "<publicObject id='druid#{@fake_druid}'></publicObject>"
|
14
|
+
@ng_pub_xml = Nokogiri::XML("<publicObject id='druid#{@fake_druid}'></publicObject>")
|
15
|
+
end
|
16
|
+
before(:each) do
|
17
|
+
@indexer = described_class.new(@config_yml_path) do |config|
|
18
|
+
config.whitelist = ['druid:ww121ss5000']
|
19
|
+
end
|
20
|
+
allow(@indexer.solr_client).to receive(:add)
|
21
|
+
end
|
22
|
+
|
23
|
+
let :resource do
|
24
|
+
r = Harvestdor::Indexer::Resource.new(double, @fake_druid)
|
25
|
+
allow(r).to receive(:collections).and_return []
|
26
|
+
allow(r).to receive(:mods).and_return Nokogiri::XML(@mods_xml)
|
27
|
+
allow(r).to receive(:public_xml).and_return Nokogiri::XML(@pub_xml)
|
28
|
+
allow(r).to receive(:public_xml?).and_return true
|
29
|
+
allow(r).to receive(:content_metadata).and_return nil
|
30
|
+
allow(r).to receive(:collection?).and_return false
|
31
|
+
r
|
32
|
+
end
|
33
|
+
|
34
|
+
let :collection do
|
35
|
+
r = Harvestdor::Indexer::Resource.new(double, @coll_druid_from_test_config)
|
36
|
+
allow(r).to receive(:collections).and_return []
|
37
|
+
allow(r).to receive(:mods).and_return Nokogiri::XML(@mods_xml)
|
38
|
+
allow(r).to receive(:public_xml).and_return Nokogiri::XML(@pub_xml)
|
39
|
+
allow(r).to receive(:public_xml?).and_return true
|
40
|
+
allow(r).to receive(:content_metadata).and_return nil
|
41
|
+
allow(r).to receive(:identity_md_obj_label).and_return ''
|
42
|
+
allow(r).to receive(:collection?).and_return true
|
43
|
+
r
|
44
|
+
end
|
45
|
+
|
46
|
+
context 'logging' do
|
47
|
+
it 'writes the log file to the directory indicated by log_dir' do
|
48
|
+
@indexer.logger.info('walters_integration_spec logging test message')
|
49
|
+
expect(File).to exist(File.join(@yaml['harvestdor']['log_dir'], @yaml['harvestdor']['log_name']))
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
describe '#harvest_and_index' do
|
54
|
+
before :each do
|
55
|
+
allow(@indexer.harvestdor).to receive(:each_resource)
|
56
|
+
allow(@indexer).to receive(:solr_client).and_return(double(commit!: nil))
|
57
|
+
allow(@indexer).to receive(:log_results)
|
58
|
+
allow(@indexer).to receive(:email_results)
|
59
|
+
end
|
60
|
+
it 'logs and email results' do
|
61
|
+
expect(@indexer).to receive(:log_results)
|
62
|
+
expect(@indexer).to receive(:email_results)
|
63
|
+
|
64
|
+
@indexer.harvest_and_index
|
65
|
+
end
|
66
|
+
it 'indexs each resource' do
|
67
|
+
allow(@indexer).to receive(:harvestdor).and_return(Class.new do
|
68
|
+
def initialize(*items)
|
69
|
+
@items = items
|
70
|
+
end
|
71
|
+
|
72
|
+
def each_resource(_opts = {})
|
73
|
+
@items.each { |x| yield x }
|
74
|
+
end
|
75
|
+
|
76
|
+
def logger
|
77
|
+
Logger.new(STDERR)
|
78
|
+
end
|
79
|
+
end.new(collection, resource))
|
80
|
+
|
81
|
+
expect(@indexer).to receive(:index).with(collection)
|
82
|
+
expect(@indexer).to receive(:index).with(resource)
|
83
|
+
|
84
|
+
@indexer.harvest_and_index
|
85
|
+
end
|
86
|
+
it 'sends a solr commit' do
|
87
|
+
expect(@indexer.solr_client).to receive(:commit!)
|
88
|
+
@indexer.harvest_and_index
|
89
|
+
end
|
90
|
+
it 'does not commit if nocommit is set' do
|
91
|
+
expect(@indexer.solr_client).to_not receive(:commit!)
|
92
|
+
@indexer.harvest_and_index(true)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
describe '#index' do
|
97
|
+
it 'indexs collections as collections' do
|
98
|
+
expect(@indexer).to receive(:collection_solr_document).with(collection)
|
99
|
+
@indexer.index collection
|
100
|
+
end
|
101
|
+
|
102
|
+
it 'indexs other resources as items' do
|
103
|
+
expect(@indexer).to receive(:item_solr_document).with(resource)
|
104
|
+
@indexer.index resource
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
describe '#index_with_exception_handling' do
|
109
|
+
it 'capture,s log, and re-raise any exception thrown by the indexing process' do
|
110
|
+
expect(@indexer).to receive(:index).with(resource).and_raise 'xyz'
|
111
|
+
expect(@indexer.logger).to receive(:error)
|
112
|
+
expect { @indexer.index_with_exception_handling(resource) }.to raise_error RuntimeError
|
113
|
+
expect(@indexer.druids_failed_to_ix).to include resource.druid
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
context '#item_solr_document' do
|
118
|
+
context 'unmerged' do
|
119
|
+
it 'calls Harvestdor::Indexer.solr_add' do
|
120
|
+
doc_hash = @indexer.item_solr_document(resource)
|
121
|
+
expect(doc_hash).to include id: @fake_druid
|
122
|
+
end
|
123
|
+
it 'calls validate_item' do
|
124
|
+
expect_any_instance_of(GDor::Indexer::SolrDocHash).to receive(:validate_item).and_return([])
|
125
|
+
@indexer.item_solr_document resource
|
126
|
+
end
|
127
|
+
it 'calls GDor::Indexer::SolrDocBuilder.validate_mods' do
|
128
|
+
allow_any_instance_of(GDor::Indexer::SolrDocHash).to receive(:validate_item).and_return([])
|
129
|
+
expect_any_instance_of(GDor::Indexer::SolrDocHash).to receive(:validate_mods).and_return([])
|
130
|
+
@indexer.item_solr_document resource
|
131
|
+
end
|
132
|
+
it 'calls add_coll_info' do
|
133
|
+
expect(@indexer).to receive(:add_coll_info)
|
134
|
+
@indexer.item_solr_document resource
|
135
|
+
end
|
136
|
+
it 'has fields populated from the collection record' do
|
137
|
+
sdb = double
|
138
|
+
allow(sdb).to receive(:doc_hash).and_return(GDor::Indexer::SolrDocHash.new)
|
139
|
+
allow(sdb).to receive(:display_type)
|
140
|
+
allow(sdb).to receive(:file_ids)
|
141
|
+
allow(sdb.doc_hash).to receive(:validate_mods).and_return([])
|
142
|
+
allow(GDor::Indexer::SolrDocBuilder).to receive(:new).and_return(sdb)
|
143
|
+
allow(resource).to receive(:collections).and_return([double(druid: 'foo', bare_druid: 'foo', identity_md_obj_label: 'bar')])
|
144
|
+
doc_hash = @indexer.item_solr_document resource
|
145
|
+
expect(doc_hash).to include druid: @fake_druid, collection: ['foo'], collection_with_title: ['foo-|-bar']
|
146
|
+
end
|
147
|
+
it 'has fields populated from the MODS' do
|
148
|
+
title = 'fake title in mods'
|
149
|
+
ng_mods = Nokogiri::XML("<mods #{@ns_decl}><titleInfo><title>#{title}</title></titleInfo></mods>")
|
150
|
+
allow(resource).to receive(:mods).and_return(ng_mods)
|
151
|
+
doc_hash = @indexer.item_solr_document resource
|
152
|
+
expect(doc_hash).to include id: @fake_druid, title_display: title
|
153
|
+
end
|
154
|
+
it 'populates url_fulltext field with purl page url' do
|
155
|
+
doc_hash = @indexer.item_solr_document resource
|
156
|
+
expect(doc_hash).to include id: @fake_druid, url_fulltext: "#{@yaml['harvestdor']['purl']}/#{@fake_druid}"
|
157
|
+
end
|
158
|
+
it 'populates druid and access_facet fields' do
|
159
|
+
doc_hash = @indexer.item_solr_document resource
|
160
|
+
expect(doc_hash).to include id: @fake_druid, druid: @fake_druid, access_facet: 'Online'
|
161
|
+
end
|
162
|
+
it 'populates display_type field by calling display_type method' do
|
163
|
+
expect_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:display_type).and_return('foo')
|
164
|
+
doc_hash = @indexer.item_solr_document resource
|
165
|
+
expect(doc_hash).to include id: @fake_druid, display_type: 'foo'
|
166
|
+
end
|
167
|
+
it 'populates file_id field by calling file_ids method' do
|
168
|
+
expect_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:file_ids).at_least(1).times.and_return(['foo'])
|
169
|
+
doc_hash = @indexer.item_solr_document resource
|
170
|
+
expect(doc_hash).to include id: @fake_druid, file_id: ['foo']
|
171
|
+
end
|
172
|
+
it 'populates building_facet field with Stanford Digital Repository' do
|
173
|
+
doc_hash = @indexer.item_solr_document resource
|
174
|
+
expect(doc_hash).to include id: @fake_druid, building_facet: 'Stanford Digital Repository'
|
175
|
+
end
|
176
|
+
end # unmerged item
|
177
|
+
end # item_solr_document
|
178
|
+
|
179
|
+
context '#collection_solr_document' do
|
180
|
+
context 'unmerged' do
|
181
|
+
it 'calls validate_collection' do
|
182
|
+
doc_hash = GDor::Indexer::SolrDocHash.new
|
183
|
+
allow_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:doc_hash).and_return(doc_hash) # speed up the test
|
184
|
+
expect(doc_hash).to receive(:validate_collection).and_return([])
|
185
|
+
doc_hash = @indexer.collection_solr_document collection
|
186
|
+
end
|
187
|
+
it 'calls GDor::Indexer::SolrDocBuilder.validate_mods' do
|
188
|
+
doc_hash = GDor::Indexer::SolrDocHash.new
|
189
|
+
allow_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:doc_hash).and_return(doc_hash) # speed up the test
|
190
|
+
expect(doc_hash).to receive(:validate_mods).and_return([])
|
191
|
+
doc_hash = @indexer.collection_solr_document collection
|
192
|
+
end
|
193
|
+
it 'populates druid and access_facet fields' do
|
194
|
+
doc_hash = @indexer.collection_solr_document collection
|
195
|
+
expect(doc_hash).to include druid: @coll_druid_from_test_config, access_facet: 'Online'
|
196
|
+
end
|
197
|
+
it 'populates url_fulltext field with purl page url' do
|
198
|
+
doc_hash = @indexer.collection_solr_document collection
|
199
|
+
expect(doc_hash).to include url_fulltext: "#{@yaml['harvestdor']['purl']}/#{@coll_druid_from_test_config}"
|
200
|
+
end
|
201
|
+
it "collection_type should be 'Digital Collection'" do
|
202
|
+
allow_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:doc_hash).and_return(GDor::Indexer::SolrDocHash.new) # speed up the test
|
203
|
+
|
204
|
+
doc_hash = @indexer.collection_solr_document collection
|
205
|
+
expect(doc_hash).to include collection_type: 'Digital Collection'
|
206
|
+
end
|
207
|
+
context 'add format_main_ssim Archive/Manuscript' do
|
208
|
+
it 'no other values' do
|
209
|
+
allow_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:doc_hash).and_return(GDor::Indexer::SolrDocHash.new)
|
210
|
+
|
211
|
+
doc_hash = @indexer.collection_solr_document collection
|
212
|
+
expect(doc_hash).to include format_main_ssim: 'Archive/Manuscript'
|
213
|
+
end
|
214
|
+
it 'other values present' do
|
215
|
+
allow_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:doc_hash).and_return(GDor::Indexer::SolrDocHash.new({ format_main_ssim: %w(Image Video) }))
|
216
|
+
|
217
|
+
doc_hash = @indexer.collection_solr_document collection
|
218
|
+
expect(doc_hash).to include format_main_ssim: ['Image', 'Video', 'Archive/Manuscript']
|
219
|
+
end
|
220
|
+
it 'already has values Archive/Manuscript' do
|
221
|
+
allow_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:doc_hash).and_return(GDor::Indexer::SolrDocHash.new({ format_main_ssim: 'Archive/Manuscript' }))
|
222
|
+
|
223
|
+
doc_hash = @indexer.collection_solr_document collection
|
224
|
+
expect(doc_hash).to include format_main_ssim: ['Archive/Manuscript']
|
225
|
+
end
|
226
|
+
end
|
227
|
+
it 'populates building_facet field with Stanford Digital Repository' do
|
228
|
+
doc_hash = @indexer.collection_solr_document collection
|
229
|
+
expect(doc_hash).to include building_facet: 'Stanford Digital Repository'
|
230
|
+
end
|
231
|
+
end # unmerged collection
|
232
|
+
end # index_coll_obj_per_config
|
233
|
+
|
234
|
+
context '#add_coll_info and supporting methods' do
|
235
|
+
before(:each) do
|
236
|
+
@coll_druids_array = [collection]
|
237
|
+
end
|
238
|
+
|
239
|
+
it 'adds no collection field values to doc_hash if there are none' do
|
240
|
+
doc_hash = GDor::Indexer::SolrDocHash.new({})
|
241
|
+
@indexer.add_coll_info(doc_hash, nil)
|
242
|
+
expect(doc_hash[:collection]).to be_nil
|
243
|
+
expect(doc_hash[:collection_with_title]).to be_nil
|
244
|
+
expect(doc_hash[:display_type]).to be_nil
|
245
|
+
end
|
246
|
+
|
247
|
+
context 'collection field' do
|
248
|
+
it 'is added field to doc hash' do
|
249
|
+
doc_hash = GDor::Indexer::SolrDocHash.new({})
|
250
|
+
@indexer.add_coll_info(doc_hash, @coll_druids_array)
|
251
|
+
expect(doc_hash[:collection]).to match_array [@coll_druid_from_test_config]
|
252
|
+
end
|
253
|
+
it 'adds two values to doc_hash when object belongs to two collections' do
|
254
|
+
coll_druid1 = 'oo111oo2222'
|
255
|
+
coll_druid2 = 'oo333oo4444'
|
256
|
+
doc_hash = GDor::Indexer::SolrDocHash.new({})
|
257
|
+
@indexer.add_coll_info(doc_hash, [double(druid: coll_druid1, bare_druid: coll_druid1, public_xml: @ng_pub_xml, identity_md_obj_label: ''), double(druid: coll_druid2, bare_druid: coll_druid2, public_xml: @ng_pub_xml, identity_md_obj_label: '')])
|
258
|
+
expect(doc_hash[:collection]).to match_array [coll_druid1, coll_druid2]
|
259
|
+
end
|
260
|
+
end
|
261
|
+
|
262
|
+
context 'collection_with_title field' do
|
263
|
+
it 'is added to doc_hash' do
|
264
|
+
coll_druid = 'oo000oo1234'
|
265
|
+
doc_hash = GDor::Indexer::SolrDocHash.new({})
|
266
|
+
@indexer.add_coll_info(doc_hash, [double(druid: coll_druid, bare_druid: coll_druid, public_xml: @ng_pub_xml, identity_md_obj_label: 'zzz')])
|
267
|
+
expect(doc_hash[:collection_with_title]).to match_array ["#{coll_druid}-|-zzz"]
|
268
|
+
end
|
269
|
+
it 'adds two values to doc_hash when object belongs to two collections' do
|
270
|
+
coll_druid1 = 'oo111oo2222'
|
271
|
+
coll_druid2 = 'oo333oo4444'
|
272
|
+
doc_hash = GDor::Indexer::SolrDocHash.new({})
|
273
|
+
@indexer.add_coll_info(doc_hash, [double(druid: coll_druid1, bare_druid: coll_druid1, public_xml: @ng_pub_xml, identity_md_obj_label: 'foo'), double(druid: coll_druid2, bare_druid: coll_druid2, public_xml: @ng_pub_xml, identity_md_obj_label: 'bar')])
|
274
|
+
expect(doc_hash[:collection_with_title]).to match_array ["#{coll_druid1}-|-foo", "#{coll_druid2}-|-bar"]
|
275
|
+
end
|
276
|
+
# other tests show it uses druid when coll rec isn't merged
|
277
|
+
end
|
278
|
+
|
279
|
+
context '#coll_display_types_from_items' do
|
280
|
+
before(:each) do
|
281
|
+
@indexer.coll_display_types_from_items(collection)
|
282
|
+
end
|
283
|
+
it 'gets single item display_type for single collection (and no dups)' do
|
284
|
+
allow(@indexer).to receive(:identity_md_obj_label)
|
285
|
+
doc_hash = GDor::Indexer::SolrDocHash.new({ display_type: 'image' })
|
286
|
+
@indexer.add_coll_info(doc_hash, @coll_druids_array)
|
287
|
+
doc_hash = GDor::Indexer::SolrDocHash.new({ display_type: 'image' })
|
288
|
+
@indexer.add_coll_info(doc_hash, @coll_druids_array)
|
289
|
+
expect(@indexer.coll_display_types_from_items(collection)).to match_array ['image']
|
290
|
+
end
|
291
|
+
it 'gets multiple formats from multiple items for single collection' do
|
292
|
+
allow(@indexer).to receive(:identity_md_obj_label)
|
293
|
+
doc_hash = GDor::Indexer::SolrDocHash.new({ display_type: 'image' })
|
294
|
+
@indexer.add_coll_info(doc_hash, @coll_druids_array)
|
295
|
+
doc_hash = GDor::Indexer::SolrDocHash.new({ display_type: 'file' })
|
296
|
+
@indexer.add_coll_info(doc_hash, @coll_druids_array)
|
297
|
+
expect(@indexer.coll_display_types_from_items(collection)).to match_array %w(image file)
|
298
|
+
end
|
299
|
+
end # coll_display_types_from_items
|
300
|
+
end # add_coll_info
|
301
|
+
|
302
|
+
context '#num_found_in_solr' do
|
303
|
+
before :each do
|
304
|
+
@unmerged_collection_response = { 'response' => { 'numFound' => '1', 'docs' => [{ 'id' => 'dm212rn7381', 'url_fulltext' => ['https://purl.stanford.edu/dm212rn7381'] }] } }
|
305
|
+
@item_response = { 'response' => { 'numFound' => '265', 'docs' => [{ 'id' => 'dm212rn7381' }] } }
|
306
|
+
end
|
307
|
+
|
308
|
+
it 'counts the items and the collection object in the solr index after indexing' do
|
309
|
+
allow(@indexer.solr_client.client).to receive(:get) do |_wt, params|
|
310
|
+
if params[:params][:fq].include?('id:"dm212rn7381"')
|
311
|
+
@unmerged_collection_response
|
312
|
+
else
|
313
|
+
@item_response
|
314
|
+
end
|
315
|
+
end
|
316
|
+
expect(@indexer.num_found_in_solr(collection: 'dm212rn7381')).to eq(266)
|
317
|
+
end
|
318
|
+
end # num_found_in_solr
|
319
|
+
|
320
|
+
context '#email_report_body' do
|
321
|
+
before :each do
|
322
|
+
@indexer.config.notification = 'notification-list@example.com'
|
323
|
+
allow(@indexer).to receive(:num_found_in_solr).and_return(500)
|
324
|
+
allow(@indexer.harvestdor).to receive(:resources).and_return([collection])
|
325
|
+
allow(collection).to receive(:items).and_return([1, 2, 3])
|
326
|
+
allow(collection).to receive(:identity_md_obj_label).and_return('testcoll title')
|
327
|
+
end
|
328
|
+
|
329
|
+
subject do
|
330
|
+
@indexer.email_report_body
|
331
|
+
end
|
332
|
+
|
333
|
+
it 'email body includes coll id' do
|
334
|
+
expect(subject).to match /testcoll indexed coll record is: ww121ss5000/
|
335
|
+
end
|
336
|
+
|
337
|
+
it 'email body includes coll title' do
|
338
|
+
expect(subject).to match /coll title: testcoll title/
|
339
|
+
end
|
340
|
+
|
341
|
+
it 'email body includes failed to index druids' do
|
342
|
+
@indexer.instance_variable_set(:@druids_failed_to_ix, %w(a b))
|
343
|
+
expect(subject).to match /records that may have failed to index \(merged recs as druids, not ckeys\): \na\nb\n\n/
|
344
|
+
end
|
345
|
+
|
346
|
+
it 'email body include validation messages' do
|
347
|
+
@indexer.instance_variable_set(:@validation_messages, ['this is a validation message'])
|
348
|
+
expect(subject).to match /this is a validation message/
|
349
|
+
end
|
350
|
+
|
351
|
+
it 'email includes reference to full log' do
|
352
|
+
expect(subject).to match /full log is at gdor_indexer\/shared\/spec\/test_logs\/testcoll\.log/
|
353
|
+
end
|
354
|
+
end
|
355
|
+
|
356
|
+
describe '#email_results' do
|
357
|
+
before :each do
|
358
|
+
@indexer.config.notification = 'notification-list@example.com'
|
359
|
+
allow(@indexer).to receive(:send_email)
|
360
|
+
allow(@indexer).to receive(:email_report_body).and_return('Report Body')
|
361
|
+
end
|
362
|
+
|
363
|
+
it 'has an appropriate subject' do
|
364
|
+
expect(@indexer).to receive(:send_email) do |_to, opts|
|
365
|
+
expect(opts[:subject]).to match /is finished/
|
366
|
+
end
|
367
|
+
|
368
|
+
@indexer.email_results
|
369
|
+
end
|
370
|
+
|
371
|
+
it 'sends the email to the notification list' do
|
372
|
+
expect(@indexer).to receive(:send_email) do |to, _opts|
|
373
|
+
expect(to).to eq @indexer.config.notification
|
374
|
+
end
|
375
|
+
|
376
|
+
@indexer.email_results
|
377
|
+
end
|
378
|
+
|
379
|
+
it 'has the report body' do
|
380
|
+
expect(@indexer).to receive(:send_email) do |_to, opts|
|
381
|
+
expect(opts[:body]).to eq 'Report Body'
|
382
|
+
end
|
383
|
+
|
384
|
+
@indexer.email_results
|
385
|
+
end
|
386
|
+
end
|
387
|
+
|
388
|
+
describe '#send_email' do
|
389
|
+
it 'sends an email to the right list' do
|
390
|
+
expect_any_instance_of(Mail::Message).to receive(:deliver!) do |mail|
|
391
|
+
expect(mail.to).to match_array ['notification-list@example.com']
|
392
|
+
end
|
393
|
+
@indexer.send_email 'notification-list@example.com', {}
|
394
|
+
end
|
395
|
+
|
396
|
+
it 'has the appropriate options set' do
|
397
|
+
expect_any_instance_of(Mail::Message).to receive(:deliver!) do |mail|
|
398
|
+
expect(mail.subject).to eq 'Subject'
|
399
|
+
expect(mail.from).to match_array ['rspec']
|
400
|
+
expect(mail.body).to eq 'Body'
|
401
|
+
end
|
402
|
+
@indexer.send_email 'notification-list@example.com', { from: 'rspec', subject: 'Subject', body: 'Body' }
|
403
|
+
end
|
404
|
+
end
|
405
|
+
|
406
|
+
# context "skip heartbeat" do
|
407
|
+
# it "allows me to use a fake url for dor-fetcher-client" do
|
408
|
+
# expect {GDor::Indexer.new(@config_yml_path)}.not_to raise_error
|
409
|
+
# end
|
410
|
+
# end
|
411
|
+
end
|
@@ -0,0 +1,286 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe GDor::Indexer::PublicXmlFields do
|
4
|
+
before(:all) do
|
5
|
+
@fake_druid = 'oo000oo0000'
|
6
|
+
@ns_decl = "xmlns='#{Mods::MODS_NS}'"
|
7
|
+
@mods_xml = "<mods #{@ns_decl}><note>public_xml_fields tests</note></mods>"
|
8
|
+
@empty_pub_xml = "<publicObject id='druid:#{@fake_druid}'></publicObject>"
|
9
|
+
end
|
10
|
+
|
11
|
+
let :logger do
|
12
|
+
Logger.new(StringIO.new)
|
13
|
+
end
|
14
|
+
|
15
|
+
def sdb_for_pub_xml(m)
|
16
|
+
resource = Harvestdor::Indexer::Resource.new(double, @fake_druid)
|
17
|
+
allow(resource).to receive(:public_xml).and_return(Nokogiri::XML(m))
|
18
|
+
allow(resource).to receive(:mods).and_return(@mods_xml)
|
19
|
+
GDor::Indexer::SolrDocBuilder.new(resource, logger)
|
20
|
+
end
|
21
|
+
|
22
|
+
def sdb_for_content_md(m)
|
23
|
+
resource = Harvestdor::Indexer::Resource.new(double, @fake_druid)
|
24
|
+
allow(resource).to receive(:content_metadata).and_return(Nokogiri::XML(m).root)
|
25
|
+
allow(resource).to receive(:public_xml).and_return(@empty_pub_xml)
|
26
|
+
allow(resource).to receive(:mods).and_return(@mods_xml)
|
27
|
+
GDor::Indexer::SolrDocBuilder.new(resource, logger)
|
28
|
+
end
|
29
|
+
|
30
|
+
# NOTE:
|
31
|
+
# "Doubles, stubs, and message expectations are all cleaned out after each example."
|
32
|
+
# per https://www.relishapp.com/rspec/rspec-mocks/docs/scope
|
33
|
+
|
34
|
+
context 'contentMetadata fields and methods' do
|
35
|
+
before(:all) do
|
36
|
+
@content_md_start = "<contentMetadata objectId='#{@fake_druid}'>"
|
37
|
+
@content_md_end = '</contentMetadata>'
|
38
|
+
@cntnt_md_type = 'image'
|
39
|
+
@cntnt_md_xml = "<contentMetadata type='#{@cntnt_md_type}' objectId='#{@fake_druid}'>#{@content_md_end}"
|
40
|
+
@pub_xml = "<publicObject id='druid:#{@fake_druid}'>#{@cntnt_md_xml}</publicObject>"
|
41
|
+
@ng_pub_xml = Nokogiri::XML(@pub_xml)
|
42
|
+
end
|
43
|
+
|
44
|
+
context 'dor_content_type' do
|
45
|
+
it 'is the value of the type attribute on <contentMetadata> element' do
|
46
|
+
val = 'foo'
|
47
|
+
cntnt_md = "<contentMetadata type='#{val}'>#{@content_md_end}"
|
48
|
+
sdb = sdb_for_content_md(cntnt_md)
|
49
|
+
expect(sdb.send(:dor_content_type)).to eq(val)
|
50
|
+
end
|
51
|
+
it 'logs an error message if there is no content type' do
|
52
|
+
cntnt_md = "#{@content_md_start}#{@content_md_end}"
|
53
|
+
sdb = sdb_for_content_md(cntnt_md)
|
54
|
+
expect(sdb.logger).to receive(:error).with("#{@fake_druid} has no DOR content type (<contentMetadata> element may be missing type attribute)")
|
55
|
+
sdb.send(:dor_content_type)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
context 'display_type' do
|
60
|
+
let :sdb do
|
61
|
+
sdb_for_pub_xml @empty_pub_xml
|
62
|
+
end
|
63
|
+
|
64
|
+
it "'image' for dor_content_type 'image'" do
|
65
|
+
allow(sdb).to receive(:dor_content_type).and_return('image')
|
66
|
+
expect(sdb.display_type).to eq('image')
|
67
|
+
end
|
68
|
+
it "'image' for dor_content_type 'manuscript'" do
|
69
|
+
allow(sdb).to receive(:dor_content_type).and_return('manuscript')
|
70
|
+
expect(sdb.display_type).to eq('image')
|
71
|
+
end
|
72
|
+
it "'image' for dor_content_type 'map'" do
|
73
|
+
allow(sdb).to receive(:dor_content_type).and_return('map')
|
74
|
+
expect(sdb.display_type).to eq('image')
|
75
|
+
end
|
76
|
+
it "'file' for dor_content_type 'media'" do
|
77
|
+
allow(sdb).to receive(:dor_content_type).and_return('media')
|
78
|
+
expect(sdb.display_type).to eq('file')
|
79
|
+
end
|
80
|
+
it "'book' for dor_content_type 'book'" do
|
81
|
+
allow(sdb).to receive(:dor_content_type).and_return('book')
|
82
|
+
expect(sdb.display_type).to eq('book')
|
83
|
+
end
|
84
|
+
it "'file' for unrecognized dor_content_type" do
|
85
|
+
allow(sdb).to receive(:dor_content_type).and_return('foo')
|
86
|
+
expect(sdb.display_type).to eq('file')
|
87
|
+
end
|
88
|
+
end # display_type
|
89
|
+
|
90
|
+
context '#file_ids' do
|
91
|
+
context 'file display_type' do
|
92
|
+
context 'contentMetadata type=file, resource type=file' do
|
93
|
+
it 'is id attrib of file element in single resource element with type=file' do
|
94
|
+
m = '<contentMetadata type="file" objectId="xh812jt9999">
|
95
|
+
<resource type="file" sequence="1" id="xh812jt9999_1">
|
96
|
+
<label>John A. Blume Earthquake Engineering Center Technical Report 180</label>
|
97
|
+
<file id="TR180_Shahi.pdf" mimetype="application/pdf" size="4949212" />
|
98
|
+
</resource></contentMetadata>'
|
99
|
+
sdb = sdb_for_content_md(m)
|
100
|
+
expect(sdb.file_ids).to match_array ['TR180_Shahi.pdf']
|
101
|
+
end
|
102
|
+
it 'is id attrib of file elements in multiple resource elements with type=file' do
|
103
|
+
m = '<contentMetadata objectId="jt108hm9275" type="file">
|
104
|
+
<resource id="jt108hm9275_1" sequence="1" type="file">
|
105
|
+
<label>Access to Energy newsletter, 1973-1994</label>
|
106
|
+
<file id="ATE.PDF" mimetype="application/pdf" size="16297305" />
|
107
|
+
</resource>
|
108
|
+
<resource id="jt108hm9275_8" sequence="8" type="file">
|
109
|
+
<label>Computer Forum Festschrift for Edward Feigenbaum, 2006 (part 6)</label>
|
110
|
+
<file id="SC0524_2013-047_b8_811.mp4" mimetype="video/mp4" size="860912776" />
|
111
|
+
</resource>
|
112
|
+
<resource id="jt108hm9275_9" sequence="9" type="file">
|
113
|
+
<label>Stanford AI Lab (SAILDART) files</label>
|
114
|
+
<file id="SAILDART.zip" mimetype="application/zip" size="472230479" />
|
115
|
+
</resource>
|
116
|
+
<resource id="jt108hm9275_10" sequence="10" type="file">
|
117
|
+
<label>WTDS Interview: Douglas C. Engelbart, 2006 Apr 13</label>
|
118
|
+
<file id="DougEngelbart041306.wav" mimetype="audio/x-wav" size="273705910" />
|
119
|
+
</resource></contentMetadata>'
|
120
|
+
sdb = sdb_for_content_md(m)
|
121
|
+
expect(sdb.file_ids).to match_array ['ATE.PDF', 'SC0524_2013-047_b8_811.mp4', 'SAILDART.zip', 'DougEngelbart041306.wav']
|
122
|
+
end
|
123
|
+
end # contentMetadata type=file, resource type=file
|
124
|
+
it 'contentMetadata type=geo, resource type=object' do
|
125
|
+
m = '<contentMetadata objectId="druid:qk786js7484" type="geo">
|
126
|
+
<resource id="druid:qk786js7484_1" sequence="1" type="object">
|
127
|
+
<label>Data</label>
|
128
|
+
<file id="data.zip" mimetype="application/zip" role="master" size="10776648" />
|
129
|
+
</resource>
|
130
|
+
<resource id="druid:qk786js7484_2" sequence="2" type="preview">
|
131
|
+
<label>Preview</label>
|
132
|
+
<file id="preview.jpg" mimetype="image/jpeg" role="master" size="140661">
|
133
|
+
<imageData height="846" width="919"/>
|
134
|
+
</file>
|
135
|
+
</resource></contentMetadata>'
|
136
|
+
sdb = sdb_for_content_md(m)
|
137
|
+
expect(sdb.file_ids).to match_array ['data.zip', 'preview.jpg']
|
138
|
+
end
|
139
|
+
|
140
|
+
# FIXME: non-file resource types
|
141
|
+
end # file display_type
|
142
|
+
context 'image display_type' do
|
143
|
+
context 'contentMetadata type=image' do
|
144
|
+
it 'resource type=image should be id attrib of file elements' do
|
145
|
+
m = '<contentMetadata objectId="rg759wj0953" type="image">
|
146
|
+
<resource id="rg759wj0953_1" sequence="1" type="image">
|
147
|
+
<label>Image 1</label>
|
148
|
+
<file id="rg759wj0953_00_0003.jp2" mimetype="image/jp2" size="13248250">
|
149
|
+
<imageData width="6254" height="11236"/>
|
150
|
+
</file>
|
151
|
+
</resource>
|
152
|
+
<resource id="rg759wj0953_2" sequence="2" type="image">
|
153
|
+
<label>Image 2</label>
|
154
|
+
<file id="rg759wj0953_00_00_0001.jp2" mimetype="image/jp2" size="8484503">
|
155
|
+
<imageData width="7266" height="6188"/>
|
156
|
+
</file>
|
157
|
+
</resource></contentMetadata>'
|
158
|
+
sdb = sdb_for_content_md m
|
159
|
+
expect(sdb.file_ids).to match_array ['rg759wj0953_00_0003.jp2', 'rg759wj0953_00_00_0001.jp2']
|
160
|
+
end
|
161
|
+
it 'resource type=object should be ignored' do
|
162
|
+
m = '<contentMetadata objectId="ny981gz0831" type="image">
|
163
|
+
<resource id="ny981gz0831_1" sequence="1" type="object">
|
164
|
+
<label>File 1</label>
|
165
|
+
<file id="da39a3ee5e6b4b0d3255bfef95601890afd80709.dderr" mimetype="application/x-symlink" size="26634" />
|
166
|
+
<file id="da39a3ee5e6b4b0d3255bfef95601890afd80709.img" mimetype="application/x-symlink" size="368640" />
|
167
|
+
<file id="da39a3ee5e6b4b0d3255bfef95601890afd80709.img.sha" mimetype="application/x-symlink" size="173" />
|
168
|
+
</resource></contentMetadata>'
|
169
|
+
sdb = sdb_for_content_md(m)
|
170
|
+
expect(sdb.file_ids).to be_nil
|
171
|
+
end
|
172
|
+
end # contentMetadata type=image
|
173
|
+
context 'contentMetadata type=map, resource type=image' do
|
174
|
+
it 'is id attrib of file elements' do
|
175
|
+
m = '<contentMetadata objectId="druid:rf935xg1061" type="map">
|
176
|
+
<resource id="0001" sequence="1" type="image">
|
177
|
+
<file id="rf935xg1061_00_0001.jp2" mimetype="image/jp2" size="20204910">
|
178
|
+
<imageData height="7248" width="14787"/>
|
179
|
+
</file>
|
180
|
+
</resource>
|
181
|
+
<resource id="0002" sequence="2" type="image">
|
182
|
+
<file id="rf935xg1061_00_0002.jp2" mimetype="image/jp2" size="20209446">
|
183
|
+
<imageData height="7248" width="14787"/>
|
184
|
+
</file>
|
185
|
+
</resource></contentMetadata>'
|
186
|
+
sdb = sdb_for_content_md(m)
|
187
|
+
expect(sdb.file_ids).to match_array ['rf935xg1061_00_0001.jp2', 'rf935xg1061_00_0002.jp2']
|
188
|
+
end
|
189
|
+
end # contentMetadata type=map, resource type=image
|
190
|
+
context 'contentMetadata type=manuscript' do
|
191
|
+
it 'resource type=image' do
|
192
|
+
m = '<contentMetadata objectId="druid:my191bb7431" type="manuscript">
|
193
|
+
<resource id="manuscript-image-1" sequence="1" type="image">
|
194
|
+
<label>Front Outer Board</label>
|
195
|
+
<file format="JPEG2000" id="T0000001.jp2" mimetype="image/jp2" size="7553958">
|
196
|
+
<imageData height="4578" width="3442"/>
|
197
|
+
</file>
|
198
|
+
</resource>
|
199
|
+
<resource id="manuscript-image-343" sequence="343" type="image">
|
200
|
+
<label>Spine</label>
|
201
|
+
<file format="JPEG2000" id="T0000343.jp2" mimetype="image/jp2" size="1929355">
|
202
|
+
<imageData height="4611" width="986"/>
|
203
|
+
</file>
|
204
|
+
</resource>
|
205
|
+
</contentMetadata>'
|
206
|
+
sdb = sdb_for_content_md(m)
|
207
|
+
expect(sdb.file_ids).to match_array ['T0000001.jp2', 'T0000343.jp2']
|
208
|
+
end
|
209
|
+
it 'resource type=page should be ignored' do
|
210
|
+
m = '<contentMetadata objectId="druid:Bodley342" type="manuscript">
|
211
|
+
<resource type="page" sequence="1" id="image-1">
|
212
|
+
<label>1</label>
|
213
|
+
<file mimetype="image/jp2" format="JPEG2000" size="1319924" id="asn0001-M.jp2">
|
214
|
+
<imageData height="3466" width="2405"/>
|
215
|
+
</file>
|
216
|
+
</resource>
|
217
|
+
<resource type="page" sequence="453" id="image-453">
|
218
|
+
<label>453</label>
|
219
|
+
<file mimetype="image/jp2" format="JPEG2000" size="1457066" id="asn0452-M.jp2">
|
220
|
+
<imageData height="3431" width="2431"/>
|
221
|
+
</file>
|
222
|
+
</resource></contentMetadata>'
|
223
|
+
sdb = sdb_for_content_md(m)
|
224
|
+
expect(sdb.file_ids).to be_nil
|
225
|
+
end
|
226
|
+
end # contentMetadata type=manuscript
|
227
|
+
end # image display_type
|
228
|
+
|
229
|
+
it 'is nil for book display_type' do
|
230
|
+
m = '<contentMetadata type="book" objectId="xm901jg3836">
|
231
|
+
<resource type="image" sequence="1" id="xm901jg3836_1">
|
232
|
+
<label>Item 1</label>
|
233
|
+
<file id="xm901jg3836_00_0002.jp2" mimetype="image/jp2" size="1152852">
|
234
|
+
<imageData width="2091" height="2905"/>
|
235
|
+
</file>
|
236
|
+
</resource>
|
237
|
+
<resource type="image" sequence="608" id="xm901jg3836_608">
|
238
|
+
<label>Item 608</label>
|
239
|
+
<file id="xm901jg3836_00_0609.jp2" mimetype="image/jp2" size="1152297">
|
240
|
+
<imageData width="2090" height="2905"/>
|
241
|
+
</file>
|
242
|
+
</resource></contentMetadata>'
|
243
|
+
sdb = sdb_for_content_md(m)
|
244
|
+
expect(sdb.file_ids).to be_nil
|
245
|
+
end
|
246
|
+
it 'is id attrib of file elements for media display_type' do
|
247
|
+
m = '<contentMetadata objectId="jy496kh1727" type="media">
|
248
|
+
<resource sequence="1" id="jy496kh1727_1" type="audio">
|
249
|
+
<label>Tape 1, Pass 1</label>
|
250
|
+
<file id="jy496kh1727_sl.mp3" mimetype="audio/mpeg" size="57010677" />
|
251
|
+
</resource>
|
252
|
+
<resource sequence="2" id="jy496kh1727_2" type="image">
|
253
|
+
<label>Image of media (1 of 3)</label>
|
254
|
+
<file id="jy496kh1727_img_1.jp2" mimetype="image/jp2" size="1277821">
|
255
|
+
<imageData width="2659" height="2535"/>
|
256
|
+
</file>
|
257
|
+
</resource></contentMetadata>'
|
258
|
+
sdb = sdb_for_content_md(m)
|
259
|
+
expect(sdb.file_ids).to match_array ['jy496kh1727_sl.mp3', 'jy496kh1727_img_1.jp2']
|
260
|
+
end
|
261
|
+
it 'is nil if there are no <resource> elements in the contentMetadata' do
|
262
|
+
m = '<contentMetadata objectId="jy496kh1727" type="file"></contentMetadata>'
|
263
|
+
sdb = sdb_for_content_md(m)
|
264
|
+
expect(sdb.file_ids).to be_nil
|
265
|
+
end
|
266
|
+
it 'is nil if there are no <file> elements in the contentMetadata' do
|
267
|
+
m = '<contentMetadata objectId="jy496kh1727" type="file">
|
268
|
+
<resource sequence="1" id="jy496kh1727_1" type="file">
|
269
|
+
<label>Tape 1, Pass 1</label>
|
270
|
+
</resource>
|
271
|
+
<resource sequence="2" id="jy496kh1727_2" type="image">
|
272
|
+
<label>Image of media (1 of 3)</label>
|
273
|
+
</resource></contentMetadata>'
|
274
|
+
sdb = sdb_for_content_md(m)
|
275
|
+
expect(sdb.file_ids).to be_nil
|
276
|
+
end
|
277
|
+
it 'is nil if there are no id elements on file elements' do
|
278
|
+
m = "#{@content_md_start}<resource type='image'><file/></resource>#{@content_md_end}"
|
279
|
+
sdb = sdb_for_content_md(m)
|
280
|
+
expect(sdb.file_ids).to be_nil
|
281
|
+
end
|
282
|
+
|
283
|
+
# TODO: multiple file elements in a single resource element
|
284
|
+
end # file_ids
|
285
|
+
end # contentMetadata fields and methods
|
286
|
+
end
|