RubyGems - gdor-indexer - Versions diffs - 0.1.0 - Mend

gdor-indexer 0.1.0

Files changed (35) hide show

checksums.yaml +7 -0
data/.gitignore +31 -0
data/.hound.yml +2 -0
data/.rubocop.yml +3 -0
data/.rubocop_todo.yml +131 -0
data/.yardopts +3 -0
data/Capfile +26 -0
data/Gemfile +12 -0
data/LICENSE.txt +5 -0
data/README.md +67 -0
data/Rakefile +57 -0
data/VERSION +1 -0
data/bin/indexer +71 -0
data/config/deploy.rb +31 -0
data/config/deploy/dev.rb +41 -0
data/config/deploy/fetcher.rb +6 -0
data/config/deploy/prod.rb +41 -0
data/config/deploy/stage.rb +41 -0
data/gdor-indexer.gemspec +43 -0
data/lib/gdor/indexer.rb +327 -0
data/lib/gdor/indexer/mods_fields.rb +114 -0
data/lib/gdor/indexer/nokogiri_xml_node_mixin.rb +42 -0
data/lib/gdor/indexer/public_xml_fields.rb +81 -0
data/lib/gdor/indexer/solr_doc_builder.rb +85 -0
data/lib/gdor/indexer/solr_doc_hash.rb +112 -0
data/lib/gdor/indexer/version.rb +5 -0
data/spec/config/walters_integration_spec.yml +44 -0
data/spec/spec_helper.rb +26 -0
data/spec/unit/gdor_mods_fields_spec.rb +812 -0
data/spec/unit/indexer_spec.rb +411 -0
data/spec/unit/public_xml_fields_spec.rb +286 -0
data/spec/unit/solr_doc_builder_spec.rb +128 -0
data/spec/unit/solr_doc_hash_spec.rb +399 -0
data/spec/vcr_cassettes/no_coll_druid_in_druid_array_call.yml +745 -0
metadata +411 -0

data/spec/unit/indexer_spec.rb ADDED Viewed

@@ -0,0 +1,411 @@
+require 'spec_helper'
+describe GDor::Indexer do
+  before(:all) do
+    @config_yml_path = File.join(File.dirname(__FILE__), '..', 'config', 'walters_integration_spec.yml')
+    require 'yaml'
+    @yaml = YAML.load_file(@config_yml_path)
+    @ns_decl = "xmlns='#{Mods::MODS_NS}'"
+    @fake_druid = 'oo000oo0000'
+    @coll_druid_from_test_config = 'ww121ss5000'
+    @mods_xml = "<mods #{@ns_decl}><note>Indexer test</note></mods>"
+    @ng_mods_xml = Nokogiri::XML("<mods #{@ns_decl}><note>Indexer test</note></mods>")
+    @pub_xml = "<publicObject id='druid#{@fake_druid}'></publicObject>"
+    @ng_pub_xml = Nokogiri::XML("<publicObject id='druid#{@fake_druid}'></publicObject>")
+  end
+  before(:each) do
+    @indexer = described_class.new(@config_yml_path) do |config|
+      config.whitelist = ['druid:ww121ss5000']
+    end
+    allow(@indexer.solr_client).to receive(:add)
+  end
+  let :resource do
+    r = Harvestdor::Indexer::Resource.new(double, @fake_druid)
+    allow(r).to receive(:collections).and_return []
+    allow(r).to receive(:mods).and_return Nokogiri::XML(@mods_xml)
+    allow(r).to receive(:public_xml).and_return Nokogiri::XML(@pub_xml)
+    allow(r).to receive(:public_xml?).and_return true
+    allow(r).to receive(:content_metadata).and_return nil
+    allow(r).to receive(:collection?).and_return false
+    r
+  end
+  let :collection do
+    r = Harvestdor::Indexer::Resource.new(double, @coll_druid_from_test_config)
+    allow(r).to receive(:collections).and_return []
+    allow(r).to receive(:mods).and_return Nokogiri::XML(@mods_xml)
+    allow(r).to receive(:public_xml).and_return Nokogiri::XML(@pub_xml)
+    allow(r).to receive(:public_xml?).and_return true
+    allow(r).to receive(:content_metadata).and_return nil
+    allow(r).to receive(:identity_md_obj_label).and_return ''
+    allow(r).to receive(:collection?).and_return true
+    r
+  end
+  context 'logging' do
+    it 'writes the log file to the directory indicated by log_dir' do
+      @indexer.logger.info('walters_integration_spec logging test message')
+      expect(File).to exist(File.join(@yaml['harvestdor']['log_dir'], @yaml['harvestdor']['log_name']))
+    end
+  end
+  describe '#harvest_and_index' do
+    before :each do
+      allow(@indexer.harvestdor).to receive(:each_resource)
+      allow(@indexer).to receive(:solr_client).and_return(double(commit!: nil))
+      allow(@indexer).to receive(:log_results)
+      allow(@indexer).to receive(:email_results)
+    end
+    it 'logs and email results' do
+      expect(@indexer).to receive(:log_results)
+      expect(@indexer).to receive(:email_results)
+      @indexer.harvest_and_index
+    end
+    it 'indexs each resource' do
+      allow(@indexer).to receive(:harvestdor).and_return(Class.new do
+        def initialize(*items)
+          @items = items
+        end
+        def each_resource(_opts = {})
+          @items.each { |x| yield x }
+        end
+        def logger
+          Logger.new(STDERR)
+        end
+      end.new(collection, resource))
+      expect(@indexer).to receive(:index).with(collection)
+      expect(@indexer).to receive(:index).with(resource)
+      @indexer.harvest_and_index
+    end
+    it 'sends a solr commit' do
+      expect(@indexer.solr_client).to receive(:commit!)
+      @indexer.harvest_and_index
+    end
+    it 'does not commit if nocommit is set' do
+      expect(@indexer.solr_client).to_not receive(:commit!)
+      @indexer.harvest_and_index(true)
+    end
+  end
+  describe '#index' do
+    it 'indexs collections as collections' do
+      expect(@indexer).to receive(:collection_solr_document).with(collection)
+      @indexer.index collection
+    end
+    it 'indexs other resources as items' do
+      expect(@indexer).to receive(:item_solr_document).with(resource)
+      @indexer.index resource
+    end
+  end
+  describe '#index_with_exception_handling' do
+    it 'capture,s log, and re-raise any exception thrown by the indexing process' do
+      expect(@indexer).to receive(:index).with(resource).and_raise 'xyz'
+      expect(@indexer.logger).to receive(:error)
+      expect { @indexer.index_with_exception_handling(resource) }.to raise_error RuntimeError
+      expect(@indexer.druids_failed_to_ix).to include resource.druid
+    end
+  end
+  context '#item_solr_document' do
+    context 'unmerged' do
+      it 'calls Harvestdor::Indexer.solr_add' do
+        doc_hash = @indexer.item_solr_document(resource)
+        expect(doc_hash).to include id: @fake_druid
+      end
+      it 'calls validate_item' do
+        expect_any_instance_of(GDor::Indexer::SolrDocHash).to receive(:validate_item).and_return([])
+        @indexer.item_solr_document resource
+      end
+      it 'calls GDor::Indexer::SolrDocBuilder.validate_mods' do
+        allow_any_instance_of(GDor::Indexer::SolrDocHash).to receive(:validate_item).and_return([])
+        expect_any_instance_of(GDor::Indexer::SolrDocHash).to receive(:validate_mods).and_return([])
+        @indexer.item_solr_document resource
+      end
+      it 'calls add_coll_info' do
+        expect(@indexer).to receive(:add_coll_info)
+        @indexer.item_solr_document resource
+      end
+      it 'has fields populated from the collection record' do
+        sdb = double
+        allow(sdb).to receive(:doc_hash).and_return(GDor::Indexer::SolrDocHash.new)
+        allow(sdb).to receive(:display_type)
+        allow(sdb).to receive(:file_ids)
+        allow(sdb.doc_hash).to receive(:validate_mods).and_return([])
+        allow(GDor::Indexer::SolrDocBuilder).to receive(:new).and_return(sdb)
+        allow(resource).to receive(:collections).and_return([double(druid: 'foo', bare_druid: 'foo', identity_md_obj_label: 'bar')])
+        doc_hash = @indexer.item_solr_document resource
+        expect(doc_hash).to include druid: @fake_druid, collection: ['foo'], collection_with_title: ['foo-|-bar']
+      end
+      it 'has fields populated from the MODS' do
+        title = 'fake title in mods'
+        ng_mods = Nokogiri::XML("<mods #{@ns_decl}><titleInfo><title>#{title}</title></titleInfo></mods>")
+        allow(resource).to receive(:mods).and_return(ng_mods)
+        doc_hash = @indexer.item_solr_document resource
+        expect(doc_hash).to include id: @fake_druid, title_display: title
+      end
+      it 'populates url_fulltext field with purl page url' do
+        doc_hash = @indexer.item_solr_document resource
+        expect(doc_hash).to include id: @fake_druid, url_fulltext: "#{@yaml['harvestdor']['purl']}/#{@fake_druid}"
+      end
+      it 'populates druid and access_facet fields' do
+        doc_hash = @indexer.item_solr_document resource
+        expect(doc_hash).to include id: @fake_druid, druid: @fake_druid, access_facet: 'Online'
+      end
+      it 'populates display_type field by calling display_type method' do
+        expect_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:display_type).and_return('foo')
+        doc_hash = @indexer.item_solr_document resource
+        expect(doc_hash).to include id: @fake_druid, display_type: 'foo'
+      end
+      it 'populates file_id field by calling file_ids method' do
+        expect_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:file_ids).at_least(1).times.and_return(['foo'])
+        doc_hash = @indexer.item_solr_document resource
+        expect(doc_hash).to include id: @fake_druid, file_id: ['foo']
+      end
+      it 'populates building_facet field with Stanford Digital Repository' do
+        doc_hash = @indexer.item_solr_document resource
+        expect(doc_hash).to include id: @fake_druid, building_facet: 'Stanford Digital Repository'
+      end
+    end # unmerged item
+  end # item_solr_document
+  context '#collection_solr_document' do
+    context 'unmerged' do
+      it 'calls validate_collection' do
+        doc_hash = GDor::Indexer::SolrDocHash.new
+        allow_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:doc_hash).and_return(doc_hash) # speed up the test
+        expect(doc_hash).to receive(:validate_collection).and_return([])
+        doc_hash = @indexer.collection_solr_document collection
+      end
+      it 'calls GDor::Indexer::SolrDocBuilder.validate_mods' do
+        doc_hash = GDor::Indexer::SolrDocHash.new
+        allow_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:doc_hash).and_return(doc_hash) # speed up the test
+        expect(doc_hash).to receive(:validate_mods).and_return([])
+        doc_hash = @indexer.collection_solr_document collection
+      end
+      it 'populates druid and access_facet fields' do
+        doc_hash = @indexer.collection_solr_document collection
+        expect(doc_hash).to include druid: @coll_druid_from_test_config, access_facet: 'Online'
+      end
+      it 'populates url_fulltext field with purl page url' do
+        doc_hash = @indexer.collection_solr_document collection
+        expect(doc_hash).to include url_fulltext: "#{@yaml['harvestdor']['purl']}/#{@coll_druid_from_test_config}"
+      end
+      it "collection_type should be 'Digital Collection'" do
+        allow_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:doc_hash).and_return(GDor::Indexer::SolrDocHash.new) # speed up the test
+        doc_hash = @indexer.collection_solr_document collection
+        expect(doc_hash).to include collection_type: 'Digital Collection'
+      end
+      context 'add format_main_ssim Archive/Manuscript' do
+        it 'no other values' do
+          allow_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:doc_hash).and_return(GDor::Indexer::SolrDocHash.new)
+          doc_hash = @indexer.collection_solr_document collection
+          expect(doc_hash).to include format_main_ssim: 'Archive/Manuscript'
+        end
+        it 'other values present' do
+          allow_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:doc_hash).and_return(GDor::Indexer::SolrDocHash.new({ format_main_ssim: %w(Image Video) }))
+          doc_hash = @indexer.collection_solr_document collection
+          expect(doc_hash).to include format_main_ssim: ['Image', 'Video', 'Archive/Manuscript']
+        end
+        it 'already has values Archive/Manuscript' do
+          allow_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:doc_hash).and_return(GDor::Indexer::SolrDocHash.new({ format_main_ssim: 'Archive/Manuscript' }))
+          doc_hash = @indexer.collection_solr_document collection
+          expect(doc_hash).to include format_main_ssim: ['Archive/Manuscript']
+        end
+      end
+      it 'populates building_facet field with Stanford Digital Repository' do
+        doc_hash = @indexer.collection_solr_document collection
+        expect(doc_hash).to include building_facet: 'Stanford Digital Repository'
+      end
+    end # unmerged collection
+  end #  index_coll_obj_per_config
+  context '#add_coll_info and supporting methods' do
+    before(:each) do
+      @coll_druids_array = [collection]
+    end
+    it 'adds no collection field values to doc_hash if there are none' do
+      doc_hash = GDor::Indexer::SolrDocHash.new({})
+      @indexer.add_coll_info(doc_hash, nil)
+      expect(doc_hash[:collection]).to be_nil
+      expect(doc_hash[:collection_with_title]).to be_nil
+      expect(doc_hash[:display_type]).to be_nil
+    end
+    context 'collection field' do
+      it 'is added field to doc hash' do
+        doc_hash = GDor::Indexer::SolrDocHash.new({})
+        @indexer.add_coll_info(doc_hash, @coll_druids_array)
+        expect(doc_hash[:collection]).to match_array [@coll_druid_from_test_config]
+      end
+      it 'adds two values to doc_hash when object belongs to two collections' do
+        coll_druid1 = 'oo111oo2222'
+        coll_druid2 = 'oo333oo4444'
+        doc_hash = GDor::Indexer::SolrDocHash.new({})
+        @indexer.add_coll_info(doc_hash, [double(druid: coll_druid1, bare_druid: coll_druid1, public_xml: @ng_pub_xml, identity_md_obj_label: ''), double(druid: coll_druid2, bare_druid: coll_druid2, public_xml: @ng_pub_xml, identity_md_obj_label: '')])
+        expect(doc_hash[:collection]).to match_array [coll_druid1, coll_druid2]
+      end
+    end
+    context 'collection_with_title field' do
+      it 'is added to doc_hash' do
+        coll_druid = 'oo000oo1234'
+        doc_hash = GDor::Indexer::SolrDocHash.new({})
+        @indexer.add_coll_info(doc_hash, [double(druid: coll_druid, bare_druid: coll_druid, public_xml: @ng_pub_xml, identity_md_obj_label: 'zzz')])
+        expect(doc_hash[:collection_with_title]).to match_array ["#{coll_druid}-|-zzz"]
+      end
+      it 'adds two values to doc_hash when object belongs to two collections' do
+        coll_druid1 = 'oo111oo2222'
+        coll_druid2 = 'oo333oo4444'
+        doc_hash = GDor::Indexer::SolrDocHash.new({})
+        @indexer.add_coll_info(doc_hash, [double(druid: coll_druid1, bare_druid: coll_druid1, public_xml: @ng_pub_xml, identity_md_obj_label: 'foo'), double(druid: coll_druid2, bare_druid: coll_druid2, public_xml: @ng_pub_xml, identity_md_obj_label: 'bar')])
+        expect(doc_hash[:collection_with_title]).to match_array ["#{coll_druid1}-|-foo", "#{coll_druid2}-|-bar"]
+      end
+      # other tests show it uses druid when coll rec isn't merged
+    end
+    context '#coll_display_types_from_items' do
+      before(:each) do
+        @indexer.coll_display_types_from_items(collection)
+      end
+      it 'gets single item display_type for single collection (and no dups)' do
+        allow(@indexer).to receive(:identity_md_obj_label)
+        doc_hash = GDor::Indexer::SolrDocHash.new({ display_type: 'image' })
+        @indexer.add_coll_info(doc_hash, @coll_druids_array)
+        doc_hash = GDor::Indexer::SolrDocHash.new({ display_type: 'image' })
+        @indexer.add_coll_info(doc_hash, @coll_druids_array)
+        expect(@indexer.coll_display_types_from_items(collection)).to match_array ['image']
+      end
+      it 'gets multiple formats from multiple items for single collection' do
+        allow(@indexer).to receive(:identity_md_obj_label)
+        doc_hash = GDor::Indexer::SolrDocHash.new({ display_type: 'image' })
+        @indexer.add_coll_info(doc_hash, @coll_druids_array)
+        doc_hash = GDor::Indexer::SolrDocHash.new({ display_type: 'file' })
+        @indexer.add_coll_info(doc_hash, @coll_druids_array)
+        expect(@indexer.coll_display_types_from_items(collection)).to match_array %w(image file)
+      end
+    end # coll_display_types_from_items
+  end # add_coll_info
+  context '#num_found_in_solr' do
+    before :each do
+      @unmerged_collection_response = { 'response' => { 'numFound' => '1', 'docs' => [{ 'id' => 'dm212rn7381', 'url_fulltext' => ['https://purl.stanford.edu/dm212rn7381'] }] } }
+      @item_response = { 'response' => { 'numFound' => '265', 'docs' => [{ 'id' => 'dm212rn7381' }] } }
+    end
+    it 'counts the items and the collection object in the solr index after indexing' do
+      allow(@indexer.solr_client.client).to receive(:get) do |_wt, params|
+        if params[:params][:fq].include?('id:"dm212rn7381"')
+          @unmerged_collection_response
+        else
+          @item_response
+        end
+      end
+      expect(@indexer.num_found_in_solr(collection: 'dm212rn7381')).to eq(266)
+    end
+  end # num_found_in_solr
+  context '#email_report_body' do
+    before :each do
+      @indexer.config.notification = 'notification-list@example.com'
+      allow(@indexer).to receive(:num_found_in_solr).and_return(500)
+      allow(@indexer.harvestdor).to receive(:resources).and_return([collection])
+      allow(collection).to receive(:items).and_return([1, 2, 3])
+      allow(collection).to receive(:identity_md_obj_label).and_return('testcoll title')
+    end
+    subject do
+      @indexer.email_report_body
+    end
+    it 'email body includes coll id' do
+      expect(subject).to match /testcoll indexed coll record is: ww121ss5000/
+    end
+    it 'email body includes coll title' do
+      expect(subject).to match /coll title: testcoll title/
+    end
+    it 'email body includes failed to index druids' do
+      @indexer.instance_variable_set(:@druids_failed_to_ix, %w(a b))
+      expect(subject).to match /records that may have failed to index \(merged recs as druids, not ckeys\): \na\nb\n\n/
+    end
+    it 'email body include validation messages' do
+      @indexer.instance_variable_set(:@validation_messages, ['this is a validation message'])
+      expect(subject).to match /this is a validation message/
+    end
+    it 'email includes reference to full log' do
+      expect(subject).to match /full log is at gdor_indexer\/shared\/spec\/test_logs\/testcoll\.log/
+    end
+  end
+  describe '#email_results' do
+    before :each do
+      @indexer.config.notification = 'notification-list@example.com'
+      allow(@indexer).to receive(:send_email)
+      allow(@indexer).to receive(:email_report_body).and_return('Report Body')
+    end
+    it 'has an appropriate subject' do
+      expect(@indexer).to receive(:send_email) do |_to, opts|
+        expect(opts[:subject]).to match /is finished/
+      end
+      @indexer.email_results
+    end
+    it 'sends the email to the notification list' do
+      expect(@indexer).to receive(:send_email) do |to, _opts|
+        expect(to).to eq @indexer.config.notification
+      end
+      @indexer.email_results
+    end
+    it 'has the report body' do
+      expect(@indexer).to receive(:send_email) do |_to, opts|
+        expect(opts[:body]).to eq 'Report Body'
+      end
+      @indexer.email_results
+    end
+  end
+  describe '#send_email' do
+    it 'sends an email to the right list' do
+      expect_any_instance_of(Mail::Message).to receive(:deliver!) do |mail|
+        expect(mail.to).to match_array ['notification-list@example.com']
+      end
+      @indexer.send_email 'notification-list@example.com', {}
+    end
+    it 'has the appropriate options set' do
+      expect_any_instance_of(Mail::Message).to receive(:deliver!) do |mail|
+        expect(mail.subject).to eq 'Subject'
+        expect(mail.from).to match_array ['rspec']
+        expect(mail.body).to eq 'Body'
+      end
+      @indexer.send_email 'notification-list@example.com', { from: 'rspec', subject: 'Subject', body: 'Body' }
+    end
+  end
+  # context "skip heartbeat" do
+  #   it "allows me to use a fake url for dor-fetcher-client" do
+  #     expect {GDor::Indexer.new(@config_yml_path)}.not_to raise_error
+  #   end
+  # end
+end

data/spec/unit/public_xml_fields_spec.rb ADDED Viewed

@@ -0,0 +1,286 @@
+require 'spec_helper'
+describe GDor::Indexer::PublicXmlFields do
+  before(:all) do
+    @fake_druid = 'oo000oo0000'
+    @ns_decl = "xmlns='#{Mods::MODS_NS}'"
+    @mods_xml = "<mods #{@ns_decl}><note>public_xml_fields tests</note></mods>"
+    @empty_pub_xml = "<publicObject id='druid:#{@fake_druid}'></publicObject>"
+  end
+  let :logger do
+    Logger.new(StringIO.new)
+  end
+  def sdb_for_pub_xml(m)
+    resource = Harvestdor::Indexer::Resource.new(double, @fake_druid)
+    allow(resource).to receive(:public_xml).and_return(Nokogiri::XML(m))
+    allow(resource).to receive(:mods).and_return(@mods_xml)
+    GDor::Indexer::SolrDocBuilder.new(resource, logger)
+  end
+  def sdb_for_content_md(m)
+    resource = Harvestdor::Indexer::Resource.new(double, @fake_druid)
+    allow(resource).to receive(:content_metadata).and_return(Nokogiri::XML(m).root)
+    allow(resource).to receive(:public_xml).and_return(@empty_pub_xml)
+    allow(resource).to receive(:mods).and_return(@mods_xml)
+    GDor::Indexer::SolrDocBuilder.new(resource, logger)
+  end
+  # NOTE:
+  # "Doubles, stubs, and message expectations are all cleaned out after each example."
+  # per https://www.relishapp.com/rspec/rspec-mocks/docs/scope
+  context 'contentMetadata fields and methods' do
+    before(:all) do
+      @content_md_start = "<contentMetadata objectId='#{@fake_druid}'>"
+      @content_md_end = '</contentMetadata>'
+      @cntnt_md_type = 'image'
+      @cntnt_md_xml = "<contentMetadata type='#{@cntnt_md_type}' objectId='#{@fake_druid}'>#{@content_md_end}"
+      @pub_xml = "<publicObject id='druid:#{@fake_druid}'>#{@cntnt_md_xml}</publicObject>"
+      @ng_pub_xml = Nokogiri::XML(@pub_xml)
+    end
+    context 'dor_content_type' do
+      it 'is the value of the type attribute on <contentMetadata> element' do
+        val = 'foo'
+        cntnt_md = "<contentMetadata type='#{val}'>#{@content_md_end}"
+        sdb = sdb_for_content_md(cntnt_md)
+        expect(sdb.send(:dor_content_type)).to eq(val)
+      end
+      it 'logs an error message if there is no content type' do
+        cntnt_md = "#{@content_md_start}#{@content_md_end}"
+        sdb = sdb_for_content_md(cntnt_md)
+        expect(sdb.logger).to receive(:error).with("#{@fake_druid} has no DOR content type (<contentMetadata> element may be missing type attribute)")
+        sdb.send(:dor_content_type)
+      end
+    end
+    context 'display_type' do
+      let :sdb do
+        sdb_for_pub_xml @empty_pub_xml
+      end
+      it "'image' for dor_content_type 'image'" do
+        allow(sdb).to receive(:dor_content_type).and_return('image')
+        expect(sdb.display_type).to eq('image')
+      end
+      it "'image' for dor_content_type 'manuscript'" do
+        allow(sdb).to receive(:dor_content_type).and_return('manuscript')
+        expect(sdb.display_type).to eq('image')
+      end
+      it "'image' for dor_content_type 'map'" do
+        allow(sdb).to receive(:dor_content_type).and_return('map')
+        expect(sdb.display_type).to eq('image')
+      end
+      it "'file' for dor_content_type 'media'" do
+        allow(sdb).to receive(:dor_content_type).and_return('media')
+        expect(sdb.display_type).to eq('file')
+      end
+      it "'book' for dor_content_type 'book'" do
+        allow(sdb).to receive(:dor_content_type).and_return('book')
+        expect(sdb.display_type).to eq('book')
+      end
+      it "'file' for unrecognized dor_content_type" do
+        allow(sdb).to receive(:dor_content_type).and_return('foo')
+        expect(sdb.display_type).to eq('file')
+      end
+    end # display_type
+    context '#file_ids' do
+      context 'file display_type' do
+        context 'contentMetadata type=file, resource type=file' do
+          it 'is id attrib of file element in single resource element with type=file' do
+            m = '<contentMetadata type="file" objectId="xh812jt9999">
+              <resource type="file" sequence="1" id="xh812jt9999_1">
+                <label>John A. Blume Earthquake Engineering Center Technical Report 180</label>
+                <file id="TR180_Shahi.pdf" mimetype="application/pdf" size="4949212" />
+              </resource></contentMetadata>'
+            sdb = sdb_for_content_md(m)
+            expect(sdb.file_ids).to match_array ['TR180_Shahi.pdf']
+          end
+          it 'is id attrib of file elements in multiple resource elements with type=file' do
+            m = '<contentMetadata objectId="jt108hm9275" type="file">
+              <resource id="jt108hm9275_1" sequence="1" type="file">
+               <label>Access to Energy newsletter, 1973-1994</label>
+               <file id="ATE.PDF" mimetype="application/pdf" size="16297305" />
+              </resource>
+              <resource id="jt108hm9275_8" sequence="8" type="file">
+               <label>Computer Forum Festschrift for Edward Feigenbaum, 2006 (part 6)</label>
+               <file id="SC0524_2013-047_b8_811.mp4" mimetype="video/mp4" size="860912776" />
+              </resource>
+              <resource id="jt108hm9275_9" sequence="9" type="file">
+                <label>Stanford AI Lab (SAILDART) files</label>
+                <file id="SAILDART.zip" mimetype="application/zip" size="472230479" />
+              </resource>
+              <resource id="jt108hm9275_10" sequence="10" type="file">
+                <label>WTDS Interview: Douglas C. Engelbart, 2006 Apr 13</label>
+                <file id="DougEngelbart041306.wav" mimetype="audio/x-wav" size="273705910" />
+              </resource></contentMetadata>'
+            sdb = sdb_for_content_md(m)
+            expect(sdb.file_ids).to match_array ['ATE.PDF', 'SC0524_2013-047_b8_811.mp4', 'SAILDART.zip', 'DougEngelbart041306.wav']
+          end
+        end # contentMetadata type=file, resource type=file
+        it 'contentMetadata type=geo, resource type=object' do
+          m = '<contentMetadata objectId="druid:qk786js7484" type="geo">
+            <resource id="druid:qk786js7484_1" sequence="1" type="object">
+              <label>Data</label>
+              <file id="data.zip" mimetype="application/zip" role="master" size="10776648" />
+            </resource>
+            <resource id="druid:qk786js7484_2" sequence="2" type="preview">
+              <label>Preview</label>
+              <file id="preview.jpg" mimetype="image/jpeg" role="master" size="140661">
+                <imageData height="846" width="919"/>
+              </file>
+            </resource></contentMetadata>'
+          sdb = sdb_for_content_md(m)
+          expect(sdb.file_ids).to match_array ['data.zip', 'preview.jpg']
+        end
+        # FIXME:  non-file resource types
+      end # file display_type
+      context 'image display_type' do
+        context 'contentMetadata type=image' do
+          it 'resource type=image should be id attrib of file elements' do
+            m = '<contentMetadata objectId="rg759wj0953" type="image">
+              <resource id="rg759wj0953_1" sequence="1" type="image">
+                <label>Image 1</label>
+                <file id="rg759wj0953_00_0003.jp2" mimetype="image/jp2" size="13248250">
+                  <imageData width="6254" height="11236"/>
+                </file>
+              </resource>
+              <resource id="rg759wj0953_2" sequence="2" type="image">
+                <label>Image 2</label>
+                <file id="rg759wj0953_00_00_0001.jp2" mimetype="image/jp2" size="8484503">
+                  <imageData width="7266" height="6188"/>
+                </file>
+              </resource></contentMetadata>'
+            sdb = sdb_for_content_md m
+            expect(sdb.file_ids).to match_array ['rg759wj0953_00_0003.jp2', 'rg759wj0953_00_00_0001.jp2']
+          end
+          it 'resource type=object should be ignored' do
+            m = '<contentMetadata objectId="ny981gz0831" type="image">
+              <resource id="ny981gz0831_1" sequence="1" type="object">
+                <label>File 1</label>
+                <file id="da39a3ee5e6b4b0d3255bfef95601890afd80709.dderr" mimetype="application/x-symlink" size="26634" />
+                <file id="da39a3ee5e6b4b0d3255bfef95601890afd80709.img" mimetype="application/x-symlink" size="368640" />
+                <file id="da39a3ee5e6b4b0d3255bfef95601890afd80709.img.sha" mimetype="application/x-symlink" size="173" />
+              </resource></contentMetadata>'
+            sdb = sdb_for_content_md(m)
+            expect(sdb.file_ids).to be_nil
+          end
+        end # contentMetadata type=image
+        context 'contentMetadata type=map, resource type=image' do
+          it 'is id attrib of file elements' do
+            m = '<contentMetadata objectId="druid:rf935xg1061" type="map">
+              <resource id="0001" sequence="1" type="image">
+                <file id="rf935xg1061_00_0001.jp2" mimetype="image/jp2" size="20204910">
+                  <imageData height="7248" width="14787"/>
+                </file>
+              </resource>
+              <resource id="0002" sequence="2" type="image">
+                <file id="rf935xg1061_00_0002.jp2" mimetype="image/jp2" size="20209446">
+                  <imageData height="7248" width="14787"/>
+                </file>
+              </resource></contentMetadata>'
+            sdb = sdb_for_content_md(m)
+            expect(sdb.file_ids).to match_array ['rf935xg1061_00_0001.jp2', 'rf935xg1061_00_0002.jp2']
+          end
+        end # contentMetadata type=map, resource type=image
+        context 'contentMetadata type=manuscript' do
+          it 'resource type=image' do
+            m = '<contentMetadata objectId="druid:my191bb7431" type="manuscript">
+              <resource id="manuscript-image-1" sequence="1" type="image">
+                <label>Front Outer Board</label>
+                <file format="JPEG2000" id="T0000001.jp2" mimetype="image/jp2" size="7553958">
+                   <imageData height="4578" width="3442"/>
+                </file>
+              </resource>
+              <resource id="manuscript-image-343" sequence="343" type="image">
+                  <label>Spine</label>
+                  <file format="JPEG2000" id="T0000343.jp2" mimetype="image/jp2" size="1929355">
+                    <imageData height="4611" width="986"/>
+                  </file>
+                </resource>
+              </contentMetadata>'
+            sdb = sdb_for_content_md(m)
+            expect(sdb.file_ids).to match_array ['T0000001.jp2', 'T0000343.jp2']
+          end
+          it 'resource type=page should be ignored' do
+            m = '<contentMetadata objectId="druid:Bodley342" type="manuscript">
+              <resource type="page" sequence="1" id="image-1">
+                <label>1</label>
+                <file mimetype="image/jp2" format="JPEG2000" size="1319924" id="asn0001-M.jp2">
+                  <imageData height="3466" width="2405"/>
+                </file>
+              </resource>
+              <resource type="page" sequence="453" id="image-453">
+                <label>453</label>
+                <file mimetype="image/jp2" format="JPEG2000" size="1457066" id="asn0452-M.jp2">
+                  <imageData height="3431" width="2431"/>
+                </file>
+              </resource></contentMetadata>'
+            sdb = sdb_for_content_md(m)
+            expect(sdb.file_ids).to be_nil
+          end
+        end # contentMetadata type=manuscript
+      end # image display_type
+      it 'is nil for book display_type' do
+        m = '<contentMetadata type="book" objectId="xm901jg3836">
+          <resource type="image" sequence="1" id="xm901jg3836_1">
+            <label>Item 1</label>
+            <file id="xm901jg3836_00_0002.jp2" mimetype="image/jp2" size="1152852">
+              <imageData width="2091" height="2905"/>
+            </file>
+          </resource>
+          <resource type="image" sequence="608" id="xm901jg3836_608">
+            <label>Item 608</label>
+            <file id="xm901jg3836_00_0609.jp2" mimetype="image/jp2" size="1152297">
+              <imageData width="2090" height="2905"/>
+            </file>
+          </resource></contentMetadata>'
+        sdb = sdb_for_content_md(m)
+        expect(sdb.file_ids).to be_nil
+      end
+      it 'is id attrib of file elements for media display_type' do
+        m = '<contentMetadata objectId="jy496kh1727" type="media">
+          <resource sequence="1" id="jy496kh1727_1" type="audio">
+            <label>Tape 1, Pass 1</label>
+            <file id="jy496kh1727_sl.mp3" mimetype="audio/mpeg" size="57010677" />
+          </resource>
+          <resource sequence="2" id="jy496kh1727_2" type="image">
+            <label>Image of media (1 of 3)</label>
+            <file id="jy496kh1727_img_1.jp2" mimetype="image/jp2" size="1277821">
+              <imageData width="2659" height="2535"/>
+            </file>
+          </resource></contentMetadata>'
+        sdb = sdb_for_content_md(m)
+        expect(sdb.file_ids).to match_array ['jy496kh1727_sl.mp3', 'jy496kh1727_img_1.jp2']
+      end
+      it 'is nil if there are no <resource> elements in the contentMetadata' do
+        m = '<contentMetadata objectId="jy496kh1727" type="file"></contentMetadata>'
+        sdb = sdb_for_content_md(m)
+        expect(sdb.file_ids).to be_nil
+      end
+      it 'is nil if there are no <file> elements in the contentMetadata' do
+        m = '<contentMetadata objectId="jy496kh1727" type="file">
+          <resource sequence="1" id="jy496kh1727_1" type="file">
+            <label>Tape 1, Pass 1</label>
+          </resource>
+          <resource sequence="2" id="jy496kh1727_2" type="image">
+            <label>Image of media (1 of 3)</label>
+          </resource></contentMetadata>'
+        sdb = sdb_for_content_md(m)
+        expect(sdb.file_ids).to be_nil
+      end
+      it 'is nil if there are no id elements on file elements' do
+        m = "#{@content_md_start}<resource type='image'><file/></resource>#{@content_md_end}"
+        sdb = sdb_for_content_md(m)
+        expect(sdb.file_ids).to be_nil
+      end
+      # TODO:  multiple file elements in a single resource element
+    end # file_ids
+  end # contentMetadata fields and methods
+end