RubyGems - gdor-indexer - Versions diffs - 0.1.0 - Mend

gdor-indexer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +7 -0
data/.gitignore +31 -0
data/.hound.yml +2 -0
data/.rubocop.yml +3 -0
data/.rubocop_todo.yml +131 -0
data/.yardopts +3 -0
data/Capfile +26 -0
data/Gemfile +12 -0
data/LICENSE.txt +5 -0
data/README.md +67 -0
data/Rakefile +57 -0
data/VERSION +1 -0
data/bin/indexer +71 -0
data/config/deploy.rb +31 -0
data/config/deploy/dev.rb +41 -0
data/config/deploy/fetcher.rb +6 -0
data/config/deploy/prod.rb +41 -0
data/config/deploy/stage.rb +41 -0
data/gdor-indexer.gemspec +43 -0
data/lib/gdor/indexer.rb +327 -0
data/lib/gdor/indexer/mods_fields.rb +114 -0
data/lib/gdor/indexer/nokogiri_xml_node_mixin.rb +42 -0
data/lib/gdor/indexer/public_xml_fields.rb +81 -0
data/lib/gdor/indexer/solr_doc_builder.rb +85 -0
data/lib/gdor/indexer/solr_doc_hash.rb +112 -0
data/lib/gdor/indexer/version.rb +5 -0
data/spec/config/walters_integration_spec.yml +44 -0
data/spec/spec_helper.rb +26 -0
data/spec/unit/gdor_mods_fields_spec.rb +812 -0
data/spec/unit/indexer_spec.rb +411 -0
data/spec/unit/public_xml_fields_spec.rb +286 -0
data/spec/unit/solr_doc_builder_spec.rb +128 -0
data/spec/unit/solr_doc_hash_spec.rb +399 -0
data/spec/vcr_cassettes/no_coll_druid_in_druid_array_call.yml +745 -0
metadata +411 -0

data/lib/gdor/indexer/mods_fields.rb ADDED Viewed

@@ -0,0 +1,114 @@
+# A mixin to the GDor::Indexer::SolrDocBuilder class.
+# Methods for Solr field values determined from MODS
+module GDor::Indexer::ModsFields
+  # Create a Hash representing a Solr doc, with all MODS related fields populated.
+  # @return [Hash] Hash representing the Solr document
+  def doc_hash_from_mods
+    doc_hash = {
+      # title fields
+      title_245a_search: smods_rec.sw_short_title,
+      title_245_search: smods_rec.sw_full_title,
+      title_variant_search: smods_rec.sw_addl_titles,
+      title_sort: smods_rec.sw_sort_title,
+      title_245a_display: smods_rec.sw_short_title,
+      title_display: smods_rec.sw_title_display,
+      title_full_display: smods_rec.sw_full_title,
+      # author fields
+      author_1xx_search: smods_rec.sw_main_author,
+      author_7xx_search: smods_rec.sw_addl_authors,
+      author_person_facet: smods_rec.sw_person_authors,
+      author_other_facet: smods_rec.sw_impersonal_authors,
+      author_sort: smods_rec.sw_sort_author,
+      author_corp_display: smods_rec.sw_corporate_authors,
+      author_meeting_display: smods_rec.sw_meeting_authors,
+      author_person_display: smods_rec.sw_person_authors,
+      author_person_full_display: smods_rec.sw_person_authors,
+      # subject search fields
+      topic_search: smods_rec.topic_search,
+      geographic_search: smods_rec.geographic_search,
+      subject_other_search: smods_rec.subject_other_search,
+      subject_other_subvy_search: smods_rec.subject_other_subvy_search,
+      subject_all_search: smods_rec.subject_all_search,
+      topic_facet: smods_rec.topic_facet,
+      geographic_facet: smods_rec.geographic_facet,
+      era_facet: smods_rec.era_facet,
+      format_main_ssim: format_main_ssim,
+      format: format, # for backwards compatibility
+      language: smods_rec.sw_language_facet,
+      physical: smods_rec.term_values([:physical_description, :extent]),
+      summary_search: smods_rec.term_values(:abstract),
+      toc_search: smods_rec.term_values(:tableOfContents),
+      url_suppl: smods_rec.term_values([:related_item, :location, :url]),
+      # publication fields
+      pub_search: smods_rec.place,
+      pub_date_sort: smods_rec.pub_date_sort,
+      imprint_display: smods_rec.pub_date_display,
+      pub_date: smods_rec.pub_date_facet,
+      pub_date_display: smods_rec.pub_date_display, # pub_date_display may be deprecated
+      all_search: smods_rec.text.gsub(/\s+/, ' ')
+    }
+    # more pub date fields
+    pub_date_sort = doc_hash[:pub_date_sort]
+    if is_positive_int? pub_date_sort
+      doc_hash[:pub_year_tisim] = pub_date_sort # for date slider
+      # put the displayable year in the correct field, :creation_year_isi for example
+      doc_hash[date_type_sym] = smods_rec.pub_date_sort if date_type_sym
+    end
+    doc_hash
+  end
+  # select one or more format values from the controlled vocabulary here:
+  #   http://searchworks-solr-lb.stanford.edu:8983/solr/select?facet.field=format&rows=0&facet.sort=index
+  # via stanford-mods gem
+  # @return [Array<String>] value(s) in the SearchWorks controlled vocabulary, or []
+  def format
+    vals = smods_rec.format
+    if vals.empty?
+      logger.warn "#{druid} has no SearchWorks format from MODS - check <typeOfResource> and other implicated MODS elements"
+    end
+    vals
+  end
+  # call stanford-mods format_main to get results
+  # @return [Array<String>] value(s) in the SearchWorks controlled vocabulary, or []
+  def format_main_ssim
+    vals = smods_rec.format_main
+    if vals.empty?
+      logger.warn "#{druid} has no SearchWorks Resource Type from MODS - check <typeOfResource> and other implicated MODS elements"
+    end
+    vals
+  end
+  # call stanford-mods sw_genre to get results
+  # @return [Array<String>] value(s)
+  def genre_ssim
+    smods_rec.sw_genre
+  end
+  protected
+  # @return true if the string parses into an int, and if so, the int is >= 0
+  def is_positive_int?(str)
+    str.to_i >= 0
+  rescue
+    false
+  end
+  # determines particular flavor of displayable publication year field
+  # @return Solr field name as a symbol
+  def date_type_sym
+    vals = smods_rec.term_values([:origin_info, :dateIssued])
+    return :publication_year_isi if vals && vals.length > 0
+    vals = smods_rec.term_values([:origin_info, :dateCreated])
+    return :creation_year_isi if vals && vals.length > 0
+    nil
+  end
+end

data/lib/gdor/indexer/nokogiri_xml_node_mixin.rb ADDED Viewed

@@ -0,0 +1,42 @@
+# Monkey patch for Nokogiri to cache xpath contexts and make things faster under jRuby
+module Nokogiri
+  module XML
+    class Node
+      @context = nil
+      def xpath(*paths)
+        return NodeSet.new(document) unless document
+        paths, handler, ns, binds = extract_params(paths)
+        sets = paths.map do |path|
+          # if self.contexts[path]
+          #  ctx = self.contexts[path]
+          # else
+          if @context
+            ctx = @context
+          else
+            ctx = XPathContext.new(self)
+            @context = ctx
+          end
+          ctx.register_namespaces(ns)
+          path = path.gsub(/xmlns:/, ' :') unless Nokogiri.uses_libxml?
+          binds.each do |key, value|
+            ctx.register_variable key.to_s, value
+          end if binds
+          ctx.evaluate(path, handler)
+        end
+        return sets.first if sets.length == 1
+        NodeSet.new(document) do |combined|
+          sets.each do |set|
+            set.each do |node|
+              combined << node
+            end
+          end
+        end
+      end # def xpath
+    end # class Node
+  end # module XML
+end # module Nokogiri

data/lib/gdor/indexer/public_xml_fields.rb ADDED Viewed

@@ -0,0 +1,81 @@
+# A mixin to the GDor::Indexer::SolrDocBuilder class.
+# Methods for Solr field values determined from the DOR object's purl page public xml
+module GDor::Indexer::PublicXmlFields
+  # value is used to tell SearchWorks UI app of specific display needs for objects
+  # a config file value for add_display_type can be used to prepend a string to
+  #  xxx_collection or xxx_object
+  # e.g., Hydrus objects are a special display case
+  # Based on a value of :add_display_type in a collection's config yml file
+  #
+  # information on DOR content types:
+  #   https://consul.stanford.edu/display/chimera/DOR+content+types%2C+resource+types+and+interpretive+metadata
+  # @return String the string to pre-pend to the display_type value  (e.g. )
+  # @return [String] 'collection' or DOR content type
+  def display_type
+    case dor_content_type
+    when 'book'
+      'book'
+    when 'image', 'manuscript', 'map'
+      'image'
+    else
+      'file'
+    end
+  end
+  # the @id attribute of resource/file elements that match the display_type, including extension
+  # @return [Array<String>] filenames
+  def file_ids
+    @file_ids ||= begin
+      ids = []
+      if content_md
+        if display_type == 'image'
+          content_md.xpath('./resource[@type="image"]/file/@id').each do |node|
+            ids << node.text unless node.text.empty?
+          end
+        elsif display_type == 'file'
+          content_md.xpath('./resource/file/@id').each do |node|
+            ids << node.text unless node.text.empty?
+          end
+        end
+      end
+      return nil if ids.empty?
+      ids
+    end
+  end
+  # @return true if the identityMetadata has <objectType>collection</objectType>, false otherwise
+  def collection?
+    resource.collection?
+  end
+  def collections
+    resource.collections
+  end
+  protected #---------------------------------------------------------------------
+  # the value of the type attribute for a DOR object's contentMetadata
+  #  more info about these values is here:
+  #    https://consul.stanford.edu/display/chimera/DOR+content+types%2C+resource+types+and+interpretive+metadata
+  #    https://consul.stanford.edu/display/chimera/Summary+of+Content+Types%2C+Resource+Types+and+their+behaviors
+  # @return [String]
+  def dor_content_type
+    @dor_content_type ||= begin
+      dct = content_md ? content_md.xpath('@type').text : nil
+      logger.error "#{druid} has no DOR content type (<contentMetadata> element may be missing type attribute)" if !dct || dct.empty?
+      dct
+    end
+  end
+  # the contentMetadata for this object, derived from the public_xml
+  # @return [Nokogiri::XML::Element] containing the contentMetadata
+  def content_md
+    resource.content_metadata
+  end
+  # the identityMetadata for this object, derived from the public_xml
+  # @return [Nokogiri::XML::Element] containing the identityMetadata
+  def identity_md
+    resource.identity_metadata
+  end
+end # GDor::Indexer::SolrDocBuilder class

data/lib/gdor/indexer/solr_doc_builder.rb ADDED Viewed

@@ -0,0 +1,85 @@
+require 'logger'
+require 'harvestdor'
+require 'stanford-mods'
+require 'gdor/indexer/solr_doc_hash'
+require 'gdor/indexer/mods_fields'
+require 'gdor/indexer/public_xml_fields'
+# Class to build the Hash representing a Solr document for a particular druid
+class GDor::Indexer::SolrDocBuilder
+  include GDor::Indexer::ModsFields
+  include GDor::Indexer::PublicXmlFields
+  # The druid of the item
+  attr_reader :resource
+  attr_reader :logger
+  # @param [Harvestdor::Indexer::Resource] resource used to get MODS and public_xml
+  # @param [Logger] logger for indexing messages
+  def initialize(resource, logger)
+    @resource = resource
+    @logger = logger
+  end
+  def druid
+    resource.druid
+  end
+  # Create a Hash representing the Solr doc to be written to Solr, based on MODS and public_xml
+  # @return [Hash] Hash representing the Solr document
+  def doc_hash
+    @doc_hash ||= begin
+      doc_hash = GDor::Indexer::SolrDocHash.new id: resource.bare_druid, modsxml: smods_rec.to_xml
+      hash_from_mods = doc_hash_from_mods # defined in gdor_mods_fields
+      doc_hash.merge!(hash_from_mods) if hash_from_mods
+      doc_hash
+    end
+  end
+  # @return [String] value with SIRSI/Symphony numeric catkey in it, or nil if none exists
+  # first we look for
+  #  identityMetadata/otherId[@name='catkey']
+  # if not found, we look for
+  #  identityMetadata/otherId[@name='barcode']
+  #   if found, we look for catkey in MODS
+  #     mods/recordInfo/recordIdentifier[@source="SIRSI"
+  #     and if found, remove the leading a
+  # otherwise, nil
+  def catkey
+    @catkey ||= begin
+      catkey = nil
+      node = public_xml.xpath("/publicObject/identityMetadata/otherId[@name='catkey']") if public_xml
+      catkey = node.first.content if node && node.first
+      unless catkey
+        # if there's a barcode in the identity metadata then look for a ckey in the MODS
+        node = public_xml.xpath("/publicObject/identityMetadata/otherId[@name='barcode']")
+        if node.first
+          rec_id = smods_rec.record_info.recordIdentifier
+          if rec_id && !rec_id.empty? && rec_id.first.source == 'SIRSI'
+            catkey = rec_id.first.text.delete('a') # need to ensure catkey is numeric only
+          else
+            logger.error("#{druid} has barcode #{node.first.content} in identityMetadata but no SIRSI catkey in mods")
+          end
+        end
+      end
+      catkey
+    end
+  end
+  # return the MODS for the druid as a Stanford::Mods::Record object
+  # @return [Stanford::Mods::Record] created from the MODS xml for the druid
+  def smods_rec
+    @smods_rec ||= begin
+      mods_rec = resource.smods_rec
+      mods_rec.druid = druid # why?
+      mods_rec
+    end
+  end
+  # the public_xml for the druid as a Nokogiri::XML::Document object
+  # @return [Nokogiri::XML::Document] containing the public xml for the druid
+  def public_xml
+    resource.public_xml
+  end
+end # GDor::Indexer::SolrDocBuilder class

data/lib/gdor/indexer/solr_doc_hash.rb ADDED Viewed

@@ -0,0 +1,112 @@
+require 'delegate'
+class GDor::Indexer
+  class SolrDocHash < SimpleDelegator
+    def initialize(hash = {})
+      super(hash)
+    end
+    # looks for non-empty existence of field when exp_val is nil;
+    # when exp_val is a String, looks for matching value as a String or as a member of an Array
+    # when exp_val is a Regexp, looks for String value that matches, or Array with a String member that matches
+    # @return true if the field is non-trivially present in the hash, false otherwise
+    def field_present?(field, exp_val = nil)
+      !!(if self.include?(field) && Array(self[field]).any? { |v| !v.blank? }
+           actual = Array(self[field])
+           case exp_val
+           when nil
+             true
+           when Regexp
+             actual.index { |s| exp_val.match(s) }
+           else
+             actual.include? exp_val
+           end
+      end)
+    end
+    # merge in field values from the new hash, with the following guarantees:
+    #  values for keys in new_hash will be a non-empty String or flat Array
+    #  keys will be removed from hash if all values are nil or empty
+    def combine(new_hash)
+      new_hash.select { |_key, value| Array(value).any? { |v| !v.blank? } }.each do |key, new_val|
+        if field_present? key
+          orig_val = self[key]
+          case orig_val
+          when Array
+            self[key] += Array(new_val)
+          else
+            self[key] = Array(orig_val) + Array(new_val)
+          end
+          self[key] = self[key].reject(&:blank?).uniq
+        else
+          self[key] = new_val
+        end
+      end
+      compact_blank_fields!
+      self
+    end
+    def compact_blank_fields!
+      keys.reject { |key| field_present? key }.each do |key|
+        delete key
+      end
+      self
+    end
+    def druid
+      self[:druid]
+    end
+    # validate fields that should be in hash for any item object in SearchWorks Solr
+    # @return [Array<String>] Array of messages suitable for notificaiton email and/or logs
+    def validate_item(config)
+      result = validate_gdor_fields(config)
+      result << "#{druid} missing collection\n" unless field_present?(:collection)
+      Array(self[:collection]).each do |collection_druid|
+        result << "#{druid} missing collection_with_title (or collection #{collection_druid} is missing title)\n" unless field_present?(:collection_with_title, Regexp.new("#{collection_druid}-\\|-.+"))
+      end
+      result << "#{druid} missing file_id(s)\n" unless field_present?(:file_id)
+      result
+    end
+    # validate fields that should be in hash for any collection object in SearchWorks Solr
+    # @return [Array<String>] Array of messages suitable for notificaiton email and/or logs
+    def validate_collection(config)
+      result = validate_gdor_fields(config)
+      result << "#{druid} missing collection_type 'Digital Collection'\n" unless field_present?(:collection_type, 'Digital Collection')
+      result << "#{druid} missing format_main_ssim 'Archive/Manuscript'\n" unless field_present?(:format_main_ssim, 'Archive/Manuscript')
+      result
+    end
+    # validate fields that should be in hash for every gryphonDOR object in SearchWorks Solr
+    # @return [Array<String>] Array of messages suitable for notificaiton email and/or logs
+    def validate_gdor_fields(config)
+      result = []
+      result << "#{druid} missing druid field\n" unless field_present?(:druid, druid)
+      result << "#{druid} missing url_fulltext for purl\n" unless field_present?(:url_fulltext, "#{config.harvestdor.purl}/#{druid}")
+      result << "#{druid} missing access_facet 'Online'\n" unless field_present?(:access_facet, 'Online')
+      result << "#{druid} missing or bad display_type, possibly caused by unrecognized @type attribute on <contentMetadata>\n" unless field_present?(:display_type, /(file)|(image)|(media)|(book)/)
+      result << "#{druid} missing building_facet 'Stanford Digital Repository'\n" unless field_present?(:building_facet, 'Stanford Digital Repository')
+      result
+    end
+    # validate fields that should be in doc hash for every unmerged gryphonDOR object in SearchWorks Solr
+    # @return [Array<String>] array of Strings indicating absence of required fields
+    def validate_mods(_config)
+      result = []
+      result << "#{druid} missing modsxml\n" unless field_present?(:modsxml)
+      result << "#{druid} missing resource type\n" unless field_present?(:format_main_ssim)
+      result << "#{druid} missing format\n" unless field_present?(:format) # for backwards compatibility
+      result << "#{druid} missing title\n" unless field_present?(:title_display)
+      result << "#{druid} missing pub year for date slider\n" unless field_present?(:pub_year_tisim)
+      result << "#{druid} missing author\n" unless field_present?(:author_person_display)
+      result << "#{druid} missing language\n" unless field_present?(:language)
+      result
+    end
+  end
+end

data/lib/gdor/indexer/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+module GDor
+  class Indexer
+    VERSION = '0.1.0'
+  end
+end

data/spec/config/walters_integration_spec.yml ADDED Viewed

@@ -0,0 +1,44 @@
+# Copy this file and change the following settings:
+# 1. whitelist
+# 2. dor_fetcher service_url
+# 3. harvestdor log_dir, log_name
+# 4. solr url
+# whitelist:  which objects will you index?
+# if this is missing, 0 records will be fetched from the Dor Fetcher service
+# the whitelist can be
+#   1. an array of druids inline here, e.g. ['druid:oo123oo1234', 'druid:oo234oo2345']
+#   2. a filename containing a list of druids (one per line)
+# if a druid is for a collection record (per the object's identityMetadata at purl page)
+#  then we process all the item druids in that collection (as if they were included individually in the whitelist)
+# if a druid is for an item object, then we process that druid
+#whitelist: ['druid:dq441rn2614']
+#  either give absolute path or path relative to where the command will be executed
+#whitelist: config/ap_whitelist.txt
+whitelist: ['druid:ms016pb9280']
+dor_fetcher:
+  # the baseurl of the DOR Fetcher service from which we get the item druids (per whitelist above)
+  #  do not include 'collections' at end.
+  service_url: http://127.0.0.1:3000
+  # if skip_heartbeat set to true, this will skip a check that the dorfetcher service is alive before making API calls
+  #   (useful for testing)
+  skip_heartbeat: true
+harvestdor:
+  # log_name: name of log file  (default: harvestdor.log)
+  log_name: testcoll.log
+  # log_dir:  directory for log file  (default logs, relative to harvestdor gem path)
+  log_dir: spec/test_logs
+  # purl: url for the DOR purl server (used to get ContentMetadata, etc.)
+  purl: https://purl.stanford.edu
+# ----------- SOLR index (that we're writing INTO) parameters ------------
+solr:
+  url: http://solr.baseurl.org
+  # timeouts are in seconds;  read_timeout -> open/read, open_timeout -> connection open
+  read_timeout: 60
+  open_timeout: 60
+  max_retries: 10