RubyGems - stanford-mods - Versions diffs - 2.6.4 → 3.0.0.alpha1 - Mend

stanford-mods 2.6.4 → 3.0.0.alpha1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

checksums.yaml +4 -4
data/.github/workflows/ruby.yml +1 -1
data/lib/stanford-mods/{geo_spatial.rb → concerns/geo_spatial.rb} +3 -5
data/lib/stanford-mods/concerns/name.rb +57 -0
data/lib/stanford-mods/concerns/origin_info.rb +109 -0
data/lib/stanford-mods/{physical_location.rb → concerns/physical_location.rb} +2 -2
data/lib/stanford-mods/concerns/searchworks.rb +125 -0
data/lib/stanford-mods/concerns/searchworks_subjects.rb +126 -0
data/lib/stanford-mods/concerns/title.rb +79 -0
data/lib/stanford-mods/coordinate.rb +21 -3
data/lib/stanford-mods/date_parsing.rb +32 -289
data/lib/stanford-mods/imprint.rb +148 -325
data/lib/stanford-mods/record.rb +20 -0
data/lib/stanford-mods/version.rb +1 -1
data/lib/stanford-mods/{searchworks_languages.rb → vocabularies/searchworks_languages.rb} +0 -0
data/lib/stanford-mods.rb +12 -11
data/spec/fixtures/searchworks_imprint_data.rb +38 -39
data/spec/fixtures/searchworks_pub_date_data.rb +7 -7
data/spec/fixtures/spotlight_pub_date_data.rb +7 -7
data/spec/geo_spatial_spec.rb +1 -6
data/spec/imprint_spec.rb +238 -207
data/spec/name_spec.rb +26 -230
data/spec/origin_info_spec.rb +34 -300
data/spec/searchworks_basic_spec.rb +1 -3
data/spec/searchworks_pub_dates_spec.rb +0 -215
data/spec/searchworks_spec.rb +0 -21
data/spec/searchworks_subject_raw_spec.rb +106 -105
data/spec/searchworks_subject_spec.rb +19 -55
data/spec/searchworks_title_spec.rb +1 -1
data/stanford-mods.gemspec +1 -1
metadata +21 -17
data/lib/marc_countries.rb +0 -387
data/lib/stanford-mods/geo_utils.rb +0 -28
data/lib/stanford-mods/name.rb +0 -80
data/lib/stanford-mods/origin_info.rb +0 -489
data/lib/stanford-mods/searchworks.rb +0 -333
data/lib/stanford-mods/searchworks_subjects.rb +0 -196
data/spec/date_parsing_spec.rb +0 -905

data/lib/stanford-mods/searchworks.rb DELETED Viewed

@@ -1,333 +0,0 @@
-# encoding: UTF-8
-require 'stanford-mods/searchworks_languages'
-require 'stanford-mods/searchworks_subjects'
-require 'logger'
-require 'mods'
-# SearchWorks specific wranglings of MODS metadata as a mixin to the Stanford::Mods::Record object
-module Stanford
-  module Mods
-    class Record < ::Mods::Record
-      attr_writer :druid
-      attr_writer :logger
-      def druid
-        @druid || 'Unknown item'
-      end
-      def logger
-        @logger ||= Logger.new(STDOUT)
-      end
-      alias sw_logger logger
-      # include langagues known to SearchWorks; try to error correct when possible (e.g. when ISO-639 disagrees with MARC standard)
-      def sw_language_facet
-        result = []
-        mods_ng_xml.language.each { |n|
-          # get languageTerm codes and add their translations to the result
-          n.code_term.each { |ct|
-            if ct.authority =~ /^iso639/
-              vals = ct.text.split(/[,|\ ]/).reject { |x| x.strip.empty? }
-              vals.each do |v|
-                if ISO_639.find(v.strip)
-                  iso639_val = ISO_639.find(v.strip).english_name
-                  if SEARCHWORKS_LANGUAGES.has_value?(iso639_val)
-                    result << iso639_val
-                  else
-                    result << SEARCHWORKS_LANGUAGES[v.strip]
-                  end
-                else
-                  logger.warn "Couldn't find english name for #{ct.text}"
-                end
-              end
-            else
-              vals = ct.text.split(/[,|\ ]/).reject { |x| x.strip.empty? }
-              vals.each do |v|
-                result << SEARCHWORKS_LANGUAGES[v.strip]
-              end
-            end
-          }
-          # add languageTerm text values
-          n.text_term.each { |tt|
-            val = tt.text.strip
-            result << val if !val.empty? && SEARCHWORKS_LANGUAGES.has_value?(val)
-          }
-          # add language values that aren't in languageTerm subelement
-          if n.languageTerm.empty?
-            result << n.text if SEARCHWORKS_LANGUAGES.has_value?(n.text)
-          end
-        }
-        result.uniq
-      end # language_facet
-      # ---- AUTHOR ----
-      # @return [String] value for author_1xx_search field
-      def sw_main_author
-        main_author_w_date
-      end
-      # @return [Array<String>] values for author_7xx_search field
-      def sw_addl_authors
-        additional_authors_w_dates
-      end
-      # @return [Array<String>] values for author_person_facet, author_person_display
-      def sw_person_authors
-        personal_names_w_dates
-      end
-      # return the display_value_w_date for all <mods><name> elements that do not have type='personal'
-      # @return [Array<String>] values for author_other_facet
-      def sw_impersonal_authors
-        mods_ng_xml.plain_name.select { |n| n.type_at != 'personal' }.map { |n| n.display_value_w_date }
-      end
-      # @return [Array<String>] values for author_corp_display
-      def sw_corporate_authors
-        mods_ng_xml.plain_name.select { |n| n.type_at == 'corporate' }.map { |n| n.display_value_w_date }
-      end
-      # @return [Array<String>] values for author_meeting_display
-      def sw_meeting_authors
-        mods_ng_xml.plain_name.select { |n| n.type_at == 'conference' }.map { |n| n.display_value_w_date }
-      end
-      # Returns a sortable version of the main_author:
-      #  main_author + sorting title
-      # which is the mods approximation of the value created for a marc record
-      # @return [String] value for author_sort field
-      def sw_sort_author
-        #  substitute java Character.MAX_CODE_POINT for nil main_author so missing main authors sort last
-        val = '' + (main_author_w_date ? main_author_w_date : "\u{10FFFF} ") + (sort_title ? sort_title : '')
-        val.gsub(/[[:punct:]]*/, '').strip
-      end
-      def main_author_w_date_test
-        result = nil
-        first_wo_role = nil
-        plain_name.each { |n|
-          first_wo_role ||= n if n.role.empty?
-          n.role.each { |r|
-            if r.authority.include?('marcrelator') &&
-              (r.value.include?('Creator') || r.value.include?('Author'))
-              result ||= n.display_value_w_date
-            end
-          }
-        }
-        result = first_wo_role.display_value_w_date if !result && first_wo_role
-        result
-      end
-      # ---- end AUTHOR ----
-      # ---- TITLE ----
-      # @return [String] value for title_245a_search field
-      def sw_short_title
-        short_titles ? short_titles.compact.reject(&:empty?).first : nil
-      end
-      # @return [Nokogiri::XML::NodeSet] title_info nodes, rejecting ones that just have blank text values
-      def present_title_info_nodes
-        mods_ng_xml.title_info.reject {|node| node.text.strip.empty?}
-      end
-      # @return [Nokogiri::XML::Node] the first titleInfo node if present, else nil
-      def first_title_info_node
-        present_title_info_nodes ? present_title_info_nodes.first : nil
-      end
-      # @return [String] the nonSort text portion of the titleInfo node as a string (if non-empty, else nil)
-      def nonSort_title
-        return unless first_title_info_node && first_title_info_node.nonSort
-        first_title_info_node.nonSort.text.strip.empty? ? nil : first_title_info_node.nonSort.text.strip
-      end
-      # @return [String] the text of the titleInfo node as a string (if non-empty, else nil)
-      def title
-        return unless first_title_info_node && first_title_info_node.title
-        first_title_info_node.title.text.strip.empty?   ? nil : first_title_info_node.title.text.strip
-      end
-      # Searchworks requires that the MODS has a '//titleInfo/title'
-      # @return [String] value for title_245_search, title_full_display
-      def sw_full_title
-        return nil if !first_title_info_node || !title
-        preSubTitle = nonSort_title ? [nonSort_title, title].compact.join(" ") : title
-        preSubTitle.sub!(/:$/, '')
-        subTitle = first_title_info_node.subTitle.text.strip
-        preParts = subTitle.empty? ? preSubTitle : preSubTitle + " : " + subTitle
-        preParts.sub!(/\.$/, '') if preParts # remove trailing period
-        partName   = first_title_info_node.partName.text.strip   unless first_title_info_node.partName.text.strip.empty?
-        partNumber = first_title_info_node.partNumber.text.strip unless first_title_info_node.partNumber.text.strip.empty?
-        partNumber.sub!(/,$/, '') if partNumber # remove trailing comma
-        if partNumber && partName
-          parts = partNumber + ", " + partName
-        elsif partNumber
-          parts = partNumber
-        elsif partName
-          parts = partName
-        end
-        parts.sub!(/\.$/, '') if parts
-        result = parts ? preParts + ". " + parts : preParts
-        return nil unless result
-        result += "." unless result =~ /[[:punct:]]$/
-        result.strip!
-        result = nil if result.empty?
-        result
-      end
-      # like sw_full_title without trailing \,/;:.
-      # spec from solrmarc-sw   sw_index.properties
-      #    title_display = custom, removeTrailingPunct(245abdefghijklmnopqrstuvwxyz, [\\\\,/;:], ([A-Za-z]{4}|[0-9]{3}|\\)|\\,))
-      # @return [String] value for title_display (like title_full_display without trailing punctuation)
-      def sw_title_display
-        result = sw_full_title
-        return nil unless result
-        result.sub(/[\.,;:\/\\]+$/, '').strip
-      end
-      # this includes all titles except
-      # @return [Array<String>] values for title_variant_search
-      def sw_addl_titles
-        excluded_title = sw_short_title || sw_title_display
-        if excluded_title.present?
-          title_regex = Regexp.new(Regexp.escape(excluded_title))
-          full_titles.reject { |s| s =~ title_regex }.reject(&:blank?)
-        else
-          full_titles.reject(&:blank?)
-        end
-      end
-      # Returns a sortable version of the main title
-      # @return [String] value for title_sort field
-      def sw_sort_title
-        val = '' + (sw_full_title ? sw_full_title : '')
-        val.sub!(Regexp.new("^" + Regexp.escape(nonSort_title)), '') if nonSort_title
-        val.gsub!(/[[:punct:]]*/, '').strip
-        val.squeeze(" ").strip
-      end
-      # remove trailing commas
-      # @deprecated in favor of sw_title_display
-      def sw_full_title_without_commas
-        result = sw_full_title
-        result.sub!(/,$/, '') if result
-        result
-      end
-      # ---- end TITLE ----
-      # ---- SUBJECT ----
-      # see searchworks_subjects.rb
-      # ---- end SUBJECT ----
-      # ---- PUBLICATION (place, year) ----
-      # see origin_info.rb  (as all this information comes from top level originInfo element)
-      # ---- end PUBLICATION (place, year) ----
-      # select one or more format values from the controlled vocabulary per JVine Summer 2014
-      #   http://searchworks-solr-lb.stanford.edu:8983/solr/select?facet.field=format_main_ssim&rows=0&facet.sort=index
-      # https://github.com/sul-dlss/stanford-mods/issues/66 - For geodata, the
-      # resource type should be only Map and not include Software, multimedia.
-      # @return <Array[String]> value in the SearchWorks controlled vocabulary
-      def format_main
-        types = typeOfResource
-        return [] unless types
-        article_genres = ['article', 'Article',
-          'book chapter', 'Book chapter', 'Book Chapter',
-          'issue brief', 'Issue brief', 'Issue Brief',
-          'project report', 'Project report', 'Project Report',
-          'student project report', 'Student project report', 'Student Project report', 'Student Project Report',
-          'technical report', 'Technical report', 'Technical Report',
-          'working paper', 'Working paper', 'Working Paper'
-        ]
-        book_genres = ['conference publication', 'Conference publication', 'Conference Publication',
-          'instruction', 'Instruction',
-          'librettos', 'Librettos',
-          'thesis', 'Thesis'
-        ]
-        val = []
-        genres = term_values(:genre) || []
-        issuance = term_values([:origin_info, :issuance]) || []
-        frequency = term_values([:origin_info, :frequency]) || []
-        val << 'Dataset' if genres.include?('dataset') || genres.include?('Dataset')
-        types.each do |type|
-          val << 'Archive/Manuscript' if type.manuscript == 'yes'
-          case type.text
-            when 'cartographic'
-              val << 'Map'
-            when 'mixed material'
-              val << 'Archive/Manuscript'
-            when 'moving image'
-              val << 'Video'
-            when 'notated music'
-              val << 'Music score'
-            when 'software, multimedia'
-              val << 'Software/Multimedia' unless types.map(&:text).include?('cartographic') || (genres.include?('dataset') || genres.include?('Dataset'))
-            when 'sound recording-musical'
-              val << 'Music recording'
-            when 'sound recording-nonmusical', 'sound recording'
-              val << 'Sound recording'
-            when 'still image'
-              val << 'Image'
-            when 'text'
-              is_explicitly_a_book = type.manuscript != 'yes' && (issuance.include?('monographic') || !(genres & article_genres).empty? || !(genres & book_genres).empty?)
-              is_periodical = issuance.include?('continuing') || issuance.include?('serial') || frequency.any? { |x| !x.empty? }
-              is_archived_website = genres.any? { |x| x.casecmp('archived website') == 0 }
-              val << 'Book' if is_explicitly_a_book
-              val << 'Journal/Periodical' if is_periodical
-              val << 'Archived website' if is_archived_website
-              val << 'Book' unless is_explicitly_a_book || is_periodical || is_archived_website
-            when 'three dimensional object'
-              val << 'Object'
-          end
-        end
-        val.uniq
-      end
-      # @return <Array[String]> values for the genre facet in SearchWorks
-      def sw_genre
-        genres = term_values(:genre)
-        return [] unless genres
-        val = genres.map(&:to_s)
-        thesis_pub = ['thesis', 'Thesis']
-        val << 'Thesis/Dissertation' if (genres & thesis_pub).any?
-        conf_pub = ['conference publication', 'Conference publication', 'Conference Publication']
-        gov_pub  = ['government publication', 'Government publication', 'Government Publication']
-        tech_rpt = ['technical report', 'Technical report', 'Technical Report']
-        val << 'Conference proceedings' if (genres & conf_pub).any?
-        val << 'Government document' if (genres & gov_pub).any?
-        val << 'Technical report' if (genres & tech_rpt).any?
-        val.uniq
-      end
-      # @return [String] value with the numeric catkey in it, or nil if none exists
-      def catkey
-        catkey = term_values([:record_info, :recordIdentifier])
-        return nil unless catkey && !catkey.empty?
-        catkey.first.tr('a', '') # ensure catkey is numeric only
-      end
-    end # class Record
-  end # Module Mods
-end # Module Stanford

data/lib/stanford-mods/searchworks_subjects.rb DELETED Viewed

@@ -1,196 +0,0 @@
-# encoding: UTF-8
-require 'logger'
-require 'mods'
-# SearchWorks specific wranglings of MODS  *subject* metadata as a mixin to the Stanford::Mods::Record object
-module Stanford
-  module Mods
-    class Record < ::Mods::Record
-      # Values are the contents of:
-      #   subject/geographic
-      #   subject/hierarchicalGeographic
-      #   subject/geographicCode  (only include the translated value if it isn't already present from other mods geo fields)
-      # @param [String] sep - the separator string for joining hierarchicalGeographic sub elements
-      # @return [Array<String>] values for geographic_search Solr field for this document or [] if none
-      def sw_geographic_search(sep = ' ')
-        result = term_values([:subject, :geographic]) || []
-        # hierarchicalGeographic has sub elements
-        mods_ng_xml.subject.hierarchicalGeographic.each { |hg_node|
-          hg_vals = hg_node.element_children.map(&:text).reject(&:empty?)
-          result << hg_vals.join(sep) unless hg_vals.empty?
-        }
-        trans_code_vals = mods_ng_xml.subject.geographicCode.translated_value || []
-        trans_code_vals.each { |val|
-          result << val unless result.include?(val)
-        }
-        result
-      end
-      # Values are the contents of:
-      #   subject/name/namePart
-      #  "Values from namePart subelements should be concatenated in the order they appear (e.g. "Shakespeare, William, 1564-1616")"
-      # @param [String] sep - the separator string for joining namePart sub elements
-      # @return [Array<String>] values for names inside subject elements or [] if none
-      def sw_subject_names(sep = ', ')
-        mods_ng_xml.subject.name_el
-                   .select { |n_el| n_el.namePart }
-                   .map { |name_el_w_np| name_el_w_np.namePart.map(&:text).reject(&:empty?) }
-                   .reject(&:empty?)
-                   .map { |parts| parts.join(sep).strip }
-      end
-      # Values are the contents of:
-      #   subject/titleInfo/(subelements)
-      # @param [String] sep - the separator string for joining titleInfo sub elements
-      # @return [Array<String>] values for titles inside subject elements or [] if none
-      def sw_subject_titles(sep = ' ')
-        result = []
-        mods_ng_xml.subject.titleInfo.each { |ti_el|
-          parts = ti_el.element_children.map(&:text).reject(&:empty?)
-          result << parts.join(sep).strip unless parts.empty?
-        }
-        result
-      end
-      # Values are the contents of:
-      #   mods/subject/topic
-      # @return [Array<String>] values for the topic_search Solr field for this document or nil if none
-      def topic_search
-        @topic_search ||= begin
-          vals = []
-          vals.concat(subject_topics) if subject_topics
-          vals.empty? ? nil : vals
-        end
-      end
-      # Values are the contents of:
-      #   subject/topic
-      #   subject/name
-      #   subject/title
-      #   subject/occupation
-      #  with trailing comma, semicolon, and backslash (and any preceding spaces) removed
-      # @return [Array<String>] values for the topic_facet Solr field for this document or nil if none
-      def topic_facet
-        vals = subject_topics ? Array.new(subject_topics) : []
-        vals.concat(subject_names) if subject_names
-        vals.concat(subject_titles) if subject_titles
-        vals.concat(subject_occupations) if subject_occupations
-        vals.map! { |val| val.sub(/[\\,;]$/, '').strip }
-        vals.empty? ? nil : vals
-      end
-      # geographic_search values with trailing comma, semicolon, and backslash (and any preceding spaces) removed
-      # @return [Array<String>] values for the geographic_facet Solr field for this document or nil if none
-      def geographic_facet
-        geographic_search.map { |val| val.sub(/[\\,;]$/, '').strip } if geographic_search
-      end
-      # subject/temporal values with trailing comma, semicolon, and backslash (and any preceding spaces) removed
-      # @return [Array<String>] values for the era_facet Solr field for this document or nil if none
-      def era_facet
-        subject_temporal.map { |val| val.sub(/[\\,;]$/, '').strip } if subject_temporal
-      end
-      # Values are the contents of:
-      #   subject/geographic
-      #   subject/hierarchicalGeographic
-      #   subject/geographicCode  (only include the translated value if it isn't already present from other mods geo fields)
-      # @return [Array<String>] values for the geographic_search Solr field for this document or nil if none
-      def geographic_search
-        @geographic_search ||= begin
-          result = sw_geographic_search
-          # TODO:  this should go into stanford-mods ... but then we have to set that gem up with a Logger
-          # print a message for any unrecognized encodings
-          xvals = subject.geographicCode.translated_value
-          codes = term_values([:subject, :geographicCode])
-          if codes && codes.size > xvals.size
-            subject.geographicCode.each { |n|
-              next unless n.authority != 'marcgac' && n.authority != 'marccountry'
-              sw_logger.info("#{druid} has subject geographicCode element with untranslated encoding (#{n.authority}): #{n.to_xml}")
-            }
-          end
-          # FIXME:  stanford-mods should be returning [], not nil ...
-          return nil if !result || result.empty?
-          result
-        end
-      end
-      # Values are the contents of:
-      #   subject/name
-      #   subject/occupation  - no subelements
-      #   subject/titleInfo
-      # @return [Array<String>] values for the subject_other_search Solr field for this document or nil if none
-      def subject_other_search
-        @subject_other_search ||= begin
-          vals = subject_occupations ? Array.new(subject_occupations) : []
-          vals.concat(subject_names) if subject_names
-          vals.concat(subject_titles) if subject_titles
-          vals.empty? ? nil : vals
-        end
-      end
-      # Values are the contents of:
-      #   subject/temporal
-      #   subject/genre
-      # @return [Array<String>] values for the subject_other_subvy_search Solr field for this document or nil if none
-      def subject_other_subvy_search
-        @subject_other_subvy_search ||= begin
-          vals = subject_temporal ? Array.new(subject_temporal) : []
-          gvals = term_values([:subject, :genre])
-          vals.concat(gvals) if gvals
-          # print a message for any temporal encodings
-          subject.temporal.each { |n|
-            sw_logger.info("#{druid} has subject temporal element with untranslated encoding: #{n.to_xml}") unless n.encoding.empty?
-          }
-          vals.empty? ? nil : vals
-        end
-      end
-      # Values are the contents of:
-      #  all subject subelements except subject/cartographic plus  genre top level element
-      # @return [Array<String>] values for the subject_all_search Solr field for this document or nil if none
-      def subject_all_search
-        vals = topic_search ? Array.new(topic_search) : []
-        vals.concat(geographic_search) if geographic_search
-        vals.concat(subject_other_search) if subject_other_search
-        vals.concat(subject_other_subvy_search) if subject_other_subvy_search
-        vals.empty? ? nil : vals
-      end
-      protected #----------------------------------------------------------
-      # convenience method for subject/name/namePart values (to avoid parsing the mods for the same thing multiple times)
-      def subject_names
-        @subject_names ||= sw_subject_names
-      end
-      # convenience method for subject/occupation values (to avoid parsing the mods for the same thing multiple times)
-      def subject_occupations
-        @subject_occupations ||= term_values([:subject, :occupation])
-      end
-      # convenience method for subject/temporal values (to avoid parsing the mods for the same thing multiple times)
-      def subject_temporal
-        @subject_temporal ||= term_values([:subject, :temporal])
-      end
-      # convenience method for subject/titleInfo values (to avoid parsing the mods for the same thing multiple times)
-      def subject_titles
-        @subject_titles ||= sw_subject_titles
-      end
-      # convenience method for subject/topic values (to avoid parsing the mods for the same thing multiple times)
-      def subject_topics
-        @subject_topics ||= term_values([:subject, :topic])
-      end
-    end
-  end
-end