RubyGems - stanford-mods - Versions diffs - 1.3.3 → 1.3.4 - Mend

stanford-mods 1.3.3 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/.rspec +1 -0
data/.rubocop.yml +4 -0
data/Gemfile +1 -0
data/lib/stanford-mods.rb +5 -5
data/lib/stanford-mods/date_parsing.rb +245 -0
data/lib/stanford-mods/origin_info.rb +411 -0
data/lib/stanford-mods/searchworks.rb +23 -474
data/lib/stanford-mods/searchworks_subjects.rb +208 -0
data/lib/stanford-mods/version.rb +1 -1
data/spec/date_parsing_spec.rb +746 -0
data/spec/fixtures/spotlight_pub_date_data.rb +316 -0
data/spec/origin_info_spec.rb +449 -0
data/spec/searchworks_pub_dates_spec.rb +166 -163
data/spec/spec_helper.rb +16 -5
data/stanford-mods.gemspec +2 -0
metadata +25 -2

data/lib/stanford-mods/searchworks.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 # encoding: UTF-8
 require 'stanford-mods/searchworks_languages'
+require 'stanford-mods/searchworks_subjects'
 require 'logger'
 require 'mods'
@@ -207,289 +208,11 @@ module Stanford
       # ---- end TITLE ----
       # ---- SUBJECT ----
-      # Values are the contents of:
-      #   subject/geographic
-      #   subject/hierarchicalGeographic
-      #   subject/geographicCode  (only include the translated value if it isn't already present from other mods geo fields)
-      # @param [String] sep - the separator string for joining hierarchicalGeographic sub elements
-      # @return [Array<String>] values for geographic_search Solr field for this document or [] if none
-      def sw_geographic_search(sep = ' ')
-        result = term_values([:subject, :geographic]) || []
-        # hierarchicalGeographic has sub elements
-        @mods_ng_xml.subject.hierarchicalGeographic.each { |hg_node|
-          hg_vals = []
-          hg_node.element_children.each { |e|
-            hg_vals << e.text unless e.text.empty?
-          }
-          result << hg_vals.join(sep) unless hg_vals.empty?
-        }
-        trans_code_vals = @mods_ng_xml.subject.geographicCode.translated_value
-        if trans_code_vals
-          trans_code_vals.each { |val|
-            result << val if !result.include?(val)
-          }
-        end
-        result
-      end
-      # Values are the contents of:
-      #   subject/name/namePart
-      #  "Values from namePart subelements should be concatenated in the order they appear (e.g. "Shakespeare, William, 1564-1616")"
-      # @param [String] sep - the separator string for joining namePart sub elements
-      # @return [Array<String>] values for names inside subject elements or [] if none
-      def sw_subject_names(sep = ', ')
-        result = []
-        @mods_ng_xml.subject.name_el.select { |n_el| n_el.namePart }.each { |name_el_w_np|
-          parts = name_el_w_np.namePart.map { |npn| npn.text unless npn.text.empty? }.compact
-          result << parts.join(sep).strip unless parts.empty?
-        }
-        result
-      end
-      # Values are the contents of:
-      #   subject/titleInfo/(subelements)
-      # @param [String] sep - the separator string for joining titleInfo sub elements
-      # @return [Array<String>] values for titles inside subject elements or [] if none
-      def sw_subject_titles(sep = ' ')
-        result = []
-        @mods_ng_xml.subject.titleInfo.each { |ti_el|
-          parts = ti_el.element_children.map { |el| el.text unless el.text.empty? }.compact
-          result << parts.join(sep).strip unless parts.empty?
-        }
-        result
-      end
-      # Values are the contents of:
-      #   mods/genre
-      #   mods/subject/topic
-      # @return [Array<String>] values for the topic_search Solr field for this document or nil if none
-      def topic_search
-        @topic_search ||= begin
-          vals = self.term_values(:genre) || []
-          vals.concat(subject_topics) if subject_topics
-          vals.empty? ? nil : vals
-        end
-      end
-      # Values are the contents of:
-      #   subject/topic
-      #   subject/name
-      #   subject/title
-      #   subject/occupation
-      #  with trailing comma, semicolon, and backslash (and any preceding spaces) removed
-      # @return [Array<String>] values for the topic_facet Solr field for this document or nil if none
-      def topic_facet
-        vals = subject_topics ? Array.new(subject_topics) : []
-        vals.concat(subject_names) if subject_names
-        vals.concat(subject_titles) if subject_titles
-        vals.concat(subject_occupations) if subject_occupations
-        vals.map! { |val|
-          v = val.sub(/[\\,;]$/, '')
-          v.strip
-        }
-        vals.empty? ? nil : vals
-      end
-      # geographic_search values with trailing comma, semicolon, and backslash (and any preceding spaces) removed
-      # @return [Array<String>] values for the geographic_facet Solr field for this document or nil if none
-      def geographic_facet
-        geographic_search.map { |val| val.sub(/[\\,;]$/, '').strip } unless !geographic_search
-      end
-      # subject/temporal values with trailing comma, semicolon, and backslash (and any preceding spaces) removed
-      # @return [Array<String>] values for the era_facet Solr field for this document or nil if none
-      def era_facet
-        subject_temporal.map { |val| val.sub(/[\\,;]$/, '').strip } unless !subject_temporal
-      end
-      # Values are the contents of:
-      #   subject/geographic
-      #   subject/hierarchicalGeographic
-      #   subject/geographicCode  (only include the translated value if it isn't already present from other mods geo fields)
-      # @return [Array<String>] values for the geographic_search Solr field for this document or nil if none
-      def geographic_search
-        @geographic_search ||= begin
-          result = self.sw_geographic_search
-          # TODO:  this should go into stanford-mods ... but then we have to set that gem up with a Logger
-          # print a message for any unrecognized encodings
-          xvals = self.subject.geographicCode.translated_value
-          codes = self.term_values([:subject, :geographicCode])
-          if codes && codes.size > xvals.size
-            self.subject.geographicCode.each { |n|
-              if n.authority != 'marcgac' && n.authority != 'marccountry'
-                sw_logger.info("#{druid} has subject geographicCode element with untranslated encoding (#{n.authority}): #{n.to_xml}")
-              end
-            }
-          end
-          # FIXME:  stanford-mods should be returning [], not nil ...
-          return nil if !result || result.empty?
-          result
-        end
-      end
-      # Values are the contents of:
-      #   subject/name
-      #   subject/occupation  - no subelements
-      #   subject/titleInfo
-      # @return [Array<String>] values for the subject_other_search Solr field for this document or nil if none
-      def subject_other_search
-        @subject_other_search ||= begin
-          vals = subject_occupations ? Array.new(subject_occupations) : []
-          vals.concat(subject_names) if subject_names
-          vals.concat(subject_titles) if subject_titles
-          vals.empty? ? nil : vals
-        end
-      end
-      # Values are the contents of:
-      #   subject/temporal
-      #   subject/genre
-      # @return [Array<String>] values for the subject_other_subvy_search Solr field for this document or nil if none
-      def subject_other_subvy_search
-        @subject_other_subvy_search ||= begin
-          vals = subject_temporal ? Array.new(subject_temporal) : []
-          gvals = self.term_values([:subject, :genre])
-          vals.concat(gvals) if gvals
-          # print a message for any temporal encodings
-          self.subject.temporal.each { |n|
-            sw_logger.info("#{druid} has subject temporal element with untranslated encoding: #{n.to_xml}") if !n.encoding.empty?
-          }
-          vals.empty? ? nil : vals
-        end
-      end
-      # Values are the contents of:
-      #  all subject subelements except subject/cartographic plus  genre top level element
-      # @return [Array<String>] values for the subject_all_search Solr field for this document or nil if none
-      def subject_all_search
-        vals = topic_search ? Array.new(topic_search) : []
-        vals.concat(geographic_search) if geographic_search
-        vals.concat(subject_other_search) if subject_other_search
-        vals.concat(subject_other_subvy_search) if subject_other_subvy_search
-        vals.empty? ? nil : vals
-      end
+      # see searchworks_subjects.rb
       # ---- end SUBJECT ----
       # ---- PUBLICATION (place, year) ----
-      def place
-        vals = self.term_values([:origin_info,:place,:placeTerm])
-        vals
-      end
-      # For the date display only, the first place to look is in the dates without encoding=marc array.
-      # If no such dates, select the first date in the dates_marc_encoding array.  Otherwise return nil
-      # @return [String] value for the pub_date_display Solr field for this document or nil if none
-      def pub_date_display
-          return dates_no_marc_encoding.first unless dates_no_marc_encoding.empty?
-          return dates_marc_encoding.first    unless dates_marc_encoding.empty?
-          return nil
-      end
-      # For the date indexing, sorting and faceting, the first place to look is in the dates with encoding=marc array.
-      # If that doesn't exist, look in the dates without encoding=marc array.  Otherwise return nil
-      # @return [Array<String>] values for the date Solr field for this document or nil if none
-      def pub_dates
-        return dates_marc_encoding    unless dates_marc_encoding.empty?
-        return dates_no_marc_encoding unless dates_no_marc_encoding.empty?
-        return nil
-      end
-      def is_number?(object)
-        true if Integer(object) rescue false
-      end
-      def is_date?(object)
-        true if Date.parse(object) rescue false
-      end
-      # Get the publish year from mods
-      # @return [String] 4 character year or nil if no valid date was found
-      def pub_year
-        #use the cached year if there is one
-        if @pub_year
-          if @pub_year == ''
-            return nil
-          end
-          return @pub_year
-        end
-        dates = pub_dates
-        if dates
-          pruned_dates = []
-          dates.each do |f_date|
-            #remove ? and []
-            if (f_date.length == 4 && f_date.end_with?('?'))
-              pruned_dates << f_date.gsub('?','0')
-            else
-              pruned_dates << f_date.gsub('?','').gsub('[','').gsub(']','')
-            end
-          end
-          #try to find a date starting with the most normal date formats and progressing to more wonky ones
-          @pub_year = get_plain_four_digit_year pruned_dates
-          return @pub_year if @pub_year
-          # Check for years in u notation, e.g., 198u
-          @pub_year = get_u_year pruned_dates
-          return @pub_year if @pub_year
-          @pub_year = get_double_digit_century pruned_dates
-          return @pub_year if @pub_year
-          @pub_year = get_bc_year pruned_dates
-          return @pub_year if @pub_year
-          @pub_year = get_three_digit_year pruned_dates
-          return @pub_year if @pub_year
-          @pub_year = get_single_digit_century pruned_dates
-          return @pub_year if @pub_year
-        end
-        @pub_year=''
-        return nil
-      end
-      #creates a date suitable for sorting. Guarnteed to be 4 digits or nil
-      def pub_date_sort
-        pd=nil
-        if pub_date
-          pd=pub_date
-          if pd.length == 3
-            pd='0'+pd
-          end
-          pd=pd.gsub('--','00')
-        end
-        raise "pub_date_sort was about to return a non 4 digit value #{pd}!" if pd and pd.length !=4
-        pd
-      end
-      #The year the object was published, , filtered based on max_pub_date and min_pub_date from the config file
-      #@return [String] 4 character year or nil
-      def pub_date
-        pub_year || nil
-      end
-      #Values for the pub date facet. This is less strict than the 4 year date requirements for pub_date
-      #@return <Array[String]> with values for the pub date facet
-      def pub_date_facet
-        if pub_date
-          if pub_date.start_with?('-')
-            return (pub_date.to_i + 1000).to_s + ' B.C.'
-          end
-          if pub_date.include? '--'
-            cent=pub_date[0,2].to_i
-            cent+=1
-            cent=cent.to_s+'th century'
-            return cent
-          else
-            return pub_date
-          end
-        else
-          nil
-        end
-      end
+      # see origin_info.rb  (as all this information comes from top level originInfo element)
       # ---- end PUBLICATION (place, year) ----
       def sw_logger
@@ -525,23 +248,23 @@ module Stanford
               when 'still image'
                 val << 'Image'
               when 'text'
-                val << 'Book' if issuance and issuance.include? 'monographic'
+                val << 'Book' if issuance && issuance.include?('monographic')
                 book_genres = ['book chapter', 'Book chapter', 'Book Chapter',
                   'issue brief', 'Issue brief', 'Issue Brief',
                   'librettos', 'Librettos',
                   'project report', 'Project report', 'Project Report',
                   'technical report', 'Technical report', 'Technical Report',
                   'working paper', 'Working paper', 'Working Paper']
-                val << 'Book' if genres and !(genres & book_genres).empty?
+                val << 'Book' if genres && !(genres & book_genres).empty?
                 conf_pub = ['conference publication', 'Conference publication', 'Conference Publication']
-                val << 'Conference Proceedings' if genres and !(genres & conf_pub).empty?
-                val << 'Journal/Periodical' if issuance and issuance.include? 'continuing'
+                val << 'Conference Proceedings' if genres && !(genres & conf_pub).empty?
+                val << 'Journal/Periodical' if issuance && issuance.include?('continuing')
                 article = ['article', 'Article']
-                val << 'Journal/Periodical' if genres and !(genres & article).empty?
+                val << 'Journal/Periodical' if genres && !(genres & article).empty?
                 stu_proj_rpt = ['student project report', 'Student project report', 'Student Project report', 'Student Project Report']
-                val << 'Other' if genres and !(genres & stu_proj_rpt).empty?
+                val << 'Other' if genres && !(genres & stu_proj_rpt).empty?
                 thesis = ['thesis', 'Thesis']
-                val << 'Thesis' if genres and !(genres & thesis).empty?
+                val << 'Thesis' if genres && !(genres & thesis).empty?
               when 'three dimensional object'
                 val << 'Other'
             end
@@ -571,7 +294,7 @@ module Stanford
         ]
         if types
           genres = self.term_values(:genre)
-          issuance = self.term_values([:origin_info,:issuance])
+          issuance = self.term_values([:origin_info, :issuance])
           types.each do |type|
             case type
               when 'cartographic'
@@ -583,7 +306,7 @@ module Stanford
               when 'notated music'
                 val << 'Music score'
               when 'software, multimedia'
-                if genres and (genres.include?('dataset') || genres.include?('Dataset'))
+                if genres && (genres.include?('dataset') || genres.include?('Dataset'))
                   val << 'Dataset'
                 else
                   val << 'Software/Multimedia'
@@ -595,10 +318,10 @@ module Stanford
               when 'still image'
                 val << 'Image'
               when 'text'
-                val << 'Book' if genres   and !(genres & article_genres).empty?
-                val << 'Book' if issuance and issuance.include? 'monographic'
-                val << 'Book' if genres   and !(genres & book_genres).empty?
-                val << 'Journal/Periodical' if issuance and issuance.include? 'continuing'
+                val << 'Book' if genres && !(genres & article_genres).empty?
+                val << 'Book' if issuance && issuance.include?('monographic')
+                val << 'Book' if genres && !(genres & book_genres).empty?
+                val << 'Journal/Periodical' if issuance && issuance.include?('continuing')
               when 'three dimensional object'
                 val << 'Object'
             end
@@ -633,195 +356,21 @@ module Stanford
       # @return [String] value with the numeric catkey in it, or nil if none exists
       def catkey
-        catkey=self.term_values([:record_info,:recordIdentifier])
-        if catkey and catkey.length>0
-          return catkey.first.gsub('a','') #need to ensure catkey is numeric only
+        catkey = self.term_values([:record_info, :recordIdentifier])
+        if catkey && catkey.length > 0
+          return catkey.first.tr('a', '') # ensure catkey is numeric only
         end
         nil
       end
-      def druid= new_druid
-        @druid=new_druid
-      end
-      def druid
-        @druid ? @druid : 'Unknown item'
-      end
-# protected ----------------------------------------------------------
-      # convenience method for subject/name/namePart values (to avoid parsing the mods for the same thing multiple times)
-      def subject_names
-        @subject_names ||= self.sw_subject_names
-      end
-      # convenience method for subject/occupation values (to avoid parsing the mods for the same thing multiple times)
-      def subject_occupations
-        @subject_occupations ||= self.term_values([:subject, :occupation])
-      end
-      # convenience method for subject/temporal values (to avoid parsing the mods for the same thing multiple times)
-      def subject_temporal
-        @subject_temporal ||= self.term_values([:subject, :temporal])
-      end
-      # convenience method for subject/titleInfo values (to avoid parsing the mods for the same thing multiple times)
-      def subject_titles
-        @subject_titles ||= self.sw_subject_titles
+      def druid=(new_druid)
+        @druid = new_druid
       end
-      # convenience method for subject/topic values (to avoid parsing the mods for the same thing multiple times)
-      def subject_topics
-        @subject_topics ||= self.term_values([:subject, :topic])
-      end
-      #get a 4 digit year like 1865 from the date array
-      def get_plain_four_digit_year dates
-        dates.each do |f_date|
-          matches=f_date.scan(/\d{4}/)
-          if matches.length == 1
-            @pub_year=matches.first
-            return matches.first
-          else
-            #if there are multiples, check for ones with CE after them
-            matches.each do |match|
-              #look for things like '1865-6 CE'
-              pos = f_date.index(Regexp.new(match+'...CE'))
-              pos = pos ? pos.to_i : 0
-              if f_date.include?(match+' CE') or pos > 0
-                @pub_year=match
-                return match
-              end
-            end
-            return matches.first
-          end
-        end
-        return nil
-      end
-      # If a year has a "u" in it, replace instances of u with 0
-      # @param [String] dates
-      # @return String
-      def get_u_year dates
-        dates.each do |f_date|
-          # Single digit u notation
-          matches = f_date.scan(/\d{3}u/)
-          if matches.length == 1
-            return matches.first.gsub('u','0')
-          end
-          # Double digit u notation
-          matches = f_date.scan(/\d{2}u{2}/)
-          if matches.length == 1
-            return matches.first.gsub('u','-')
-          end
-        end
-        return nil
-      end
-      #get a double digit century like '12th century' from the date array
-      def get_double_digit_century dates
-        dates.each do |f_date|
-          matches=f_date.scan(/\d{2}th/)
-          if matches.length == 1
-            @pub_year=((matches.first[0,2].to_i)-1).to_s+'--'
-            return @pub_year
-          end
-          #if there are multiples, check for ones with CE after them
-          if matches.length > 0
-            matches.each do |match|
-              pos = f_date.index(Regexp.new(match+'...CE'))
-              pos = pos ? pos.to_i : f_date.index(Regexp.new(match+' century CE'))
-              pos = pos ? pos.to_i : 0
-              if f_date.include?(match+' CE') or pos > 0
-                @pub_year=((match[0,2].to_i) - 1).to_s+'--'
-                return @pub_year
-              end
-            end
-          end
-        end
-        return nil
-      end
-      #get a 3 digit year like 965 from the date array
-      def get_three_digit_year dates
-        dates.each do |f_date|
-          matches=f_date.scan(/\d{3}/)
-          if matches.length > 0
-            return matches.first
-          end
-        end
-        return nil
-      end
-      #get the 3 digit BC year, return it as a negative, so -700 for 300 BC. Other methods will translate it to proper display, this is good for sorting.
-      def get_bc_year dates
-        dates.each do |f_date|
-          matches=f_date.scan(/\d{3} B.C./)
-          if matches.length > 0
-            bc_year=matches.first[0..2]
-            return (bc_year.to_i-1000).to_s
-          end
-        end
-        return nil
-      end
-      #get a single digit century like '9th century' from the date array
-      def get_single_digit_century dates
-        dates.each do |f_date|
-          matches=f_date.scan(/\d{1}th/)
-          if matches.length == 1
-            @pub_year=((matches.first[0,2].to_i)-1).to_s+'--'
-            return @pub_year
-          end
-          #if there are multiples, check for ones with CE after them
-          if matches.length > 0
-            matches.each do |match|
-              pos = f_date.index(Regexp.new(match+'...CE'))
-              pos = pos ? pos.to_i : f_date.index(Regexp.new(match+' century CE'))
-              pos = pos ? pos.to_i : 0
-              if f_date.include?(match+' CE') or pos > 0
-                @pub_year=((match[0,1].to_i) - 1).to_s+'--'
-                return @pub_year
-              end
-            end
-          end
-        end
-        return nil
-      end
-      # @return [Array<String>] dates from dateIssued and dateCreated tags from origin_info with encoding="marc"
-      def dates_marc_encoding
-        @dates_marc_encoding ||= begin
-          parse_dates_from_originInfo
-          @dates_marc_encoding
-        end
-      end
-      # @return [Array<String>] dates from dateIssued and dateCreated tags from origin_info with encoding not "marc"
-      def dates_no_marc_encoding
-        @dates_no_marc_encoding ||= begin
-          parse_dates_from_originInfo
-          @dates_no_marc_encoding
-        end
+      def druid
+        @druid ? @druid : 'Unknown item'
       end
-      # Populate @dates_marc_encoding and @dates_no_marc_encoding from dateIssued and dateCreated tags from origin_info
-      # with and without encoding=marc
-      def parse_dates_from_originInfo
-        @dates_marc_encoding = []
-        @dates_no_marc_encoding = []
-        self.origin_info.dateIssued.each { |di|
-          if di.encoding == "marc"
-            @dates_marc_encoding << di.text
-          else
-            @dates_no_marc_encoding << di.text
-          end
-        }
-        self.origin_info.dateCreated.each { |dc|
-          if dc.encoding == "marc"
-            @dates_marc_encoding << dc.text
-          else
-            @dates_no_marc_encoding << dc.text
-          end
-        }
-      end
     end # class Record
   end # Module Mods
 end # Module Stanford