gdor-indexer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,114 @@
1
+ # A mixin to the GDor::Indexer::SolrDocBuilder class.
2
+ # Methods for Solr field values determined from MODS
3
+ module GDor::Indexer::ModsFields
4
+ # Create a Hash representing a Solr doc, with all MODS related fields populated.
5
+ # @return [Hash] Hash representing the Solr document
6
+ def doc_hash_from_mods
7
+ doc_hash = {
8
+ # title fields
9
+ title_245a_search: smods_rec.sw_short_title,
10
+ title_245_search: smods_rec.sw_full_title,
11
+ title_variant_search: smods_rec.sw_addl_titles,
12
+ title_sort: smods_rec.sw_sort_title,
13
+ title_245a_display: smods_rec.sw_short_title,
14
+ title_display: smods_rec.sw_title_display,
15
+ title_full_display: smods_rec.sw_full_title,
16
+
17
+ # author fields
18
+ author_1xx_search: smods_rec.sw_main_author,
19
+ author_7xx_search: smods_rec.sw_addl_authors,
20
+ author_person_facet: smods_rec.sw_person_authors,
21
+ author_other_facet: smods_rec.sw_impersonal_authors,
22
+ author_sort: smods_rec.sw_sort_author,
23
+ author_corp_display: smods_rec.sw_corporate_authors,
24
+ author_meeting_display: smods_rec.sw_meeting_authors,
25
+ author_person_display: smods_rec.sw_person_authors,
26
+ author_person_full_display: smods_rec.sw_person_authors,
27
+
28
+ # subject search fields
29
+ topic_search: smods_rec.topic_search,
30
+ geographic_search: smods_rec.geographic_search,
31
+ subject_other_search: smods_rec.subject_other_search,
32
+ subject_other_subvy_search: smods_rec.subject_other_subvy_search,
33
+ subject_all_search: smods_rec.subject_all_search,
34
+ topic_facet: smods_rec.topic_facet,
35
+ geographic_facet: smods_rec.geographic_facet,
36
+ era_facet: smods_rec.era_facet,
37
+
38
+ format_main_ssim: format_main_ssim,
39
+ format: format, # for backwards compatibility
40
+
41
+ language: smods_rec.sw_language_facet,
42
+ physical: smods_rec.term_values([:physical_description, :extent]),
43
+ summary_search: smods_rec.term_values(:abstract),
44
+ toc_search: smods_rec.term_values(:tableOfContents),
45
+ url_suppl: smods_rec.term_values([:related_item, :location, :url]),
46
+
47
+ # publication fields
48
+ pub_search: smods_rec.place,
49
+ pub_date_sort: smods_rec.pub_date_sort,
50
+ imprint_display: smods_rec.pub_date_display,
51
+ pub_date: smods_rec.pub_date_facet,
52
+ pub_date_display: smods_rec.pub_date_display, # pub_date_display may be deprecated
53
+
54
+ all_search: smods_rec.text.gsub(/\s+/, ' ')
55
+ }
56
+
57
+ # more pub date fields
58
+ pub_date_sort = doc_hash[:pub_date_sort]
59
+ if is_positive_int? pub_date_sort
60
+ doc_hash[:pub_year_tisim] = pub_date_sort # for date slider
61
+ # put the displayable year in the correct field, :creation_year_isi for example
62
+ doc_hash[date_type_sym] = smods_rec.pub_date_sort if date_type_sym
63
+ end
64
+
65
+ doc_hash
66
+ end
67
+
68
+ # select one or more format values from the controlled vocabulary here:
69
+ # http://searchworks-solr-lb.stanford.edu:8983/solr/select?facet.field=format&rows=0&facet.sort=index
70
+ # via stanford-mods gem
71
+ # @return [Array<String>] value(s) in the SearchWorks controlled vocabulary, or []
72
+ def format
73
+ vals = smods_rec.format
74
+ if vals.empty?
75
+ logger.warn "#{druid} has no SearchWorks format from MODS - check <typeOfResource> and other implicated MODS elements"
76
+ end
77
+ vals
78
+ end
79
+
80
+ # call stanford-mods format_main to get results
81
+ # @return [Array<String>] value(s) in the SearchWorks controlled vocabulary, or []
82
+ def format_main_ssim
83
+ vals = smods_rec.format_main
84
+ if vals.empty?
85
+ logger.warn "#{druid} has no SearchWorks Resource Type from MODS - check <typeOfResource> and other implicated MODS elements"
86
+ end
87
+ vals
88
+ end
89
+
90
+ # call stanford-mods sw_genre to get results
91
+ # @return [Array<String>] value(s)
92
+ def genre_ssim
93
+ smods_rec.sw_genre
94
+ end
95
+
96
+ protected
97
+
98
+ # @return true if the string parses into an int, and if so, the int is >= 0
99
+ def is_positive_int?(str)
100
+ str.to_i >= 0
101
+ rescue
102
+ false
103
+ end
104
+
105
+ # determines particular flavor of displayable publication year field
106
+ # @return Solr field name as a symbol
107
+ def date_type_sym
108
+ vals = smods_rec.term_values([:origin_info, :dateIssued])
109
+ return :publication_year_isi if vals && vals.length > 0
110
+ vals = smods_rec.term_values([:origin_info, :dateCreated])
111
+ return :creation_year_isi if vals && vals.length > 0
112
+ nil
113
+ end
114
+ end
@@ -0,0 +1,42 @@
1
+ # Monkey patch for Nokogiri to cache xpath contexts and make things faster under jRuby
2
+ module Nokogiri
3
+ module XML
4
+ class Node
5
+ @context = nil
6
+
7
+ def xpath(*paths)
8
+ return NodeSet.new(document) unless document
9
+
10
+ paths, handler, ns, binds = extract_params(paths)
11
+
12
+ sets = paths.map do |path|
13
+ # if self.contexts[path]
14
+ # ctx = self.contexts[path]
15
+ # else
16
+ if @context
17
+ ctx = @context
18
+ else
19
+ ctx = XPathContext.new(self)
20
+ @context = ctx
21
+ end
22
+ ctx.register_namespaces(ns)
23
+ path = path.gsub(/xmlns:/, ' :') unless Nokogiri.uses_libxml?
24
+ binds.each do |key, value|
25
+ ctx.register_variable key.to_s, value
26
+ end if binds
27
+ ctx.evaluate(path, handler)
28
+ end
29
+
30
+ return sets.first if sets.length == 1
31
+
32
+ NodeSet.new(document) do |combined|
33
+ sets.each do |set|
34
+ set.each do |node|
35
+ combined << node
36
+ end
37
+ end
38
+ end
39
+ end # def xpath
40
+ end # class Node
41
+ end # module XML
42
+ end # module Nokogiri
@@ -0,0 +1,81 @@
1
+ # A mixin to the GDor::Indexer::SolrDocBuilder class.
2
+ # Methods for Solr field values determined from the DOR object's purl page public xml
3
+ module GDor::Indexer::PublicXmlFields
4
+ # value is used to tell SearchWorks UI app of specific display needs for objects
5
+ # a config file value for add_display_type can be used to prepend a string to
6
+ # xxx_collection or xxx_object
7
+ # e.g., Hydrus objects are a special display case
8
+ # Based on a value of :add_display_type in a collection's config yml file
9
+ #
10
+ # information on DOR content types:
11
+ # https://consul.stanford.edu/display/chimera/DOR+content+types%2C+resource+types+and+interpretive+metadata
12
+ # @return String the string to pre-pend to the display_type value (e.g. )
13
+ # @return [String] 'collection' or DOR content type
14
+ def display_type
15
+ case dor_content_type
16
+ when 'book'
17
+ 'book'
18
+ when 'image', 'manuscript', 'map'
19
+ 'image'
20
+ else
21
+ 'file'
22
+ end
23
+ end
24
+
25
+ # the @id attribute of resource/file elements that match the display_type, including extension
26
+ # @return [Array<String>] filenames
27
+ def file_ids
28
+ @file_ids ||= begin
29
+ ids = []
30
+ if content_md
31
+ if display_type == 'image'
32
+ content_md.xpath('./resource[@type="image"]/file/@id').each do |node|
33
+ ids << node.text unless node.text.empty?
34
+ end
35
+ elsif display_type == 'file'
36
+ content_md.xpath('./resource/file/@id').each do |node|
37
+ ids << node.text unless node.text.empty?
38
+ end
39
+ end
40
+ end
41
+ return nil if ids.empty?
42
+ ids
43
+ end
44
+ end
45
+
46
+ # @return true if the identityMetadata has <objectType>collection</objectType>, false otherwise
47
+ def collection?
48
+ resource.collection?
49
+ end
50
+
51
+ def collections
52
+ resource.collections
53
+ end
54
+
55
+ protected #---------------------------------------------------------------------
56
+
57
+ # the value of the type attribute for a DOR object's contentMetadata
58
+ # more info about these values is here:
59
+ # https://consul.stanford.edu/display/chimera/DOR+content+types%2C+resource+types+and+interpretive+metadata
60
+ # https://consul.stanford.edu/display/chimera/Summary+of+Content+Types%2C+Resource+Types+and+their+behaviors
61
+ # @return [String]
62
+ def dor_content_type
63
+ @dor_content_type ||= begin
64
+ dct = content_md ? content_md.xpath('@type').text : nil
65
+ logger.error "#{druid} has no DOR content type (<contentMetadata> element may be missing type attribute)" if !dct || dct.empty?
66
+ dct
67
+ end
68
+ end
69
+
70
+ # the contentMetadata for this object, derived from the public_xml
71
+ # @return [Nokogiri::XML::Element] containing the contentMetadata
72
+ def content_md
73
+ resource.content_metadata
74
+ end
75
+
76
+ # the identityMetadata for this object, derived from the public_xml
77
+ # @return [Nokogiri::XML::Element] containing the identityMetadata
78
+ def identity_md
79
+ resource.identity_metadata
80
+ end
81
+ end # GDor::Indexer::SolrDocBuilder class
@@ -0,0 +1,85 @@
1
+ require 'logger'
2
+
3
+ require 'harvestdor'
4
+ require 'stanford-mods'
5
+ require 'gdor/indexer/solr_doc_hash'
6
+ require 'gdor/indexer/mods_fields'
7
+ require 'gdor/indexer/public_xml_fields'
8
+
9
+ # Class to build the Hash representing a Solr document for a particular druid
10
+ class GDor::Indexer::SolrDocBuilder
11
+ include GDor::Indexer::ModsFields
12
+ include GDor::Indexer::PublicXmlFields
13
+
14
+ # The druid of the item
15
+ attr_reader :resource
16
+ attr_reader :logger
17
+
18
+ # @param [Harvestdor::Indexer::Resource] resource used to get MODS and public_xml
19
+ # @param [Logger] logger for indexing messages
20
+ def initialize(resource, logger)
21
+ @resource = resource
22
+ @logger = logger
23
+ end
24
+
25
+ def druid
26
+ resource.druid
27
+ end
28
+
29
+ # Create a Hash representing the Solr doc to be written to Solr, based on MODS and public_xml
30
+ # @return [Hash] Hash representing the Solr document
31
+ def doc_hash
32
+ @doc_hash ||= begin
33
+ doc_hash = GDor::Indexer::SolrDocHash.new id: resource.bare_druid, modsxml: smods_rec.to_xml
34
+ hash_from_mods = doc_hash_from_mods # defined in gdor_mods_fields
35
+ doc_hash.merge!(hash_from_mods) if hash_from_mods
36
+ doc_hash
37
+ end
38
+ end
39
+
40
+ # @return [String] value with SIRSI/Symphony numeric catkey in it, or nil if none exists
41
+ # first we look for
42
+ # identityMetadata/otherId[@name='catkey']
43
+ # if not found, we look for
44
+ # identityMetadata/otherId[@name='barcode']
45
+ # if found, we look for catkey in MODS
46
+ # mods/recordInfo/recordIdentifier[@source="SIRSI"
47
+ # and if found, remove the leading a
48
+ # otherwise, nil
49
+ def catkey
50
+ @catkey ||= begin
51
+ catkey = nil
52
+ node = public_xml.xpath("/publicObject/identityMetadata/otherId[@name='catkey']") if public_xml
53
+ catkey = node.first.content if node && node.first
54
+ unless catkey
55
+ # if there's a barcode in the identity metadata then look for a ckey in the MODS
56
+ node = public_xml.xpath("/publicObject/identityMetadata/otherId[@name='barcode']")
57
+ if node.first
58
+ rec_id = smods_rec.record_info.recordIdentifier
59
+ if rec_id && !rec_id.empty? && rec_id.first.source == 'SIRSI'
60
+ catkey = rec_id.first.text.delete('a') # need to ensure catkey is numeric only
61
+ else
62
+ logger.error("#{druid} has barcode #{node.first.content} in identityMetadata but no SIRSI catkey in mods")
63
+ end
64
+ end
65
+ end
66
+ catkey
67
+ end
68
+ end
69
+
70
+ # return the MODS for the druid as a Stanford::Mods::Record object
71
+ # @return [Stanford::Mods::Record] created from the MODS xml for the druid
72
+ def smods_rec
73
+ @smods_rec ||= begin
74
+ mods_rec = resource.smods_rec
75
+ mods_rec.druid = druid # why?
76
+ mods_rec
77
+ end
78
+ end
79
+
80
+ # the public_xml for the druid as a Nokogiri::XML::Document object
81
+ # @return [Nokogiri::XML::Document] containing the public xml for the druid
82
+ def public_xml
83
+ resource.public_xml
84
+ end
85
+ end # GDor::Indexer::SolrDocBuilder class
@@ -0,0 +1,112 @@
1
+ require 'delegate'
2
+
3
+ class GDor::Indexer
4
+ class SolrDocHash < SimpleDelegator
5
+ def initialize(hash = {})
6
+ super(hash)
7
+ end
8
+
9
+ # looks for non-empty existence of field when exp_val is nil;
10
+ # when exp_val is a String, looks for matching value as a String or as a member of an Array
11
+ # when exp_val is a Regexp, looks for String value that matches, or Array with a String member that matches
12
+ # @return true if the field is non-trivially present in the hash, false otherwise
13
+ def field_present?(field, exp_val = nil)
14
+ !!(if self.include?(field) && Array(self[field]).any? { |v| !v.blank? }
15
+ actual = Array(self[field])
16
+
17
+ case exp_val
18
+ when nil
19
+ true
20
+ when Regexp
21
+ actual.index { |s| exp_val.match(s) }
22
+ else
23
+ actual.include? exp_val
24
+ end
25
+ end)
26
+ end
27
+
28
+ # merge in field values from the new hash, with the following guarantees:
29
+ # values for keys in new_hash will be a non-empty String or flat Array
30
+ # keys will be removed from hash if all values are nil or empty
31
+ def combine(new_hash)
32
+ new_hash.select { |_key, value| Array(value).any? { |v| !v.blank? } }.each do |key, new_val|
33
+ if field_present? key
34
+ orig_val = self[key]
35
+ case orig_val
36
+ when Array
37
+ self[key] += Array(new_val)
38
+ else
39
+ self[key] = Array(orig_val) + Array(new_val)
40
+ end
41
+
42
+ self[key] = self[key].reject(&:blank?).uniq
43
+ else
44
+ self[key] = new_val
45
+ end
46
+ end
47
+
48
+ compact_blank_fields!
49
+
50
+ self
51
+ end
52
+
53
+ def compact_blank_fields!
54
+ keys.reject { |key| field_present? key }.each do |key|
55
+ delete key
56
+ end
57
+ self
58
+ end
59
+
60
+ def druid
61
+ self[:druid]
62
+ end
63
+
64
+ # validate fields that should be in hash for any item object in SearchWorks Solr
65
+ # @return [Array<String>] Array of messages suitable for notificaiton email and/or logs
66
+ def validate_item(config)
67
+ result = validate_gdor_fields(config)
68
+ result << "#{druid} missing collection\n" unless field_present?(:collection)
69
+
70
+ Array(self[:collection]).each do |collection_druid|
71
+ result << "#{druid} missing collection_with_title (or collection #{collection_druid} is missing title)\n" unless field_present?(:collection_with_title, Regexp.new("#{collection_druid}-\\|-.+"))
72
+ end
73
+ result << "#{druid} missing file_id(s)\n" unless field_present?(:file_id)
74
+ result
75
+ end
76
+
77
+ # validate fields that should be in hash for any collection object in SearchWorks Solr
78
+ # @return [Array<String>] Array of messages suitable for notificaiton email and/or logs
79
+ def validate_collection(config)
80
+ result = validate_gdor_fields(config)
81
+ result << "#{druid} missing collection_type 'Digital Collection'\n" unless field_present?(:collection_type, 'Digital Collection')
82
+ result << "#{druid} missing format_main_ssim 'Archive/Manuscript'\n" unless field_present?(:format_main_ssim, 'Archive/Manuscript')
83
+ result
84
+ end
85
+
86
+ # validate fields that should be in hash for every gryphonDOR object in SearchWorks Solr
87
+ # @return [Array<String>] Array of messages suitable for notificaiton email and/or logs
88
+ def validate_gdor_fields(config)
89
+ result = []
90
+ result << "#{druid} missing druid field\n" unless field_present?(:druid, druid)
91
+ result << "#{druid} missing url_fulltext for purl\n" unless field_present?(:url_fulltext, "#{config.harvestdor.purl}/#{druid}")
92
+ result << "#{druid} missing access_facet 'Online'\n" unless field_present?(:access_facet, 'Online')
93
+ result << "#{druid} missing or bad display_type, possibly caused by unrecognized @type attribute on <contentMetadata>\n" unless field_present?(:display_type, /(file)|(image)|(media)|(book)/)
94
+ result << "#{druid} missing building_facet 'Stanford Digital Repository'\n" unless field_present?(:building_facet, 'Stanford Digital Repository')
95
+ result
96
+ end
97
+
98
+ # validate fields that should be in doc hash for every unmerged gryphonDOR object in SearchWorks Solr
99
+ # @return [Array<String>] array of Strings indicating absence of required fields
100
+ def validate_mods(_config)
101
+ result = []
102
+ result << "#{druid} missing modsxml\n" unless field_present?(:modsxml)
103
+ result << "#{druid} missing resource type\n" unless field_present?(:format_main_ssim)
104
+ result << "#{druid} missing format\n" unless field_present?(:format) # for backwards compatibility
105
+ result << "#{druid} missing title\n" unless field_present?(:title_display)
106
+ result << "#{druid} missing pub year for date slider\n" unless field_present?(:pub_year_tisim)
107
+ result << "#{druid} missing author\n" unless field_present?(:author_person_display)
108
+ result << "#{druid} missing language\n" unless field_present?(:language)
109
+ result
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,5 @@
1
+ module GDor
2
+ class Indexer
3
+ VERSION = '0.1.0'
4
+ end
5
+ end
@@ -0,0 +1,44 @@
1
+ # Copy this file and change the following settings:
2
+ # 1. whitelist
3
+ # 2. dor_fetcher service_url
4
+ # 3. harvestdor log_dir, log_name
5
+ # 4. solr url
6
+
7
+ # whitelist: which objects will you index?
8
+ # if this is missing, 0 records will be fetched from the Dor Fetcher service
9
+ # the whitelist can be
10
+ # 1. an array of druids inline here, e.g. ['druid:oo123oo1234', 'druid:oo234oo2345']
11
+ # 2. a filename containing a list of druids (one per line)
12
+ # if a druid is for a collection record (per the object's identityMetadata at purl page)
13
+ # then we process all the item druids in that collection (as if they were included individually in the whitelist)
14
+ # if a druid is for an item object, then we process that druid
15
+ #whitelist: ['druid:dq441rn2614']
16
+ # either give absolute path or path relative to where the command will be executed
17
+ #whitelist: config/ap_whitelist.txt
18
+ whitelist: ['druid:ms016pb9280']
19
+
20
+ dor_fetcher:
21
+ # the baseurl of the DOR Fetcher service from which we get the item druids (per whitelist above)
22
+ # do not include 'collections' at end.
23
+ service_url: http://127.0.0.1:3000
24
+ # if skip_heartbeat set to true, this will skip a check that the dorfetcher service is alive before making API calls
25
+ # (useful for testing)
26
+ skip_heartbeat: true
27
+
28
+ harvestdor:
29
+ # log_name: name of log file (default: harvestdor.log)
30
+ log_name: testcoll.log
31
+
32
+ # log_dir: directory for log file (default logs, relative to harvestdor gem path)
33
+ log_dir: spec/test_logs
34
+
35
+ # purl: url for the DOR purl server (used to get ContentMetadata, etc.)
36
+ purl: https://purl.stanford.edu
37
+
38
+ # ----------- SOLR index (that we're writing INTO) parameters ------------
39
+ solr:
40
+ url: http://solr.baseurl.org
41
+ # timeouts are in seconds; read_timeout -> open/read, open_timeout -> connection open
42
+ read_timeout: 60
43
+ open_timeout: 60
44
+ max_retries: 10