dor_indexing 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +2 -0
  3. data/.rubocop.yml +355 -0
  4. data/Gemfile +16 -0
  5. data/Gemfile.lock +218 -0
  6. data/README.md +33 -0
  7. data/Rakefile +11 -0
  8. data/dor_indexing.gemspec +40 -0
  9. data/lib/dor_indexing/builders/all_search_text_builder.rb +58 -0
  10. data/lib/dor_indexing/builders/author_builder.rb +31 -0
  11. data/lib/dor_indexing/builders/collection_rights_description_builder.rb +29 -0
  12. data/lib/dor_indexing/builders/document_builder.rb +106 -0
  13. data/lib/dor_indexing/builders/event_date_builder.rb +71 -0
  14. data/lib/dor_indexing/builders/event_place_builder.rb +73 -0
  15. data/lib/dor_indexing/builders/geographic_builder.rb +82 -0
  16. data/lib/dor_indexing/builders/name_builder.rb +70 -0
  17. data/lib/dor_indexing/builders/orcid_builder.rb +62 -0
  18. data/lib/dor_indexing/builders/publisher_name_builder.rb +53 -0
  19. data/lib/dor_indexing/builders/temporal_builder.rb +56 -0
  20. data/lib/dor_indexing/builders/topic_builder.rb +96 -0
  21. data/lib/dor_indexing/cocina_repository.rb +24 -0
  22. data/lib/dor_indexing/indexers/administrative_tag_indexer.rb +69 -0
  23. data/lib/dor_indexing/indexers/collection_title_indexer.rb +27 -0
  24. data/lib/dor_indexing/indexers/composite_indexer.rb +36 -0
  25. data/lib/dor_indexing/indexers/content_metadata_indexer.rb +69 -0
  26. data/lib/dor_indexing/indexers/data_indexer.rb +66 -0
  27. data/lib/dor_indexing/indexers/default_object_rights_indexer.rb +36 -0
  28. data/lib/dor_indexing/indexers/descriptive_metadata_indexer.rb +226 -0
  29. data/lib/dor_indexing/indexers/embargo_metadata_indexer.rb +32 -0
  30. data/lib/dor_indexing/indexers/identifiable_indexer.rb +92 -0
  31. data/lib/dor_indexing/indexers/identity_metadata_indexer.rb +85 -0
  32. data/lib/dor_indexing/indexers/process_indexer.rb +63 -0
  33. data/lib/dor_indexing/indexers/releasable_indexer.rb +62 -0
  34. data/lib/dor_indexing/indexers/rights_metadata_indexer.rb +59 -0
  35. data/lib/dor_indexing/indexers/role_metadata_indexer.rb +31 -0
  36. data/lib/dor_indexing/indexers/workflow_indexer.rb +51 -0
  37. data/lib/dor_indexing/indexers/workflows_indexer.rb +40 -0
  38. data/lib/dor_indexing/marc_country.rb +359 -0
  39. data/lib/dor_indexing/selectors/event_selector.rb +112 -0
  40. data/lib/dor_indexing/selectors/pub_year_selector.rb +119 -0
  41. data/lib/dor_indexing/version.rb +5 -0
  42. data/lib/dor_indexing/workflow_fields.rb +63 -0
  43. data/lib/dor_indexing/workflow_solr_document.rb +93 -0
  44. data/lib/dor_indexing.rb +19 -0
  45. metadata +173 -0
@@ -0,0 +1,92 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DorIndexing
4
+ module Indexers
5
+ # Indexes the identifiable concerns
6
+ class IdentifiableIndexer
7
+ attr_reader :cocina, :cocina_repository
8
+
9
+ CURRENT_CATALOG_TYPE = 'folio'
10
+
11
+ def initialize(cocina:, cocina_repository:, **)
12
+ @cocina = cocina
13
+ @cocina_repository = cocina_repository
14
+ end
15
+
16
+ ## Module-level variables, shared between ALL mixin includers (and ALL *their* includers/extenders)!
17
+ ## used for caching found values
18
+ @@apo_hash = {} # rubocop:disable Style/ClassVars
19
+
20
+ # @return [Hash] the partial solr document for identifiable concerns
21
+ def to_solr
22
+ {}.tap do |solr_doc|
23
+ add_apo_titles(solr_doc, cocina.administrative.hasAdminPolicy)
24
+
25
+ solr_doc['metadata_source_ssim'] = identity_metadata_sources unless cocina.is_a? Cocina::Models::AdminPolicyWithMetadata
26
+ # This used to be added to the index by https://github.com/sul-dlss/dor-services/commit/11b80d249d19326ef591411ffeb634900e75c2c3
27
+ # and was called dc_identifier_druid_tesim
28
+ # It is used to search based on druid.
29
+ solr_doc['objectId_tesim'] = [cocina.externalIdentifier, cocina.externalIdentifier.delete_prefix('druid:')]
30
+ end
31
+ end
32
+
33
+ # @return [Array<String>] calculated values for Solr index
34
+ def identity_metadata_sources
35
+ return ['DOR'] if !cocina.identification.respond_to?(:catalogLinks) || distinct_current_catalog_types.empty?
36
+
37
+ distinct_current_catalog_types.map(&:capitalize)
38
+ end
39
+
40
+ # Clears out the cache of items. Used primarily in testing.
41
+ def self.reset_cache!
42
+ @@apo_hash = {} # rubocop:disable Style/ClassVars
43
+ end
44
+
45
+ private
46
+
47
+ def distinct_current_catalog_types
48
+ # Filter out e.g. "previous symphony", "previous folio"
49
+ @distinct_current_catalog_types ||=
50
+ cocina.identification
51
+ .catalogLinks
52
+ .map(&:catalog)
53
+ .uniq
54
+ .sort
55
+ .select { |catalog_type| catalog_type == CURRENT_CATALOG_TYPE }
56
+ end
57
+
58
+ # @param [Hash] solr_doc
59
+ # @param [String] admin_policy_id
60
+ def add_apo_titles(solr_doc, admin_policy_id)
61
+ row = populate_cache(admin_policy_id)
62
+ title = row['related_obj_title']
63
+ if row['is_from_hydrus']
64
+ ::Solrizer.insert_field(solr_doc, 'hydrus_apo_title', title, :symbol)
65
+ else
66
+ ::Solrizer.insert_field(solr_doc, 'nonhydrus_apo_title', title, :symbol)
67
+ end
68
+ ::Solrizer.insert_field(solr_doc, 'apo_title', title, :symbol)
69
+ end
70
+
71
+ # populate cache if necessary
72
+ def populate_cache(rel_druid)
73
+ @@apo_hash[rel_druid] ||= begin
74
+ related_obj = cocina_repository.find(rel_druid)
75
+ # APOs don't have projects, and since Hydrus is set to be retired, I don't want to
76
+ # add the cocina property. Just check the tags service instead.
77
+ is_from_hydrus = hydrus_tag?(rel_druid)
78
+ title = Cocina::Models::Builders::TitleBuilder.build(related_obj.description.title)
79
+ { 'related_obj_title' => title, 'is_from_hydrus' => is_from_hydrus }
80
+ rescue CocinaRepository::RepositoryError
81
+ Honeybadger.notify("Bad association found on #{cocina.externalIdentifier}. #{rel_druid} could not be found")
82
+ # This may happen if the given APO or Collection does not exist (bad data)
83
+ { 'related_obj_title' => rel_druid, 'is_from_hydrus' => false }
84
+ end
85
+ end
86
+
87
+ def hydrus_tag?(id)
88
+ cocina_repository.administrative_tags(id).include?('Project : Hydrus')
89
+ end
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DorIndexing
4
+ module Indexers
5
+ # Indexes the identity metadata
6
+ class IdentityMetadataIndexer
7
+ attr_reader :cocina_object
8
+
9
+ def initialize(cocina:, **)
10
+ @cocina_object = cocina
11
+ end
12
+
13
+ # @return [Hash] the partial solr document for identityMetadata
14
+ # rubocop:disable Metrics/AbcSize
15
+ # rubocop:disable Metrics/MethodLength
16
+ def to_solr
17
+ return { 'objectType_ssim' => [object_type] } if object_type == 'adminPolicy' || cocina_object.identification.nil?
18
+
19
+ {
20
+ 'objectType_ssim' => [object_type],
21
+ 'dor_id_tesim' => [source_id_value, barcode, folio_instance_hrid, previous_ils_ids].flatten.compact,
22
+ 'identifier_ssim' => prefixed_identifiers,
23
+ 'identifier_tesim' => prefixed_identifiers,
24
+ 'barcode_id_ssim' => [barcode].compact,
25
+ 'source_id_ssi' => source_id,
26
+ 'source_id_text_nostem_i' => source_id,
27
+ 'folio_instance_hrid_ssim' => [folio_instance_hrid].compact,
28
+ 'doi_ssim' => [doi].compact
29
+ }
30
+ end
31
+ # rubocop:enable Metrics/AbcSize
32
+ # rubocop:enable Metrics/MethodLength
33
+
34
+ private
35
+
36
+ def source_id
37
+ @source_id ||= cocina_object.identification.sourceId
38
+ end
39
+
40
+ def source_id_value
41
+ @source_id_value ||= source_id ? source_id.split(':', 2)[1] : nil
42
+ end
43
+
44
+ def barcode
45
+ @barcode ||= object_type == 'collection' ? nil : cocina_object.identification.barcode
46
+ end
47
+
48
+ def doi
49
+ @doi ||= object_type == 'item' ? cocina_object.identification.doi : nil
50
+ end
51
+
52
+ def folio_instance_hrid
53
+ @folio_instance_hrid ||= Array(cocina_object.identification.catalogLinks).find { |link| link.catalog == 'folio' }&.catalogRecordId
54
+ end
55
+
56
+ def previous_folio_instance_hrids
57
+ @previous_folio_instance_hrids ||=
58
+ Array(cocina_object.identification.catalogLinks).filter_map { |link| link.catalogRecordId if link.catalog == 'previous folio' }
59
+ end
60
+
61
+ def previous_ils_ids
62
+ @previous_ils_ids ||= previous_folio_instance_hrids
63
+ end
64
+
65
+ def object_type
66
+ case cocina_object
67
+ when Cocina::Models::AdminPolicyWithMetadata
68
+ 'adminPolicy'
69
+ when Cocina::Models::CollectionWithMetadata
70
+ 'collection'
71
+ else
72
+ cocina_object.type == Cocina::Models::ObjectType.agreement ? 'agreement' : 'item'
73
+ end
74
+ end
75
+
76
+ def prefixed_identifiers
77
+ [].tap do |identifiers|
78
+ identifiers << source_id if source_id
79
+ identifiers << "barcode:#{barcode}" if barcode
80
+ identifiers << "folio:#{folio_instance_hrid}" if folio_instance_hrid
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DorIndexing
4
+ module Indexers
5
+ # Indexes the process for a workflow
6
+ class ProcessIndexer
7
+ ERROR_OMISSION = '... (continued)'
8
+ private_constant :ERROR_OMISSION
9
+
10
+ # see https://lucene.apache.org/core/7_3_1/core/org/apache/lucene/util/BytesRefHash.MaxBytesLengthExceededException.html
11
+ MAX_ERROR_LENGTH = 32_768 - 2 - ERROR_OMISSION.length
12
+ private_constant :MAX_ERROR_LENGTH
13
+
14
+ # @param [WorkflowSolrDocument] solr_doc
15
+ # @param [String] workflow_name
16
+ # @param [Dor::Workflow::Response::Process] process
17
+ def initialize(solr_doc:, workflow_name:, process:)
18
+ @solr_doc = solr_doc
19
+ @workflow_name = workflow_name
20
+ @process = process
21
+ end
22
+
23
+ # @return [Hash] the partial solr document for the workflow document
24
+ # rubocop:disable Metrics/AbcSize
25
+ def to_solr
26
+ return unless status
27
+
28
+ # add a record of the robot having operated on this item, so we can track robot activity
29
+ solr_doc.add_process_time(workflow_name, name, Time.parse(process.datetime)) if time?
30
+
31
+ index_error_message
32
+
33
+ # workflow name, process status then process name
34
+ solr_doc.add_wsp("#{workflow_name}:#{status}", "#{workflow_name}:#{status}:#{name}")
35
+
36
+ # workflow name, process name then process status
37
+ solr_doc.add_wps("#{workflow_name}:#{name}", "#{workflow_name}:#{name}:#{status}")
38
+
39
+ # process status, workflowname then process name
40
+ solr_doc.add_swp(process.status.to_s, "#{status}:#{workflow_name}", "#{status}:#{workflow_name}:#{name}")
41
+ end
42
+ # rubocop:enable Metrics/AbcSize
43
+
44
+ private
45
+
46
+ attr_reader :process, :workflow_name, :solr_doc
47
+
48
+ delegate :status, :name, :state, :error_message, :datetime, to: :process
49
+
50
+ def time?
51
+ datetime && (status == 'completed' || status == 'error')
52
+ end
53
+
54
+ # index the error message without the druid so we hopefully get some overlap
55
+ # truncate to avoid org.apache.lucene.util.BytesRefHash$MaxBytesLengthExceededException
56
+ def index_error_message
57
+ return unless error_message
58
+
59
+ solr_doc.error = "#{workflow_name}:#{name}:#{error_message}".truncate(MAX_ERROR_LENGTH, omission: ERROR_OMISSION)
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DorIndexing
4
+ module Indexers
5
+ # Indexes the object's release tags
6
+ class ReleasableIndexer
7
+ attr_reader :cocina, :parent_collections
8
+
9
+ def initialize(cocina:, parent_collections:, **)
10
+ @cocina = cocina
11
+ @parent_collections = parent_collections
12
+ end
13
+
14
+ # @return [Hash] the partial solr document for releasable concerns
15
+ def to_solr
16
+ return {} if tags.blank?
17
+
18
+ {
19
+ 'released_to_ssim' => tags.map(&:to).uniq,
20
+ 'released_to_searchworks_dttsi' => searchworks_release_date,
21
+ 'released_to_earthworks_dttsi' => earthworks_release_date
22
+ }.compact
23
+ end
24
+
25
+ private
26
+
27
+ def earthworks_release_date
28
+ tags.find { |tag| tag.to == 'Earthworks' }&.date&.utc&.iso8601
29
+ end
30
+
31
+ def searchworks_release_date
32
+ tags.find { |tag| tag.to == 'Searchworks' }&.date&.utc&.iso8601
33
+ end
34
+
35
+ # Item tags have precidence over collection tags, so if the collection is release=true
36
+ # and the item is release=false, then it is not released
37
+ def tags
38
+ @tags ||= tags_from_collection.merge(tags_from_item).values.select(&:release)
39
+ end
40
+
41
+ def tags_from_collection
42
+ parent_collections.each_with_object({}) do |collection, result|
43
+ Array(collection.administrative.releaseTags)
44
+ .select { |tag| tag.what == 'collection' }
45
+ .group_by(&:to).map do |project, releases_for_project|
46
+ result[project] = releases_for_project.max_by(&:date)
47
+ end
48
+ end
49
+ end
50
+
51
+ def tags_from_item
52
+ released_for.group_by(&:to).transform_values do |releases_for_project|
53
+ releases_for_project.max_by(&:date)
54
+ end
55
+ end
56
+
57
+ def released_for
58
+ Array(cocina.administrative.releaseTags)
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,59 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DorIndexing
4
+ module Indexers
5
+ # Indexes the rights metadata
6
+ class RightsMetadataIndexer
7
+ attr_reader :cocina
8
+
9
+ def initialize(cocina:, **)
10
+ @cocina = cocina
11
+ end
12
+
13
+ # @return [Hash] the partial solr document for rightsMetadata
14
+ def to_solr
15
+ {
16
+ 'copyright_ssim' => cocina.access.copyright,
17
+ 'use_statement_ssim' => cocina.access.useAndReproductionStatement,
18
+ 'use_license_machine_ssi' => license,
19
+ 'rights_descriptions_ssim' => rights_description
20
+ }.compact
21
+ end
22
+
23
+ private
24
+
25
+ LICENSE_CODE = {
26
+ 'http://cocina.sul.stanford.edu/licenses/none' => 'none', # Only used in some legacy ETDs.
27
+ 'https://creativecommons.org/licenses/by/3.0/legalcode' => 'CC-BY-3.0',
28
+ 'https://creativecommons.org/licenses/by-sa/3.0/legalcode' => 'CC-BY-SA-3.0',
29
+ 'https://creativecommons.org/licenses/by-nd/3.0/legalcode' => 'CC-BY-ND-3.0',
30
+ 'https://creativecommons.org/licenses/by-nc/3.0/legalcode' => 'CC-BY-NC-3.0',
31
+ 'https://creativecommons.org/licenses/by-nc-sa/3.0/legalcode' => 'CC-BY-NC-SA-3.0',
32
+ 'https://creativecommons.org/licenses/by-nc-nd/3.0/legalcode' => 'CC-BY-NC-ND-3.0',
33
+ 'https://creativecommons.org/licenses/by/4.0/legalcode' => 'CC-BY-4.0',
34
+ 'https://creativecommons.org/licenses/by-sa/4.0/legalcode' => 'CC-BY-SA-4.0',
35
+ 'https://creativecommons.org/licenses/by-nd/4.0/legalcode' => 'CC-BY-ND-4.0',
36
+ 'https://creativecommons.org/licenses/by-nc/4.0/legalcode' => 'CC-BY-NC-4.0',
37
+ 'https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode' => 'CC-BY-NC-SA-4.0',
38
+ 'https://creativecommons.org/licenses/by-nc-nd/4.0/legalcode' => 'CC-BY-NC-ND-4.0',
39
+ 'https://creativecommons.org/publicdomain/zero/1.0/legalcode' => 'CC0-1.0',
40
+ 'https://creativecommons.org/publicdomain/mark/1.0/' => 'PDM',
41
+ 'https://opendatacommons.org/licenses/pddl/1-0/' => 'PDDL-1.0',
42
+ 'https://opendatacommons.org/licenses/by/1-0/' => 'ODC-By-1.0',
43
+ 'https://opendatacommons.org/licenses/odbl/1-0/' => 'ODbL-1.0'
44
+ }.freeze
45
+
46
+ def rights_description
47
+ return DorIndexing::Builders::CollectionRightsDescriptionBuilder.build(cocina) if cocina.collection?
48
+
49
+ Cocina::Models::Builders::DroRightsDescriptionBuilder.build(cocina)
50
+ end
51
+
52
+ # @return [String] the code if we've defined one, or the URI if we haven't.
53
+ def license
54
+ uri = cocina.access.license
55
+ LICENSE_CODE.fetch(uri, uri)
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DorIndexing
4
+ module Indexers
5
+ # Indexes the role metadata
6
+ class RoleMetadataIndexer
7
+ attr_reader :cocina
8
+
9
+ def initialize(cocina:, **)
10
+ @cocina = cocina
11
+ end
12
+
13
+ # @return [Hash] the partial solr document for roleMetadata
14
+ # rubocop:disable Metrics/AbcSize
15
+ def to_solr
16
+ Array(cocina.administrative.roles).each_with_object({}) do |role, solr_doc|
17
+ solr_doc['apo_register_permissions_ssim'] = serialize(role.members) if role.name == 'dor-apo-manager'
18
+ solr_doc["apo_role_#{role.name}_ssim"] = serialize(role.members.select { |member| member.type == 'workgroup' })
19
+ solr_doc["apo_role_person_#{role.name}_ssim"] = serialize(role.members.select { |member| member.type == 'sunetid' })
20
+ end
21
+ end
22
+ # rubocop:enable Metrics/AbcSize
23
+
24
+ private
25
+
26
+ def serialize(members)
27
+ members.map { |member| [member.type, member.identifier].join(':') }
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DorIndexing
4
+ module Indexers
5
+ # Indexes the objects position in workflows
6
+ class WorkflowIndexer
7
+ # @param [Workflow::Response::Workflow] workflow the workflow document to index
8
+ def initialize(workflow:, workflow_client:)
9
+ @workflow = workflow
10
+ @workflow_client = workflow_client
11
+ end
12
+
13
+ # @return [Hash] the partial solr document for the workflow document
14
+ def to_solr
15
+ WorkflowSolrDocument.new do |solr_doc|
16
+ solr_doc.name = workflow_name
17
+
18
+ errors = 0 # The error count is used by the Report class in Argo
19
+ processes.each do |process|
20
+ ProcessIndexer.new(solr_doc:, workflow_name:, process:).to_solr
21
+ errors += 1 if process.status == 'error'
22
+ end
23
+ solr_doc.status = [workflow_name, workflow_status, errors].join('|')
24
+ end
25
+ end
26
+
27
+ private
28
+
29
+ attr_reader :workflow, :workflow_client
30
+
31
+ delegate :workflow_name, to: :workflow
32
+
33
+ def definition_process_names
34
+ @definition_process_names ||= begin
35
+ definition = workflow_client.workflow_template(workflow_name)
36
+ definition['processes'].pluck('name')
37
+ end
38
+ end
39
+
40
+ def processes
41
+ @processes ||= definition_process_names.map do |process_name|
42
+ workflow.process_for_recent_version(name: process_name)
43
+ end
44
+ end
45
+
46
+ def workflow_status
47
+ workflow.complete? ? 'completed' : 'active'
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DorIndexing
4
+ module Indexers
5
+ # Indexes the objects position in workflows
6
+ class WorkflowsIndexer
7
+ attr_reader :id
8
+
9
+ def initialize(id:, workflow_client:, **)
10
+ @id = id
11
+ @workflow_client = workflow_client
12
+ end
13
+
14
+ # @return [Hash] the partial solr document for workflow concerns
15
+ def to_solr
16
+ WorkflowSolrDocument.new do |combined_doc|
17
+ workflows.each do |wf|
18
+ doc = WorkflowIndexer.new(workflow: wf, workflow_client:).to_solr
19
+ combined_doc.merge!(doc)
20
+ end
21
+ end.to_h
22
+ end
23
+
24
+ private
25
+
26
+ attr_reader :workflow_client
27
+
28
+ # @return [Array<Workflow::Response::Workflow>]
29
+ def workflows
30
+ all_workflows.workflows
31
+ end
32
+
33
+ # TODO: remove Dor::Workflow::Document
34
+ # @return [Workflow::Response::Workflows]
35
+ def all_workflows
36
+ @all_workflows ||= workflow_client.workflow_routes.all_workflows pid: id
37
+ end
38
+ end
39
+ end
40
+ end