dor_indexing 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +2 -0
  3. data/.rubocop.yml +355 -0
  4. data/Gemfile +16 -0
  5. data/Gemfile.lock +218 -0
  6. data/README.md +33 -0
  7. data/Rakefile +11 -0
  8. data/dor_indexing.gemspec +40 -0
  9. data/lib/dor_indexing/builders/all_search_text_builder.rb +58 -0
  10. data/lib/dor_indexing/builders/author_builder.rb +31 -0
  11. data/lib/dor_indexing/builders/collection_rights_description_builder.rb +29 -0
  12. data/lib/dor_indexing/builders/document_builder.rb +106 -0
  13. data/lib/dor_indexing/builders/event_date_builder.rb +71 -0
  14. data/lib/dor_indexing/builders/event_place_builder.rb +73 -0
  15. data/lib/dor_indexing/builders/geographic_builder.rb +82 -0
  16. data/lib/dor_indexing/builders/name_builder.rb +70 -0
  17. data/lib/dor_indexing/builders/orcid_builder.rb +62 -0
  18. data/lib/dor_indexing/builders/publisher_name_builder.rb +53 -0
  19. data/lib/dor_indexing/builders/temporal_builder.rb +56 -0
  20. data/lib/dor_indexing/builders/topic_builder.rb +96 -0
  21. data/lib/dor_indexing/cocina_repository.rb +24 -0
  22. data/lib/dor_indexing/indexers/administrative_tag_indexer.rb +69 -0
  23. data/lib/dor_indexing/indexers/collection_title_indexer.rb +27 -0
  24. data/lib/dor_indexing/indexers/composite_indexer.rb +36 -0
  25. data/lib/dor_indexing/indexers/content_metadata_indexer.rb +69 -0
  26. data/lib/dor_indexing/indexers/data_indexer.rb +66 -0
  27. data/lib/dor_indexing/indexers/default_object_rights_indexer.rb +36 -0
  28. data/lib/dor_indexing/indexers/descriptive_metadata_indexer.rb +226 -0
  29. data/lib/dor_indexing/indexers/embargo_metadata_indexer.rb +32 -0
  30. data/lib/dor_indexing/indexers/identifiable_indexer.rb +92 -0
  31. data/lib/dor_indexing/indexers/identity_metadata_indexer.rb +85 -0
  32. data/lib/dor_indexing/indexers/process_indexer.rb +63 -0
  33. data/lib/dor_indexing/indexers/releasable_indexer.rb +62 -0
  34. data/lib/dor_indexing/indexers/rights_metadata_indexer.rb +59 -0
  35. data/lib/dor_indexing/indexers/role_metadata_indexer.rb +31 -0
  36. data/lib/dor_indexing/indexers/workflow_indexer.rb +51 -0
  37. data/lib/dor_indexing/indexers/workflows_indexer.rb +40 -0
  38. data/lib/dor_indexing/marc_country.rb +359 -0
  39. data/lib/dor_indexing/selectors/event_selector.rb +112 -0
  40. data/lib/dor_indexing/selectors/pub_year_selector.rb +119 -0
  41. data/lib/dor_indexing/version.rb +5 -0
  42. data/lib/dor_indexing/workflow_fields.rb +63 -0
  43. data/lib/dor_indexing/workflow_solr_document.rb +93 -0
  44. data/lib/dor_indexing.rb +19 -0
  45. metadata +173 -0
@@ -0,0 +1,92 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DorIndexing
4
+ module Indexers
5
+ # Indexes the identifiable concerns
6
+ class IdentifiableIndexer
7
+ attr_reader :cocina, :cocina_repository
8
+
9
+ CURRENT_CATALOG_TYPE = 'folio'
10
+
11
+ def initialize(cocina:, cocina_repository:, **)
12
+ @cocina = cocina
13
+ @cocina_repository = cocina_repository
14
+ end
15
+
16
+ ## Module-level variables, shared between ALL mixin includers (and ALL *their* includers/extenders)!
17
+ ## used for caching found values
18
+ @@apo_hash = {} # rubocop:disable Style/ClassVars
19
+
20
+ # @return [Hash] the partial solr document for identifiable concerns
21
+ def to_solr
22
+ {}.tap do |solr_doc|
23
+ add_apo_titles(solr_doc, cocina.administrative.hasAdminPolicy)
24
+
25
+ solr_doc['metadata_source_ssim'] = identity_metadata_sources unless cocina.is_a? Cocina::Models::AdminPolicyWithMetadata
26
+ # This used to be added to the index by https://github.com/sul-dlss/dor-services/commit/11b80d249d19326ef591411ffeb634900e75c2c3
27
+ # and was called dc_identifier_druid_tesim
28
+ # It is used to search based on druid.
29
+ solr_doc['objectId_tesim'] = [cocina.externalIdentifier, cocina.externalIdentifier.delete_prefix('druid:')]
30
+ end
31
+ end
32
+
33
+ # @return [Array<String>] calculated values for Solr index
34
+ def identity_metadata_sources
35
+ return ['DOR'] if !cocina.identification.respond_to?(:catalogLinks) || distinct_current_catalog_types.empty?
36
+
37
+ distinct_current_catalog_types.map(&:capitalize)
38
+ end
39
+
40
+ # Clears out the cache of items. Used primarily in testing.
41
+ def self.reset_cache!
42
+ @@apo_hash = {} # rubocop:disable Style/ClassVars
43
+ end
44
+
45
+ private
46
+
47
+ def distinct_current_catalog_types
48
+ # Filter out e.g. "previous symphony", "previous folio"
49
+ @distinct_current_catalog_types ||=
50
+ cocina.identification
51
+ .catalogLinks
52
+ .map(&:catalog)
53
+ .uniq
54
+ .sort
55
+ .select { |catalog_type| catalog_type == CURRENT_CATALOG_TYPE }
56
+ end
57
+
58
+ # @param [Hash] solr_doc
59
+ # @param [String] admin_policy_id
60
+ def add_apo_titles(solr_doc, admin_policy_id)
61
+ row = populate_cache(admin_policy_id)
62
+ title = row['related_obj_title']
63
+ if row['is_from_hydrus']
64
+ ::Solrizer.insert_field(solr_doc, 'hydrus_apo_title', title, :symbol)
65
+ else
66
+ ::Solrizer.insert_field(solr_doc, 'nonhydrus_apo_title', title, :symbol)
67
+ end
68
+ ::Solrizer.insert_field(solr_doc, 'apo_title', title, :symbol)
69
+ end
70
+
71
+ # populate cache if necessary
72
+ def populate_cache(rel_druid)
73
+ @@apo_hash[rel_druid] ||= begin
74
+ related_obj = cocina_repository.find(rel_druid)
75
+ # APOs don't have projects, and since Hydrus is set to be retired, I don't want to
76
+ # add the cocina property. Just check the tags service instead.
77
+ is_from_hydrus = hydrus_tag?(rel_druid)
78
+ title = Cocina::Models::Builders::TitleBuilder.build(related_obj.description.title)
79
+ { 'related_obj_title' => title, 'is_from_hydrus' => is_from_hydrus }
80
+ rescue CocinaRepository::RepositoryError
81
+ Honeybadger.notify("Bad association found on #{cocina.externalIdentifier}. #{rel_druid} could not be found")
82
+ # This may happen if the given APO or Collection does not exist (bad data)
83
+ { 'related_obj_title' => rel_druid, 'is_from_hydrus' => false }
84
+ end
85
+ end
86
+
87
+ def hydrus_tag?(id)
88
+ cocina_repository.administrative_tags(id).include?('Project : Hydrus')
89
+ end
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DorIndexing
4
+ module Indexers
5
+ # Indexes the identity metadata
6
+ class IdentityMetadataIndexer
7
+ attr_reader :cocina_object
8
+
9
+ def initialize(cocina:, **)
10
+ @cocina_object = cocina
11
+ end
12
+
13
+ # @return [Hash] the partial solr document for identityMetadata
14
+ # rubocop:disable Metrics/AbcSize
15
+ # rubocop:disable Metrics/MethodLength
16
+ def to_solr
17
+ return { 'objectType_ssim' => [object_type] } if object_type == 'adminPolicy' || cocina_object.identification.nil?
18
+
19
+ {
20
+ 'objectType_ssim' => [object_type],
21
+ 'dor_id_tesim' => [source_id_value, barcode, folio_instance_hrid, previous_ils_ids].flatten.compact,
22
+ 'identifier_ssim' => prefixed_identifiers,
23
+ 'identifier_tesim' => prefixed_identifiers,
24
+ 'barcode_id_ssim' => [barcode].compact,
25
+ 'source_id_ssi' => source_id,
26
+ 'source_id_text_nostem_i' => source_id,
27
+ 'folio_instance_hrid_ssim' => [folio_instance_hrid].compact,
28
+ 'doi_ssim' => [doi].compact
29
+ }
30
+ end
31
+ # rubocop:enable Metrics/AbcSize
32
+ # rubocop:enable Metrics/MethodLength
33
+
34
+ private
35
+
36
+ def source_id
37
+ @source_id ||= cocina_object.identification.sourceId
38
+ end
39
+
40
+ def source_id_value
41
+ @source_id_value ||= source_id ? source_id.split(':', 2)[1] : nil
42
+ end
43
+
44
+ def barcode
45
+ @barcode ||= object_type == 'collection' ? nil : cocina_object.identification.barcode
46
+ end
47
+
48
+ def doi
49
+ @doi ||= object_type == 'item' ? cocina_object.identification.doi : nil
50
+ end
51
+
52
+ def folio_instance_hrid
53
+ @folio_instance_hrid ||= Array(cocina_object.identification.catalogLinks).find { |link| link.catalog == 'folio' }&.catalogRecordId
54
+ end
55
+
56
+ def previous_folio_instance_hrids
57
+ @previous_folio_instance_hrids ||=
58
+ Array(cocina_object.identification.catalogLinks).filter_map { |link| link.catalogRecordId if link.catalog == 'previous folio' }
59
+ end
60
+
61
+ def previous_ils_ids
62
+ @previous_ils_ids ||= previous_folio_instance_hrids
63
+ end
64
+
65
+ def object_type
66
+ case cocina_object
67
+ when Cocina::Models::AdminPolicyWithMetadata
68
+ 'adminPolicy'
69
+ when Cocina::Models::CollectionWithMetadata
70
+ 'collection'
71
+ else
72
+ cocina_object.type == Cocina::Models::ObjectType.agreement ? 'agreement' : 'item'
73
+ end
74
+ end
75
+
76
+ def prefixed_identifiers
77
+ [].tap do |identifiers|
78
+ identifiers << source_id if source_id
79
+ identifiers << "barcode:#{barcode}" if barcode
80
+ identifiers << "folio:#{folio_instance_hrid}" if folio_instance_hrid
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DorIndexing
4
+ module Indexers
5
+ # Indexes the process for a workflow
6
+ class ProcessIndexer
7
+ ERROR_OMISSION = '... (continued)'
8
+ private_constant :ERROR_OMISSION
9
+
10
+ # see https://lucene.apache.org/core/7_3_1/core/org/apache/lucene/util/BytesRefHash.MaxBytesLengthExceededException.html
11
+ MAX_ERROR_LENGTH = 32_768 - 2 - ERROR_OMISSION.length
12
+ private_constant :MAX_ERROR_LENGTH
13
+
14
+ # @param [WorkflowSolrDocument] solr_doc
15
+ # @param [String] workflow_name
16
+ # @param [Dor::Workflow::Response::Process] process
17
+ def initialize(solr_doc:, workflow_name:, process:)
18
+ @solr_doc = solr_doc
19
+ @workflow_name = workflow_name
20
+ @process = process
21
+ end
22
+
23
+ # @return [Hash] the partial solr document for the workflow document
24
+ # rubocop:disable Metrics/AbcSize
25
+ def to_solr
26
+ return unless status
27
+
28
+ # add a record of the robot having operated on this item, so we can track robot activity
29
+ solr_doc.add_process_time(workflow_name, name, Time.parse(process.datetime)) if time?
30
+
31
+ index_error_message
32
+
33
+ # workflow name, process status then process name
34
+ solr_doc.add_wsp("#{workflow_name}:#{status}", "#{workflow_name}:#{status}:#{name}")
35
+
36
+ # workflow name, process name then process status
37
+ solr_doc.add_wps("#{workflow_name}:#{name}", "#{workflow_name}:#{name}:#{status}")
38
+
39
+ # process status, workflowname then process name
40
+ solr_doc.add_swp(process.status.to_s, "#{status}:#{workflow_name}", "#{status}:#{workflow_name}:#{name}")
41
+ end
42
+ # rubocop:enable Metrics/AbcSize
43
+
44
+ private
45
+
46
+ attr_reader :process, :workflow_name, :solr_doc
47
+
48
+ delegate :status, :name, :state, :error_message, :datetime, to: :process
49
+
50
+ def time?
51
+ datetime && (status == 'completed' || status == 'error')
52
+ end
53
+
54
+ # index the error message without the druid so we hopefully get some overlap
55
+ # truncate to avoid org.apache.lucene.util.BytesRefHash$MaxBytesLengthExceededException
56
+ def index_error_message
57
+ return unless error_message
58
+
59
+ solr_doc.error = "#{workflow_name}:#{name}:#{error_message}".truncate(MAX_ERROR_LENGTH, omission: ERROR_OMISSION)
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DorIndexing
4
+ module Indexers
5
+ # Indexes the object's release tags
6
+ class ReleasableIndexer
7
+ attr_reader :cocina, :parent_collections
8
+
9
+ def initialize(cocina:, parent_collections:, **)
10
+ @cocina = cocina
11
+ @parent_collections = parent_collections
12
+ end
13
+
14
+ # @return [Hash] the partial solr document for releasable concerns
15
+ def to_solr
16
+ return {} if tags.blank?
17
+
18
+ {
19
+ 'released_to_ssim' => tags.map(&:to).uniq,
20
+ 'released_to_searchworks_dttsi' => searchworks_release_date,
21
+ 'released_to_earthworks_dttsi' => earthworks_release_date
22
+ }.compact
23
+ end
24
+
25
+ private
26
+
27
+ def earthworks_release_date
28
+ tags.find { |tag| tag.to == 'Earthworks' }&.date&.utc&.iso8601
29
+ end
30
+
31
+ def searchworks_release_date
32
+ tags.find { |tag| tag.to == 'Searchworks' }&.date&.utc&.iso8601
33
+ end
34
+
35
+ # Item tags have precidence over collection tags, so if the collection is release=true
36
+ # and the item is release=false, then it is not released
37
+ def tags
38
+ @tags ||= tags_from_collection.merge(tags_from_item).values.select(&:release)
39
+ end
40
+
41
+ def tags_from_collection
42
+ parent_collections.each_with_object({}) do |collection, result|
43
+ Array(collection.administrative.releaseTags)
44
+ .select { |tag| tag.what == 'collection' }
45
+ .group_by(&:to).map do |project, releases_for_project|
46
+ result[project] = releases_for_project.max_by(&:date)
47
+ end
48
+ end
49
+ end
50
+
51
+ def tags_from_item
52
+ released_for.group_by(&:to).transform_values do |releases_for_project|
53
+ releases_for_project.max_by(&:date)
54
+ end
55
+ end
56
+
57
+ def released_for
58
+ Array(cocina.administrative.releaseTags)
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,59 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DorIndexing
4
+ module Indexers
5
+ # Indexes the rights metadata
6
+ class RightsMetadataIndexer
7
+ attr_reader :cocina
8
+
9
+ def initialize(cocina:, **)
10
+ @cocina = cocina
11
+ end
12
+
13
+ # @return [Hash] the partial solr document for rightsMetadata
14
+ def to_solr
15
+ {
16
+ 'copyright_ssim' => cocina.access.copyright,
17
+ 'use_statement_ssim' => cocina.access.useAndReproductionStatement,
18
+ 'use_license_machine_ssi' => license,
19
+ 'rights_descriptions_ssim' => rights_description
20
+ }.compact
21
+ end
22
+
23
+ private
24
+
25
+ LICENSE_CODE = {
26
+ 'http://cocina.sul.stanford.edu/licenses/none' => 'none', # Only used in some legacy ETDs.
27
+ 'https://creativecommons.org/licenses/by/3.0/legalcode' => 'CC-BY-3.0',
28
+ 'https://creativecommons.org/licenses/by-sa/3.0/legalcode' => 'CC-BY-SA-3.0',
29
+ 'https://creativecommons.org/licenses/by-nd/3.0/legalcode' => 'CC-BY-ND-3.0',
30
+ 'https://creativecommons.org/licenses/by-nc/3.0/legalcode' => 'CC-BY-NC-3.0',
31
+ 'https://creativecommons.org/licenses/by-nc-sa/3.0/legalcode' => 'CC-BY-NC-SA-3.0',
32
+ 'https://creativecommons.org/licenses/by-nc-nd/3.0/legalcode' => 'CC-BY-NC-ND-3.0',
33
+ 'https://creativecommons.org/licenses/by/4.0/legalcode' => 'CC-BY-4.0',
34
+ 'https://creativecommons.org/licenses/by-sa/4.0/legalcode' => 'CC-BY-SA-4.0',
35
+ 'https://creativecommons.org/licenses/by-nd/4.0/legalcode' => 'CC-BY-ND-4.0',
36
+ 'https://creativecommons.org/licenses/by-nc/4.0/legalcode' => 'CC-BY-NC-4.0',
37
+ 'https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode' => 'CC-BY-NC-SA-4.0',
38
+ 'https://creativecommons.org/licenses/by-nc-nd/4.0/legalcode' => 'CC-BY-NC-ND-4.0',
39
+ 'https://creativecommons.org/publicdomain/zero/1.0/legalcode' => 'CC0-1.0',
40
+ 'https://creativecommons.org/publicdomain/mark/1.0/' => 'PDM',
41
+ 'https://opendatacommons.org/licenses/pddl/1-0/' => 'PDDL-1.0',
42
+ 'https://opendatacommons.org/licenses/by/1-0/' => 'ODC-By-1.0',
43
+ 'https://opendatacommons.org/licenses/odbl/1-0/' => 'ODbL-1.0'
44
+ }.freeze
45
+
46
+ def rights_description
47
+ return DorIndexing::Builders::CollectionRightsDescriptionBuilder.build(cocina) if cocina.collection?
48
+
49
+ Cocina::Models::Builders::DroRightsDescriptionBuilder.build(cocina)
50
+ end
51
+
52
+ # @return [String] the code if we've defined one, or the URI if we haven't.
53
+ def license
54
+ uri = cocina.access.license
55
+ LICENSE_CODE.fetch(uri, uri)
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DorIndexing
4
+ module Indexers
5
+ # Indexes the role metadata
6
+ class RoleMetadataIndexer
7
+ attr_reader :cocina
8
+
9
+ def initialize(cocina:, **)
10
+ @cocina = cocina
11
+ end
12
+
13
+ # @return [Hash] the partial solr document for roleMetadata
14
+ # rubocop:disable Metrics/AbcSize
15
+ def to_solr
16
+ Array(cocina.administrative.roles).each_with_object({}) do |role, solr_doc|
17
+ solr_doc['apo_register_permissions_ssim'] = serialize(role.members) if role.name == 'dor-apo-manager'
18
+ solr_doc["apo_role_#{role.name}_ssim"] = serialize(role.members.select { |member| member.type == 'workgroup' })
19
+ solr_doc["apo_role_person_#{role.name}_ssim"] = serialize(role.members.select { |member| member.type == 'sunetid' })
20
+ end
21
+ end
22
+ # rubocop:enable Metrics/AbcSize
23
+
24
+ private
25
+
26
+ def serialize(members)
27
+ members.map { |member| [member.type, member.identifier].join(':') }
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DorIndexing
4
+ module Indexers
5
+ # Indexes the objects position in workflows
6
+ class WorkflowIndexer
7
+ # @param [Workflow::Response::Workflow] workflow the workflow document to index
8
+ def initialize(workflow:, workflow_client:)
9
+ @workflow = workflow
10
+ @workflow_client = workflow_client
11
+ end
12
+
13
+ # @return [Hash] the partial solr document for the workflow document
14
+ def to_solr
15
+ WorkflowSolrDocument.new do |solr_doc|
16
+ solr_doc.name = workflow_name
17
+
18
+ errors = 0 # The error count is used by the Report class in Argo
19
+ processes.each do |process|
20
+ ProcessIndexer.new(solr_doc:, workflow_name:, process:).to_solr
21
+ errors += 1 if process.status == 'error'
22
+ end
23
+ solr_doc.status = [workflow_name, workflow_status, errors].join('|')
24
+ end
25
+ end
26
+
27
+ private
28
+
29
+ attr_reader :workflow, :workflow_client
30
+
31
+ delegate :workflow_name, to: :workflow
32
+
33
+ def definition_process_names
34
+ @definition_process_names ||= begin
35
+ definition = workflow_client.workflow_template(workflow_name)
36
+ definition['processes'].pluck('name')
37
+ end
38
+ end
39
+
40
+ def processes
41
+ @processes ||= definition_process_names.map do |process_name|
42
+ workflow.process_for_recent_version(name: process_name)
43
+ end
44
+ end
45
+
46
+ def workflow_status
47
+ workflow.complete? ? 'completed' : 'active'
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DorIndexing
4
+ module Indexers
5
+ # Indexes the objects position in workflows
6
+ class WorkflowsIndexer
7
+ attr_reader :id
8
+
9
+ def initialize(id:, workflow_client:, **)
10
+ @id = id
11
+ @workflow_client = workflow_client
12
+ end
13
+
14
+ # @return [Hash] the partial solr document for workflow concerns
15
+ def to_solr
16
+ WorkflowSolrDocument.new do |combined_doc|
17
+ workflows.each do |wf|
18
+ doc = WorkflowIndexer.new(workflow: wf, workflow_client:).to_solr
19
+ combined_doc.merge!(doc)
20
+ end
21
+ end.to_h
22
+ end
23
+
24
+ private
25
+
26
+ attr_reader :workflow_client
27
+
28
+ # @return [Array<Workflow::Response::Workflow>]
29
+ def workflows
30
+ all_workflows.workflows
31
+ end
32
+
33
+ # TODO: remove Dor::Workflow::Document
34
+ # @return [Workflow::Response::Workflows]
35
+ def all_workflows
36
+ @all_workflows ||= workflow_client.workflow_routes.all_workflows pid: id
37
+ end
38
+ end
39
+ end
40
+ end