dor_indexing 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +2 -0
- data/.rubocop.yml +355 -0
- data/Gemfile +16 -0
- data/Gemfile.lock +218 -0
- data/README.md +33 -0
- data/Rakefile +11 -0
- data/dor_indexing.gemspec +40 -0
- data/lib/dor_indexing/builders/all_search_text_builder.rb +58 -0
- data/lib/dor_indexing/builders/author_builder.rb +31 -0
- data/lib/dor_indexing/builders/collection_rights_description_builder.rb +29 -0
- data/lib/dor_indexing/builders/document_builder.rb +106 -0
- data/lib/dor_indexing/builders/event_date_builder.rb +71 -0
- data/lib/dor_indexing/builders/event_place_builder.rb +73 -0
- data/lib/dor_indexing/builders/geographic_builder.rb +82 -0
- data/lib/dor_indexing/builders/name_builder.rb +70 -0
- data/lib/dor_indexing/builders/orcid_builder.rb +62 -0
- data/lib/dor_indexing/builders/publisher_name_builder.rb +53 -0
- data/lib/dor_indexing/builders/temporal_builder.rb +56 -0
- data/lib/dor_indexing/builders/topic_builder.rb +96 -0
- data/lib/dor_indexing/cocina_repository.rb +24 -0
- data/lib/dor_indexing/indexers/administrative_tag_indexer.rb +69 -0
- data/lib/dor_indexing/indexers/collection_title_indexer.rb +27 -0
- data/lib/dor_indexing/indexers/composite_indexer.rb +36 -0
- data/lib/dor_indexing/indexers/content_metadata_indexer.rb +69 -0
- data/lib/dor_indexing/indexers/data_indexer.rb +66 -0
- data/lib/dor_indexing/indexers/default_object_rights_indexer.rb +36 -0
- data/lib/dor_indexing/indexers/descriptive_metadata_indexer.rb +226 -0
- data/lib/dor_indexing/indexers/embargo_metadata_indexer.rb +32 -0
- data/lib/dor_indexing/indexers/identifiable_indexer.rb +92 -0
- data/lib/dor_indexing/indexers/identity_metadata_indexer.rb +85 -0
- data/lib/dor_indexing/indexers/process_indexer.rb +63 -0
- data/lib/dor_indexing/indexers/releasable_indexer.rb +62 -0
- data/lib/dor_indexing/indexers/rights_metadata_indexer.rb +59 -0
- data/lib/dor_indexing/indexers/role_metadata_indexer.rb +31 -0
- data/lib/dor_indexing/indexers/workflow_indexer.rb +51 -0
- data/lib/dor_indexing/indexers/workflows_indexer.rb +40 -0
- data/lib/dor_indexing/marc_country.rb +359 -0
- data/lib/dor_indexing/selectors/event_selector.rb +112 -0
- data/lib/dor_indexing/selectors/pub_year_selector.rb +119 -0
- data/lib/dor_indexing/version.rb +5 -0
- data/lib/dor_indexing/workflow_fields.rb +63 -0
- data/lib/dor_indexing/workflow_solr_document.rb +93 -0
- data/lib/dor_indexing.rb +19 -0
- metadata +173 -0
@@ -0,0 +1,96 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DorIndexing
|
4
|
+
module Builders
|
5
|
+
# Builds the topic fields for a solr document
|
6
|
+
class TopicBuilder
|
7
|
+
# @param [Array] subjects
|
8
|
+
# @param [String] filter can either be 'topic' or 'name'
|
9
|
+
def self.build(subjects, filter:, remove_trailing_punctuation: false)
|
10
|
+
new(filter:, remove_trailing_punctuation:).build(subjects)
|
11
|
+
end
|
12
|
+
|
13
|
+
def initialize(filter:, remove_trailing_punctuation:)
|
14
|
+
@filter = filter
|
15
|
+
@remove_trailing_punctuation = remove_trailing_punctuation
|
16
|
+
end
|
17
|
+
|
18
|
+
def build(subjects)
|
19
|
+
topics(subjects).flat_map { |topic| flat_topic(topic) }.compact.uniq
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
attr_reader :filter
|
25
|
+
|
26
|
+
def remove_trailing_punctuation?
|
27
|
+
@remove_trailing_punctuation
|
28
|
+
end
|
29
|
+
|
30
|
+
# Filter the subjects we are interested in>
|
31
|
+
# Handles:
|
32
|
+
# parallelValue that contain structuredValue and the parallelValue has the type AND
|
33
|
+
# parallelValue that contain structuredValue each with their own type AND
|
34
|
+
# parallelValue that has a type conferred to the child AND
|
35
|
+
# structuredValue that contains structuredValue where the type can be at the higher or lower level.
|
36
|
+
def topics(subjects)
|
37
|
+
(
|
38
|
+
subjects.flat_map { |subject| basic_value(subject) } +
|
39
|
+
subjects.flat_map { |subject| structured_values(subject) } +
|
40
|
+
parallel_subjects(subjects)
|
41
|
+
).compact
|
42
|
+
end
|
43
|
+
|
44
|
+
def parallel_subjects(subjects)
|
45
|
+
parallels = subjects.select(&:parallelValue)
|
46
|
+
parallels.flat_map { |subject| parallel_with_type(subject, subject.type) if subject.type } +
|
47
|
+
parallels.flat_map { |subject| topics(subject.parallelValue) unless subject.type }
|
48
|
+
end
|
49
|
+
|
50
|
+
def flat_topic(value)
|
51
|
+
if value.parallelValue.present?
|
52
|
+
value.parallelValue.flat_map { |topic| flat_topic(topic) }
|
53
|
+
elsif remove_trailing_punctuation?
|
54
|
+
# comma, semicolon, and backslash are dropped
|
55
|
+
Array(value.value&.sub(/[ ,;\\]+$/, ''))
|
56
|
+
else
|
57
|
+
Array(value.value)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def parallel_with_type(item, type_from_parent)
|
62
|
+
return unless type_matches_filter?(type_from_parent)
|
63
|
+
|
64
|
+
item
|
65
|
+
end
|
66
|
+
|
67
|
+
def basic_value(subject)
|
68
|
+
return create_fullname(subject) if filter == 'name' && subject.type == 'person'
|
69
|
+
return create_title(subject) if filter == 'name' && subject.type == 'title'
|
70
|
+
|
71
|
+
subject if type_matches_filter?(subject.type)
|
72
|
+
end
|
73
|
+
|
74
|
+
def structured_values(subject)
|
75
|
+
selected = Array(subject.structuredValue).select { |child| type_matches_filter?(child.type) }
|
76
|
+
|
77
|
+
topics(selected)
|
78
|
+
end
|
79
|
+
|
80
|
+
def create_title(title)
|
81
|
+
titles = Cocina::Models::Builders::TitleBuilder.build([title], strategy: :all, add_punctuation: false)
|
82
|
+
titles.map { |value| Cocina::Models::DescriptiveValue.new(value:) }
|
83
|
+
end
|
84
|
+
|
85
|
+
def create_fullname(name)
|
86
|
+
names = NameBuilder.build([name], strategy: :all)
|
87
|
+
names.map { |value| Cocina::Models::DescriptiveValue.new(value:) }
|
88
|
+
end
|
89
|
+
|
90
|
+
def type_matches_filter?(type)
|
91
|
+
(filter == 'name' && %w[person organization title occupation].include?(type)) ||
|
92
|
+
type == filter
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DorIndexing
|
4
|
+
# Interface for retrieving Cocina objects.
|
5
|
+
# In DSA, the concrete implementation backs this with CocinaObjectStore.
|
6
|
+
# In DIA, the concrete implementation backs this with Dor Services Client.
|
7
|
+
class CocinaRepository
|
8
|
+
class RepositoryError < StandardError; end
|
9
|
+
|
10
|
+
# @param [String] druid
|
11
|
+
# @return [Cocina::Models::DROWithMetadata,Cocina::Models::CollectionWithMetadata,Cocina::Models::AdminPolicyWithMetadata]
|
12
|
+
# @raise [RepositoryError] if the object is not found or other error occurs
|
13
|
+
def find(druid)
|
14
|
+
raise NotImplementedError
|
15
|
+
end
|
16
|
+
|
17
|
+
# @param [String] druid
|
18
|
+
# @return [Array<String>] administrative tags
|
19
|
+
# @raise [RepositoryError] if the object is not found or other error occurs
|
20
|
+
def administrative_tags(druid)
|
21
|
+
raise NotImplementedError
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DorIndexing
|
4
|
+
module Indexers
|
5
|
+
# Index administrative tags for an object.
|
6
|
+
# NOTE: Most of this code was extracted from the dor-services gem:
|
7
|
+
# https://github.com/sul-dlss/dor-services/blob/v9.0.0/lib/dor/datastreams/identity_metadata_ds.rb#L196-L218
|
8
|
+
class AdministrativeTagIndexer
|
9
|
+
TAG_PART_DELIMITER = ' : '
|
10
|
+
SPECIAL_TAG_TYPES_TO_INDEX = ['Project', 'Registered By'].freeze
|
11
|
+
|
12
|
+
attr_reader :id
|
13
|
+
|
14
|
+
def initialize(id:, administrative_tags:, **)
|
15
|
+
@id = id
|
16
|
+
@administrative_tags = administrative_tags
|
17
|
+
end
|
18
|
+
|
19
|
+
# @return [Hash] the partial solr document for administrative tags
|
20
|
+
# rubocop:disable Metrics/MethodLength
|
21
|
+
# rubocop:disable Metrics/AbcSize
|
22
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
23
|
+
def to_solr
|
24
|
+
solr_doc = {
|
25
|
+
'tag_ssim' => [],
|
26
|
+
'tag_text_unstemmed_im' => [],
|
27
|
+
'exploded_nonproject_tag_ssim' => []
|
28
|
+
}
|
29
|
+
administrative_tags.each do |tag|
|
30
|
+
tag_prefix, rest = tag.split(TAG_PART_DELIMITER, 2)
|
31
|
+
prefix = tag_prefix.downcase.strip.gsub(/\s/, '_')
|
32
|
+
|
33
|
+
solr_doc['tag_ssim'] << tag # for facet and display
|
34
|
+
solr_doc['tag_text_unstemmed_im'] << tag # for search
|
35
|
+
|
36
|
+
solr_doc['exploded_nonproject_tag_ssim'] += exploded_tags_from(tag) unless prefix == 'project'
|
37
|
+
|
38
|
+
next if SPECIAL_TAG_TYPES_TO_INDEX.exclude?(tag_prefix) || rest.nil?
|
39
|
+
|
40
|
+
(solr_doc["#{prefix}_tag_ssim"] ||= []) << rest.strip
|
41
|
+
|
42
|
+
if prefix == 'project'
|
43
|
+
solr_doc['exploded_project_tag_ssim'] ||= []
|
44
|
+
solr_doc['exploded_project_tag_ssim'] += exploded_tags_from(rest.strip)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
solr_doc
|
48
|
+
end
|
49
|
+
# rubocop:enable Metrics/MethodLength
|
50
|
+
# rubocop:enable Metrics/AbcSize
|
51
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
attr_reader :administrative_tags
|
56
|
+
|
57
|
+
# solrize each possible prefix for the tag, inclusive of the full tag.
|
58
|
+
# e.g., for a tag such as "A : B : C", this will solrize to an _ssim field
|
59
|
+
# that contains ["A", "A : B", "A : B : C"].
|
60
|
+
def exploded_tags_from(tag)
|
61
|
+
tag_parts = tag.split(TAG_PART_DELIMITER)
|
62
|
+
|
63
|
+
1.upto(tag_parts.count).map do |i|
|
64
|
+
tag_parts.take(i).join(TAG_PART_DELIMITER)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DorIndexing
|
4
|
+
module Indexers
|
5
|
+
# Indexes the collection title
|
6
|
+
class CollectionTitleIndexer
|
7
|
+
attr_reader :cocina, :parent_collections
|
8
|
+
|
9
|
+
def initialize(cocina:, parent_collections:, **)
|
10
|
+
@cocina = cocina
|
11
|
+
@parent_collections = parent_collections
|
12
|
+
end
|
13
|
+
|
14
|
+
# @return [Hash] the partial solr document for identifiable concerns
|
15
|
+
def to_solr
|
16
|
+
{}.tap do |solr_doc|
|
17
|
+
parent_collections.each do |related_obj|
|
18
|
+
coll_title = Cocina::Models::Builders::TitleBuilder.build(related_obj.description.title)
|
19
|
+
|
20
|
+
# create/append collection_title_tesim and collection_title_ssim
|
21
|
+
::Solrizer.insert_field(solr_doc, 'collection_title', coll_title, :stored_searchable, :symbol)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DorIndexing
|
4
|
+
module Indexers
|
5
|
+
# Borrowed from https://github.com/samvera/valkyrie/blob/master/lib/valkyrie/persistence/solr/composite_indexer.rb
|
6
|
+
class CompositeIndexer
|
7
|
+
attr_reader :indexers
|
8
|
+
|
9
|
+
def initialize(*indexers)
|
10
|
+
@indexers = indexers
|
11
|
+
end
|
12
|
+
|
13
|
+
def new(**)
|
14
|
+
Instance.new(indexers, **)
|
15
|
+
end
|
16
|
+
|
17
|
+
# Instance for a composite indexer
|
18
|
+
class Instance
|
19
|
+
attr_reader :indexers
|
20
|
+
|
21
|
+
def initialize(indexers, **)
|
22
|
+
@indexers = indexers.map do |i|
|
23
|
+
i.new(**)
|
24
|
+
rescue ArgumentError => e
|
25
|
+
raise ArgumentError, "Unable to initialize #{i}. #{e.message}"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# @return [Hash] the merged solr document for all the sub-indexers
|
30
|
+
def to_solr
|
31
|
+
indexers.map(&:to_solr).inject({}, &:merge)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DorIndexing
|
4
|
+
module Indexers
|
5
|
+
# Indexes the content metadata
|
6
|
+
class ContentMetadataIndexer
|
7
|
+
attr_reader :cocina
|
8
|
+
|
9
|
+
def initialize(cocina:, **)
|
10
|
+
@cocina = cocina
|
11
|
+
end
|
12
|
+
|
13
|
+
# @return [Hash] the partial solr document for contentMetadata
|
14
|
+
def to_solr
|
15
|
+
{
|
16
|
+
'content_type_ssim' => type(cocina.type),
|
17
|
+
'content_file_mimetypes_ssim' => files.map(&:hasMimeType).uniq,
|
18
|
+
'content_file_count_itsi' => files.size,
|
19
|
+
'shelved_content_file_count_itsi' => shelved_files.size,
|
20
|
+
'resource_count_itsi' => file_sets.size,
|
21
|
+
'preserved_size_dbtsi' => preserved_size, # double (trie) to support very large sizes
|
22
|
+
'content_file_roles_ssim' => files.filter_map(&:use),
|
23
|
+
# first_shelved_image is neither indexed nor multiple
|
24
|
+
'first_shelved_image_ss' => first_shelved_image
|
25
|
+
}
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def first_shelved_image
|
31
|
+
shelved_files.find { |file| file.filename.end_with?('jp2') }&.filename
|
32
|
+
end
|
33
|
+
|
34
|
+
def shelved_files
|
35
|
+
files.select { |file| file.administrative.shelve }
|
36
|
+
end
|
37
|
+
|
38
|
+
def preserved_size
|
39
|
+
files.select { |file| file.administrative.sdrPreserve }
|
40
|
+
.filter_map(&:size).sum # filter out missing size
|
41
|
+
end
|
42
|
+
|
43
|
+
def files
|
44
|
+
@files ||= file_sets.flat_map { |fs| fs.structural.contains }
|
45
|
+
end
|
46
|
+
|
47
|
+
def file_sets
|
48
|
+
@file_sets ||= Array(cocina.structural&.contains)
|
49
|
+
end
|
50
|
+
|
51
|
+
TYPES = {
|
52
|
+
Cocina::Models::ObjectType.image => 'image',
|
53
|
+
Cocina::Models::ObjectType.manuscript => 'image',
|
54
|
+
Cocina::Models::ObjectType.book => 'book',
|
55
|
+
Cocina::Models::ObjectType.map => 'map',
|
56
|
+
Cocina::Models::ObjectType.three_dimensional => '3d',
|
57
|
+
Cocina::Models::ObjectType.media => 'media',
|
58
|
+
Cocina::Models::ObjectType.webarchive_seed => 'webarchive-seed',
|
59
|
+
Cocina::Models::ObjectType.webarchive_binary => 'webarchive-binary',
|
60
|
+
Cocina::Models::ObjectType.geo => 'geo',
|
61
|
+
Cocina::Models::ObjectType.document => 'document'
|
62
|
+
}.freeze
|
63
|
+
|
64
|
+
def type(object_type)
|
65
|
+
TYPES.fetch(object_type, 'file')
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DorIndexing
|
4
|
+
module Indexers
|
5
|
+
# Indexing provided by ActiveFedora
|
6
|
+
class DataIndexer
|
7
|
+
attr_reader :cocina, :workflow_client
|
8
|
+
|
9
|
+
def initialize(cocina:, workflow_client:, **)
|
10
|
+
@cocina = cocina
|
11
|
+
@workflow_client = workflow_client
|
12
|
+
end
|
13
|
+
|
14
|
+
# rubocop:disable Metrics/AbcSize
|
15
|
+
# rubocop:disable Metrics/MethodLength
|
16
|
+
def to_solr
|
17
|
+
{}.tap do |solr_doc|
|
18
|
+
solr_doc[:id] = cocina.externalIdentifier
|
19
|
+
solr_doc['current_version_isi'] = cocina.version # Argo Facet field "Version"
|
20
|
+
solr_doc['obj_label_tesim'] = cocina.label
|
21
|
+
|
22
|
+
solr_doc['modified_latest_dttsi'] = modified_latest
|
23
|
+
solr_doc['created_at_dttsi'] = created_at
|
24
|
+
|
25
|
+
# is_member_of_collection_ssim is used by dor-services-app for querying for members of a
|
26
|
+
# collection and it is a facet in Argo
|
27
|
+
solr_doc['is_member_of_collection_ssim'] = legacy_collections
|
28
|
+
solr_doc['is_governed_by_ssim'] = legacy_apo # Argo facet
|
29
|
+
|
30
|
+
# Used so that DSA can generate public XML whereas a constituent can find the virtual object it is part of.
|
31
|
+
solr_doc['has_constituents_ssim'] = virtual_object_constituents
|
32
|
+
end.merge(DorIndexing::WorkflowFields.for(druid: cocina.externalIdentifier, version: cocina.version, workflow_client:))
|
33
|
+
.transform_keys(&:to_s)
|
34
|
+
end
|
35
|
+
# rubocop:enable Metrics/AbcSize
|
36
|
+
# rubocop:enable Metrics/MethodLength
|
37
|
+
|
38
|
+
def modified_latest
|
39
|
+
cocina.modified.to_datetime.strftime('%FT%TZ')
|
40
|
+
end
|
41
|
+
|
42
|
+
def created_at
|
43
|
+
cocina.created.to_datetime.strftime('%FT%TZ')
|
44
|
+
end
|
45
|
+
|
46
|
+
def legacy_collections
|
47
|
+
case cocina.type
|
48
|
+
when Cocina::Models::ObjectType.admin_policy, Cocina::Models::ObjectType.collection
|
49
|
+
[]
|
50
|
+
else
|
51
|
+
Array(cocina.structural&.isMemberOf).map { |col_id| "info:fedora/#{col_id}" }
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def virtual_object_constituents
|
56
|
+
return unless cocina.dro?
|
57
|
+
|
58
|
+
Array(cocina.structural&.hasMemberOrders).first&.members
|
59
|
+
end
|
60
|
+
|
61
|
+
def legacy_apo
|
62
|
+
"info:fedora/#{cocina.administrative.hasAdminPolicy}"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DorIndexing
|
4
|
+
module Indexers
|
5
|
+
# Indexes the default object rights
|
6
|
+
class DefaultObjectRightsIndexer
|
7
|
+
attr_reader :cocina
|
8
|
+
|
9
|
+
def initialize(cocina:, **)
|
10
|
+
@cocina = cocina
|
11
|
+
end
|
12
|
+
|
13
|
+
# @return [Hash] the partial solr document for defaultObjectRights
|
14
|
+
def to_solr
|
15
|
+
return {} unless cocina.administrative.accessTemplate
|
16
|
+
|
17
|
+
{
|
18
|
+
'use_statement_ssim' => use_statement,
|
19
|
+
'copyright_ssim' => copyright,
|
20
|
+
'rights_descriptions_ssim' => 'dark',
|
21
|
+
'default_rights_descriptions_ssim' => Cocina::Models::Builders::RightsDescriptionBuilder.build(cocina)
|
22
|
+
}
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def use_statement
|
28
|
+
cocina.administrative.accessTemplate.useAndReproductionStatement
|
29
|
+
end
|
30
|
+
|
31
|
+
def copyright
|
32
|
+
cocina.administrative.accessTemplate.copyright
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,226 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'stanford-mods'
|
4
|
+
|
5
|
+
class DorIndexing
|
6
|
+
module Indexers
|
7
|
+
# rubocop:disable Metrics/ClassLength
|
8
|
+
# Indexes the descriptive metadata
|
9
|
+
class DescriptiveMetadataIndexer
|
10
|
+
attr_reader :cocina, :stanford_mods_record
|
11
|
+
|
12
|
+
def initialize(cocina:, **)
|
13
|
+
@cocina = cocina
|
14
|
+
mods_ng = Cocina::Models::Mapping::ToMods::Description.transform(cocina.description, cocina.externalIdentifier)
|
15
|
+
@stanford_mods_record = Stanford::Mods::Record.new.from_nk_node(mods_ng.root)
|
16
|
+
end
|
17
|
+
|
18
|
+
# @return [Hash] the partial solr document for descriptive metadata
|
19
|
+
# rubocop:disable Metrics/MethodLength
|
20
|
+
# rubocop:disable Metrics/AbcSize
|
21
|
+
def to_solr
|
22
|
+
{
|
23
|
+
# title
|
24
|
+
'sw_display_title_tesim' => title,
|
25
|
+
# contributor
|
26
|
+
'author_text_nostem_im' => author_primary, # primary author tokenized but not stemmed
|
27
|
+
'sw_author_tesim' => author_primary, # used for author display in Argo
|
28
|
+
'contributor_text_nostem_im' => author_all, # author names should be tokenized but not stemmed
|
29
|
+
'contributor_orcids_ssim' => orcids,
|
30
|
+
# topic
|
31
|
+
'topic_ssim' => stanford_mods_record.topic_facet&.uniq,
|
32
|
+
'topic_tesim' => stemmable_topics,
|
33
|
+
# publication
|
34
|
+
'originInfo_date_created_tesim' => creation_date,
|
35
|
+
'originInfo_publisher_tesim' => publisher_name,
|
36
|
+
'originInfo_place_placeTerm_tesim' => event_place, # do we want this?
|
37
|
+
'sw_pub_date_facet_ssi' => stanford_mods_record.pub_year_int.to_s, # SW Date facet
|
38
|
+
|
39
|
+
'metadata_format_ssim' => 'mods', # no longer used? https://github.com/search?q=org%3Asul-dlss+metadata_format_ssim&type=code
|
40
|
+
|
41
|
+
# SW facets plus a friend facet
|
42
|
+
'sw_format_ssim' => sw_format, # SW Resource Type facet
|
43
|
+
'mods_typeOfResource_ssim' => resource_type, # MODS Resource Type facet
|
44
|
+
'sw_genre_ssim' => stanford_mods_record.sw_genre, # SW Genre facet
|
45
|
+
'sw_language_ssim' => stanford_mods_record.sw_language_facet, # SW Language facet
|
46
|
+
'sw_subject_temporal_ssim' => stanford_mods_record.era_facet, # SW Era facet
|
47
|
+
'sw_subject_geographic_ssim' => subject_geographic, # SW Region facet
|
48
|
+
|
49
|
+
# all the descriptive data that we want to search on, with different flavors for better recall and precision
|
50
|
+
'descriptive_tiv' => all_search_text, # ICU tokenized, ICU folded
|
51
|
+
'descriptive_text_nostem_i' => all_search_text, # whitespace tokenized, ICU folded, word delimited
|
52
|
+
'descriptive_teiv' => all_search_text # ICU tokenized, ICU folded, minimal stemming
|
53
|
+
}.select { |_k, v| v.present? }
|
54
|
+
end
|
55
|
+
# rubocop:enable Metrics/MethodLength
|
56
|
+
# rubocop:enable Metrics/AbcSize
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
def subject_temporal
|
61
|
+
DorIndexing::Builders::TemporalBuilder.build(subjects)
|
62
|
+
end
|
63
|
+
|
64
|
+
def subject_geographic
|
65
|
+
DorIndexing::Builders::GeographicBuilder.build(subjects)
|
66
|
+
end
|
67
|
+
|
68
|
+
def subjects
|
69
|
+
@subjects ||= Array(cocina.description.subject)
|
70
|
+
end
|
71
|
+
|
72
|
+
def author_primary
|
73
|
+
author_builder.build_primary
|
74
|
+
end
|
75
|
+
|
76
|
+
def author_all
|
77
|
+
author_builder.build_all
|
78
|
+
end
|
79
|
+
|
80
|
+
def author_builder
|
81
|
+
@author_builder ||= DorIndexing::Builders::AuthorBuilder.new(Array(cocina.description.contributor))
|
82
|
+
end
|
83
|
+
|
84
|
+
def orcids
|
85
|
+
DorIndexing::Builders::OrcidBuilder.build(Array(cocina.description.contributor))
|
86
|
+
end
|
87
|
+
|
88
|
+
def title
|
89
|
+
Cocina::Models::Builders::TitleBuilder.build(cocina.description.title)
|
90
|
+
end
|
91
|
+
|
92
|
+
def forms
|
93
|
+
@forms ||= Array(cocina.description.form)
|
94
|
+
end
|
95
|
+
|
96
|
+
def resource_type
|
97
|
+
@resource_type ||= forms.select do |form|
|
98
|
+
form.source&.value == 'MODS resource types' &&
|
99
|
+
%w[collection manuscript].exclude?(form.value)
|
100
|
+
end.map(&:value)
|
101
|
+
end
|
102
|
+
|
103
|
+
# See https://github.com/sul-dlss/stanford-mods/blob/master/lib/stanford-mods/searchworks.rb#L244
|
104
|
+
FORMAT = {
|
105
|
+
'cartographic' => 'Map',
|
106
|
+
'manuscript' => 'Archive/Manuscript',
|
107
|
+
'mixed material' => 'Archive/Manuscript',
|
108
|
+
'moving image' => 'Video',
|
109
|
+
'notated music' => 'Music score',
|
110
|
+
'software, multimedia' => 'Software/Multimedia',
|
111
|
+
'sound recording-musical' => 'Music recording',
|
112
|
+
'sound recording-nonmusical' => 'Sound recording',
|
113
|
+
'sound recording' => 'Sound recording',
|
114
|
+
'still image' => 'Image',
|
115
|
+
'three dimensional object' => 'Object',
|
116
|
+
'text' => 'Book'
|
117
|
+
}.freeze
|
118
|
+
|
119
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
120
|
+
# rubocop:disable Metrics/PerceivedComplexity
|
121
|
+
# rubocop:disable Metrics/AbcSize
|
122
|
+
def sw_format
|
123
|
+
return ['Map'] if resource_type?('software, multimedia') && resource_type?('cartographic')
|
124
|
+
return ['Dataset'] if resource_type?('software, multimedia') && genre?('dataset')
|
125
|
+
return ['Archived website'] if resource_type?('text') && genre?('archived website')
|
126
|
+
return ['Book'] if resource_type?('text') && issuance?('monographic')
|
127
|
+
return ['Journal/Periodical'] if resource_type?('text') && (issuance?('continuing') || issuance?('serial') || frequency?)
|
128
|
+
|
129
|
+
resource_type_formats = flat_forms_for('resource type').map { |form| FORMAT[form.value&.downcase] }.uniq.compact
|
130
|
+
resource_type_formats.delete('Book') if resource_type_formats.include?('Archive/Manuscript')
|
131
|
+
|
132
|
+
return resource_type_formats if resource_type_formats == ['Book']
|
133
|
+
|
134
|
+
genre_formats = flat_forms_for('genre').map { |form| form.value&.capitalize }.uniq
|
135
|
+
|
136
|
+
(resource_type_formats + genre_formats).presence
|
137
|
+
end
|
138
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
139
|
+
# rubocop:enable Metrics/PerceivedComplexity
|
140
|
+
# rubocop:enable Metrics/AbcSize
|
141
|
+
|
142
|
+
def resource_type?(type)
|
143
|
+
flat_forms_for('resource type').any? { |form| form.value == type }
|
144
|
+
end
|
145
|
+
|
146
|
+
def genre?(genre)
|
147
|
+
flat_forms_for('genre').any? { |form| form.value == genre }
|
148
|
+
end
|
149
|
+
|
150
|
+
def issuance?(issuance)
|
151
|
+
flat_event_notes.any? { |note| note.type == 'issuance' && note.value == issuance }
|
152
|
+
end
|
153
|
+
|
154
|
+
def frequency?
|
155
|
+
flat_event_notes.any? { |note| note.type == 'frequency' }
|
156
|
+
end
|
157
|
+
|
158
|
+
def flat_forms_for(type)
|
159
|
+
forms.flat_map do |form|
|
160
|
+
if form.type == type
|
161
|
+
flat_value(form)
|
162
|
+
else
|
163
|
+
flat_value(form).select { |form_value| form_value.type == type }
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def flat_event_notes
|
169
|
+
@flat_event_notes ||= events.flat_map { |event| flat_event(event) }.flat_map do |event|
|
170
|
+
Array(event.note).flat_map do |note|
|
171
|
+
flat_value(note)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
def pub_year
|
177
|
+
DorIndexing::Selectors::PubYearSelector.build(events)
|
178
|
+
end
|
179
|
+
|
180
|
+
def creation_date
|
181
|
+
@creation_date ||= DorIndexing::Builders::EventDateBuilder.build(creation_event, 'creation')
|
182
|
+
end
|
183
|
+
|
184
|
+
def event_place
|
185
|
+
place_event = events.find { |event| event.type == 'publication' } || events.first
|
186
|
+
DorIndexing::Builders::EventPlaceBuilder.build(place_event)
|
187
|
+
end
|
188
|
+
|
189
|
+
def publisher_name
|
190
|
+
publish_events = events.map { |event| event.parallelEvent&.first || event }
|
191
|
+
return if publish_events.blank?
|
192
|
+
|
193
|
+
DorIndexing::Builders::PublisherNameBuilder.build(publish_events)
|
194
|
+
end
|
195
|
+
|
196
|
+
def stemmable_topics
|
197
|
+
DorIndexing::Builders::TopicBuilder.build(Array(cocina.description.subject), filter: 'topic')
|
198
|
+
end
|
199
|
+
|
200
|
+
def publication_event
|
201
|
+
@publication_event ||= DorIndexing::Selectors::EventSelector.select(events, 'publication')
|
202
|
+
end
|
203
|
+
|
204
|
+
def creation_event
|
205
|
+
@creation_event ||= DorIndexing::Selectors::EventSelector.select(events, 'creation')
|
206
|
+
end
|
207
|
+
|
208
|
+
def events
|
209
|
+
@events ||= Array(cocina.description.event).compact
|
210
|
+
end
|
211
|
+
|
212
|
+
def flat_event(event)
|
213
|
+
event.parallelEvent.presence || Array(event)
|
214
|
+
end
|
215
|
+
|
216
|
+
def flat_value(value)
|
217
|
+
value.parallelValue.presence || value.groupedValue.presence || value.structuredValue.presence || Array(value)
|
218
|
+
end
|
219
|
+
|
220
|
+
def all_search_text
|
221
|
+
@all_search_text ||= DorIndexing::Builders::AllSearchTextBuilder.build(cocina.description)
|
222
|
+
end
|
223
|
+
end
|
224
|
+
# rubocop:enable Metrics/ClassLength
|
225
|
+
end
|
226
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DorIndexing
|
4
|
+
module Indexers
|
5
|
+
# Indexes the embargo metadata
|
6
|
+
class EmbargoMetadataIndexer
|
7
|
+
attr_reader :cocina
|
8
|
+
|
9
|
+
def initialize(cocina:, **)
|
10
|
+
@cocina = cocina
|
11
|
+
end
|
12
|
+
|
13
|
+
# These fields are used by the EmbargoReleaseService in dor-services-app
|
14
|
+
# @return [Hash] the partial solr document for embargoMetadata
|
15
|
+
def to_solr
|
16
|
+
{}.tap do |solr_doc|
|
17
|
+
embargo_release_date = embargo_release_date(cocina)
|
18
|
+
if embargo_release_date.present?
|
19
|
+
solr_doc['embargo_status_ssim'] = ['embargoed']
|
20
|
+
solr_doc['embargo_release_dtsim'] = [embargo_release_date.utc.iso8601]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def embargo_release_date(cocina)
|
28
|
+
cocina.access.embargo.releaseDate if cocina.access.embargo&.releaseDate.present?
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|