dor_indexing 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.rspec +2 -0
- data/.rubocop.yml +355 -0
- data/Gemfile +16 -0
- data/Gemfile.lock +218 -0
- data/README.md +33 -0
- data/Rakefile +11 -0
- data/dor_indexing.gemspec +40 -0
- data/lib/dor_indexing/builders/all_search_text_builder.rb +58 -0
- data/lib/dor_indexing/builders/author_builder.rb +31 -0
- data/lib/dor_indexing/builders/collection_rights_description_builder.rb +29 -0
- data/lib/dor_indexing/builders/document_builder.rb +106 -0
- data/lib/dor_indexing/builders/event_date_builder.rb +71 -0
- data/lib/dor_indexing/builders/event_place_builder.rb +73 -0
- data/lib/dor_indexing/builders/geographic_builder.rb +82 -0
- data/lib/dor_indexing/builders/name_builder.rb +70 -0
- data/lib/dor_indexing/builders/orcid_builder.rb +62 -0
- data/lib/dor_indexing/builders/publisher_name_builder.rb +53 -0
- data/lib/dor_indexing/builders/temporal_builder.rb +56 -0
- data/lib/dor_indexing/builders/topic_builder.rb +96 -0
- data/lib/dor_indexing/cocina_repository.rb +24 -0
- data/lib/dor_indexing/indexers/administrative_tag_indexer.rb +69 -0
- data/lib/dor_indexing/indexers/collection_title_indexer.rb +27 -0
- data/lib/dor_indexing/indexers/composite_indexer.rb +36 -0
- data/lib/dor_indexing/indexers/content_metadata_indexer.rb +69 -0
- data/lib/dor_indexing/indexers/data_indexer.rb +66 -0
- data/lib/dor_indexing/indexers/default_object_rights_indexer.rb +36 -0
- data/lib/dor_indexing/indexers/descriptive_metadata_indexer.rb +226 -0
- data/lib/dor_indexing/indexers/embargo_metadata_indexer.rb +32 -0
- data/lib/dor_indexing/indexers/identifiable_indexer.rb +92 -0
- data/lib/dor_indexing/indexers/identity_metadata_indexer.rb +85 -0
- data/lib/dor_indexing/indexers/process_indexer.rb +63 -0
- data/lib/dor_indexing/indexers/releasable_indexer.rb +62 -0
- data/lib/dor_indexing/indexers/rights_metadata_indexer.rb +59 -0
- data/lib/dor_indexing/indexers/role_metadata_indexer.rb +31 -0
- data/lib/dor_indexing/indexers/workflow_indexer.rb +51 -0
- data/lib/dor_indexing/indexers/workflows_indexer.rb +40 -0
- data/lib/dor_indexing/marc_country.rb +359 -0
- data/lib/dor_indexing/selectors/event_selector.rb +112 -0
- data/lib/dor_indexing/selectors/pub_year_selector.rb +119 -0
- data/lib/dor_indexing/version.rb +5 -0
- data/lib/dor_indexing/workflow_fields.rb +63 -0
- data/lib/dor_indexing/workflow_solr_document.rb +93 -0
- data/lib/dor_indexing.rb +19 -0
- metadata +173 -0
@@ -0,0 +1,96 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DorIndexing
|
4
|
+
module Builders
|
5
|
+
# Builds the topic fields for a solr document
|
6
|
+
class TopicBuilder
|
7
|
+
# @param [Array] subjects
|
8
|
+
# @param [String] filter can either be 'topic' or 'name'
|
9
|
+
def self.build(subjects, filter:, remove_trailing_punctuation: false)
|
10
|
+
new(filter:, remove_trailing_punctuation:).build(subjects)
|
11
|
+
end
|
12
|
+
|
13
|
+
def initialize(filter:, remove_trailing_punctuation:)
|
14
|
+
@filter = filter
|
15
|
+
@remove_trailing_punctuation = remove_trailing_punctuation
|
16
|
+
end
|
17
|
+
|
18
|
+
def build(subjects)
|
19
|
+
topics(subjects).flat_map { |topic| flat_topic(topic) }.compact.uniq
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
attr_reader :filter
|
25
|
+
|
26
|
+
def remove_trailing_punctuation?
|
27
|
+
@remove_trailing_punctuation
|
28
|
+
end
|
29
|
+
|
30
|
+
# Filter the subjects we are interested in>
|
31
|
+
# Handles:
|
32
|
+
# parallelValue that contain structuredValue and the parallelValue has the type AND
|
33
|
+
# parallelValue that contain structuredValue each with their own type AND
|
34
|
+
# parallelValue that has a type conferred to the child AND
|
35
|
+
# structuredValue that contains structuredValue where the type can be at the higher or lower level.
|
36
|
+
def topics(subjects)
|
37
|
+
(
|
38
|
+
subjects.flat_map { |subject| basic_value(subject) } +
|
39
|
+
subjects.flat_map { |subject| structured_values(subject) } +
|
40
|
+
parallel_subjects(subjects)
|
41
|
+
).compact
|
42
|
+
end
|
43
|
+
|
44
|
+
def parallel_subjects(subjects)
|
45
|
+
parallels = subjects.select(&:parallelValue)
|
46
|
+
parallels.flat_map { |subject| parallel_with_type(subject, subject.type) if subject.type } +
|
47
|
+
parallels.flat_map { |subject| topics(subject.parallelValue) unless subject.type }
|
48
|
+
end
|
49
|
+
|
50
|
+
def flat_topic(value)
|
51
|
+
if value.parallelValue.present?
|
52
|
+
value.parallelValue.flat_map { |topic| flat_topic(topic) }
|
53
|
+
elsif remove_trailing_punctuation?
|
54
|
+
# comma, semicolon, and backslash are dropped
|
55
|
+
Array(value.value&.sub(/[ ,;\\]+$/, ''))
|
56
|
+
else
|
57
|
+
Array(value.value)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def parallel_with_type(item, type_from_parent)
|
62
|
+
return unless type_matches_filter?(type_from_parent)
|
63
|
+
|
64
|
+
item
|
65
|
+
end
|
66
|
+
|
67
|
+
def basic_value(subject)
|
68
|
+
return create_fullname(subject) if filter == 'name' && subject.type == 'person'
|
69
|
+
return create_title(subject) if filter == 'name' && subject.type == 'title'
|
70
|
+
|
71
|
+
subject if type_matches_filter?(subject.type)
|
72
|
+
end
|
73
|
+
|
74
|
+
def structured_values(subject)
|
75
|
+
selected = Array(subject.structuredValue).select { |child| type_matches_filter?(child.type) }
|
76
|
+
|
77
|
+
topics(selected)
|
78
|
+
end
|
79
|
+
|
80
|
+
def create_title(title)
|
81
|
+
titles = Cocina::Models::Builders::TitleBuilder.build([title], strategy: :all, add_punctuation: false)
|
82
|
+
titles.map { |value| Cocina::Models::DescriptiveValue.new(value:) }
|
83
|
+
end
|
84
|
+
|
85
|
+
def create_fullname(name)
|
86
|
+
names = NameBuilder.build([name], strategy: :all)
|
87
|
+
names.map { |value| Cocina::Models::DescriptiveValue.new(value:) }
|
88
|
+
end
|
89
|
+
|
90
|
+
def type_matches_filter?(type)
|
91
|
+
(filter == 'name' && %w[person organization title occupation].include?(type)) ||
|
92
|
+
type == filter
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DorIndexing
|
4
|
+
# Interface for retrieving Cocina objects.
|
5
|
+
# In DSA, the concrete implementation backs this with CocinaObjectStore.
|
6
|
+
# In DIA, the concrete implementation backs this with Dor Services Client.
|
7
|
+
class CocinaRepository
|
8
|
+
class RepositoryError < StandardError; end
|
9
|
+
|
10
|
+
# @param [String] druid
|
11
|
+
# @return [Cocina::Models::DROWithMetadata,Cocina::Models::CollectionWithMetadata,Cocina::Models::AdminPolicyWithMetadata]
|
12
|
+
# @raise [RepositoryError] if the object is not found or other error occurs
|
13
|
+
def find(druid)
|
14
|
+
raise NotImplementedError
|
15
|
+
end
|
16
|
+
|
17
|
+
# @param [String] druid
|
18
|
+
# @return [Array<String>] administrative tags
|
19
|
+
# @raise [RepositoryError] if the object is not found or other error occurs
|
20
|
+
def administrative_tags(druid)
|
21
|
+
raise NotImplementedError
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DorIndexing
|
4
|
+
module Indexers
|
5
|
+
# Index administrative tags for an object.
|
6
|
+
# NOTE: Most of this code was extracted from the dor-services gem:
|
7
|
+
# https://github.com/sul-dlss/dor-services/blob/v9.0.0/lib/dor/datastreams/identity_metadata_ds.rb#L196-L218
|
8
|
+
class AdministrativeTagIndexer
|
9
|
+
TAG_PART_DELIMITER = ' : '
|
10
|
+
SPECIAL_TAG_TYPES_TO_INDEX = ['Project', 'Registered By'].freeze
|
11
|
+
|
12
|
+
attr_reader :id
|
13
|
+
|
14
|
+
def initialize(id:, administrative_tags:, **)
|
15
|
+
@id = id
|
16
|
+
@administrative_tags = administrative_tags
|
17
|
+
end
|
18
|
+
|
19
|
+
# @return [Hash] the partial solr document for administrative tags
|
20
|
+
# rubocop:disable Metrics/MethodLength
|
21
|
+
# rubocop:disable Metrics/AbcSize
|
22
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
23
|
+
def to_solr
|
24
|
+
solr_doc = {
|
25
|
+
'tag_ssim' => [],
|
26
|
+
'tag_text_unstemmed_im' => [],
|
27
|
+
'exploded_nonproject_tag_ssim' => []
|
28
|
+
}
|
29
|
+
administrative_tags.each do |tag|
|
30
|
+
tag_prefix, rest = tag.split(TAG_PART_DELIMITER, 2)
|
31
|
+
prefix = tag_prefix.downcase.strip.gsub(/\s/, '_')
|
32
|
+
|
33
|
+
solr_doc['tag_ssim'] << tag # for facet and display
|
34
|
+
solr_doc['tag_text_unstemmed_im'] << tag # for search
|
35
|
+
|
36
|
+
solr_doc['exploded_nonproject_tag_ssim'] += exploded_tags_from(tag) unless prefix == 'project'
|
37
|
+
|
38
|
+
next if SPECIAL_TAG_TYPES_TO_INDEX.exclude?(tag_prefix) || rest.nil?
|
39
|
+
|
40
|
+
(solr_doc["#{prefix}_tag_ssim"] ||= []) << rest.strip
|
41
|
+
|
42
|
+
if prefix == 'project'
|
43
|
+
solr_doc['exploded_project_tag_ssim'] ||= []
|
44
|
+
solr_doc['exploded_project_tag_ssim'] += exploded_tags_from(rest.strip)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
solr_doc
|
48
|
+
end
|
49
|
+
# rubocop:enable Metrics/MethodLength
|
50
|
+
# rubocop:enable Metrics/AbcSize
|
51
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
attr_reader :administrative_tags
|
56
|
+
|
57
|
+
# solrize each possible prefix for the tag, inclusive of the full tag.
|
58
|
+
# e.g., for a tag such as "A : B : C", this will solrize to an _ssim field
|
59
|
+
# that contains ["A", "A : B", "A : B : C"].
|
60
|
+
def exploded_tags_from(tag)
|
61
|
+
tag_parts = tag.split(TAG_PART_DELIMITER)
|
62
|
+
|
63
|
+
1.upto(tag_parts.count).map do |i|
|
64
|
+
tag_parts.take(i).join(TAG_PART_DELIMITER)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DorIndexing
|
4
|
+
module Indexers
|
5
|
+
# Indexes the collection title
|
6
|
+
class CollectionTitleIndexer
|
7
|
+
attr_reader :cocina, :parent_collections
|
8
|
+
|
9
|
+
def initialize(cocina:, parent_collections:, **)
|
10
|
+
@cocina = cocina
|
11
|
+
@parent_collections = parent_collections
|
12
|
+
end
|
13
|
+
|
14
|
+
# @return [Hash] the partial solr document for identifiable concerns
|
15
|
+
def to_solr
|
16
|
+
{}.tap do |solr_doc|
|
17
|
+
parent_collections.each do |related_obj|
|
18
|
+
coll_title = Cocina::Models::Builders::TitleBuilder.build(related_obj.description.title)
|
19
|
+
|
20
|
+
# create/append collection_title_tesim and collection_title_ssim
|
21
|
+
::Solrizer.insert_field(solr_doc, 'collection_title', coll_title, :stored_searchable, :symbol)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DorIndexing
|
4
|
+
module Indexers
|
5
|
+
# Borrowed from https://github.com/samvera/valkyrie/blob/master/lib/valkyrie/persistence/solr/composite_indexer.rb
|
6
|
+
class CompositeIndexer
|
7
|
+
attr_reader :indexers
|
8
|
+
|
9
|
+
def initialize(*indexers)
|
10
|
+
@indexers = indexers
|
11
|
+
end
|
12
|
+
|
13
|
+
def new(**)
|
14
|
+
Instance.new(indexers, **)
|
15
|
+
end
|
16
|
+
|
17
|
+
# Instance for a composite indexer
|
18
|
+
class Instance
|
19
|
+
attr_reader :indexers
|
20
|
+
|
21
|
+
def initialize(indexers, **)
|
22
|
+
@indexers = indexers.map do |i|
|
23
|
+
i.new(**)
|
24
|
+
rescue ArgumentError => e
|
25
|
+
raise ArgumentError, "Unable to initialize #{i}. #{e.message}"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# @return [Hash] the merged solr document for all the sub-indexers
|
30
|
+
def to_solr
|
31
|
+
indexers.map(&:to_solr).inject({}, &:merge)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DorIndexing
|
4
|
+
module Indexers
|
5
|
+
# Indexes the content metadata
|
6
|
+
class ContentMetadataIndexer
|
7
|
+
attr_reader :cocina
|
8
|
+
|
9
|
+
def initialize(cocina:, **)
|
10
|
+
@cocina = cocina
|
11
|
+
end
|
12
|
+
|
13
|
+
# @return [Hash] the partial solr document for contentMetadata
|
14
|
+
def to_solr
|
15
|
+
{
|
16
|
+
'content_type_ssim' => type(cocina.type),
|
17
|
+
'content_file_mimetypes_ssim' => files.map(&:hasMimeType).uniq,
|
18
|
+
'content_file_count_itsi' => files.size,
|
19
|
+
'shelved_content_file_count_itsi' => shelved_files.size,
|
20
|
+
'resource_count_itsi' => file_sets.size,
|
21
|
+
'preserved_size_dbtsi' => preserved_size, # double (trie) to support very large sizes
|
22
|
+
'content_file_roles_ssim' => files.filter_map(&:use),
|
23
|
+
# first_shelved_image is neither indexed nor multiple
|
24
|
+
'first_shelved_image_ss' => first_shelved_image
|
25
|
+
}
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def first_shelved_image
|
31
|
+
shelved_files.find { |file| file.filename.end_with?('jp2') }&.filename
|
32
|
+
end
|
33
|
+
|
34
|
+
def shelved_files
|
35
|
+
files.select { |file| file.administrative.shelve }
|
36
|
+
end
|
37
|
+
|
38
|
+
def preserved_size
|
39
|
+
files.select { |file| file.administrative.sdrPreserve }
|
40
|
+
.filter_map(&:size).sum # filter out missing size
|
41
|
+
end
|
42
|
+
|
43
|
+
def files
|
44
|
+
@files ||= file_sets.flat_map { |fs| fs.structural.contains }
|
45
|
+
end
|
46
|
+
|
47
|
+
def file_sets
|
48
|
+
@file_sets ||= Array(cocina.structural&.contains)
|
49
|
+
end
|
50
|
+
|
51
|
+
TYPES = {
|
52
|
+
Cocina::Models::ObjectType.image => 'image',
|
53
|
+
Cocina::Models::ObjectType.manuscript => 'image',
|
54
|
+
Cocina::Models::ObjectType.book => 'book',
|
55
|
+
Cocina::Models::ObjectType.map => 'map',
|
56
|
+
Cocina::Models::ObjectType.three_dimensional => '3d',
|
57
|
+
Cocina::Models::ObjectType.media => 'media',
|
58
|
+
Cocina::Models::ObjectType.webarchive_seed => 'webarchive-seed',
|
59
|
+
Cocina::Models::ObjectType.webarchive_binary => 'webarchive-binary',
|
60
|
+
Cocina::Models::ObjectType.geo => 'geo',
|
61
|
+
Cocina::Models::ObjectType.document => 'document'
|
62
|
+
}.freeze
|
63
|
+
|
64
|
+
def type(object_type)
|
65
|
+
TYPES.fetch(object_type, 'file')
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DorIndexing
|
4
|
+
module Indexers
|
5
|
+
# Indexing provided by ActiveFedora
|
6
|
+
class DataIndexer
|
7
|
+
attr_reader :cocina, :workflow_client
|
8
|
+
|
9
|
+
def initialize(cocina:, workflow_client:, **)
|
10
|
+
@cocina = cocina
|
11
|
+
@workflow_client = workflow_client
|
12
|
+
end
|
13
|
+
|
14
|
+
# rubocop:disable Metrics/AbcSize
|
15
|
+
# rubocop:disable Metrics/MethodLength
|
16
|
+
def to_solr
|
17
|
+
{}.tap do |solr_doc|
|
18
|
+
solr_doc[:id] = cocina.externalIdentifier
|
19
|
+
solr_doc['current_version_isi'] = cocina.version # Argo Facet field "Version"
|
20
|
+
solr_doc['obj_label_tesim'] = cocina.label
|
21
|
+
|
22
|
+
solr_doc['modified_latest_dttsi'] = modified_latest
|
23
|
+
solr_doc['created_at_dttsi'] = created_at
|
24
|
+
|
25
|
+
# is_member_of_collection_ssim is used by dor-services-app for querying for members of a
|
26
|
+
# collection and it is a facet in Argo
|
27
|
+
solr_doc['is_member_of_collection_ssim'] = legacy_collections
|
28
|
+
solr_doc['is_governed_by_ssim'] = legacy_apo # Argo facet
|
29
|
+
|
30
|
+
# Used so that DSA can generate public XML whereas a constituent can find the virtual object it is part of.
|
31
|
+
solr_doc['has_constituents_ssim'] = virtual_object_constituents
|
32
|
+
end.merge(DorIndexing::WorkflowFields.for(druid: cocina.externalIdentifier, version: cocina.version, workflow_client:))
|
33
|
+
.transform_keys(&:to_s)
|
34
|
+
end
|
35
|
+
# rubocop:enable Metrics/AbcSize
|
36
|
+
# rubocop:enable Metrics/MethodLength
|
37
|
+
|
38
|
+
def modified_latest
|
39
|
+
cocina.modified.to_datetime.strftime('%FT%TZ')
|
40
|
+
end
|
41
|
+
|
42
|
+
def created_at
|
43
|
+
cocina.created.to_datetime.strftime('%FT%TZ')
|
44
|
+
end
|
45
|
+
|
46
|
+
def legacy_collections
|
47
|
+
case cocina.type
|
48
|
+
when Cocina::Models::ObjectType.admin_policy, Cocina::Models::ObjectType.collection
|
49
|
+
[]
|
50
|
+
else
|
51
|
+
Array(cocina.structural&.isMemberOf).map { |col_id| "info:fedora/#{col_id}" }
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def virtual_object_constituents
|
56
|
+
return unless cocina.dro?
|
57
|
+
|
58
|
+
Array(cocina.structural&.hasMemberOrders).first&.members
|
59
|
+
end
|
60
|
+
|
61
|
+
def legacy_apo
|
62
|
+
"info:fedora/#{cocina.administrative.hasAdminPolicy}"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DorIndexing
|
4
|
+
module Indexers
|
5
|
+
# Indexes the default object rights
|
6
|
+
class DefaultObjectRightsIndexer
|
7
|
+
attr_reader :cocina
|
8
|
+
|
9
|
+
def initialize(cocina:, **)
|
10
|
+
@cocina = cocina
|
11
|
+
end
|
12
|
+
|
13
|
+
# @return [Hash] the partial solr document for defaultObjectRights
|
14
|
+
def to_solr
|
15
|
+
return {} unless cocina.administrative.accessTemplate
|
16
|
+
|
17
|
+
{
|
18
|
+
'use_statement_ssim' => use_statement,
|
19
|
+
'copyright_ssim' => copyright,
|
20
|
+
'rights_descriptions_ssim' => 'dark',
|
21
|
+
'default_rights_descriptions_ssim' => Cocina::Models::Builders::RightsDescriptionBuilder.build(cocina)
|
22
|
+
}
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def use_statement
|
28
|
+
cocina.administrative.accessTemplate.useAndReproductionStatement
|
29
|
+
end
|
30
|
+
|
31
|
+
def copyright
|
32
|
+
cocina.administrative.accessTemplate.copyright
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,226 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'stanford-mods'
|
4
|
+
|
5
|
+
class DorIndexing
|
6
|
+
module Indexers
|
7
|
+
# rubocop:disable Metrics/ClassLength
|
8
|
+
# Indexes the descriptive metadata
|
9
|
+
class DescriptiveMetadataIndexer
|
10
|
+
attr_reader :cocina, :stanford_mods_record
|
11
|
+
|
12
|
+
def initialize(cocina:, **)
|
13
|
+
@cocina = cocina
|
14
|
+
mods_ng = Cocina::Models::Mapping::ToMods::Description.transform(cocina.description, cocina.externalIdentifier)
|
15
|
+
@stanford_mods_record = Stanford::Mods::Record.new.from_nk_node(mods_ng.root)
|
16
|
+
end
|
17
|
+
|
18
|
+
# @return [Hash] the partial solr document for descriptive metadata
|
19
|
+
# rubocop:disable Metrics/MethodLength
|
20
|
+
# rubocop:disable Metrics/AbcSize
|
21
|
+
def to_solr
|
22
|
+
{
|
23
|
+
# title
|
24
|
+
'sw_display_title_tesim' => title,
|
25
|
+
# contributor
|
26
|
+
'author_text_nostem_im' => author_primary, # primary author tokenized but not stemmed
|
27
|
+
'sw_author_tesim' => author_primary, # used for author display in Argo
|
28
|
+
'contributor_text_nostem_im' => author_all, # author names should be tokenized but not stemmed
|
29
|
+
'contributor_orcids_ssim' => orcids,
|
30
|
+
# topic
|
31
|
+
'topic_ssim' => stanford_mods_record.topic_facet&.uniq,
|
32
|
+
'topic_tesim' => stemmable_topics,
|
33
|
+
# publication
|
34
|
+
'originInfo_date_created_tesim' => creation_date,
|
35
|
+
'originInfo_publisher_tesim' => publisher_name,
|
36
|
+
'originInfo_place_placeTerm_tesim' => event_place, # do we want this?
|
37
|
+
'sw_pub_date_facet_ssi' => stanford_mods_record.pub_year_int.to_s, # SW Date facet
|
38
|
+
|
39
|
+
'metadata_format_ssim' => 'mods', # no longer used? https://github.com/search?q=org%3Asul-dlss+metadata_format_ssim&type=code
|
40
|
+
|
41
|
+
# SW facets plus a friend facet
|
42
|
+
'sw_format_ssim' => sw_format, # SW Resource Type facet
|
43
|
+
'mods_typeOfResource_ssim' => resource_type, # MODS Resource Type facet
|
44
|
+
'sw_genre_ssim' => stanford_mods_record.sw_genre, # SW Genre facet
|
45
|
+
'sw_language_ssim' => stanford_mods_record.sw_language_facet, # SW Language facet
|
46
|
+
'sw_subject_temporal_ssim' => stanford_mods_record.era_facet, # SW Era facet
|
47
|
+
'sw_subject_geographic_ssim' => subject_geographic, # SW Region facet
|
48
|
+
|
49
|
+
# all the descriptive data that we want to search on, with different flavors for better recall and precision
|
50
|
+
'descriptive_tiv' => all_search_text, # ICU tokenized, ICU folded
|
51
|
+
'descriptive_text_nostem_i' => all_search_text, # whitespace tokenized, ICU folded, word delimited
|
52
|
+
'descriptive_teiv' => all_search_text # ICU tokenized, ICU folded, minimal stemming
|
53
|
+
}.select { |_k, v| v.present? }
|
54
|
+
end
|
55
|
+
# rubocop:enable Metrics/MethodLength
|
56
|
+
# rubocop:enable Metrics/AbcSize
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
def subject_temporal
|
61
|
+
DorIndexing::Builders::TemporalBuilder.build(subjects)
|
62
|
+
end
|
63
|
+
|
64
|
+
def subject_geographic
|
65
|
+
DorIndexing::Builders::GeographicBuilder.build(subjects)
|
66
|
+
end
|
67
|
+
|
68
|
+
def subjects
|
69
|
+
@subjects ||= Array(cocina.description.subject)
|
70
|
+
end
|
71
|
+
|
72
|
+
def author_primary
|
73
|
+
author_builder.build_primary
|
74
|
+
end
|
75
|
+
|
76
|
+
def author_all
|
77
|
+
author_builder.build_all
|
78
|
+
end
|
79
|
+
|
80
|
+
def author_builder
|
81
|
+
@author_builder ||= DorIndexing::Builders::AuthorBuilder.new(Array(cocina.description.contributor))
|
82
|
+
end
|
83
|
+
|
84
|
+
def orcids
|
85
|
+
DorIndexing::Builders::OrcidBuilder.build(Array(cocina.description.contributor))
|
86
|
+
end
|
87
|
+
|
88
|
+
def title
|
89
|
+
Cocina::Models::Builders::TitleBuilder.build(cocina.description.title)
|
90
|
+
end
|
91
|
+
|
92
|
+
def forms
|
93
|
+
@forms ||= Array(cocina.description.form)
|
94
|
+
end
|
95
|
+
|
96
|
+
def resource_type
|
97
|
+
@resource_type ||= forms.select do |form|
|
98
|
+
form.source&.value == 'MODS resource types' &&
|
99
|
+
%w[collection manuscript].exclude?(form.value)
|
100
|
+
end.map(&:value)
|
101
|
+
end
|
102
|
+
|
103
|
+
# See https://github.com/sul-dlss/stanford-mods/blob/master/lib/stanford-mods/searchworks.rb#L244
|
104
|
+
FORMAT = {
|
105
|
+
'cartographic' => 'Map',
|
106
|
+
'manuscript' => 'Archive/Manuscript',
|
107
|
+
'mixed material' => 'Archive/Manuscript',
|
108
|
+
'moving image' => 'Video',
|
109
|
+
'notated music' => 'Music score',
|
110
|
+
'software, multimedia' => 'Software/Multimedia',
|
111
|
+
'sound recording-musical' => 'Music recording',
|
112
|
+
'sound recording-nonmusical' => 'Sound recording',
|
113
|
+
'sound recording' => 'Sound recording',
|
114
|
+
'still image' => 'Image',
|
115
|
+
'three dimensional object' => 'Object',
|
116
|
+
'text' => 'Book'
|
117
|
+
}.freeze
|
118
|
+
|
119
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
120
|
+
# rubocop:disable Metrics/PerceivedComplexity
|
121
|
+
# rubocop:disable Metrics/AbcSize
|
122
|
+
def sw_format
|
123
|
+
return ['Map'] if resource_type?('software, multimedia') && resource_type?('cartographic')
|
124
|
+
return ['Dataset'] if resource_type?('software, multimedia') && genre?('dataset')
|
125
|
+
return ['Archived website'] if resource_type?('text') && genre?('archived website')
|
126
|
+
return ['Book'] if resource_type?('text') && issuance?('monographic')
|
127
|
+
return ['Journal/Periodical'] if resource_type?('text') && (issuance?('continuing') || issuance?('serial') || frequency?)
|
128
|
+
|
129
|
+
resource_type_formats = flat_forms_for('resource type').map { |form| FORMAT[form.value&.downcase] }.uniq.compact
|
130
|
+
resource_type_formats.delete('Book') if resource_type_formats.include?('Archive/Manuscript')
|
131
|
+
|
132
|
+
return resource_type_formats if resource_type_formats == ['Book']
|
133
|
+
|
134
|
+
genre_formats = flat_forms_for('genre').map { |form| form.value&.capitalize }.uniq
|
135
|
+
|
136
|
+
(resource_type_formats + genre_formats).presence
|
137
|
+
end
|
138
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
139
|
+
# rubocop:enable Metrics/PerceivedComplexity
|
140
|
+
# rubocop:enable Metrics/AbcSize
|
141
|
+
|
142
|
+
def resource_type?(type)
|
143
|
+
flat_forms_for('resource type').any? { |form| form.value == type }
|
144
|
+
end
|
145
|
+
|
146
|
+
def genre?(genre)
|
147
|
+
flat_forms_for('genre').any? { |form| form.value == genre }
|
148
|
+
end
|
149
|
+
|
150
|
+
def issuance?(issuance)
|
151
|
+
flat_event_notes.any? { |note| note.type == 'issuance' && note.value == issuance }
|
152
|
+
end
|
153
|
+
|
154
|
+
def frequency?
|
155
|
+
flat_event_notes.any? { |note| note.type == 'frequency' }
|
156
|
+
end
|
157
|
+
|
158
|
+
def flat_forms_for(type)
|
159
|
+
forms.flat_map do |form|
|
160
|
+
if form.type == type
|
161
|
+
flat_value(form)
|
162
|
+
else
|
163
|
+
flat_value(form).select { |form_value| form_value.type == type }
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def flat_event_notes
|
169
|
+
@flat_event_notes ||= events.flat_map { |event| flat_event(event) }.flat_map do |event|
|
170
|
+
Array(event.note).flat_map do |note|
|
171
|
+
flat_value(note)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
def pub_year
|
177
|
+
DorIndexing::Selectors::PubYearSelector.build(events)
|
178
|
+
end
|
179
|
+
|
180
|
+
def creation_date
|
181
|
+
@creation_date ||= DorIndexing::Builders::EventDateBuilder.build(creation_event, 'creation')
|
182
|
+
end
|
183
|
+
|
184
|
+
def event_place
|
185
|
+
place_event = events.find { |event| event.type == 'publication' } || events.first
|
186
|
+
DorIndexing::Builders::EventPlaceBuilder.build(place_event)
|
187
|
+
end
|
188
|
+
|
189
|
+
def publisher_name
|
190
|
+
publish_events = events.map { |event| event.parallelEvent&.first || event }
|
191
|
+
return if publish_events.blank?
|
192
|
+
|
193
|
+
DorIndexing::Builders::PublisherNameBuilder.build(publish_events)
|
194
|
+
end
|
195
|
+
|
196
|
+
def stemmable_topics
|
197
|
+
DorIndexing::Builders::TopicBuilder.build(Array(cocina.description.subject), filter: 'topic')
|
198
|
+
end
|
199
|
+
|
200
|
+
def publication_event
|
201
|
+
@publication_event ||= DorIndexing::Selectors::EventSelector.select(events, 'publication')
|
202
|
+
end
|
203
|
+
|
204
|
+
def creation_event
|
205
|
+
@creation_event ||= DorIndexing::Selectors::EventSelector.select(events, 'creation')
|
206
|
+
end
|
207
|
+
|
208
|
+
def events
|
209
|
+
@events ||= Array(cocina.description.event).compact
|
210
|
+
end
|
211
|
+
|
212
|
+
def flat_event(event)
|
213
|
+
event.parallelEvent.presence || Array(event)
|
214
|
+
end
|
215
|
+
|
216
|
+
def flat_value(value)
|
217
|
+
value.parallelValue.presence || value.groupedValue.presence || value.structuredValue.presence || Array(value)
|
218
|
+
end
|
219
|
+
|
220
|
+
def all_search_text
|
221
|
+
@all_search_text ||= DorIndexing::Builders::AllSearchTextBuilder.build(cocina.description)
|
222
|
+
end
|
223
|
+
end
|
224
|
+
# rubocop:enable Metrics/ClassLength
|
225
|
+
end
|
226
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DorIndexing
|
4
|
+
module Indexers
|
5
|
+
# Indexes the embargo metadata
|
6
|
+
class EmbargoMetadataIndexer
|
7
|
+
attr_reader :cocina
|
8
|
+
|
9
|
+
def initialize(cocina:, **)
|
10
|
+
@cocina = cocina
|
11
|
+
end
|
12
|
+
|
13
|
+
# These fields are used by the EmbargoReleaseService in dor-services-app
|
14
|
+
# @return [Hash] the partial solr document for embargoMetadata
|
15
|
+
def to_solr
|
16
|
+
{}.tap do |solr_doc|
|
17
|
+
embargo_release_date = embargo_release_date(cocina)
|
18
|
+
if embargo_release_date.present?
|
19
|
+
solr_doc['embargo_status_ssim'] = ['embargoed']
|
20
|
+
solr_doc['embargo_release_dtsim'] = [embargo_release_date.utc.iso8601]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def embargo_release_date(cocina)
|
28
|
+
cocina.access.embargo.releaseDate if cocina.access.embargo&.releaseDate.present?
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|